static uint8x8_t paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) { uint8x8_t d, e; uint16x8_t p1, pa, pb, pc; p1 = vaddl_u8(a, b); /* a + b */ pc = vaddl_u8(c, c); /* c * 2 */ pa = vabdl_u8(b, c); /* pa */ pb = vabdl_u8(a, c); /* pb */ pc = vabdq_u16(p1, pc); /* pc */ p1 = vcleq_u16(pa, pb); /* pa <= pb */ pa = vcleq_u16(pa, pc); /* pa <= pc */ pb = vcleq_u16(pb, pc); /* pb <= pc */ p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ d = vmovn_u16(pb); e = vmovn_u16(p1); d = vbsl_u8(d, b, c); e = vbsl_u8(e, a, d); return e; }
void test_vbslu8 (void) { uint8x8_t out_uint8x8_t; uint8x8_t arg0_uint8x8_t; uint8x8_t arg1_uint8x8_t; uint8x8_t arg2_uint8x8_t; out_uint8x8_t = vbsl_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t); }
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width) { const int *S0 = src[0], *S1 = src[1]; int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567; int32x4_t qT_0123, qT_4567; int16x4_t dT_0123, dT_4567; uint16x8_t qT_01234567; uint8x8_t dT_01234567, dDst_01234567; int32x2_t dBeta; dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0); dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1); int32x4_t qDelta, qMin, qMax; qDelta = vdupq_n_s32 (DELTA); qMin = vdupq_n_s32 (0); qMax = vdupq_n_s32 (255); int x = 0; for (; x <= width - 8; x += 8) { qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); vst1_u8 (&dst[x], dT_01234567); } if (x < width) { uint8x8_t dMask; dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)])); dDst_01234567 = vld1_u8 (&dst[x]); qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567); vst1_u8 (&dst[x], dMask); } }
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst) { int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); uint8x8_t vcolR, vcolG, vcolB; uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; if (width >= 8) { vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; uint8x8_t vsel_trans, vsel_opq; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Prepare compare masks vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } // Leftovers for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], opaqueDst); } }
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh uint8x8_t d3u8, // p2 uint8x8_t d4u8, // p2 uint8x8_t d5u8, // p1 uint8x8_t d6u8, // p0 uint8x8_t d7u8, // q0 uint8x8_t d16u8, // q1 uint8x8_t d17u8, // q2 uint8x8_t d18u8, // q3 uint8x8_t *d0ru8, // p1 uint8x8_t *d1ru8, // p1 uint8x8_t *d2ru8, // p0 uint8x8_t *d3ru8, // q0 uint8x8_t *d4ru8, // q1 uint8x8_t *d5ru8) { // q1 uint32_t flat; uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; int16x8_t q15s16; uint16x8_t q10u16, q14u16; int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; d19u8 = vabd_u8(d3u8, d4u8); d20u8 = vabd_u8(d4u8, d5u8); d21u8 = vabd_u8(d5u8, d6u8); d22u8 = vabd_u8(d16u8, d7u8); d23u8 = vabd_u8(d17u8, d16u8); d24u8 = vabd_u8(d18u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d20u8 = vmax_u8(d21u8, d22u8); d25u8 = vabd_u8(d6u8, d4u8); d23u8 = vmax_u8(d23u8, d24u8); d26u8 = vabd_u8(d7u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d24u8 = vabd_u8(d6u8, d7u8); d27u8 = vabd_u8(d3u8, d6u8); d28u8 = vabd_u8(d18u8, d7u8); d19u8 = vmax_u8(d19u8, d23u8); d23u8 = vabd_u8(d5u8, d16u8); d24u8 = vqadd_u8(d24u8, d24u8); d19u8 = vcge_u8(dlimit, d19u8); d25u8 = vmax_u8(d25u8, d26u8); d26u8 = vmax_u8(d27u8, d28u8); d23u8 = vshr_n_u8(d23u8, 1); d25u8 = vmax_u8(d25u8, d26u8); d24u8 = vqadd_u8(d24u8, d23u8); d20u8 = vmax_u8(d20u8, d25u8); d23u8 = vdup_n_u8(1); d24u8 = vcge_u8(dblimit, d24u8); d21u8 = vcgt_u8(d21u8, dthresh); d20u8 = vcge_u8(d23u8, d20u8); d19u8 = vand_u8(d19u8, d24u8); d23u8 = vcgt_u8(d22u8, dthresh); d20u8 = vand_u8(d20u8, d19u8); d22u8 = vdup_n_u8(0x80); d23u8 = vorr_u8(d21u8, d23u8); q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); d30u8 = vshrn_n_u16(q10u16, 4); flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); if (flat == 0xffffffff) { // Check for all 1's, power_branch_only d27u8 = vdup_n_u8(3); d21u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d21u8); q14u16 = vaddw_u8(q14u16, d5u8); *d0ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); *d1ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d2ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d18u8); *d3ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d18u8); *d4ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); *d5ru8 = vqrshrn_n_u16(q14u16, 3); } else { d21u8 = veor_u8(d7u8, d22u8); d24u8 = veor_u8(d6u8, d22u8); d25u8 = veor_u8(d5u8, d22u8); d26u8 = veor_u8(d16u8, d22u8); d27u8 = vdup_n_u8(3); d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); q15s16 = vaddw_s8(q15s16, d29s8); d29u8 = vdup_n_u8(4); d28s8 = vqmovn_s16(q15s16); d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); d30s8 = vshr_n_s8(d30s8, 3); d29s8 = vshr_n_s8(d29s8, 3); d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); d29s8 = vrshr_n_s8(d29s8, 1); d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); if (flat == 0) { // filter_branch_only *d0ru8 = d4u8; *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); *d5ru8 = d17u8; return; } d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); d23u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d23u8); d0u8 = vbsl_u8(d20u8, dblimit, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); d1u8 = vbsl_u8(d20u8, dlimit, d25u8); d30u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); d2u8 = vbsl_u8(d20u8, dthresh, d24u8); d31u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); d23u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); q14u16 = vaddw_u8(q14u16, d18u8); *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); d22u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); d3u8 = vbsl_u8(d20u8, d3u8, d21u8); q14u16 = vaddw_u8(q14u16, d18u8); d4u8 = vbsl_u8(d20u8, d4u8, d26u8); d6u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); d5u8 = vbsl_u8(d20u8, d5u8, d17u8); d7u8 = vqrshrn_n_u16(q14u16, 3); *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); } return; }
void yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count, unsigned char * destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0}; const int16x8_t v_coeff = {90, -46, 0, 0, 90, -46, 0, 0}; const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF}; const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; int16x8_t mp0_rgba; /* macropixel 0's resulting RGBA RGBA pixels */ int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels */ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ uint8x8_t rgba0, rgba1; /* rgba values as bytes */ uint8x16_t bothrgba; uint8_t * destinationp; /* pointer into output buffer destp */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; destinationp = (uint8_t *)destp; while (sourcep < vector_endp) { /* pull YUYV from 2 four byte macropixels starting at sourcep. */ /* we'll increment sourcep as we go to save the array dereference */ /* and separate increment instruction at the end of the loop */ /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */ rawpixels = vld1_u8(sourcep); sourcep += sizeof(rawpixels); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* ---------- process macropixel 0 --------------- */ /* take macropixel zero ([YUYV]0) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp0_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]0 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP0_Y0); y1_vec = vdup_lane_u8(rawpixels, MP0_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); /* store ALPHA in elements 3 and 7 (after the RGB components) */ narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ /* change interpretation from unsigned 16-bit to signed */ /* 16-bit. */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_low_s16(widerpixels); /* form a vector of the U component from MP0 */ u_vec = vdupq_lane_s16(yuyv, MP0_U); /* subtract uvbias from u_vec */ u_vec = vsubq_s16(u_vec, uvbias); /* form a vector of the V component from MP0 */ v_vec = vdupq_lane_s16(yuyv, MP0_V); /* subtract uvbias from v_vec */ v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp0_rgba = y_vec + u_vec + v_vec */ mp0_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* ---------- process macropixel 1 --------------- */ /* take macropixel one ([YUYV]1) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp1_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]1 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP1_Y0); y1_vec = vdup_lane_u8(rawpixels, MP1_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_high_s16(widerpixels); u_vec = vdupq_lane_s16(yuyv, 1); u_vec = vsubq_s16(u_vec, uvbias); v_vec = vdupq_lane_s16(yuyv, 3); v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp1_rgba = y_vec + u_vec + v_vec */ mp1_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* turn mp0_rgba from a vector of shorts to a vector of */ /* unsigned unsigned chars. this will saturate: clipping */ /* the values between 0 and 255. */ rgba0 = vqmovun_s16(mp0_rgba); rgba1 = vqmovun_s16(mp1_rgba); /* make it faster to copy these back out of vector registers into */ /* memory by combining rgba0 and rgba1 into the larger bothrgba. */ /* then store that back into memory at destinationp. */ bothrgba = vcombine_u8(rgba0, rgba1); vst1q_u8(destinationp, bothrgba); destinationp += 16; } }