void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor) { int colA = SkColorGetA(color); int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); colA = SkAlpha255To256(colA); uint8x8_t vcolR, vcolG, vcolB; uint16x8_t vcolA; if (width >= 8) { vcolA = vdupq_n_u16(colA); vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); vdst.val[NEON_A] = vdup_n_u8(0xFF); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); } }
inline int v_signmask(const v_uint16x8& a) { int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); }
QT_BEGIN_NAMESPACE static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half) { // result = (x + (x >> 8) + 0x80) >> 8 const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8 const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80 const int16x8_t sum = vaddq_s16(temp, sum_part); return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8)); }
SIMD_INLINE uint16x8_t DivideBy64(uint16x8_t value) { return vshrq_n_u16(vaddq_u16(value, K16_0020), 6); }
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst) { int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); uint8x8_t vcolR, vcolG, vcolB; uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; if (width >= 8) { vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; uint8x8_t vsel_trans, vsel_opq; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Prepare compare masks vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } // Leftovers for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], opaqueDst); } }
template <> SIMD_INLINE uint16x8_t DivideBy16<false>(uint16x8_t value) { return vshrq_n_u16(value, 4); }
template <> SIMD_INLINE uint16x8_t DivideBy16<true>(uint16x8_t value) { return vshrq_n_u16(vaddq_u16(value, K16_0008), 4); }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }