void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor) { int colA = SkColorGetA(color); int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); colA = SkAlpha255To256(colA); uint8x8_t vcolR, vcolG, vcolB; uint16x8_t vcolA; if (width >= 8) { vcolA = vdupq_n_u16(colA); vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); vdst.val[NEON_A] = vdup_n_u8(0xFF); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); } }
void ComputesRGBLuminanceMask_NEON(const uint8_t *aSourceData, int32_t aSourceStride, uint8_t *aDestData, int32_t aDestStride, const IntSize &aSize, float aOpacity) { int32_t redFactor = 55 * aOpacity; // 255 * 0.2125 * opacity int32_t greenFactor = 183 * aOpacity; // 255 * 0.7154 * opacity int32_t blueFactor = 18 * aOpacity; // 255 * 0.0721 const uint8_t *sourcePixel = aSourceData; int32_t sourceOffset = aSourceStride - 4 * aSize.width; uint8_t *destPixel = aDestData; int32_t destOffset = aDestStride - aSize.width; sourcePixel = aSourceData; int32_t remainderWidth = aSize.width % 8; int32_t roundedWidth = aSize.width - remainderWidth; uint16x8_t temp; uint8x8_t gray; uint8x8_t redVector = vdup_n_u8(redFactor); uint8x8_t greenVector = vdup_n_u8(greenFactor); uint8x8_t blueVector = vdup_n_u8(blueFactor); uint8x8_t fullBitVector = vdup_n_u8(255); uint8x8_t oneVector = vdup_n_u8(1); for (int32_t y = 0; y < aSize.height; y++) { // Calculate luminance by neon with 8 pixels per loop for (int32_t x = 0; x < roundedWidth; x += 8) { uint8x8x4_t argb = vld4_u8(sourcePixel); temp = vmull_u8(argb.val[GFX_ARGB32_OFFSET_R], redVector); // temp = red * redFactor temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_G], greenVector); // temp += green * greenFactor temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_B], blueVector); // temp += blue * blueFactor gray = vshrn_n_u16(temp, 8); // gray = temp >> 8 // Check alpha value uint8x8_t alphaVector = vtst_u8(argb.val[GFX_ARGB32_OFFSET_A], fullBitVector); gray = vmul_u8(gray, vand_u8(alphaVector, oneVector)); // Put the result to the 8 pixels vst1_u8(destPixel, gray); sourcePixel += 8 * 4; destPixel += 8; } // Calculate the rest pixels of the line by cpu for (int32_t x = 0; x < remainderWidth; x++) { if (sourcePixel[GFX_ARGB32_OFFSET_A] > 0) { *destPixel = (redFactor * sourcePixel[GFX_ARGB32_OFFSET_R]+ greenFactor * sourcePixel[GFX_ARGB32_OFFSET_G] + blueFactor * sourcePixel[GFX_ARGB32_OFFSET_B]) >> 8; } else { *destPixel = 0; } sourcePixel += 4; destPixel++; }
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { int i; for (i = 0; i + 8 <= width; i += 8) { const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]); vst1_u8(y + i, Y); } for (; i < width; ++i) { // left-over const uint32_t p = argb[i]; y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, YUV_HALF); } }
/** * @brief Grayscale 변환을 수행하는 thread 함수. NEON 명령어를 사용한다. * * @param arg 계산해야할 정보가 담겨있는 _thread_data 형식의 구조체 * * @return NULL */ static void *thread_calc(void *arg) { struct _thread_data *data = (struct _thread_data *)arg; uint8x8_t rfac = vdup_n_u8 (76); uint8x8_t gfac = vdup_n_u8 (151); uint8x8_t bfac = vdup_n_u8 (29); int n = data->size / 8; int m = data->size % 8; int iTemp; unsigned char szTemp[8]; unsigned int *data_in = data->data_in; unsigned int *data_out = data->data_out; // 한 루프당 8픽셀씩 변환 (32 bytes) while (n--) { uint16x8_t temp; uint8x8x4_t rgb = vld4_u8 ((unsigned char *)data_in); uint8x8_t result; temp = vmull_u8(rgb.val[0], bfac); temp = vmlal_u8(temp, rgb.val[1], gfac); temp = vmlal_u8(temp, rgb.val[2], rfac); result = vshrn_n_u16 (temp, 8); vst1_u8(szTemp, result); iTemp = szTemp[0]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[1]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[2]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[3]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[4]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[5]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[6]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[7]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; } if (m) { argb8888_to_gray(data_in, data_out, m); } return NULL; }
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst) { int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); uint8x8_t vcolR, vcolG, vcolB; uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; if (width >= 8) { vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; uint8x8_t vsel_trans, vsel_opq; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Prepare compare masks vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } // Leftovers for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], opaqueDst); } }
void test_vld4u8 (void) { uint8x8x4_t out_uint8x8x4_t; out_uint8x8x4_t = vld4_u8 (0); }
static INLINE void scaledconvolve_horiz_w4( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filters, const int x0_q4, const int x_step_q4, const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; y = h; do { int x_q4 = x0_q4; x = 0; do { // process 4 src_x steps for (z = 0; z < 4; ++z) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x8_t ss[4]; int16x4_t t[8], tt; load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); t[0] = vget_low_s16(ss[0]); t[1] = vget_low_s16(ss[1]); t[2] = vget_low_s16(ss[2]); t[3] = vget_low_s16(ss[3]); t[4] = vget_high_s16(ss[0]); t[5] = vget_high_s16(ss[1]); t[6] = vget_high_s16(ss[2]); t[7] = vget_high_s16(ss[3]); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, filter3, filter4); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); } else { int i; for (i = 0; i < 4; ++i) { temp[z * 4 + i] = src_x[i * src_stride + 3]; } } x_q4 += x_step_q4; } // transpose the 4x4 filters values back to dst { const uint8x8x4_t d4 = vld4_u8(temp); vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], vreinterpret_u32_u8(d4.val[0]), 0); vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], vreinterpret_u32_u8(d4.val[1]), 0); vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], vreinterpret_u32_u8(d4.val[2]), 0); vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], vreinterpret_u32_u8(d4.val[3]), 0); } x += 4; } while (x < w); src += src_stride * 4; dst += dst_stride * 4; y -= 4; } while (y > 0); }
inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); }
// // box blur a square array of pixels (power of 2, actually) // if we insist on powers of 2, we don't need to special case some end-of-row/col conditions // to a specific blur width // // also, we're using NEON to vectorize our arithmetic. // we need to do a division along the way, but NEON doesn't support integer division. // so rather than divide by, say "w", we multiply by magic(w). // magic(w) is chosen so that the result of multiplying by it will be the same as // dividing by w, except that the result will be in the high half of the result. // yes, dorothy... this is what compilers do, too... void NEONboxBlur(pixel *src, pixel *dest, unsigned int size, unsigned int blurRad) { unsigned int wid = 2 * blurRad + 1; // because NEON doesn't have integer division, we use "magic constants" that will give // use the result of division by multiplication -- the upper half of the result will be // (more or less) the result of the division. // for this, we need to compute the magic numbers corresponding to a given divisor struct magicu_info minfo = compute_unsigned_magic_info(wid, 16); int16x8_t preshift = vdupq_n_s16(-minfo.pre_shift); // negative means shift right int32x4_t postshift = vdupq_n_s32(-(minfo.post_shift+16)); // negative means shift right uint16x4_t magic = vdup_n_u16(minfo.multiplier); // fprintf(stderr,"width %5d, preshift %d, postshift %d + 16, increment %d, magic %d\n", wid, // minfo.pre_shift, minfo.post_shift, minfo.increment, minfo.multiplier); // if (minfo.pre_shift > 0) fprintf(stderr,"hey, not an odd number!\n"); int i, j, k, ch; for (i = 0 ; i < size ; i+=8) { // first, initialize the sum so that we can loop from 0 to size-1 // we'll initialize boxsum for index -1, so that we can move into 0 as part of our loop uint16x8x4_t boxsum; uint8x8x4_t firstpixel = vld4_u8((uint8_t *)(src + 0 * size + i)); for (ch = 0 ; ch < 4 ; ch++) { // boxsum[ch] = blurRad * srcpixel[ch] boxsum.val[ch] = vmulq_n_u16(vmovl_u8(firstpixel.val[ch]),(blurRad+1)+1); } for ( k = 1 ; k < blurRad ; k++) { uint8x8x4_t srcpixel = vld4_u8((uint8_t *)(src + k * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { boxsum.val[ch] = vaddw_u8(boxsum.val[ch], srcpixel.val[ch]); } } int right = blurRad-1; int left = -blurRad-1; if (minfo.increment) { for ( k = 0 ; k < size ; k++) { // move to next pixel unsigned int l = (left < 0)?0:left; // take off the old left left++; right++; unsigned int r = (right < size)?right:(size-1); // but add the new right uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i)); uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { // boxsum[ch] += addpixel[ch] - subpixel[ch]; boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]); } uint8x8x4_t destpixel; for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid // since 16bit multiplication leads to 32bit results, we need to // split our task into two chunks, for the hi and low half of our vector // (because otherwise, it won't all fit into 128 bits) // this is the meat of the magic division algorithm (see the include file...) uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift); // multiply by the magic number uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic); res_hi = vaddw_u16(res_hi, magic); // take the high half and post-shift uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift)); // pre-shift and multiply by the magic number uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic); res_lo = vaddw_u16(res_lo, magic); // take the high half and post-shift uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift)); destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi)); } pixel block[8]; vst4_u8((uint8_t *)&block, destpixel); for (j = 0 ; j < 8 ; j++ ) { dest[(i + j)*size + k] = block[j]; } // vst4_u8((uint8_t *)(dest + k * size + i), destpixel); } } else { for ( k = 0 ; k < size ; k++) { // move to next pixel unsigned int l = (left < 0)?0:left; // take off the old left left++; right++; unsigned int r = (right < size)?right:(size-1); // but add the new right uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i)); uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { // boxsum[ch] += addpixel[ch] - subpixel[ch]; boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]); } uint8x8x4_t destpixel; for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid // since 16bit multiplication leads to 32bit results, we need to // split our task into two chunks, for the hi and low half of our vector // (because otherwise, it won't all fit into 128 bits) // this is the meat of the magic division algorithm (see the include file...) uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift); // multiply by the magic number // take the high half and post-shift uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic); uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift)); // multiply by the magic number // take the high half and post-shift uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic); uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift)); destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi)); } pixel block[8]; vst4_u8((uint8_t *)&block, destpixel); for (j = 0 ; j < 8 ; j++ ) { dest[(i + j)*size + k] = block[j]; } // vst4_u8((uint8_t *)(dest + k * size + i), destpixel); } } } }