Ejemplo n.º 1
0
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
                                   SkColor color, int width, SkPMColor) {
    int colA = SkColorGetA(color);
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    colA = SkAlpha255To256(colA);

    uint8x8_t vcolR, vcolG, vcolB;
    uint16x8_t vcolA;

    if (width >= 8) {
        vcolA = vdupq_n_u16(colA);
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
        vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
        vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);

        vdst.val[NEON_A] = vdup_n_u8(0xFF);
        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
    }
}
Ejemplo n.º 2
0
void
ComputesRGBLuminanceMask_NEON(const uint8_t *aSourceData,
                              int32_t aSourceStride,
                              uint8_t *aDestData,
                              int32_t aDestStride,
                              const IntSize &aSize,
                              float aOpacity)
{
  int32_t redFactor = 55 * aOpacity; // 255 * 0.2125 * opacity
  int32_t greenFactor = 183 * aOpacity; // 255 * 0.7154 * opacity
  int32_t blueFactor = 18 * aOpacity; // 255 * 0.0721
  const uint8_t *sourcePixel = aSourceData;
  int32_t sourceOffset = aSourceStride - 4 * aSize.width;
  uint8_t *destPixel = aDestData;
  int32_t destOffset = aDestStride - aSize.width;

  sourcePixel = aSourceData;
  int32_t remainderWidth = aSize.width % 8;
  int32_t roundedWidth = aSize.width - remainderWidth;
  uint16x8_t temp;
  uint8x8_t gray;
  uint8x8_t redVector = vdup_n_u8(redFactor);
  uint8x8_t greenVector = vdup_n_u8(greenFactor);
  uint8x8_t blueVector = vdup_n_u8(blueFactor);
  uint8x8_t fullBitVector = vdup_n_u8(255);
  uint8x8_t oneVector = vdup_n_u8(1);
  for (int32_t y = 0; y < aSize.height; y++) {
    // Calculate luminance by neon with 8 pixels per loop
    for (int32_t x = 0; x < roundedWidth; x += 8) {
      uint8x8x4_t argb  = vld4_u8(sourcePixel);
      temp = vmull_u8(argb.val[GFX_ARGB32_OFFSET_R], redVector); // temp = red * redFactor
      temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_G], greenVector); // temp += green * greenFactor
      temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_B], blueVector); // temp += blue * blueFactor
      gray = vshrn_n_u16(temp, 8); // gray = temp >> 8

      // Check alpha value
      uint8x8_t alphaVector = vtst_u8(argb.val[GFX_ARGB32_OFFSET_A], fullBitVector);
      gray = vmul_u8(gray, vand_u8(alphaVector, oneVector));

      // Put the result to the 8 pixels
      vst1_u8(destPixel, gray);
      sourcePixel += 8 * 4;
      destPixel += 8;
    }

    // Calculate the rest pixels of the line by cpu
    for (int32_t x = 0; x < remainderWidth; x++) {
      if (sourcePixel[GFX_ARGB32_OFFSET_A] > 0) {
        *destPixel = (redFactor * sourcePixel[GFX_ARGB32_OFFSET_R]+
                      greenFactor * sourcePixel[GFX_ARGB32_OFFSET_G] +
                      blueFactor * sourcePixel[GFX_ARGB32_OFFSET_B]) >> 8;
      } else {
        *destPixel = 0;
      }
      sourcePixel += 4;
      destPixel++;
    }
Ejemplo n.º 3
0
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
  int i;
  for (i = 0; i + 8 <= width; i += 8) {
    const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
    vst1_u8(y + i, Y);
  }
  for (; i < width; ++i) {   // left-over
    const uint32_t p = argb[i];
    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
                     YUV_HALF);
  }
}
/**
 * @brief Grayscale 변환을 수행하는 thread 함수. NEON 명령어를 사용한다.
 *
 * @param arg 계산해야할 정보가 담겨있는 _thread_data 형식의 구조체
 *
 * @return NULL
 */
static void *thread_calc(void *arg)
{
    struct _thread_data *data = (struct _thread_data *)arg;

    uint8x8_t rfac = vdup_n_u8 (76);
    uint8x8_t gfac = vdup_n_u8 (151);
    uint8x8_t bfac = vdup_n_u8 (29);
    int n = data->size / 8;
    int m = data->size % 8;

    int iTemp;
    unsigned char szTemp[8];

    unsigned int *data_in  = data->data_in;
    unsigned int *data_out = data->data_out;

    // 한 루프당 8픽셀씩 변환 (32 bytes)
    while (n--) {
        uint16x8_t  temp;
        uint8x8x4_t rgb  = vld4_u8 ((unsigned char *)data_in);
        uint8x8_t result;

        temp = vmull_u8(rgb.val[0],       bfac);
        temp = vmlal_u8(temp, rgb.val[1], gfac);
        temp = vmlal_u8(temp, rgb.val[2], rfac);

        result = vshrn_n_u16 (temp, 8);
        vst1_u8(szTemp, result);

        iTemp = szTemp[0]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[1]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[2]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[3]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[4]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[5]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[6]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[7]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
    }

    if (m) {
        argb8888_to_gray(data_in, data_out, m);
    }

    return NULL;
}
Ejemplo n.º 5
0
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                        SkColor color, int width,
                                        SkPMColor opaqueDst) {
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    uint8x8_t vcolR, vcolG, vcolB;
    uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;

    if (width >= 8) {
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
        vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
        vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
        vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
        vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;
        uint8x8_t vsel_trans, vsel_opq;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Prepare compare masks
        vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
        vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
        vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);

        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
        vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
        vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    // Leftovers
    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
                                    opaqueDst);
    }
}
void test_vld4u8 (void)
{
  uint8x8x4_t out_uint8x8x4_t;

  out_uint8x8x4_t = vld4_u8 (0);
}
Ejemplo n.º 7
0
static INLINE void scaledconvolve_horiz_w4(
    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
    const int x0_q4, const int x_step_q4, const int w, const int h) {
  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
  int x, y, z;

  src -= SUBPEL_TAPS / 2 - 1;

  y = h;
  do {
    int x_q4 = x0_q4;
    x = 0;
    do {
      // process 4 src_x steps
      for (z = 0; z < 4; ++z) {
        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
        if (x_q4 & SUBPEL_MASK) {
          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
          uint8x8_t s[8], d;
          int16x8_t ss[4];
          int16x4_t t[8], tt;

          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);

          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
          t[0] = vget_low_s16(ss[0]);
          t[1] = vget_low_s16(ss[1]);
          t[2] = vget_low_s16(ss[2]);
          t[3] = vget_low_s16(ss[3]);
          t[4] = vget_high_s16(ss[0]);
          t[5] = vget_high_s16(ss[1]);
          t[6] = vget_high_s16(ss[2]);
          t[7] = vget_high_s16(ss[3]);

          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
                           filters, filter3, filter4);
          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
        } else {
          int i;
          for (i = 0; i < 4; ++i) {
            temp[z * 4 + i] = src_x[i * src_stride + 3];
          }
        }
        x_q4 += x_step_q4;
      }

      // transpose the 4x4 filters values back to dst
      {
        const uint8x8x4_t d4 = vld4_u8(temp);
        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
                      vreinterpret_u32_u8(d4.val[0]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
                      vreinterpret_u32_u8(d4.val[1]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
                      vreinterpret_u32_u8(d4.val[2]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
                      vreinterpret_u32_u8(d4.val[3]), 0);
      }
      x += 4;
    } while (x < w);

    src += src_stride * 4;
    dst += dst_stride * 4;
    y -= 4;
  } while (y > 0);
}
Ejemplo n.º 8
0
inline   uint8x8x4_t vld4(const u8  * ptr) { return  vld4_u8(ptr); }
Ejemplo n.º 9
0
//
// box blur a square array of pixels (power of 2, actually)
// if we insist on powers of 2, we don't need to special case some end-of-row/col conditions
// to a specific blur width
//
// also, we're using NEON to vectorize our arithmetic.
// we need to do a division along the way, but NEON doesn't support integer division.
// so rather than divide by, say "w", we multiply by magic(w).
// magic(w) is chosen so that the result of multiplying by it will be the same as
// dividing by w, except that the result will be in the high half of the result.
// yes, dorothy... this is what compilers do, too...
void NEONboxBlur(pixel *src, pixel *dest, unsigned int size, unsigned int blurRad) {
	unsigned int wid = 2 * blurRad + 1;

	// because NEON doesn't have integer division, we use "magic constants" that will give
	// use the result of division by multiplication -- the upper half of the result will be
	// (more or less) the result of the division.
	// for this, we need to compute the magic numbers corresponding to a given divisor

	struct magicu_info minfo = compute_unsigned_magic_info(wid, 16);

	int16x8_t preshift  = vdupq_n_s16(-minfo.pre_shift); // negative means shift right
	int32x4_t postshift = vdupq_n_s32(-(minfo.post_shift+16)); // negative means shift right
	uint16x4_t magic    = vdup_n_u16(minfo.multiplier);

//	fprintf(stderr,"width %5d, preshift %d, postshift %d + 16, increment %d, magic %d\n", wid,
//			minfo.pre_shift, minfo.post_shift, minfo.increment, minfo.multiplier);

//	if (minfo.pre_shift > 0) fprintf(stderr,"hey, not an odd number!\n");

	int i, j, k, ch;
	for (i = 0 ; i < size ; i+=8) {
		// first, initialize the sum so that we can loop from 0 to size-1

		// we'll initialize boxsum for index -1, so that we can move into 0 as part of our loop
		uint16x8x4_t boxsum;
		uint8x8x4_t firstpixel = vld4_u8((uint8_t *)(src + 0 * size + i));
		for (ch = 0 ; ch < 4 ; ch++) {
			// boxsum[ch] = blurRad * srcpixel[ch]
			boxsum.val[ch] = vmulq_n_u16(vmovl_u8(firstpixel.val[ch]),(blurRad+1)+1);
		}
		for ( k = 1 ; k < blurRad ; k++) {
			uint8x8x4_t srcpixel = vld4_u8((uint8_t *)(src + k * size + i));
			for (ch = 0 ; ch < 4 ; ch++ ) {
				boxsum.val[ch] = vaddw_u8(boxsum.val[ch], srcpixel.val[ch]);
			}
		}

		int right = blurRad-1;
		int left = -blurRad-1;

		if (minfo.increment) {
			for ( k = 0 ; k < size ; k++) {
				// move to next pixel
				unsigned int l = (left < 0)?0:left; // take off the old left
				left++;
				right++;
				unsigned int r = (right < size)?right:(size-1); // but add the new right

				uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i));
				uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i));
				for (ch = 0 ; ch < 4 ; ch++ ) {
					// boxsum[ch] += addpixel[ch] - subpixel[ch];
					boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]);
				}

				uint8x8x4_t destpixel;
				for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid
					// since 16bit multiplication leads to 32bit results, we need to
					// split our task into two chunks, for the hi and low half of our vector
					// (because otherwise, it won't all fit into 128 bits)

					// this is the meat of the magic division algorithm (see the include file...)
					uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift);

					// multiply by the magic number
					uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic);
					res_hi = vaddw_u16(res_hi, magic);
					// take the high half and post-shift
					uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift));

					// pre-shift and multiply by the magic number
					uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic);
					res_lo = vaddw_u16(res_lo, magic);
					// take the high half and post-shift
					uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift));

					destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi));
				}
				pixel block[8];
				vst4_u8((uint8_t *)&block, destpixel);
				for (j = 0 ; j < 8 ; j++ ) {
					dest[(i + j)*size + k] = block[j];
				}
				//			vst4_u8((uint8_t *)(dest + k * size + i), destpixel);
			}
		} else {
			for ( k = 0 ; k < size ; k++) {
				// move to next pixel
				unsigned int l = (left < 0)?0:left; // take off the old left
				left++;
				right++;
				unsigned int r = (right < size)?right:(size-1); // but add the new right

				uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i));
				uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i));
				for (ch = 0 ; ch < 4 ; ch++ ) {
					// boxsum[ch] += addpixel[ch] - subpixel[ch];
					boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]);
				}

				uint8x8x4_t destpixel;
				for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid
					// since 16bit multiplication leads to 32bit results, we need to
					// split our task into two chunks, for the hi and low half of our vector
					// (because otherwise, it won't all fit into 128 bits)

					// this is the meat of the magic division algorithm (see the include file...)
					uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift);

					// multiply by the magic number
					// take the high half and post-shift
					uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic);
					uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift));

					// multiply by the magic number
					// take the high half and post-shift
					uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic);
					uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift));

					destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi));
				}
				pixel block[8];
				vst4_u8((uint8_t *)&block, destpixel);
				for (j = 0 ; j < 8 ; j++ ) {
					dest[(i + j)*size + k] = block[j];
				}
				//			vst4_u8((uint8_t *)(dest + k * size + i), destpixel);
			}
		}
	}
}