Ejemplo n.º 1
0
static void interpolate5LineNeon(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width, int k1, int k2) {
	uint16x4_t kRedBlueMask_4 = vdup_n_u16(ColorMask::kRedBlueMask);
	uint16x4_t kGreenMask_4 = vdup_n_u16(ColorMask::kGreenMask);
	uint16x4_t k1_4 = vdup_n_u16(k1);
	uint16x4_t k2_4 = vdup_n_u16(k2);
	while (width >= 4) {
		uint16x4_t srcA_4 = vld1_u16(srcA);
		uint16x4_t srcB_4 = vld1_u16(srcB);
		uint16x4_t p1_4 = srcB_4;
		uint16x4_t p2_4 = srcA_4;

		uint16x4_t p1_rb_4 = vand_u16(p1_4, kRedBlueMask_4);
		uint16x4_t p1_g_4  = vand_u16(p1_4, kGreenMask_4);
		uint16x4_t p2_rb_4 = vand_u16(p2_4, kRedBlueMask_4);
		uint16x4_t p2_g_4  = vand_u16(p2_4, kGreenMask_4);

		uint32x4_t tmp_rb_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_rb_4, k2_4), p1_rb_4, k1_4), 3);
		uint32x4_t tmp_g_4  = vshrq_n_u32(vmlal_u16(vmull_u16(p2_g_4, k2_4), p1_g_4, k1_4), 3);
		uint16x4_t p_rb_4 = vmovn_u32(tmp_rb_4);
		p_rb_4 = vand_u16(p_rb_4, kRedBlueMask_4);
		uint16x4_t p_g_4 = vmovn_u32(tmp_g_4);
		p_g_4 = vand_u16(p_g_4, kGreenMask_4);

		uint16x4_t result_4 = p_rb_4 | p_g_4;
		vst1_u16(dst, result_4);

		dst += 4;
		srcA += 4;
		srcB += 4;
		width -= 4;
	}
}
void test_vdup_nu16 (void)
{
  uint16x4_t out_uint16x4_t;
  uint16_t arg0_uint16_t;

  out_uint16x4_t = vdup_n_u16 (arg0_uint16_t);
}
Ejemplo n.º 3
0
uint16x4_t test_vdup_n_u16(uint16_t v1) {
  // CHECK: test_vdup_n_u16
  return vdup_n_u16(v1);
  // CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
}
Ejemplo n.º 4
0
inline  uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
Ejemplo n.º 5
0
//
// box blur a square array of pixels (power of 2, actually)
// if we insist on powers of 2, we don't need to special case some end-of-row/col conditions
// to a specific blur width
//
// also, we're using NEON to vectorize our arithmetic.
// we need to do a division along the way, but NEON doesn't support integer division.
// so rather than divide by, say "w", we multiply by magic(w).
// magic(w) is chosen so that the result of multiplying by it will be the same as
// dividing by w, except that the result will be in the high half of the result.
// yes, dorothy... this is what compilers do, too...
void NEONboxBlur(pixel *src, pixel *dest, unsigned int size, unsigned int blurRad) {
	unsigned int wid = 2 * blurRad + 1;

	// because NEON doesn't have integer division, we use "magic constants" that will give
	// use the result of division by multiplication -- the upper half of the result will be
	// (more or less) the result of the division.
	// for this, we need to compute the magic numbers corresponding to a given divisor

	struct magicu_info minfo = compute_unsigned_magic_info(wid, 16);

	int16x8_t preshift  = vdupq_n_s16(-minfo.pre_shift); // negative means shift right
	int32x4_t postshift = vdupq_n_s32(-(minfo.post_shift+16)); // negative means shift right
	uint16x4_t magic    = vdup_n_u16(minfo.multiplier);

//	fprintf(stderr,"width %5d, preshift %d, postshift %d + 16, increment %d, magic %d\n", wid,
//			minfo.pre_shift, minfo.post_shift, minfo.increment, minfo.multiplier);

//	if (minfo.pre_shift > 0) fprintf(stderr,"hey, not an odd number!\n");

	int i, j, k, ch;
	for (i = 0 ; i < size ; i+=8) {
		// first, initialize the sum so that we can loop from 0 to size-1

		// we'll initialize boxsum for index -1, so that we can move into 0 as part of our loop
		uint16x8x4_t boxsum;
		uint8x8x4_t firstpixel = vld4_u8((uint8_t *)(src + 0 * size + i));
		for (ch = 0 ; ch < 4 ; ch++) {
			// boxsum[ch] = blurRad * srcpixel[ch]
			boxsum.val[ch] = vmulq_n_u16(vmovl_u8(firstpixel.val[ch]),(blurRad+1)+1);
		}
		for ( k = 1 ; k < blurRad ; k++) {
			uint8x8x4_t srcpixel = vld4_u8((uint8_t *)(src + k * size + i));
			for (ch = 0 ; ch < 4 ; ch++ ) {
				boxsum.val[ch] = vaddw_u8(boxsum.val[ch], srcpixel.val[ch]);
			}
		}

		int right = blurRad-1;
		int left = -blurRad-1;

		if (minfo.increment) {
			for ( k = 0 ; k < size ; k++) {
				// move to next pixel
				unsigned int l = (left < 0)?0:left; // take off the old left
				left++;
				right++;
				unsigned int r = (right < size)?right:(size-1); // but add the new right

				uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i));
				uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i));
				for (ch = 0 ; ch < 4 ; ch++ ) {
					// boxsum[ch] += addpixel[ch] - subpixel[ch];
					boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]);
				}

				uint8x8x4_t destpixel;
				for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid
					// since 16bit multiplication leads to 32bit results, we need to
					// split our task into two chunks, for the hi and low half of our vector
					// (because otherwise, it won't all fit into 128 bits)

					// this is the meat of the magic division algorithm (see the include file...)
					uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift);

					// multiply by the magic number
					uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic);
					res_hi = vaddw_u16(res_hi, magic);
					// take the high half and post-shift
					uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift));

					// pre-shift and multiply by the magic number
					uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic);
					res_lo = vaddw_u16(res_lo, magic);
					// take the high half and post-shift
					uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift));

					destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi));
				}
				pixel block[8];
				vst4_u8((uint8_t *)&block, destpixel);
				for (j = 0 ; j < 8 ; j++ ) {
					dest[(i + j)*size + k] = block[j];
				}
				//			vst4_u8((uint8_t *)(dest + k * size + i), destpixel);
			}
		} else {
			for ( k = 0 ; k < size ; k++) {
				// move to next pixel
				unsigned int l = (left < 0)?0:left; // take off the old left
				left++;
				right++;
				unsigned int r = (right < size)?right:(size-1); // but add the new right

				uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i));
				uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i));
				for (ch = 0 ; ch < 4 ; ch++ ) {
					// boxsum[ch] += addpixel[ch] - subpixel[ch];
					boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]);
				}

				uint8x8x4_t destpixel;
				for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid
					// since 16bit multiplication leads to 32bit results, we need to
					// split our task into two chunks, for the hi and low half of our vector
					// (because otherwise, it won't all fit into 128 bits)

					// this is the meat of the magic division algorithm (see the include file...)
					uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift);

					// multiply by the magic number
					// take the high half and post-shift
					uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic);
					uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift));

					// multiply by the magic number
					// take the high half and post-shift
					uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic);
					uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift));

					destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi));
				}
				pixel block[8];
				vst4_u8((uint8_t *)&block, destpixel);
				for (j = 0 ; j < 8 ; j++ ) {
					dest[(i + j)*size + k] = block[j];
				}
				//			vst4_u8((uint8_t *)(dest + k * size + i), destpixel);
			}
		}
	}
}
Ejemplo n.º 6
0
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) {
  int ay = _ay;
  int i;
  DATA32* pbuf = result;
	    uint16x4_t ay_16x4;
	    uint16x4_t p0_16x4;
	    uint16x4_t p2_16x4;
	    uint16x8_t ax_16x8;
	    uint16x8_t p0_p2_16x8;
	    uint16x8_t p1_p3_16x8;
	    uint16x8_t x255_16x8;
	    uint32x2_t p0_p2_32x2;
	    uint32x2_t p1_p3_32x2;
	    uint32x2_t res_32x2;
	    uint8x8_t p0_p2_8x8;
	    uint8x8_t p1_p3_8x8;
	    uint8x8_t p2_8x8;
	    uint16x4_t temp_16x4;

	    ay_16x4 = vdup_n_u16(ay);
	    x255_16x8 = vdupq_n_u16(0xff);
  for(i = 0; i < len; i++) {
    DATA32 p0 = *_p0++;
    DATA32 p1 = *_p1++;
    DATA32 p2 = *_p2++;
    DATA32 p3 = *_p3++;
    int ax = *_ax++;
		if (p0 | p1 | p2 | p3)
		  {
		    ax_16x8 = vdupq_n_u16(ax);

		    p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0);
		    p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1);
		    p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0);
		    p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1);

		    p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2);
		    p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2);
		    p1_p3_16x8 = vmovl_u8(p1_p3_8x8);
		    p0_p2_16x8 = vmovl_u8(p0_p2_8x8);

		    p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8);
		    p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8);
		    p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8);
		    p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8);
		    p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8);

		    p0_16x4 = vget_low_u16(p1_p3_16x8);
		    p2_16x4 = vget_high_u16(p1_p3_16x8);

		    p2_16x4 = vsub_u16(p2_16x4, p0_16x4);
		    p2_16x4 = vmul_u16(p2_16x4, ay_16x4);
		    p2_16x4 = vshr_n_u16(p2_16x4, 8);
		    p2_16x4 = vadd_u16(p2_16x4, p0_16x4);

		    p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4);
		    p2_8x8 = vmovn_u16(p1_p3_16x8);
		    res_32x2 = vreinterpret_u32_u8(p2_8x8);
		    vst1_lane_u32(pbuf++, res_32x2, 1);
		  }
		else
		  *pbuf++ = p0;

  }
	return 0;
}