예제 #1
0
		SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t *src)
		{
			const uint8x8x2_t t01 = vld2_u8(src - 1);
			const uint8x8x2_t t23 = vld2_u8(src + 1);
            return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003));
		}
예제 #2
0
		template <> SIMD_INLINE uint16x8_t ReduceColTail<false>(const uint8_t *src)
		{
			const uint8x8x2_t t01 = vld2_u8(src - 1);
			const uint8x8x2_t t23 = Deinterleave(LoadAfterLast<1>(LoadAfterLast<1>(vld1q_u8(src - 1))));
            return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003));
        }
예제 #3
0
		SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src)
		{
			const uint8x8x2_t t01 = Deinterleave(LoadBeforeFirst<1>(vld1q_u8(src)));
			const uint8x8x2_t t23 = vld2_u8(src + 1);
            return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003));
        }
예제 #4
0
void test_vld2u8 (void)
{
  uint8x8x2_t out_uint8x8x2_t;

  out_uint8x8x2_t = vld2_u8 (0);
}
예제 #5
0
inline   uint8x8x2_t vld2(const u8  * ptr) { return  vld2_u8(ptr); }
예제 #6
0
void combineUYVY(const Size2D &size,
                 const u8 * srcyBase, ptrdiff_t srcyStride,
                 const u8 * srcuBase, ptrdiff_t srcuStride,
                 const u8 * srcvBase, ptrdiff_t srcvStride,
                 u8 * dstBase, ptrdiff_t dstStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    for (size_t i = 0u; i < size.height; ++i)
    {
        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t syj = 0u, sj = 0u, dj = 0u;

#ifndef __ANDROID__
        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
        {
            internal::prefetch(srcy + syj);
            internal::prefetch(srcu + sj);
            internal::prefetch(srcv + sj);

            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
            uint8x16x4_t v_dst;
            v_dst.val[0] = vld1q_u8(srcu + sj);
            v_dst.val[1] = v_y.val[0];
            v_dst.val[2] = vld1q_u8(srcv + sj);
            v_dst.val[3] = v_y.val[1];
            vst4q_u8(dst + dj, v_dst);

            v_y = vld2q_u8(srcy + syj + 32);
            v_dst.val[0] = vld1q_u8(srcu + sj + 16);
            v_dst.val[1] = v_y.val[0];
            v_dst.val[2] = vld1q_u8(srcv + sj + 16);
            v_dst.val[3] = v_y.val[1];
            vst4q_u8(dst + dj + 64, v_dst);
        }
#endif

        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
        {
            uint8x8x2_t v_y = vld2_u8(srcy + syj);
            uint8x8x4_t v_dst;
            v_dst.val[0] = vld1_u8(srcu + sj);
            v_dst.val[1] = v_y.val[0];
            v_dst.val[2] = vld1_u8(srcv + sj);
            v_dst.val[3] = v_y.val[1];
            vst4_u8(dst + dj, v_dst);
        }

        for (; sj < size.width; ++sj, syj += 2, dj += 4)
        {
            dst[dj] = srcu[sj];
            dst[dj + 1] = srcy[syj];
            dst[dj + 2] = srcv[sj];
            dst[dj + 3] = srcy[syj + 1];
        }
    }
#else
    (void)size;
    (void)srcyBase;
    (void)srcyStride;
    (void)srcuBase;
    (void)srcuStride;
    (void)srcvBase;
    (void)srcvStride;
    (void)dstBase;
    (void)dstStride;
#endif
}
예제 #7
0
void rotate_down_scale_cbcr_to_cr_cb(int wDest, int hDest, int full_width, uint8_t* cbcr_src, uint8_t* cr_dst, uint8_t* cb_dst,bool_t clockWise,bool_t down_scale) {
#ifdef __ARM_NEON__
	int hSrc = down_scale?wDest*2:wDest;
	int wSrc = down_scale?hDest*2:hDest;
	int src_stride = 2*full_width;

	int signed_dst_stride;
	int incr;
	int y_step=down_scale?2:1;


	if (clockWise) {
		/* ms_warning("start writing destination buffer from top right");*/
		cb_dst += wDest - 1;
		cr_dst += wDest - 1;
		incr = 1;
		signed_dst_stride = wDest;
	} else {
		/* ms_warning("start writing destination buffer from top right");*/
		cb_dst += wDest * (hDest - 1);
		cr_dst += wDest * (hDest - 1);
		incr = -1;
		signed_dst_stride = -wDest;
	}

	int x,y;
	for (y=0; y<hSrc; y+=y_step) {
		uint8_t* cb_dst2 = cb_dst;
		uint8_t* cr_dst2 = cr_dst;
		for (x=0; x<2*wSrc; x+=16) {
			uint8x8x2_t tmp = vld2_u8 (cbcr_src+x);

			vst1_lane_u8 (cb_dst2, tmp.val[0], 0);
			vst1_lane_u8 (cr_dst2, tmp.val[1], 0);
			cb_dst2+=signed_dst_stride;
			cr_dst2+=signed_dst_stride;
			if (!down_scale) {
				vst1_lane_u8 (cb_dst2, tmp.val[0], 1);
				vst1_lane_u8 (cr_dst2, tmp.val[1], 1);
				cb_dst2+=signed_dst_stride;
				cr_dst2+=signed_dst_stride;
			}
			vst1_lane_u8 (cb_dst2, tmp.val[0], 2);
			vst1_lane_u8 (cr_dst2, tmp.val[1], 2);
			cb_dst2+=signed_dst_stride;
			cr_dst2+=signed_dst_stride;
			if (!down_scale) {
				vst1_lane_u8 (cb_dst2, tmp.val[0], 3);
				vst1_lane_u8 (cr_dst2, tmp.val[1], 3);
				cb_dst2+=signed_dst_stride;
				cr_dst2+=signed_dst_stride;
			}
			vst1_lane_u8 (cb_dst2, tmp.val[0], 4);
			vst1_lane_u8 (cr_dst2, tmp.val[1], 4);
			cb_dst2+=signed_dst_stride;
			cr_dst2+=signed_dst_stride;
			if (!down_scale) {
				vst1_lane_u8 (cb_dst2, tmp.val[0], 5);
				vst1_lane_u8 (cr_dst2, tmp.val[1], 5);
				cb_dst2+=signed_dst_stride;
				cr_dst2+=signed_dst_stride;
			}
			vst1_lane_u8 (cb_dst2, tmp.val[0], 6);
			vst1_lane_u8 (cr_dst2, tmp.val[1], 6);
			cb_dst2+=signed_dst_stride;
			cr_dst2+=signed_dst_stride;
			if (!down_scale) {
				vst1_lane_u8 (cb_dst2, tmp.val[0], 7);
				vst1_lane_u8 (cr_dst2, tmp.val[1], 7);
				cb_dst2+=signed_dst_stride;
				cr_dst2+=signed_dst_stride;
			}
		}
		cb_dst -= incr;
		cr_dst -= incr;
		cbcr_src += src_stride*y_step;
	}
#else
	ms_error("Neon function '%s' used without hw neon support", __FUNCTION__);
#endif
}