SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t *src) { const uint8x8x2_t t01 = vld2_u8(src - 1); const uint8x8x2_t t23 = vld2_u8(src + 1); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
template <> SIMD_INLINE uint16x8_t ReduceColTail<false>(const uint8_t *src) { const uint8x8x2_t t01 = vld2_u8(src - 1); const uint8x8x2_t t23 = Deinterleave(LoadAfterLast<1>(LoadAfterLast<1>(vld1q_u8(src - 1)))); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) { const uint8x8x2_t t01 = Deinterleave(LoadBeforeFirst<1>(vld1q_u8(src))); const uint8x8x2_t t23 = vld2_u8(src + 1); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
void test_vld2u8 (void) { uint8x8x2_t out_uint8x8x2_t; out_uint8x8x2_t = vld2_u8 (0); }
inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); }
void combineUYVY(const Size2D &size, const u8 * srcyBase, ptrdiff_t srcyStride, const u8 * srcuBase, ptrdiff_t srcuStride, const u8 * srcvBase, ptrdiff_t srcvStride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON #ifndef __ANDROID__ size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; #endif size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; for (size_t i = 0u; i < size.height; ++i) { const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t syj = 0u, sj = 0u, dj = 0u; #ifndef __ANDROID__ for (; sj < roiw32; sj += 32, syj += 64, dj += 128) { internal::prefetch(srcy + syj); internal::prefetch(srcu + sj); internal::prefetch(srcv + sj); uint8x16x2_t v_y = vld2q_u8(srcy + syj); uint8x16x4_t v_dst; v_dst.val[0] = vld1q_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj, v_dst); v_y = vld2q_u8(srcy + syj + 32); v_dst.val[0] = vld1q_u8(srcu + sj + 16); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj + 16); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj + 64, v_dst); } #endif for (; sj < roiw8; sj += 8, syj += 16, dj += 32) { uint8x8x2_t v_y = vld2_u8(srcy + syj); uint8x8x4_t v_dst; v_dst.val[0] = vld1_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4_u8(dst + dj, v_dst); } for (; sj < size.width; ++sj, syj += 2, dj += 4) { dst[dj] = srcu[sj]; dst[dj + 1] = srcy[syj]; dst[dj + 2] = srcv[sj]; dst[dj + 3] = srcy[syj + 1]; } } #else (void)size; (void)srcyBase; (void)srcyStride; (void)srcuBase; (void)srcuStride; (void)srcvBase; (void)srcvStride; (void)dstBase; (void)dstStride; #endif }
void rotate_down_scale_cbcr_to_cr_cb(int wDest, int hDest, int full_width, uint8_t* cbcr_src, uint8_t* cr_dst, uint8_t* cb_dst,bool_t clockWise,bool_t down_scale) { #ifdef __ARM_NEON__ int hSrc = down_scale?wDest*2:wDest; int wSrc = down_scale?hDest*2:hDest; int src_stride = 2*full_width; int signed_dst_stride; int incr; int y_step=down_scale?2:1; if (clockWise) { /* ms_warning("start writing destination buffer from top right");*/ cb_dst += wDest - 1; cr_dst += wDest - 1; incr = 1; signed_dst_stride = wDest; } else { /* ms_warning("start writing destination buffer from top right");*/ cb_dst += wDest * (hDest - 1); cr_dst += wDest * (hDest - 1); incr = -1; signed_dst_stride = -wDest; } int x,y; for (y=0; y<hSrc; y+=y_step) { uint8_t* cb_dst2 = cb_dst; uint8_t* cr_dst2 = cr_dst; for (x=0; x<2*wSrc; x+=16) { uint8x8x2_t tmp = vld2_u8 (cbcr_src+x); vst1_lane_u8 (cb_dst2, tmp.val[0], 0); vst1_lane_u8 (cr_dst2, tmp.val[1], 0); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; if (!down_scale) { vst1_lane_u8 (cb_dst2, tmp.val[0], 1); vst1_lane_u8 (cr_dst2, tmp.val[1], 1); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; } vst1_lane_u8 (cb_dst2, tmp.val[0], 2); vst1_lane_u8 (cr_dst2, tmp.val[1], 2); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; if (!down_scale) { vst1_lane_u8 (cb_dst2, tmp.val[0], 3); vst1_lane_u8 (cr_dst2, tmp.val[1], 3); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; } vst1_lane_u8 (cb_dst2, tmp.val[0], 4); vst1_lane_u8 (cr_dst2, tmp.val[1], 4); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; if (!down_scale) { vst1_lane_u8 (cb_dst2, tmp.val[0], 5); vst1_lane_u8 (cr_dst2, tmp.val[1], 5); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; } vst1_lane_u8 (cb_dst2, tmp.val[0], 6); vst1_lane_u8 (cr_dst2, tmp.val[1], 6); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; if (!down_scale) { vst1_lane_u8 (cb_dst2, tmp.val[0], 7); vst1_lane_u8 (cr_dst2, tmp.val[1], 7); cb_dst2+=signed_dst_stride; cr_dst2+=signed_dst_stride; } } cb_dst -= incr; cr_dst -= incr; cbcr_src += src_stride*y_step; } #else ms_error("Neon function '%s' used without hw neon support", __FUNCTION__); #endif }