void ar_stride2_vmaxall_u8_neon(const uint8_t* a, uint32_t n, uint8_t* line_results) { uint8x16x2_t a_loaded; uint8x16_t line0_max = vdupq_n_u8(0); uint8x16_t line1_max = vdupq_n_u8(0); line_results[0] = 0; line_results[1] = 0; uint8_t line0_array[16]; uint8_t line1_array[16]; for (uint32_t i = 0; i < n; i += 32) { a_loaded = vld2q_u8(&(a[i])); line0_max = vmaxq_u8(a_loaded.val[0], line0_max); line1_max = vmaxq_u8(a_loaded.val[1], line1_max); } vst1q_u8(line0_array, line0_max); vst1q_u8(line1_array, line1_max); for (uint32_t i = 0; i < 16; i++) { line_results[0] = ar_max_u8(line_results[0], line0_array[i]); line_results[1] = ar_max_u8(line_results[1], line1_array[i]); } }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_neon(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { size_t i, j, k; static const size_t bytesoftype = 2; uint8x16x2_t r0; for (i = 0, k = 0; i < vectorizable_elements * bytesoftype; i += 32, k++) { /* Load (and permute) 32 bytes to the structure r0 */ r0 = vld2q_u8(src + i); /* Store the results in the destination vector */ for (j = 0; j < 2; j++) { vst1q_u8(dest + total_elements * j + k * 16, r0.val[j]); } } }
void test_vld2Qu8 (void) { uint8x16x2_t out_uint8x16x2_t; out_uint8x16x2_t = vld2q_u8 (0); }
out->buf[oy*out->stride+ox] = pad; } } return out; } #ifdef __ARM_NEON__ #include <arm_neon.h> void neon_decimate2(uint8_t * __restrict dest, int destwidth, int destheight, int deststride, uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride) { for (int y = 0; y < destheight; y++) { for (int x = 0; x < destwidth; x+=8) { uint8x16x2_t row0 = vld2q_u8(src + 2*x); uint8x16x2_t row1 = vld2q_u8(src + 2*x + srcstride); uint8x16_t sum0 = vhaddq_u8(row0.val[0], row1.val[1]); uint8x16_t sum1 = vhaddq_u8(row1.val[0], row0.val[1]); uint8x16_t sum = vhaddq_u8(sum0, sum1); vst1q_u8(dest + x, sum); } src += 2*srcstride; dest += deststride; } } void neon_decimate3(uint8_t * __restrict dest, int destwidth, int destheight, int deststride, uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride) { for (int y = 0; y < destheight; y++) {
inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); }
void combineUYVY(const Size2D &size, const u8 * srcyBase, ptrdiff_t srcyStride, const u8 * srcuBase, ptrdiff_t srcuStride, const u8 * srcvBase, ptrdiff_t srcvStride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON #ifndef __ANDROID__ size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; #endif size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; for (size_t i = 0u; i < size.height; ++i) { const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t syj = 0u, sj = 0u, dj = 0u; #ifndef __ANDROID__ for (; sj < roiw32; sj += 32, syj += 64, dj += 128) { internal::prefetch(srcy + syj); internal::prefetch(srcu + sj); internal::prefetch(srcv + sj); uint8x16x2_t v_y = vld2q_u8(srcy + syj); uint8x16x4_t v_dst; v_dst.val[0] = vld1q_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj, v_dst); v_y = vld2q_u8(srcy + syj + 32); v_dst.val[0] = vld1q_u8(srcu + sj + 16); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj + 16); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj + 64, v_dst); } #endif for (; sj < roiw8; sj += 8, syj += 16, dj += 32) { uint8x8x2_t v_y = vld2_u8(srcy + syj); uint8x8x4_t v_dst; v_dst.val[0] = vld1_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4_u8(dst + dj, v_dst); } for (; sj < size.width; ++sj, syj += 2, dj += 4) { dst[dj] = srcu[sj]; dst[dj + 1] = srcy[syj]; dst[dj + 2] = srcv[sj]; dst[dj + 3] = srcy[syj + 1]; } } #else (void)size; (void)srcyBase; (void)srcyStride; (void)srcuBase; (void)srcuStride; (void)srcvBase; (void)srcvStride; (void)dstBase; (void)dstStride; #endif }