static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
void test_vst1Qu16 (void) { uint16_t *arg0_uint16_t; uint16x8_t arg1_uint16x8_t; vst1q_u16 (arg0_uint16_t, arg1_uint16x8_t); }
/* u16x8 mv mul */ void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C) { int i = 0; int k = 0; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_u16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_u16(A + j); j+=Row; neon_a1 = vld1q_u16(A + j); j+=Row; neon_a2 = vld1q_u16(A + j); j+=Row; neon_a3 = vld1q_u16(A + j); j+=Row; neon_a4 = vld1q_u16(A + j); j+=Row; neon_a5 = vld1q_u16(A + j); j+=Row; neon_a6 = vld1q_u16(A + j); j+=Row; neon_a7 = vld1q_u16(A + j); neon_b = vld1q_u16(B + k); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); } vst1q_u16(C + i, neon_c); } }
void byte2word64_neon(const uint8_t *t, const int pitch, float *pf) { uint16_t *p = (uint16_t *)pf; vst1q_u16(p, vmovl_u8(vld1_u8(t))); vst1q_u16(p + 8, vmovl_u8(vld1_u8(t + 8))); vst1q_u16(p + 16, vmovl_u8(vld1_u8(t + pitch * 2))); vst1q_u16(p + 24, vmovl_u8(vld1_u8(t + pitch * 2 + 8))); vst1q_u16(p + 32, vmovl_u8(vld1_u8(t + pitch * 4))); vst1q_u16(p + 40, vmovl_u8(vld1_u8(t + pitch * 4 + 8))); vst1q_u16(p + 48, vmovl_u8(vld1_u8(t + pitch * 6))); vst1q_u16(p + 56, vmovl_u8(vld1_u8(t + pitch * 6 + 8))); }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, const int stride, const int16x8_t res) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const uint16x8_t c0 = vqshluq_n_s16(b0, 0); const uint16x8_t c1 = vqshluq_n_s16(b1, 0); const uint16x8_t c2 = vqshluq_n_s16(b2, 0); const uint16x8_t c3 = vqshluq_n_s16(b3, 0); vst1q_u16(*dest, c0); vst1q_u16(*dest + 8, c1); vst1q_u16(*dest + 16, c2); vst1q_u16(*dest + 24, c3); *dest += stride; }
static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const int16x8_t c0 = vminq_s16(b0, max); const int16x8_t c1 = vminq_s16(b1, max); const int16x8_t c2 = vminq_s16(b2, max); const int16x8_t c3 = vminq_s16(b3, max); vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); *dest += stride; }
void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { int i; if (count >= 8) { /* SkFixed is 16.16 fixed point */ SkFixed dx2 = dx+dx; SkFixed dx4 = dx2+dx2; SkFixed dx8 = dx4+dx4; /* now build fx/fx+dx/fx+2dx/fx+3dx */ SkFixed fx1, fx2, fx3; int32x4_t lbase, hbase; uint16_t *dst16 = (uint16_t *)dst; fx1 = fx+dx; fx2 = fx1+dx; fx3 = fx2+dx; /* avoid an 'lbase unitialized' warning */ lbase = vdupq_n_s32(fx); lbase = vsetq_lane_s32(fx1, lbase, 1); lbase = vsetq_lane_s32(fx2, lbase, 2); lbase = vsetq_lane_s32(fx3, lbase, 3); hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); /* take upper 16 of each, store, and bump everything */ do { int32x4_t lout, hout; uint16x8_t hi16; lout = lbase; hout = hbase; /* gets hi's of all louts then hi's of all houts */ asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); hi16 = vreinterpretq_u16_s32(hout); vst1q_u16(dst16, hi16); /* on to the next */ lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); dst16 += 8; count -= 8; fx += dx8; } while (count >= 8); dst = (uint32_t *) dst16; }
/* u16x8 saturated sub */ void mw_neon_mm_qsub_u16x8(unsigned short * A, int Row, int Col, unsigned short * B, unsigned short * C) { uint16x8_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 8; i <= size ; i+=8) { k = i - 8; neon_a = vld1q_u16(A + k); neon_b = vld1q_u16(B + k); neon_c = vqsubq_u16(neon_a, neon_b); vst1q_u16(C + k, neon_c); } k = i - 8; for (i = 0; i < size % 8; i++) { C[k + i] = A[k + i] + B[k + i]; } }
void byte2word48_neon(const uint8_t *t, const int pitch, float *pf) { uint16_t *p = (uint16_t *)pf; uint8x8_t m0, m1, m2, m3, m4, m5; m0 = vld1_u8(t); m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m1), 0)); m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m1), 1)); m2 = vld1_u8(t + pitch * 2 + 4); t += pitch * 4; m3 = vld1_u8(t); m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m4), 0)); m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m4), 1)); m5 = vld1_u8(t + pitch * 2 + 4); vst1q_u16(p, vmovl_u8(m0)); vst1q_u16(p + 8, vmovl_u8(m1)); vst1q_u16(p + 16, vmovl_u8(m2)); vst1q_u16(p + 24, vmovl_u8(m3)); vst1q_u16(p + 32, vmovl_u8(m4)); vst1q_u16(p + 40, vmovl_u8(m5)); }
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); // Check there is enough room for the depth pixels if (!CheckDepthBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pShiftOut = GetShiftsOutputBuffer(); XnUInt16* pnOutput = GetDepthOutputBuffer(); XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7; #ifdef XN_NEON XnUInt16 shift[8]; XnUInt16 depth[8]; uint16x8_t Q0; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0); #ifdef XN_NEON shift[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0); shift[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0); shift[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0); shift[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0); shift[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0); shift[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0); shift[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0); shift[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0); depth[0] = GetOutput(a0); depth[1] = GetOutput(a1); depth[2] = GetOutput(a2); depth[3] = GetOutput(a3); depth[4] = GetOutput(a4); depth[5] = GetOutput(a5); depth[6] = GetOutput(a6); depth[7] = GetOutput(a7); // Load Q0 = vld1q_u16(depth); // Store vst1q_u16(pnOutput, Q0); // Load Q0 = vld1q_u16(shift); // Store vst1q_u16(pShiftOut, Q0); #else pShiftOut[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0); pShiftOut[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0); pShiftOut[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0); pShiftOut[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0); pShiftOut[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0); pShiftOut[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0); pShiftOut[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0); pShiftOut[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0); pnOutput[0] = GetOutput(a0); pnOutput[1] = GetOutput(a1); pnOutput[2] = GetOutput(a2); pnOutput[3] = GetOutput(a3); pnOutput[4] = GetOutput(a4); pnOutput[5] = GetOutput(a5); pnOutput[6] = GetOutput(a6); pnOutput[7] = GetOutput(a7); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; pShiftOut += 8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
XnStatus XnPacked12DepthProcessor::Unpack12to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); if (!CheckDepthBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = GetDepthOutputBuffer(); XnUInt16* pShiftOut = GetShiftsOutputBuffer(); XnUInt16 shift[16]; #ifdef XN_NEON XnUInt16 depth[16]; uint8x8x3_t inD3; uint8x8_t rshft4D, lshft4D; uint16x8_t rshft4Q, lshft4Q; uint16x8_t depthQ; uint16x8x2_t shiftQ2; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { #ifndef XN_NEON // input: 0, 1,2,3, 4,5,6, 7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23 // -,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,-- // bits: 8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8 // ---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,---- // output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4); shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0); shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4); shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0); shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4); shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0); shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4); shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0); shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4); shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0); shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4); shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0); shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4); shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0); shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4); shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0); pShiftOut[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0); pShiftOut[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0); pShiftOut[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0); pShiftOut[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0); pShiftOut[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0); pShiftOut[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0); pShiftOut[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0); pShiftOut[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0); pShiftOut[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0); pShiftOut[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0); pShiftOut[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0); pShiftOut[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0); pShiftOut[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0); pShiftOut[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0); pShiftOut[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0); pShiftOut[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0); pnOutput[0] = GetOutput(shift[0]); pnOutput[1] = GetOutput(shift[1]); pnOutput[2] = GetOutput(shift[2]); pnOutput[3] = GetOutput(shift[3]); pnOutput[4] = GetOutput(shift[4]); pnOutput[5] = GetOutput(shift[5]); pnOutput[6] = GetOutput(shift[6]); pnOutput[7] = GetOutput(shift[7]); pnOutput[8] = GetOutput(shift[8]); pnOutput[9] = GetOutput(shift[9]); pnOutput[10] = GetOutput(shift[10]); pnOutput[11] = GetOutput(shift[11]); pnOutput[12] = GetOutput(shift[12]); pnOutput[13] = GetOutput(shift[13]); pnOutput[14] = GetOutput(shift[14]); pnOutput[15] = GetOutput(shift[15]); #else // input: 0, 1,2 (X8) // -,---,- // bits: 8,4,4,8 (X8) // ---,--- // output: 0, 1 (X8) // Split 24 bytes into 3 vectors (64 bit each) inD3 = vld3_u8(pcInput); // rshft4D0 contains 4 MSB of second vector (placed at offset 0) rshft4D = vshr_n_u8(inD3.val[1], 4); // lshft4D0 contains 4 LSB of second vector (placed at offset 4) lshft4D = vshl_n_u8(inD3.val[1], 4); // Expand 64 bit vectors to 128 bit (8 values of 16 bits) shiftQ2.val[0] = vmovl_u8(inD3.val[0]); shiftQ2.val[1] = vmovl_u8(inD3.val[2]); rshft4Q = vmovl_u8(rshft4D); lshft4Q = vmovl_u8(lshft4D); // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4); shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q); // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector lshft4Q = vshlq_n_u16(lshft4Q, 4); shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q); // Interleave shift values to a single vector vst2q_u16(shift, shiftQ2); shift[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0); shift[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0); shift[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0); shift[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0); shift[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0); shift[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0); shift[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0); shift[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0); shift[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0); shift[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0); shift[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0); shift[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0); shift[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0); shift[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0); shift[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0); shift[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0); depth[0] = GetOutput(shift[0]); depth[1] = GetOutput(shift[1]); depth[2] = GetOutput(shift[2]); depth[3] = GetOutput(shift[3]); depth[4] = GetOutput(shift[4]); depth[5] = GetOutput(shift[5]); depth[6] = GetOutput(shift[6]); depth[7] = GetOutput(shift[7]); // Load depthQ = vld1q_u16(depth); //Store vst1q_u16(pnOutput, depthQ); // Load depthQ = vld1q_u16(shift); // Store vst1q_u16(pShiftOut, depthQ); depth[8] = GetOutput(shift[8]); depth[9] = GetOutput(shift[9]); depth[10] = GetOutput(shift[10]); depth[11] = GetOutput(shift[11]); depth[12] = GetOutput(shift[12]); depth[13] = GetOutput(shift[13]); depth[14] = GetOutput(shift[14]); depth[15] = GetOutput(shift[15]); // Load depthQ = vld1q_u16(depth + 8); // Store vst1q_u16(pnOutput + 8, depthQ); // Load depthQ = vld1q_u16(shift + 8); // Store vst1q_u16(pShiftOut + 8, depthQ); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 16; pShiftOut += 16; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; (void)filter_y; (void)filter_y_stride; (void)bd; if (w < 8) { // avg4 uint16x4_t s0, s1, d0, d1; uint16x8_t s01, d01; do { s0 = vld1_u16(src); d0 = vld1_u16(dst); src += src_stride; s1 = vld1_u16(src); d1 = vld1_u16(dst + dst_stride); src += src_stride; s01 = vcombine_u16(s0, s1); d01 = vcombine_u16(d0, d1); d01 = vrhaddq_u16(s01, d01); vst1_u16(dst, vget_low_u16(d01)); dst += dst_stride; vst1_u16(dst, vget_high_u16(d01)); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 8) { // avg8 uint16x8_t s0, s1, d0, d1; do { s0 = vld1q_u16(src); d0 = vld1q_u16(dst); src += src_stride; s1 = vld1q_u16(src); d1 = vld1q_u16(dst + dst_stride); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); vst1q_u16(dst, d0); dst += dst_stride; vst1q_u16(dst, d1); dst += dst_stride; h -= 2; } while (h > 0); } else if (w < 32) { // avg16 uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; do { s0l = vld1q_u16(src); s0h = vld1q_u16(src + 8); d0l = vld1q_u16(dst); d0h = vld1q_u16(dst + 8); src += src_stride; s1l = vld1q_u16(src); s1h = vld1q_u16(src + 8); d1l = vld1q_u16(dst + dst_stride); d1h = vld1q_u16(dst + dst_stride + 8); src += src_stride; d0l = vrhaddq_u16(s0l, d0l); d0h = vrhaddq_u16(s0h, d0h); d1l = vrhaddq_u16(s1l, d1l); d1h = vrhaddq_u16(s1h, d1h); vst1q_u16(dst, d0l); vst1q_u16(dst + 8, d0h); dst += dst_stride; vst1q_u16(dst, d1l); vst1q_u16(dst + 8, d1h); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 32) { // avg32 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; h -= 2; } while (h > 0); } else { // avg64 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); s0 = vld1q_u16(src + 32); s1 = vld1q_u16(src + 40); s2 = vld1q_u16(src + 48); s3 = vld1q_u16(src + 56); d0 = vld1q_u16(dst + 32); d1 = vld1q_u16(dst + 40); d2 = vld1q_u16(dst + 48); d3 = vld1q_u16(dst + 56); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst + 32, d0); vst1q_u16(dst + 40, d1); vst1q_u16(dst + 48, d2); vst1q_u16(dst + 56, d3); src += src_stride; dst += dst_stride; } while (--h); } }
XnStatus Link12BitS2DParser::Unpack12to16(const XnUInt8* pcInput,XnUInt8* pDest, const XnUInt32 nInputSize, XnUInt32* pnActualRead, XnUInt32* pnActualWritten) { const XnUInt8* pOrigInput = (XnUInt8*)pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored //XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnUInt16 *pnOutput = (XnUInt16*)pDest; XnUInt16 shift[16]; #ifdef XN_NEON XnUInt16 depth[16]; uint8x8x3_t inD3; uint8x8_t rshft4D, lshft4D; uint16x8_t rshft4Q, lshft4Q; uint16x8_t depthQ; uint16x8x2_t shiftQ2; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { #ifndef XN_NEON // input: 0, 1,2,3, 4,5,6, 7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23 // -,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,-- // bits: 8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8 // ---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,---- // output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4); shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0); shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4); shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0); shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4); shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0); shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4); shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0); shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4); shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0); shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4); shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0); shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4); shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0); shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4); shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0); pnOutput[0] = m_pShiftToDepth[(shift[0])]; pnOutput[1] = m_pShiftToDepth[(shift[1])]; pnOutput[2] = m_pShiftToDepth[(shift[2])]; pnOutput[3] = m_pShiftToDepth[(shift[3])]; pnOutput[4] = m_pShiftToDepth[(shift[4])]; pnOutput[5] = m_pShiftToDepth[(shift[5])]; pnOutput[6] = m_pShiftToDepth[(shift[6])]; pnOutput[7] = m_pShiftToDepth[(shift[7])]; pnOutput[8] = m_pShiftToDepth[(shift[8])]; pnOutput[9] = m_pShiftToDepth[(shift[9])]; pnOutput[10] = m_pShiftToDepth[(shift[10])]; pnOutput[11] = m_pShiftToDepth[(shift[11])]; pnOutput[12] = m_pShiftToDepth[(shift[12])]; pnOutput[13] = m_pShiftToDepth[(shift[13])]; pnOutput[14] = m_pShiftToDepth[(shift[14])]; pnOutput[15] = m_pShiftToDepth[(shift[15])]; #else // input: 0, 1,2 (X8) // -,---,- // bits: 8,4,4,8 (X8) // ---,--- // output: 0, 1 (X8) // Split 24 bytes into 3 vectors (64 bit each) inD3 = vld3_u8(pcInput); // rshft4D0 contains 4 MSB of second vector (placed at offset 0) rshft4D = vshr_n_u8(inD3.val[1], 4); // lshft4D0 contains 4 LSB of second vector (placed at offset 4) lshft4D = vshl_n_u8(inD3.val[1], 4); // Expand 64 bit vectors to 128 bit (8 values of 16 bits) shiftQ2.val[0] = vmovl_u8(inD3.val[0]); shiftQ2.val[1] = vmovl_u8(inD3.val[2]); rshft4Q = vmovl_u8(rshft4D); lshft4Q = vmovl_u8(lshft4D); // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4); shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q); // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector lshft4Q = vshlq_n_u16(lshft4Q, 4); shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q); // Interleave shift values to a single vector vst2q_u16(shift, shiftQ2); depth[0] = m_pShiftToDepth[(shift[0])]; depth[1] = m_pShiftToDepth[(shift[1])]; depth[2] = m_pShiftToDepth[(shift[2])]; depth[3] = m_pShiftToDepth[(shift[3])]; depth[4] = m_pShiftToDepth[(shift[4])]; depth[5] = m_pShiftToDepth[(shift[5])]; depth[6] = m_pShiftToDepth[(shift[6])]; depth[7] = m_pShiftToDepth[(shift[7])]; // Load depthQ = vld1q_u16(depth); //Store vst1q_u16(pnOutput, depthQ); depth[8] = m_pShiftToDepth[(shift[8])]; depth[9] = m_pShiftToDepth[(shift[9])]; depth[10] = m_pShiftToDepth[(shift[10])]; depth[11] = m_pShiftToDepth[(shift[11])]; depth[12] = m_pShiftToDepth[(shift[12])]; depth[13] = m_pShiftToDepth[(shift[13])]; depth[14] = m_pShiftToDepth[(shift[14])]; depth[15] = m_pShiftToDepth[(shift[15])]; // Load depthQ = vld1q_u16(depth + 8); // Store vst1q_u16(pnOutput + 8, depthQ); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 16; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); // total bytes *pnActualWritten = (XnUInt32)((XnUInt8*)pnOutput - pDest); return XN_STATUS_OK; }
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); // Check there is enough room for the depth pixels if (!CheckWriteBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer(); XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7; #ifdef XN_NEON XnUInt16 depth[8]; uint16x8_t Q0; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { if(m_nScaleFactor > 1) { XnUInt32 px = m_nOffsetInFrame%m_CurrentVideoMode.resolutionX; XnUInt32 py = (m_nOffsetInFrame)/m_CurrentVideoMode.resolutionX; if(py%m_nScaleFactor != 0) { // Skip as many pixels as possible XnUInt32 nEltsToSkip = XN_MIN(nElements - nElem, (m_CurrentVideoMode.resolutionX - px)/8 + (m_nScaleFactor-(py%m_nScaleFactor) - 1)*m_CurrentVideoMode.resolutionX/8); // ::memset(pnOutput, 0, nEltsToSkip*8*sizeof(XnUInt16)); pcInput += nEltsToSkip*XN_INPUT_ELEMENT_SIZE; pnOutput += nEltsToSkip*8; m_nOffsetInFrame += nEltsToSkip*8; nElem += (nEltsToSkip-1); continue; } } // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 if(m_nScaleFactor == 2) { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); } else if(m_nScaleFactor == 4) { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); } else { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0); } #ifdef XN_NEON depth[0] = GetOutput(a0); depth[1] = GetOutput(a1); depth[2] = GetOutput(a2); depth[3] = GetOutput(a3); depth[4] = GetOutput(a4); depth[5] = GetOutput(a5); depth[6] = GetOutput(a6); depth[7] = GetOutput(a7); // Load Q0 = vld1q_u16(depth); // Store vst1q_u16(pnOutput, Q0); #else if(m_nScaleFactor == 2) { *pnOutput++ = GetOutput(a0); *pnOutput++ = 0; *pnOutput++ = GetOutput(a2); *pnOutput++ = 0; *pnOutput++ = GetOutput(a4); *pnOutput++ = 0; *pnOutput++ = GetOutput(a6); *pnOutput++ = 0; } else if(m_nScaleFactor == 4) { *pnOutput++ = GetOutput(a0); *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = GetOutput(a4); *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = 0; } else { *pnOutput++ = GetOutput(a0); *pnOutput++ = GetOutput(a1); *pnOutput++ = GetOutput(a2); *pnOutput++ = GetOutput(a3); *pnOutput++ = GetOutput(a4); *pnOutput++ = GetOutput(a5); *pnOutput++ = GetOutput(a6); *pnOutput++ = GetOutput(a7); } #endif pcInput += XN_INPUT_ELEMENT_SIZE; m_nOffsetInFrame+=8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); }