static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
/* u16x8 mv mul */ void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C) { int i = 0; int k = 0; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_u16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_u16(A + j); j+=Row; neon_a1 = vld1q_u16(A + j); j+=Row; neon_a2 = vld1q_u16(A + j); j+=Row; neon_a3 = vld1q_u16(A + j); j+=Row; neon_a4 = vld1q_u16(A + j); j+=Row; neon_a5 = vld1q_u16(A + j); j+=Row; neon_a6 = vld1q_u16(A + j); j+=Row; neon_a7 = vld1q_u16(A + j); neon_b = vld1q_u16(B + k); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); } vst1q_u16(C + i, neon_c); } }
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor) { int colA = SkColorGetA(color); int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); colA = SkAlpha255To256(colA); uint8x8_t vcolR, vcolG, vcolB; uint16x8_t vcolA; if (width >= 8) { vcolA = vdupq_n_u16(colA); vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); vdst.val[NEON_A] = vdup_n_u8(0xFF); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); } }
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est) { assert((uintptr_t)echo_est % 32 == 0); assert((uintptr_t)(aecm->channelStored) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); // This is C code of following optimized code. // During startup we store the channel every block. // memcpy(aecm->channelStored, // aecm->channelAdapt16, // sizeof(int16_t) * PART_LEN1); // Recalculate echo estimate // for (i = 0; i < PART_LEN; i += 4) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], // far_spectrum[i + 1]); // echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], // far_spectrum[i + 2]); // echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], // far_spectrum[i + 3]); // } // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); const uint16_t* far_spectrum_p = far_spectrum; int16_t* start_adapt_p = aecm->channelAdapt16; int16_t* start_stored_p = aecm->channelStored; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; int32_t* echo_est_p = echo_est; uint16x8_t far_spectrum_v; int16x8_t adapt_v; uint32x4_t echo_est_v_low, echo_est_v_high; while (start_stored_p < end_stored_p) { far_spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); vst1q_s16(start_stored_p, adapt_v); echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v), vget_low_u16(vreinterpretq_u16_s16(adapt_v))); echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v), vget_high_u16(vreinterpretq_u16_s16(adapt_v))); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); far_spectrum_p += 8; start_adapt_p += 8; start_stored_p += 8; echo_est_p += 8; } aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN]; echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, const int stride, const int16x8_t res) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const uint16x8_t c0 = vqshluq_n_s16(b0, 0); const uint16x8_t c1 = vqshluq_n_s16(b1, 0); const uint16x8_t c2 = vqshluq_n_s16(b2, 0); const uint16x8_t c3 = vqshluq_n_s16(b3, 0); vst1q_u16(*dest, c0); vst1q_u16(*dest + 8, c1); vst1q_u16(*dest + 16, c2); vst1q_u16(*dest + 24, c3); *dest += stride; }
static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const int16x8_t c0 = vminq_s16(b0, max); const int16x8_t c1 = vminq_s16(b1, max); const int16x8_t c2 = vminq_s16(b2, max); const int16x8_t c3 = vminq_s16(b3, max); vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); *dest += stride; }
/* u16x8 saturated sub */ void mw_neon_mm_qsub_u16x8(unsigned short * A, int Row, int Col, unsigned short * B, unsigned short * C) { uint16x8_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 8; i <= size ; i+=8) { k = i - 8; neon_a = vld1q_u16(A + k); neon_b = vld1q_u16(B + k); neon_c = vqsubq_u16(neon_a, neon_b); vst1q_u16(C + k, neon_c); } k = i - 8; for (i = 0; i < size % 8; i++) { C[k + i] = A[k + i] + B[k + i]; } }
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst) { int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); uint8x8_t vcolR, vcolG, vcolB; uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; if (width >= 8) { vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; uint8x8_t vsel_trans, vsel_opq; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Prepare compare masks vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } // Leftovers for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], opaqueDst); } }
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); // Check there is enough room for the depth pixels if (!CheckDepthBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pShiftOut = GetShiftsOutputBuffer(); XnUInt16* pnOutput = GetDepthOutputBuffer(); XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7; #ifdef XN_NEON XnUInt16 shift[8]; XnUInt16 depth[8]; uint16x8_t Q0; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0); #ifdef XN_NEON shift[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0); shift[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0); shift[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0); shift[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0); shift[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0); shift[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0); shift[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0); shift[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0); depth[0] = GetOutput(a0); depth[1] = GetOutput(a1); depth[2] = GetOutput(a2); depth[3] = GetOutput(a3); depth[4] = GetOutput(a4); depth[5] = GetOutput(a5); depth[6] = GetOutput(a6); depth[7] = GetOutput(a7); // Load Q0 = vld1q_u16(depth); // Store vst1q_u16(pnOutput, Q0); // Load Q0 = vld1q_u16(shift); // Store vst1q_u16(pShiftOut, Q0); #else pShiftOut[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0); pShiftOut[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0); pShiftOut[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0); pShiftOut[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0); pShiftOut[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0); pShiftOut[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0); pShiftOut[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0); pShiftOut[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0); pnOutput[0] = GetOutput(a0); pnOutput[1] = GetOutput(a1); pnOutput[2] = GetOutput(a2); pnOutput[3] = GetOutput(a3); pnOutput[4] = GetOutput(a4); pnOutput[5] = GetOutput(a5); pnOutput[6] = GetOutput(a6); pnOutput[7] = GetOutput(a7); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; pShiftOut += 8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
XnStatus XnPacked12DepthProcessor::Unpack12to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); if (!CheckDepthBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = GetDepthOutputBuffer(); XnUInt16* pShiftOut = GetShiftsOutputBuffer(); XnUInt16 shift[16]; #ifdef XN_NEON XnUInt16 depth[16]; uint8x8x3_t inD3; uint8x8_t rshft4D, lshft4D; uint16x8_t rshft4Q, lshft4Q; uint16x8_t depthQ; uint16x8x2_t shiftQ2; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { #ifndef XN_NEON // input: 0, 1,2,3, 4,5,6, 7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23 // -,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,-- // bits: 8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8 // ---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,---- // output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4); shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0); shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4); shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0); shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4); shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0); shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4); shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0); shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4); shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0); shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4); shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0); shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4); shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0); shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4); shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0); pShiftOut[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0); pShiftOut[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0); pShiftOut[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0); pShiftOut[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0); pShiftOut[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0); pShiftOut[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0); pShiftOut[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0); pShiftOut[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0); pShiftOut[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0); pShiftOut[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0); pShiftOut[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0); pShiftOut[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0); pShiftOut[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0); pShiftOut[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0); pShiftOut[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0); pShiftOut[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0); pnOutput[0] = GetOutput(shift[0]); pnOutput[1] = GetOutput(shift[1]); pnOutput[2] = GetOutput(shift[2]); pnOutput[3] = GetOutput(shift[3]); pnOutput[4] = GetOutput(shift[4]); pnOutput[5] = GetOutput(shift[5]); pnOutput[6] = GetOutput(shift[6]); pnOutput[7] = GetOutput(shift[7]); pnOutput[8] = GetOutput(shift[8]); pnOutput[9] = GetOutput(shift[9]); pnOutput[10] = GetOutput(shift[10]); pnOutput[11] = GetOutput(shift[11]); pnOutput[12] = GetOutput(shift[12]); pnOutput[13] = GetOutput(shift[13]); pnOutput[14] = GetOutput(shift[14]); pnOutput[15] = GetOutput(shift[15]); #else // input: 0, 1,2 (X8) // -,---,- // bits: 8,4,4,8 (X8) // ---,--- // output: 0, 1 (X8) // Split 24 bytes into 3 vectors (64 bit each) inD3 = vld3_u8(pcInput); // rshft4D0 contains 4 MSB of second vector (placed at offset 0) rshft4D = vshr_n_u8(inD3.val[1], 4); // lshft4D0 contains 4 LSB of second vector (placed at offset 4) lshft4D = vshl_n_u8(inD3.val[1], 4); // Expand 64 bit vectors to 128 bit (8 values of 16 bits) shiftQ2.val[0] = vmovl_u8(inD3.val[0]); shiftQ2.val[1] = vmovl_u8(inD3.val[2]); rshft4Q = vmovl_u8(rshft4D); lshft4Q = vmovl_u8(lshft4D); // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4); shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q); // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector lshft4Q = vshlq_n_u16(lshft4Q, 4); shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q); // Interleave shift values to a single vector vst2q_u16(shift, shiftQ2); shift[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0); shift[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0); shift[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0); shift[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0); shift[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0); shift[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0); shift[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0); shift[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0); shift[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0); shift[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0); shift[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0); shift[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0); shift[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0); shift[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0); shift[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0); shift[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0); depth[0] = GetOutput(shift[0]); depth[1] = GetOutput(shift[1]); depth[2] = GetOutput(shift[2]); depth[3] = GetOutput(shift[3]); depth[4] = GetOutput(shift[4]); depth[5] = GetOutput(shift[5]); depth[6] = GetOutput(shift[6]); depth[7] = GetOutput(shift[7]); // Load depthQ = vld1q_u16(depth); //Store vst1q_u16(pnOutput, depthQ); // Load depthQ = vld1q_u16(shift); // Store vst1q_u16(pShiftOut, depthQ); depth[8] = GetOutput(shift[8]); depth[9] = GetOutput(shift[9]); depth[10] = GetOutput(shift[10]); depth[11] = GetOutput(shift[11]); depth[12] = GetOutput(shift[12]); depth[13] = GetOutput(shift[13]); depth[14] = GetOutput(shift[14]); depth[15] = GetOutput(shift[15]); // Load depthQ = vld1q_u16(depth + 8); // Store vst1q_u16(pnOutput + 8, depthQ); // Load depthQ = vld1q_u16(shift + 8); // Store vst1q_u16(pShiftOut + 8, depthQ); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 16; pShiftOut += 16; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
void test_vld1Qu16 (void) { uint16x8_t out_uint16x8_t; out_uint16x8_t = vld1q_u16 (0); }
void meanStdDev(const Size2D &size, const u16 * srcBase, ptrdiff_t srcStride, f32 * pMean, f32 * pStdDev) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; f64 fsum = 0.0f, fsqsum = 0.0f; f32 arsum[8]; uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; for (size_t i = 0; i < size.height; ++i) { const u16 * src = internal::getRowPtr(srcBase, srcStride, i); size_t j = 0u; while (j < roiw4) { size_t blockSize = std::min(roiw4 - j, blockSize0) + j; v_sum = v_zero; v_sqsum = v_zero_f; for ( ; j + 16 < blockSize ; j += 16) { internal::prefetch(src + j); uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); // 0 uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); // 1 v_srclo = vmovl_u16(vget_low_u16(v_src1)); v_srchi = vmovl_u16(vget_high_u16(v_src1)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); v_srclo_f = vcvtq_f32_u32(v_srclo); v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); } for ( ; j < blockSize; j += 4) { uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); float32x4_t v_src_f = vcvtq_f32_u32(v_src); v_sum = vaddq_u32(v_sum, v_src); v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); } vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); vst1q_f32(arsum + 4, v_sqsum); fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; } // collect a few last elements in the current row for ( ; j < size.width; ++j) { f32 srcval = src[j]; fsum += srcval; fsqsum += srcval * srcval; } } // calc mean and stddev f64 itotal = 1.0 / size.total(); f64 mean = fsum * itotal; f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); if (pMean) *pMean = mean; if (pStdDev) *pStdDev = stddev; #else (void)size; (void)srcBase; (void)srcStride; (void)pMean; (void)pStdDev; #endif }
void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; (void)filter_y; (void)filter_y_stride; (void)bd; if (w < 8) { // avg4 uint16x4_t s0, s1, d0, d1; uint16x8_t s01, d01; do { s0 = vld1_u16(src); d0 = vld1_u16(dst); src += src_stride; s1 = vld1_u16(src); d1 = vld1_u16(dst + dst_stride); src += src_stride; s01 = vcombine_u16(s0, s1); d01 = vcombine_u16(d0, d1); d01 = vrhaddq_u16(s01, d01); vst1_u16(dst, vget_low_u16(d01)); dst += dst_stride; vst1_u16(dst, vget_high_u16(d01)); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 8) { // avg8 uint16x8_t s0, s1, d0, d1; do { s0 = vld1q_u16(src); d0 = vld1q_u16(dst); src += src_stride; s1 = vld1q_u16(src); d1 = vld1q_u16(dst + dst_stride); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); vst1q_u16(dst, d0); dst += dst_stride; vst1q_u16(dst, d1); dst += dst_stride; h -= 2; } while (h > 0); } else if (w < 32) { // avg16 uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; do { s0l = vld1q_u16(src); s0h = vld1q_u16(src + 8); d0l = vld1q_u16(dst); d0h = vld1q_u16(dst + 8); src += src_stride; s1l = vld1q_u16(src); s1h = vld1q_u16(src + 8); d1l = vld1q_u16(dst + dst_stride); d1h = vld1q_u16(dst + dst_stride + 8); src += src_stride; d0l = vrhaddq_u16(s0l, d0l); d0h = vrhaddq_u16(s0h, d0h); d1l = vrhaddq_u16(s1l, d1l); d1h = vrhaddq_u16(s1h, d1h); vst1q_u16(dst, d0l); vst1q_u16(dst + 8, d0h); dst += dst_stride; vst1q_u16(dst, d1l); vst1q_u16(dst + 8, d1h); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 32) { // avg32 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; h -= 2; } while (h > 0); } else { // avg64 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); s0 = vld1q_u16(src + 32); s1 = vld1q_u16(src + 40); s2 = vld1q_u16(src + 48); s3 = vld1q_u16(src + 56); d0 = vld1q_u16(dst + 32); d1 = vld1q_u16(dst + 40); d2 = vld1q_u16(dst + 48); d3 = vld1q_u16(dst + 56); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst + 32, d0); vst1q_u16(dst + 40, d1); vst1q_u16(dst + 48, d2); vst1q_u16(dst + 56, d3); src += src_stride; dst += dst_stride; } while (--h); } }
XnStatus Link12BitS2DParser::Unpack12to16(const XnUInt8* pcInput,XnUInt8* pDest, const XnUInt32 nInputSize, XnUInt32* pnActualRead, XnUInt32* pnActualWritten) { const XnUInt8* pOrigInput = (XnUInt8*)pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored //XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnUInt16 *pnOutput = (XnUInt16*)pDest; XnUInt16 shift[16]; #ifdef XN_NEON XnUInt16 depth[16]; uint8x8x3_t inD3; uint8x8_t rshft4D, lshft4D; uint16x8_t rshft4Q, lshft4Q; uint16x8_t depthQ; uint16x8x2_t shiftQ2; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { #ifndef XN_NEON // input: 0, 1,2,3, 4,5,6, 7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23 // -,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,-- // bits: 8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8 // ---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,---- // output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4); shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0); shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4); shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0); shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4); shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0); shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4); shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0); shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4); shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0); shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4); shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0); shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4); shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0); shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4); shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0); pnOutput[0] = m_pShiftToDepth[(shift[0])]; pnOutput[1] = m_pShiftToDepth[(shift[1])]; pnOutput[2] = m_pShiftToDepth[(shift[2])]; pnOutput[3] = m_pShiftToDepth[(shift[3])]; pnOutput[4] = m_pShiftToDepth[(shift[4])]; pnOutput[5] = m_pShiftToDepth[(shift[5])]; pnOutput[6] = m_pShiftToDepth[(shift[6])]; pnOutput[7] = m_pShiftToDepth[(shift[7])]; pnOutput[8] = m_pShiftToDepth[(shift[8])]; pnOutput[9] = m_pShiftToDepth[(shift[9])]; pnOutput[10] = m_pShiftToDepth[(shift[10])]; pnOutput[11] = m_pShiftToDepth[(shift[11])]; pnOutput[12] = m_pShiftToDepth[(shift[12])]; pnOutput[13] = m_pShiftToDepth[(shift[13])]; pnOutput[14] = m_pShiftToDepth[(shift[14])]; pnOutput[15] = m_pShiftToDepth[(shift[15])]; #else // input: 0, 1,2 (X8) // -,---,- // bits: 8,4,4,8 (X8) // ---,--- // output: 0, 1 (X8) // Split 24 bytes into 3 vectors (64 bit each) inD3 = vld3_u8(pcInput); // rshft4D0 contains 4 MSB of second vector (placed at offset 0) rshft4D = vshr_n_u8(inD3.val[1], 4); // lshft4D0 contains 4 LSB of second vector (placed at offset 4) lshft4D = vshl_n_u8(inD3.val[1], 4); // Expand 64 bit vectors to 128 bit (8 values of 16 bits) shiftQ2.val[0] = vmovl_u8(inD3.val[0]); shiftQ2.val[1] = vmovl_u8(inD3.val[2]); rshft4Q = vmovl_u8(rshft4D); lshft4Q = vmovl_u8(lshft4D); // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4); shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q); // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector lshft4Q = vshlq_n_u16(lshft4Q, 4); shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q); // Interleave shift values to a single vector vst2q_u16(shift, shiftQ2); depth[0] = m_pShiftToDepth[(shift[0])]; depth[1] = m_pShiftToDepth[(shift[1])]; depth[2] = m_pShiftToDepth[(shift[2])]; depth[3] = m_pShiftToDepth[(shift[3])]; depth[4] = m_pShiftToDepth[(shift[4])]; depth[5] = m_pShiftToDepth[(shift[5])]; depth[6] = m_pShiftToDepth[(shift[6])]; depth[7] = m_pShiftToDepth[(shift[7])]; // Load depthQ = vld1q_u16(depth); //Store vst1q_u16(pnOutput, depthQ); depth[8] = m_pShiftToDepth[(shift[8])]; depth[9] = m_pShiftToDepth[(shift[9])]; depth[10] = m_pShiftToDepth[(shift[10])]; depth[11] = m_pShiftToDepth[(shift[11])]; depth[12] = m_pShiftToDepth[(shift[12])]; depth[13] = m_pShiftToDepth[(shift[13])]; depth[14] = m_pShiftToDepth[(shift[14])]; depth[15] = m_pShiftToDepth[(shift[15])]; // Load depthQ = vld1q_u16(depth + 8); // Store vst1q_u16(pnOutput + 8, depthQ); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 16; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); // total bytes *pnActualWritten = (XnUInt32)((XnUInt8*)pnOutput - pDest); return XN_STATUS_OK; }
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); // Check there is enough room for the depth pixels if (!CheckWriteBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer(); XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7; #ifdef XN_NEON XnUInt16 depth[8]; uint16x8_t Q0; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { if(m_nScaleFactor > 1) { XnUInt32 px = m_nOffsetInFrame%m_CurrentVideoMode.resolutionX; XnUInt32 py = (m_nOffsetInFrame)/m_CurrentVideoMode.resolutionX; if(py%m_nScaleFactor != 0) { // Skip as many pixels as possible XnUInt32 nEltsToSkip = XN_MIN(nElements - nElem, (m_CurrentVideoMode.resolutionX - px)/8 + (m_nScaleFactor-(py%m_nScaleFactor) - 1)*m_CurrentVideoMode.resolutionX/8); // ::memset(pnOutput, 0, nEltsToSkip*8*sizeof(XnUInt16)); pcInput += nEltsToSkip*XN_INPUT_ELEMENT_SIZE; pnOutput += nEltsToSkip*8; m_nOffsetInFrame += nEltsToSkip*8; nElem += (nEltsToSkip-1); continue; } } // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 if(m_nScaleFactor == 2) { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); } else if(m_nScaleFactor == 4) { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); } else { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0); } #ifdef XN_NEON depth[0] = GetOutput(a0); depth[1] = GetOutput(a1); depth[2] = GetOutput(a2); depth[3] = GetOutput(a3); depth[4] = GetOutput(a4); depth[5] = GetOutput(a5); depth[6] = GetOutput(a6); depth[7] = GetOutput(a7); // Load Q0 = vld1q_u16(depth); // Store vst1q_u16(pnOutput, Q0); #else if(m_nScaleFactor == 2) { *pnOutput++ = GetOutput(a0); *pnOutput++ = 0; *pnOutput++ = GetOutput(a2); *pnOutput++ = 0; *pnOutput++ = GetOutput(a4); *pnOutput++ = 0; *pnOutput++ = GetOutput(a6); *pnOutput++ = 0; } else if(m_nScaleFactor == 4) { *pnOutput++ = GetOutput(a0); *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = GetOutput(a4); *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = 0; } else { *pnOutput++ = GetOutput(a0); *pnOutput++ = GetOutput(a1); *pnOutput++ = GetOutput(a2); *pnOutput++ = GetOutput(a3); *pnOutput++ = GetOutput(a4); *pnOutput++ = GetOutput(a5); *pnOutput++ = GetOutput(a6); *pnOutput++ = GetOutput(a7); } #endif pcInput += XN_INPUT_ELEMENT_SIZE; m_nOffsetInFrame+=8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
/* u16x8 mm mul */ void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C) { int i, k, j; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u16(0); for (j = 0; j < T; j+=8) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u16(A + j_T); j_T+=Row; neon_a1 = vld1q_u16(A + j_T); j_T+=Row; neon_a2 = vld1q_u16(A + j_T); j_T+=Row; neon_a3 = vld1q_u16(A + j_T); j_T+=Row; neon_a4 = vld1q_u16(A + j_T); j_T+=Row; neon_a5 = vld1q_u16(A + j_T); j_T+=Row; neon_a6 = vld1q_u16(A + j_T); j_T+=Row; neon_a7 = vld1q_u16(A + j_T); neon_b = vld1q_u16(B + k_Row + j); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); vst1q_lane_u16(C + k_Row + i, neon_c, 0); vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3); vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4); vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5); vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6); vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7); } } } }
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est, uint32_t* far_energy, uint32_t* echo_energy_adapt, uint32_t* echo_energy_stored) { int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt_p = aecm->channelAdapt16; int32_t* echo_est_p = echo_est; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; const uint16_t* far_spectrum_p = far_spectrum; int16x8_t store_v, adapt_v; uint16x8_t spectrum_v; uint32x4_t echo_est_v_low, echo_est_v_high; uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v; far_energy_v = vdupq_n_u32(0); echo_adapt_v = vdupq_n_u32(0); echo_stored_v = vdupq_n_u32(0); // Get energy for the delayed far end signal and estimated // echo using both stored and adapted channels. // The C code: // for (i = 0; i < PART_LEN1; i++) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // (*far_energy) += (uint32_t)(far_spectrum[i]); // *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; // (*echo_energy_stored) += (uint32_t)echo_est[i]; // } while (start_stored_p < end_stored_p) { spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); store_v = vld1q_s16(start_stored_p); far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v)); far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v)); echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)), vget_low_u16(spectrum_v)); echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)), vget_high_u16(spectrum_v)); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v); echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_low_s16(adapt_v)), vget_low_u16(spectrum_v)); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_high_s16(adapt_v)), vget_high_u16(spectrum_v)); start_stored_p += 8; start_adapt_p += 8; far_spectrum_p += 8; echo_est_p += 8; } AddLanes(far_energy, far_energy_v); AddLanes(echo_energy_stored, echo_stored_v); AddLanes(echo_energy_adapt, echo_adapt_v); echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); *echo_energy_stored += (uint32_t)echo_est[PART_LEN]; *far_energy += (uint32_t)far_spectrum[PART_LEN]; *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN]; }
inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }