Example #1
0
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
                                     uint16_t* dst, int len) {
  int i;
  const int16x8_t zero = vdupq_n_s16(0);
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  uint64x2_t sum = vdupq_n_u64(0);
  uint64_t diff;

  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
    const int16x8_t D = vsubq_s16(A, B);       // diff_y
    const int16x8_t F = vaddq_s16(C, D);       // new_y
    const uint16x8_t H =
        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
    vst1q_u16(dst + i, H);
    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
  }
  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
  for (; i < len; ++i) {
    const int diff_y = ref[i] - src[i];
    const int new_y = (int)(dst[i]) + diff_y;
    dst[i] = clip_y(new_y);
    diff += (uint64_t)(abs(diff_y));
  }
  return diff;
}
Example #2
0
/* u16x8 mv mul */
void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C)
{
	int i = 0;
	int k = 0;

	uint16x8_t neon_b, neon_c;
	uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7;
	uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7;

	for (i = 0; i < Row; i+=8)
	{
		neon_c = vmovq_n_u16(0);

		for (k = 0; k < T; k+=8)
		{
			int j = k * T + i;

			neon_a0 = vld1q_u16(A + j);
			j+=Row;
			neon_a1 = vld1q_u16(A + j);
			j+=Row;
			neon_a2 = vld1q_u16(A + j);
			j+=Row;
			neon_a3 = vld1q_u16(A + j);
			j+=Row;
			neon_a4 = vld1q_u16(A + j);
			j+=Row;
			neon_a5 = vld1q_u16(A + j);
			j+=Row;
			neon_a6 = vld1q_u16(A + j);
			j+=Row;
			neon_a7 = vld1q_u16(A + j);

			neon_b = vld1q_u16(B + k);
			neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0));
			neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1));
			neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2));
			neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3));
			neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4));
			neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5));
			neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6));
			neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7));

			neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c);

		}

		vst1q_u16(C + i, neon_c);
	}
}
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
                                   SkColor color, int width, SkPMColor) {
    int colA = SkColorGetA(color);
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    colA = SkAlpha255To256(colA);

    uint8x8_t vcolR, vcolG, vcolB;
    uint16x8_t vcolA;

    if (width >= 8) {
        vcolA = vdupq_n_u16(colA);
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
        vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
        vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);

        vdst.val[NEON_A] = vdup_n_u8(0xFF);
        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
    }
}
Example #4
0
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm,
                                         const uint16_t* far_spectrum,
                                         int32_t* echo_est) {
  assert((uintptr_t)echo_est % 32 == 0);
  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);

  // This is C code of following optimized code.
  // During startup we store the channel every block.
  //  memcpy(aecm->channelStored,
  //         aecm->channelAdapt16,
  //         sizeof(int16_t) * PART_LEN1);
  // Recalculate echo estimate
  //  for (i = 0; i < PART_LEN; i += 4) {
  //    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                        far_spectrum[i]);
  //    echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
  //                                            far_spectrum[i + 1]);
  //    echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
  //                                            far_spectrum[i + 2]);
  //    echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
  //                                            far_spectrum[i + 3]);
  //  }
  //  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                     far_spectrum[i]);
  const uint16_t* far_spectrum_p = far_spectrum;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int16_t* start_stored_p = aecm->channelStored;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  int32_t* echo_est_p = echo_est;

  uint16x8_t far_spectrum_v;
  int16x8_t adapt_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;

  while (start_stored_p < end_stored_p) {
    far_spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);

    vst1q_s16(start_stored_p, adapt_v);

    echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
                               vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
    echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
                                vget_high_u16(vreinterpretq_u16_s16(adapt_v)));

    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    far_spectrum_p += 8;
    start_adapt_p += 8;
    start_stored_p += 8;
    echo_est_p += 8;
  }
  aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN];
  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
}
Example #5
0
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
                                   const uint16_t* best_y, uint16_t* out) {
  int i;
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  const int16x8_t zero = vdupq_n_s16(0);
  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t a0 = vld1q_s16(A + i + 0);
    const int16x8_t a1 = vld1q_s16(A + i + 1);
    const int16x8_t b0 = vld1q_s16(B + i + 0);
    const int16x8_t b1 = vld1q_s16(B + i + 1);
    const int16x8_t a0b1 = vaddq_s16(a0, b1);
    const int16x8_t a1b0 = vaddq_s16(a1, b0);
    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
    const int16x8_t d0 = vaddq_s16(c1, a0);
    const int16x8_t d1 = vaddq_s16(c0, a1);
    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
    const int16x8x2_t f = vzipq_s16(e0, e1);
    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
  }
  for (; i < len; ++i) {
    const int a0b1 = A[i + 0] + B[i + 1];
    const int a1b0 = A[i + 1] + B[i + 0];
    const int a0a1b0b1 = a0b1 + a1b0 + 8;
    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
  }
}
static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
                                                     const int stride,
                                                     const int16x8_t res) {
  const uint16x8_t a0 = vld1q_u16(*dest);
  const uint16x8_t a1 = vld1q_u16(*dest + 8);
  const uint16x8_t a2 = vld1q_u16(*dest + 16);
  const uint16x8_t a3 = vld1q_u16(*dest + 24);
  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
  const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
  const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
  vst1q_u16(*dest, c0);
  vst1q_u16(*dest + 8, c1);
  vst1q_u16(*dest + 16, c2);
  vst1q_u16(*dest + 24, c3);
  *dest += stride;
}
static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
                                                     const int stride,
                                                     const int16x8_t res,
                                                     const int16x8_t max) {
  const uint16x8_t a0 = vld1q_u16(*dest);
  const uint16x8_t a1 = vld1q_u16(*dest + 8);
  const uint16x8_t a2 = vld1q_u16(*dest + 16);
  const uint16x8_t a3 = vld1q_u16(*dest + 24);
  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
  const int16x8_t c0 = vminq_s16(b0, max);
  const int16x8_t c1 = vminq_s16(b1, max);
  const int16x8_t c2 = vminq_s16(b2, max);
  const int16x8_t c3 = vminq_s16(b3, max);
  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
  *dest += stride;
}
Example #8
0
/* u16x8 saturated sub */
void mw_neon_mm_qsub_u16x8(unsigned short * A, int Row, int Col, unsigned short * B, unsigned short * C)
{
	uint16x8_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 8; i <= size ; i+=8)
	{
		k = i - 8;
		neon_a = vld1q_u16(A + k);
		neon_b = vld1q_u16(B + k);
		neon_c = vqsubq_u16(neon_a, neon_b);
		vst1q_u16(C + k, neon_c);
	}

	k = i - 8;
    for (i = 0; i < size % 8; i++)
	{
		C[k + i] = A[k + i] + B[k + i];
	}
}
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                        SkColor color, int width,
                                        SkPMColor opaqueDst) {
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    uint8x8_t vcolR, vcolG, vcolB;
    uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;

    if (width >= 8) {
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
        vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
        vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
        vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
        vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;
        uint8x8_t vsel_trans, vsel_opq;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Prepare compare masks
        vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
        vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
        vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);

        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
        vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
        vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    // Leftovers
    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
                                    opaqueDst);
    }
}
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
	const XnUInt8* pOrigInput = pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	*pnActualRead = 0;
	XnBuffer* pWriteBuffer = GetWriteBuffer();

	// Check there is enough room for the depth pixels
	if (!CheckDepthBufferForOverflow(nNeededOutput))
	{
		return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
	}

	XnUInt16* pShiftOut = GetShiftsOutputBuffer();
	XnUInt16* pnOutput = GetDepthOutputBuffer();

	XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7;
#ifdef XN_NEON
	XnUInt16 shift[8];
	XnUInt16 depth[8];
	uint16x8_t Q0;
#endif

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
		// input:	0,  1,  2,3,  4,  5,  6,7,  8,  9,10
		//			-,---,---,-,---,---,---,-,---,---,-
		// bits:	8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
		//			---,---,-----,---,---,-----,---,---
		// output:	  0,  1,    2,  3,  4,    5,  6,  7

		a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
		a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2);
		a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7);
		a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4);
		a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
		a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6);
		a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3);
		a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0);


#ifdef XN_NEON
		shift[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0);
		shift[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0);
		shift[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0);
		shift[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0);
		shift[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0);
		shift[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0);
		shift[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0);
		shift[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0);

		depth[0] = GetOutput(a0);
		depth[1] = GetOutput(a1);
		depth[2] = GetOutput(a2);
		depth[3] = GetOutput(a3);
		depth[4] = GetOutput(a4);
		depth[5] = GetOutput(a5);
		depth[6] = GetOutput(a6);
		depth[7] = GetOutput(a7);

		// Load
		Q0 = vld1q_u16(depth);
		// Store
		vst1q_u16(pnOutput, Q0);

		// Load
		Q0 = vld1q_u16(shift);
		// Store
		vst1q_u16(pShiftOut, Q0);
#else
		pShiftOut[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0);
		pShiftOut[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0);
		pShiftOut[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0);
		pShiftOut[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0);
		pShiftOut[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0);
		pShiftOut[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0);
		pShiftOut[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0);
		pShiftOut[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0);

		pnOutput[0] = GetOutput(a0);
		pnOutput[1] = GetOutput(a1);
		pnOutput[2] = GetOutput(a2);
		pnOutput[3] = GetOutput(a3);
		pnOutput[4] = GetOutput(a4);
		pnOutput[5] = GetOutput(a5);
		pnOutput[6] = GetOutput(a6);
		pnOutput[7] = GetOutput(a7);

#endif

		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 8;
		pShiftOut += 8;
	}

	*pnActualRead = (XnUInt32)(pcInput - pOrigInput);
	pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

	return XN_STATUS_OK;
}
XnStatus XnPacked12DepthProcessor::Unpack12to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
    const XnUInt8* pOrigInput = pcInput;

    XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
    XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

    *pnActualRead = 0;
    XnBuffer* pWriteBuffer = GetWriteBuffer();

    if (!CheckDepthBufferForOverflow(nNeededOutput))
    {
        return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
    }

    XnUInt16* pnOutput = GetDepthOutputBuffer();
    XnUInt16* pShiftOut = GetShiftsOutputBuffer();
    XnUInt16 shift[16];
#ifdef XN_NEON
    XnUInt16 depth[16];
    uint8x8x3_t inD3;
    uint8x8_t rshft4D, lshft4D;
    uint16x8_t rshft4Q, lshft4Q;
    uint16x8_t depthQ;
    uint16x8x2_t shiftQ2;
#endif

    // Convert the 11bit packed data into 16bit shorts
    for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
    {
#ifndef XN_NEON
        // input:	0,  1,2,3,  4,5,6,  7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23
        //			-,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,--
        // bits:	8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8
        //			---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,----
        // output:	  0,  1,  2,  3,  4,  5,  6,   7,   8,   9,  10,  11,  12,  13,  14,  15

        shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4);
        shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0);
        shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4);
        shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0);
        shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4);
        shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0);
        shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4);
        shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0);
        shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4);
        shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0);
        shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4);
        shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0);
        shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4);
        shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0);
        shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4);
        shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0);

        pShiftOut[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0);
        pShiftOut[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0);
        pShiftOut[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0);
        pShiftOut[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0);
        pShiftOut[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0);
        pShiftOut[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0);
        pShiftOut[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0);
        pShiftOut[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0);
        pShiftOut[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0);
        pShiftOut[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0);
        pShiftOut[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0);
        pShiftOut[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0);
        pShiftOut[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0);
        pShiftOut[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0);
        pShiftOut[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0);
        pShiftOut[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0);

        pnOutput[0] = GetOutput(shift[0]);
        pnOutput[1] = GetOutput(shift[1]);
        pnOutput[2] = GetOutput(shift[2]);
        pnOutput[3] = GetOutput(shift[3]);
        pnOutput[4] = GetOutput(shift[4]);
        pnOutput[5] = GetOutput(shift[5]);
        pnOutput[6] = GetOutput(shift[6]);
        pnOutput[7] = GetOutput(shift[7]);
        pnOutput[8] = GetOutput(shift[8]);
        pnOutput[9] = GetOutput(shift[9]);
        pnOutput[10] = GetOutput(shift[10]);
        pnOutput[11] = GetOutput(shift[11]);
        pnOutput[12] = GetOutput(shift[12]);
        pnOutput[13] = GetOutput(shift[13]);
        pnOutput[14] = GetOutput(shift[14]);
        pnOutput[15] = GetOutput(shift[15]);

#else
        // input:	0,  1,2    (X8)
        //			-,---,-
        // bits:	8,4,4,8    (X8)
        //			---,---
        // output:	  0,  1    (X8)

        // Split 24 bytes into 3 vectors (64 bit each)
        inD3 = vld3_u8(pcInput);

        // rshft4D0 contains 4 MSB of second vector (placed at offset 0)
        rshft4D = vshr_n_u8(inD3.val[1], 4);
        // lshft4D0 contains 4 LSB of second vector (placed at offset 4)
        lshft4D = vshl_n_u8(inD3.val[1], 4);

        // Expand 64 bit vectors to 128 bit (8 values of 16 bits)
        shiftQ2.val[0] = vmovl_u8(inD3.val[0]);
        shiftQ2.val[1] = vmovl_u8(inD3.val[2]);
        rshft4Q = vmovl_u8(rshft4D);
        lshft4Q = vmovl_u8(lshft4D);

        // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector
        shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4);
        shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q);

        // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector
        lshft4Q = vshlq_n_u16(lshft4Q, 4);
        shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q);

        // Interleave shift values to a single vector
        vst2q_u16(shift, shiftQ2);

        shift[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0);
        shift[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0);
        shift[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0);
        shift[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0);
        shift[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0);
        shift[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0);
        shift[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0);
        shift[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0);
        shift[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0);
        shift[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0);
        shift[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0);
        shift[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0);
        shift[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0);
        shift[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0);
        shift[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0);
        shift[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0);

        depth[0] = GetOutput(shift[0]);
        depth[1] = GetOutput(shift[1]);

        depth[2] = GetOutput(shift[2]);
        depth[3] = GetOutput(shift[3]);

        depth[4] = GetOutput(shift[4]);
        depth[5] = GetOutput(shift[5]);

        depth[6] = GetOutput(shift[6]);
        depth[7] = GetOutput(shift[7]);

        // Load
        depthQ = vld1q_u16(depth);
        //Store
        vst1q_u16(pnOutput, depthQ);

        // Load
        depthQ = vld1q_u16(shift);
        // Store
        vst1q_u16(pShiftOut, depthQ);

        depth[8] = GetOutput(shift[8]);
        depth[9] = GetOutput(shift[9]);

        depth[10] = GetOutput(shift[10]);
        depth[11] = GetOutput(shift[11]);

        depth[12] = GetOutput(shift[12]);
        depth[13] = GetOutput(shift[13]);

        depth[14] = GetOutput(shift[14]);
        depth[15] = GetOutput(shift[15]);

        // Load
        depthQ = vld1q_u16(depth + 8);
        // Store
        vst1q_u16(pnOutput + 8, depthQ);

        // Load
        depthQ = vld1q_u16(shift + 8);
        // Store
        vst1q_u16(pShiftOut + 8, depthQ);

#endif

        pcInput += XN_INPUT_ELEMENT_SIZE;
        pnOutput += 16;
        pShiftOut += 16;
    }


    *pnActualRead = (XnUInt32)(pcInput - pOrigInput);
    pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

    return XN_STATUS_OK;
}
void test_vld1Qu16 (void)
{
  uint16x8_t out_uint16x8_t;

  out_uint16x8_t = vld1q_u16 (0);
}
Example #13
0
void meanStdDev(const Size2D &size,
                const u16 * srcBase, ptrdiff_t srcStride,
                f32 * pMean, f32 * pStdDev)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
    f64 fsum = 0.0f, fsqsum = 0.0f;

    f32 arsum[8];
    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
        size_t j = 0u;

        while (j < roiw4)
        {
            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
            v_sum = v_zero;
            v_sqsum = v_zero_f;

            for ( ; j + 16 < blockSize ; j += 16)
            {
                internal::prefetch(src + j);
                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);

                // 0
                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);

                // 1
                v_srclo = vmovl_u16(vget_low_u16(v_src1));
                v_srchi = vmovl_u16(vget_high_u16(v_src1));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                v_srclo_f = vcvtq_f32_u32(v_srclo);
                v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
            }

            for ( ; j < blockSize; j += 4)
            {
                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
                v_sum = vaddq_u32(v_sum, v_src);
                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
            }

            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
            vst1q_f32(arsum + 4, v_sqsum);

            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
        }

        // collect a few last elements in the current row
        for ( ; j < size.width; ++j)
        {
            f32 srcval = src[j];
            fsum += srcval;
            fsqsum += srcval * srcval;
        }
    }

    // calc mean and stddev
    f64 itotal = 1.0 / size.total();
    f64 mean = fsum * itotal;
    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));

    if (pMean)
        *pMean = mean;
    if (pStdDev)
        *pStdDev = stddev;
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMean;
    (void)pStdDev;
#endif
}
void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int filter_x_stride,
                                  const int16_t *filter_y, int filter_y_stride,
                                  int w, int h, int bd) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
  (void)bd;

  if (w < 8) {  // avg4
    uint16x4_t s0, s1, d0, d1;
    uint16x8_t s01, d01;
    do {
      s0 = vld1_u16(src);
      d0 = vld1_u16(dst);
      src += src_stride;
      s1 = vld1_u16(src);
      d1 = vld1_u16(dst + dst_stride);
      src += src_stride;
      s01 = vcombine_u16(s0, s1);
      d01 = vcombine_u16(d0, d1);
      d01 = vrhaddq_u16(s01, d01);
      vst1_u16(dst, vget_low_u16(d01));
      dst += dst_stride;
      vst1_u16(dst, vget_high_u16(d01));
      dst += dst_stride;
      h -= 2;
    } while (h > 0);
  } else if (w == 8) {  // avg8
    uint16x8_t s0, s1, d0, d1;
    do {
      s0 = vld1q_u16(src);
      d0 = vld1q_u16(dst);
      src += src_stride;
      s1 = vld1q_u16(src);
      d1 = vld1q_u16(dst + dst_stride);
      src += src_stride;

      d0 = vrhaddq_u16(s0, d0);
      d1 = vrhaddq_u16(s1, d1);

      vst1q_u16(dst, d0);
      dst += dst_stride;
      vst1q_u16(dst, d1);
      dst += dst_stride;
      h -= 2;
    } while (h > 0);
  } else if (w < 32) {  // avg16
    uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
    do {
      s0l = vld1q_u16(src);
      s0h = vld1q_u16(src + 8);
      d0l = vld1q_u16(dst);
      d0h = vld1q_u16(dst + 8);
      src += src_stride;
      s1l = vld1q_u16(src);
      s1h = vld1q_u16(src + 8);
      d1l = vld1q_u16(dst + dst_stride);
      d1h = vld1q_u16(dst + dst_stride + 8);
      src += src_stride;

      d0l = vrhaddq_u16(s0l, d0l);
      d0h = vrhaddq_u16(s0h, d0h);
      d1l = vrhaddq_u16(s1l, d1l);
      d1h = vrhaddq_u16(s1h, d1h);

      vst1q_u16(dst, d0l);
      vst1q_u16(dst + 8, d0h);
      dst += dst_stride;
      vst1q_u16(dst, d1l);
      vst1q_u16(dst + 8, d1h);
      dst += dst_stride;
      h -= 2;
    } while (h > 0);
  } else if (w == 32) {  // avg32
    uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
    do {
      s0 = vld1q_u16(src);
      s1 = vld1q_u16(src + 8);
      s2 = vld1q_u16(src + 16);
      s3 = vld1q_u16(src + 24);
      d0 = vld1q_u16(dst);
      d1 = vld1q_u16(dst + 8);
      d2 = vld1q_u16(dst + 16);
      d3 = vld1q_u16(dst + 24);
      src += src_stride;

      d0 = vrhaddq_u16(s0, d0);
      d1 = vrhaddq_u16(s1, d1);
      d2 = vrhaddq_u16(s2, d2);
      d3 = vrhaddq_u16(s3, d3);

      vst1q_u16(dst, d0);
      vst1q_u16(dst + 8, d1);
      vst1q_u16(dst + 16, d2);
      vst1q_u16(dst + 24, d3);
      dst += dst_stride;

      s0 = vld1q_u16(src);
      s1 = vld1q_u16(src + 8);
      s2 = vld1q_u16(src + 16);
      s3 = vld1q_u16(src + 24);
      d0 = vld1q_u16(dst);
      d1 = vld1q_u16(dst + 8);
      d2 = vld1q_u16(dst + 16);
      d3 = vld1q_u16(dst + 24);
      src += src_stride;

      d0 = vrhaddq_u16(s0, d0);
      d1 = vrhaddq_u16(s1, d1);
      d2 = vrhaddq_u16(s2, d2);
      d3 = vrhaddq_u16(s3, d3);

      vst1q_u16(dst, d0);
      vst1q_u16(dst + 8, d1);
      vst1q_u16(dst + 16, d2);
      vst1q_u16(dst + 24, d3);
      dst += dst_stride;
      h -= 2;
    } while (h > 0);
  } else {  // avg64
    uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
    do {
      s0 = vld1q_u16(src);
      s1 = vld1q_u16(src + 8);
      s2 = vld1q_u16(src + 16);
      s3 = vld1q_u16(src + 24);
      d0 = vld1q_u16(dst);
      d1 = vld1q_u16(dst + 8);
      d2 = vld1q_u16(dst + 16);
      d3 = vld1q_u16(dst + 24);

      d0 = vrhaddq_u16(s0, d0);
      d1 = vrhaddq_u16(s1, d1);
      d2 = vrhaddq_u16(s2, d2);
      d3 = vrhaddq_u16(s3, d3);

      vst1q_u16(dst, d0);
      vst1q_u16(dst + 8, d1);
      vst1q_u16(dst + 16, d2);
      vst1q_u16(dst + 24, d3);

      s0 = vld1q_u16(src + 32);
      s1 = vld1q_u16(src + 40);
      s2 = vld1q_u16(src + 48);
      s3 = vld1q_u16(src + 56);
      d0 = vld1q_u16(dst + 32);
      d1 = vld1q_u16(dst + 40);
      d2 = vld1q_u16(dst + 48);
      d3 = vld1q_u16(dst + 56);

      d0 = vrhaddq_u16(s0, d0);
      d1 = vrhaddq_u16(s1, d1);
      d2 = vrhaddq_u16(s2, d2);
      d3 = vrhaddq_u16(s3, d3);

      vst1q_u16(dst + 32, d0);
      vst1q_u16(dst + 40, d1);
      vst1q_u16(dst + 48, d2);
      vst1q_u16(dst + 56, d3);
      src += src_stride;
      dst += dst_stride;
    } while (--h);
  }
}
XnStatus Link12BitS2DParser::Unpack12to16(const XnUInt8* pcInput,XnUInt8* pDest, const XnUInt32 nInputSize, XnUInt32* pnActualRead, XnUInt32* pnActualWritten)
{
	const XnUInt8* pOrigInput = (XnUInt8*)pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	//XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;
	
	*pnActualRead = 0;

	XnUInt16 *pnOutput = (XnUInt16*)pDest;
	XnUInt16 shift[16];
#ifdef XN_NEON
	XnUInt16 depth[16];
	uint8x8x3_t inD3;
	uint8x8_t rshft4D, lshft4D;
	uint16x8_t rshft4Q, lshft4Q;
	uint16x8_t depthQ;
	uint16x8x2_t shiftQ2;
#endif

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
#ifndef XN_NEON
		// input:	0,  1,2,3,  4,5,6,  7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23
		//			-,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,--
		// bits:	8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8
		//			---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,----
		// output:	  0,  1,  2,  3,  4,  5,  6,   7,   8,   9,  10,  11,  12,  13,  14,  15

		shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4);
		shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0);
		shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4);
		shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0);
		shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4);
		shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0);
		shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4);
		shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0);
		shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4);
		shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0);
		shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4);
		shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0);
		shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4);
		shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0);
		shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4);
		shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0);

		pnOutput[0] = m_pShiftToDepth[(shift[0])];
		pnOutput[1] = m_pShiftToDepth[(shift[1])];
		pnOutput[2] = m_pShiftToDepth[(shift[2])];
		pnOutput[3] = m_pShiftToDepth[(shift[3])];
		pnOutput[4] = m_pShiftToDepth[(shift[4])];
		pnOutput[5] = m_pShiftToDepth[(shift[5])];
		pnOutput[6] = m_pShiftToDepth[(shift[6])];
		pnOutput[7] = m_pShiftToDepth[(shift[7])];
		pnOutput[8] = m_pShiftToDepth[(shift[8])];
		pnOutput[9] = m_pShiftToDepth[(shift[9])];
		pnOutput[10] = m_pShiftToDepth[(shift[10])];
		pnOutput[11] = m_pShiftToDepth[(shift[11])];
		pnOutput[12] = m_pShiftToDepth[(shift[12])];
		pnOutput[13] = m_pShiftToDepth[(shift[13])];
		pnOutput[14] = m_pShiftToDepth[(shift[14])];
		pnOutput[15] = m_pShiftToDepth[(shift[15])];
#else
		// input:	0,  1,2    (X8)
		//			-,---,-
		// bits:	8,4,4,8    (X8)
		//			---,---
		// output:	  0,  1    (X8)

		// Split 24 bytes into 3 vectors (64 bit each)
		inD3 = vld3_u8(pcInput);

		// rshft4D0 contains 4 MSB of second vector (placed at offset 0)
		rshft4D = vshr_n_u8(inD3.val[1], 4);
		// lshft4D0 contains 4 LSB of second vector (placed at offset 4)
		lshft4D = vshl_n_u8(inD3.val[1], 4);

		// Expand 64 bit vectors to 128 bit (8 values of 16 bits)
		shiftQ2.val[0] = vmovl_u8(inD3.val[0]);
		shiftQ2.val[1] = vmovl_u8(inD3.val[2]);
		rshft4Q = vmovl_u8(rshft4D);
		lshft4Q = vmovl_u8(lshft4D);

		// Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector
		shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4);
		shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q);

		// Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector
		lshft4Q = vshlq_n_u16(lshft4Q, 4);
		shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q);

		// Interleave shift values to a single vector
		vst2q_u16(shift, shiftQ2);

		depth[0] = m_pShiftToDepth[(shift[0])];
		depth[1] = m_pShiftToDepth[(shift[1])];

		depth[2] = m_pShiftToDepth[(shift[2])];
		depth[3] = m_pShiftToDepth[(shift[3])];

		depth[4] = m_pShiftToDepth[(shift[4])];
		depth[5] = m_pShiftToDepth[(shift[5])];

		depth[6] = m_pShiftToDepth[(shift[6])];
		depth[7] = m_pShiftToDepth[(shift[7])];

		// Load
		depthQ = vld1q_u16(depth);
		//Store
		vst1q_u16(pnOutput, depthQ);

		depth[8] = m_pShiftToDepth[(shift[8])];
		depth[9] = m_pShiftToDepth[(shift[9])];

		depth[10] = m_pShiftToDepth[(shift[10])];
		depth[11] = m_pShiftToDepth[(shift[11])];

		depth[12] = m_pShiftToDepth[(shift[12])];
		depth[13] = m_pShiftToDepth[(shift[13])];

		depth[14] = m_pShiftToDepth[(shift[14])];
		depth[15] = m_pShiftToDepth[(shift[15])];

		// Load
		depthQ = vld1q_u16(depth + 8);
		// Store
		vst1q_u16(pnOutput + 8, depthQ);
#endif
		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 16;
	}
	
	*pnActualRead = (XnUInt32)(pcInput - pOrigInput); // total bytes 
	*pnActualWritten = (XnUInt32)((XnUInt8*)pnOutput - pDest);

	return XN_STATUS_OK;
}
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
	const XnUInt8* pOrigInput = pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	*pnActualRead = 0;
	XnBuffer* pWriteBuffer = GetWriteBuffer();

	// Check there is enough room for the depth pixels
	if (!CheckWriteBufferForOverflow(nNeededOutput))
	{
		return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
	}

	XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer();

	XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7;
#ifdef XN_NEON
	XnUInt16 depth[8];
	uint16x8_t Q0;
#endif

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
    if(m_nScaleFactor > 1)
    {
      XnUInt32 px = m_nOffsetInFrame%m_CurrentVideoMode.resolutionX;
      XnUInt32 py = (m_nOffsetInFrame)/m_CurrentVideoMode.resolutionX;

      if(py%m_nScaleFactor != 0)
      {
        // Skip as many pixels as possible
        XnUInt32 nEltsToSkip =
            XN_MIN(nElements - nElem,
                   (m_CurrentVideoMode.resolutionX - px)/8
                   + (m_nScaleFactor-(py%m_nScaleFactor) - 1)*m_CurrentVideoMode.resolutionX/8);

        //      ::memset(pnOutput, 0, nEltsToSkip*8*sizeof(XnUInt16));
        pcInput += nEltsToSkip*XN_INPUT_ELEMENT_SIZE;
        pnOutput += nEltsToSkip*8;
        m_nOffsetInFrame += nEltsToSkip*8;
        nElem += (nEltsToSkip-1);
        continue;
      }
    }

    // input:  0,  1,  2,3,  4,  5,  6,7,  8,  9,10
    //         -,---,---,-,---,---,---,-,---,---,-
    // bits:   8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
    //         ---,---,-----,---,---,-----,---,---
    // output:   0,  1,    2,  3,  4,    5,  6,  7
    if(m_nScaleFactor == 2)
    {
      a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
      a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7);
      a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
      a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3);
    }
    else if(m_nScaleFactor == 4)
    {
      a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
      a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
    }
    else
    {
      a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
      a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2);
      a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7);
      a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4);
      a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
      a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6);
      a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3);
      a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0);
    }



#ifdef XN_NEON
		depth[0] = GetOutput(a0);
		depth[1] = GetOutput(a1);
		depth[2] = GetOutput(a2);
		depth[3] = GetOutput(a3);
		depth[4] = GetOutput(a4);
		depth[5] = GetOutput(a5);
		depth[6] = GetOutput(a6);
		depth[7] = GetOutput(a7);

		// Load
		Q0 = vld1q_u16(depth);
		// Store
		vst1q_u16(pnOutput, Q0);
#else
    if(m_nScaleFactor == 2)
    {
      *pnOutput++ = GetOutput(a0);
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a2);
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a4);
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a6);
      *pnOutput++ = 0;
    }
    else if(m_nScaleFactor == 4)
    {
      *pnOutput++ = GetOutput(a0);
      *pnOutput++ = 0;
      *pnOutput++ = 0;
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a4);
      *pnOutput++ = 0;
      *pnOutput++ = 0;
      *pnOutput++ = 0;
    }
    else
    {
      *pnOutput++ = GetOutput(a0);
      *pnOutput++ = GetOutput(a1);
      *pnOutput++ = GetOutput(a2);
      *pnOutput++ = GetOutput(a3);
      *pnOutput++ = GetOutput(a4);
      *pnOutput++ = GetOutput(a5);
      *pnOutput++ = GetOutput(a6);
      *pnOutput++ = GetOutput(a7);
    }
#endif

		pcInput += XN_INPUT_ELEMENT_SIZE;
    m_nOffsetInFrame+=8;
	}

	*pnActualRead = (XnUInt32)(pcInput - pOrigInput);
	pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

	return XN_STATUS_OK;
}
Example #17
0
/* u16x8 mm mul */
void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C)
{
	int i, k, j;

	uint16x8_t neon_b, neon_c;
	uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7;
	uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7;

	for (i = 0; i < Row; i+=8)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_u16(0);

			for (j = 0; j < T; j+=8)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a4 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a5 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a6 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a7 = vld1q_u16(A + j_T);

				neon_b = vld1q_u16(B + k_Row + j);
				neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0));
				neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1));
				neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2));
				neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3));
				neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4));
				neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5));
				neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6));
				neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7));

				neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c);

				vst1q_lane_u16(C + k_Row + i, neon_c, 0);
				vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3);
				vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4);
				vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5);
				vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6);
				vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7);

			}
		}
	}
}
Example #18
0
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm,
                                       const uint16_t* far_spectrum,
                                       int32_t* echo_est,
                                       uint32_t* far_energy,
                                       uint32_t* echo_energy_adapt,
                                       uint32_t* echo_energy_stored) {
  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int32_t* echo_est_p = echo_est;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  const uint16_t* far_spectrum_p = far_spectrum;
  int16x8_t store_v, adapt_v;
  uint16x8_t spectrum_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;
  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;

  far_energy_v = vdupq_n_u32(0);
  echo_adapt_v = vdupq_n_u32(0);
  echo_stored_v = vdupq_n_u32(0);

  // Get energy for the delayed far end signal and estimated
  // echo using both stored and adapted channels.
  // The C code:
  //  for (i = 0; i < PART_LEN1; i++) {
  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                         far_spectrum[i]);
  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
  //  }
  while (start_stored_p < end_stored_p) {
    spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);
    store_v = vld1q_s16(start_stored_p);

    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));

    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
                               vget_low_u16(spectrum_v));
    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
                                vget_high_u16(spectrum_v));
    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);

    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
                             vget_low_u16(spectrum_v));
    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
                             vget_high_u16(spectrum_v));

    start_stored_p += 8;
    start_adapt_p += 8;
    far_spectrum_p += 8;
    echo_est_p += 8;
  }

  AddLanes(far_energy, far_energy_v);
  AddLanes(echo_energy_stored, echo_stored_v);
  AddLanes(echo_energy_adapt, echo_adapt_v);

  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
  *far_energy += (uint32_t)far_spectrum[PART_LEN];
  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
}
Example #19
0
inline  uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }