XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
	const XnUInt8* pOrigInput = pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	*pnActualRead = 0;
	XnBuffer* pWriteBuffer = GetWriteBuffer();

	// Check there is enough room for the depth pixels
	if (!CheckDepthBufferForOverflow(nNeededOutput))
	{
		return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
	}

	XnUInt16* pShiftOut = GetShiftsOutputBuffer();
	XnUInt16* pnOutput = GetDepthOutputBuffer();

	XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7;
#ifdef XN_NEON
	XnUInt16 shift[8];
	XnUInt16 depth[8];
	uint16x8_t Q0;
#endif

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
		// input:	0,  1,  2,3,  4,  5,  6,7,  8,  9,10
		//			-,---,---,-,---,---,---,-,---,---,-
		// bits:	8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
		//			---,---,-----,---,---,-----,---,---
		// output:	  0,  1,    2,  3,  4,    5,  6,  7

		a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
		a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2);
		a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7);
		a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4);
		a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
		a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6);
		a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3);
		a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0);


#ifdef XN_NEON
		shift[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0);
		shift[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0);
		shift[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0);
		shift[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0);
		shift[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0);
		shift[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0);
		shift[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0);
		shift[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0);

		depth[0] = GetOutput(a0);
		depth[1] = GetOutput(a1);
		depth[2] = GetOutput(a2);
		depth[3] = GetOutput(a3);
		depth[4] = GetOutput(a4);
		depth[5] = GetOutput(a5);
		depth[6] = GetOutput(a6);
		depth[7] = GetOutput(a7);

		// Load
		Q0 = vld1q_u16(depth);
		// Store
		vst1q_u16(pnOutput, Q0);

		// Load
		Q0 = vld1q_u16(shift);
		// Store
		vst1q_u16(pShiftOut, Q0);
#else
		pShiftOut[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0);
		pShiftOut[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0);
		pShiftOut[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0);
		pShiftOut[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0);
		pShiftOut[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0);
		pShiftOut[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0);
		pShiftOut[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0);
		pShiftOut[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0);

		pnOutput[0] = GetOutput(a0);
		pnOutput[1] = GetOutput(a1);
		pnOutput[2] = GetOutput(a2);
		pnOutput[3] = GetOutput(a3);
		pnOutput[4] = GetOutput(a4);
		pnOutput[5] = GetOutput(a5);
		pnOutput[6] = GetOutput(a6);
		pnOutput[7] = GetOutput(a7);

#endif

		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 8;
		pShiftOut += 8;
	}

	*pnActualRead = (XnUInt32)(pcInput - pOrigInput);
	pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

	return XN_STATUS_OK;
}
예제 #2
0
XnStatus XnPacked12DepthProcessor::Unpack12to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
    const XnUInt8* pOrigInput = pcInput;

    XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
    XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

    *pnActualRead = 0;
    XnBuffer* pWriteBuffer = GetWriteBuffer();

    if (!CheckDepthBufferForOverflow(nNeededOutput))
    {
        return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
    }

    XnUInt16* pnOutput = GetDepthOutputBuffer();
    XnUInt16* pShiftOut = GetShiftsOutputBuffer();
    XnUInt16 shift[16];
#ifdef XN_NEON
    XnUInt16 depth[16];
    uint8x8x3_t inD3;
    uint8x8_t rshft4D, lshft4D;
    uint16x8_t rshft4Q, lshft4Q;
    uint16x8_t depthQ;
    uint16x8x2_t shiftQ2;
#endif

    // Convert the 11bit packed data into 16bit shorts
    for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
    {
#ifndef XN_NEON
        // input:	0,  1,2,3,  4,5,6,  7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23
        //			-,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,--
        // bits:	8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8
        //			---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,----
        // output:	  0,  1,  2,  3,  4,  5,  6,   7,   8,   9,  10,  11,  12,  13,  14,  15

        shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4);
        shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0);
        shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4);
        shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0);
        shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4);
        shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0);
        shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4);
        shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0);
        shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4);
        shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0);
        shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4);
        shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0);
        shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4);
        shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0);
        shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4);
        shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0);

        pShiftOut[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0);
        pShiftOut[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0);
        pShiftOut[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0);
        pShiftOut[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0);
        pShiftOut[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0);
        pShiftOut[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0);
        pShiftOut[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0);
        pShiftOut[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0);
        pShiftOut[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0);
        pShiftOut[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0);
        pShiftOut[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0);
        pShiftOut[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0);
        pShiftOut[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0);
        pShiftOut[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0);
        pShiftOut[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0);
        pShiftOut[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0);

        pnOutput[0] = GetOutput(shift[0]);
        pnOutput[1] = GetOutput(shift[1]);
        pnOutput[2] = GetOutput(shift[2]);
        pnOutput[3] = GetOutput(shift[3]);
        pnOutput[4] = GetOutput(shift[4]);
        pnOutput[5] = GetOutput(shift[5]);
        pnOutput[6] = GetOutput(shift[6]);
        pnOutput[7] = GetOutput(shift[7]);
        pnOutput[8] = GetOutput(shift[8]);
        pnOutput[9] = GetOutput(shift[9]);
        pnOutput[10] = GetOutput(shift[10]);
        pnOutput[11] = GetOutput(shift[11]);
        pnOutput[12] = GetOutput(shift[12]);
        pnOutput[13] = GetOutput(shift[13]);
        pnOutput[14] = GetOutput(shift[14]);
        pnOutput[15] = GetOutput(shift[15]);

#else
        // input:	0,  1,2    (X8)
        //			-,---,-
        // bits:	8,4,4,8    (X8)
        //			---,---
        // output:	  0,  1    (X8)

        // Split 24 bytes into 3 vectors (64 bit each)
        inD3 = vld3_u8(pcInput);

        // rshft4D0 contains 4 MSB of second vector (placed at offset 0)
        rshft4D = vshr_n_u8(inD3.val[1], 4);
        // lshft4D0 contains 4 LSB of second vector (placed at offset 4)
        lshft4D = vshl_n_u8(inD3.val[1], 4);

        // Expand 64 bit vectors to 128 bit (8 values of 16 bits)
        shiftQ2.val[0] = vmovl_u8(inD3.val[0]);
        shiftQ2.val[1] = vmovl_u8(inD3.val[2]);
        rshft4Q = vmovl_u8(rshft4D);
        lshft4Q = vmovl_u8(lshft4D);

        // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector
        shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4);
        shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q);

        // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector
        lshft4Q = vshlq_n_u16(lshft4Q, 4);
        shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q);

        // Interleave shift values to a single vector
        vst2q_u16(shift, shiftQ2);

        shift[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0);
        shift[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0);
        shift[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0);
        shift[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0);
        shift[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0);
        shift[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0);
        shift[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0);
        shift[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0);
        shift[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0);
        shift[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0);
        shift[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0);
        shift[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0);
        shift[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0);
        shift[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0);
        shift[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0);
        shift[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0);

        depth[0] = GetOutput(shift[0]);
        depth[1] = GetOutput(shift[1]);

        depth[2] = GetOutput(shift[2]);
        depth[3] = GetOutput(shift[3]);

        depth[4] = GetOutput(shift[4]);
        depth[5] = GetOutput(shift[5]);

        depth[6] = GetOutput(shift[6]);
        depth[7] = GetOutput(shift[7]);

        // Load
        depthQ = vld1q_u16(depth);
        //Store
        vst1q_u16(pnOutput, depthQ);

        // Load
        depthQ = vld1q_u16(shift);
        // Store
        vst1q_u16(pShiftOut, depthQ);

        depth[8] = GetOutput(shift[8]);
        depth[9] = GetOutput(shift[9]);

        depth[10] = GetOutput(shift[10]);
        depth[11] = GetOutput(shift[11]);

        depth[12] = GetOutput(shift[12]);
        depth[13] = GetOutput(shift[13]);

        depth[14] = GetOutput(shift[14]);
        depth[15] = GetOutput(shift[15]);

        // Load
        depthQ = vld1q_u16(depth + 8);
        // Store
        vst1q_u16(pnOutput + 8, depthQ);

        // Load
        depthQ = vld1q_u16(shift + 8);
        // Store
        vst1q_u16(pShiftOut + 8, depthQ);

#endif

        pcInput += XN_INPUT_ELEMENT_SIZE;
        pnOutput += 16;
        pShiftOut += 16;
    }


    *pnActualRead = (XnUInt32)(pcInput - pOrigInput);
    pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

    return XN_STATUS_OK;
}
예제 #3
0
int  Unpack11to16(const unsigned char* pcInput, unsigned short* pnOutput, const unsigned long  nInputSize)
{
	const unsigned char* pOrigInput = pcInput;
	uint8x8_t inputfield;
	uint16x4_t shiftfield;
	uint16_t test[4];

	unsigned long nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	unsigned long nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	// Convert the 11bit packed data into 16bit shorts
	for (unsigned long nElem = 0; nElem < nElements; ++nElem)
	{
		// input:	0,  1,  2,3,  4,  5,  6,7,  8,  9,10
		//		-,---,---,-,---,---,---,-,---,---,-
		// bits:	8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
		//		---,---,-----,---,---,-----,---,---
		// output:	  0,  1,    2,  3,  4,    5,  6,  7
#ifdef NEON
	        // Load 64 bits of data
		inputfield = vld1_u8(pcInput);
		// Reverse it since the endianess is wrong.
		inputfield = vrev16_u8(inputfield);

		// Debug -- let's make sure it looks ok by looking at 
		// it as a 16-bit element since that is ultimately what we want
		vst1_u16(test, inputfield);
		printf("i %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);

		// Right shift by 5 bits to aling the first half-word
		// *note this does not compile since the compiler cannot deal with this 
		// conversion for some reason. It can deal with vshr_n_u32() and lower.
		// print out the results
		shiftfield = vshr_n_u64(inputfield, 5);
		vst1_u16( test,shiftfield);
		printf("1 %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);
		
		// Right shift by 10 bits to aling the second half-word
		// print out the results
		shiftfield = vshr_n_u32(inputfield, 10);
		vst1_u16( test,shiftfield);
		printf("2 %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);

		// Right shift by 15 bits to aling the third half-word
		// print out the results
		shiftfield = vshr_n_u32(inputfield, 15);
		vst1_u16( test,shiftfield);
		printf("3 %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);

		// we would continue for all 8 half-word results
		
#else
		// This is the original Primesense code...
		// shift the output by 5 bits to the right to align 11 bits on the 16 bit field
		vsri_n_u64(leftfield, shiftfield, 5);

		vst1_u64((uint64_t*)pnOutput, shiftfield);

		pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5));
		pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2));
		pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7));
		pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4));
		pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1));
		pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6));
		pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3));
		pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0));
#endif

		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 8;
	}

	return (pcInput - pOrigInput);
}
XnStatus Link12BitS2DParser::Unpack12to16(const XnUInt8* pcInput,XnUInt8* pDest, const XnUInt32 nInputSize, XnUInt32* pnActualRead, XnUInt32* pnActualWritten)
{
	const XnUInt8* pOrigInput = (XnUInt8*)pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	//XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;
	
	*pnActualRead = 0;

	XnUInt16 *pnOutput = (XnUInt16*)pDest;
	XnUInt16 shift[16];
#ifdef XN_NEON
	XnUInt16 depth[16];
	uint8x8x3_t inD3;
	uint8x8_t rshft4D, lshft4D;
	uint16x8_t rshft4Q, lshft4Q;
	uint16x8_t depthQ;
	uint16x8x2_t shiftQ2;
#endif

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
#ifndef XN_NEON
		// input:	0,  1,2,3,  4,5,6,  7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23
		//			-,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,--
		// bits:	8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8
		//			---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,----
		// output:	  0,  1,  2,  3,  4,  5,  6,   7,   8,   9,  10,  11,  12,  13,  14,  15

		shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4);
		shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0);
		shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4);
		shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0);
		shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4);
		shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0);
		shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4);
		shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0);
		shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4);
		shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0);
		shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4);
		shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0);
		shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4);
		shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0);
		shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4);
		shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0);

		pnOutput[0] = m_pShiftToDepth[(shift[0])];
		pnOutput[1] = m_pShiftToDepth[(shift[1])];
		pnOutput[2] = m_pShiftToDepth[(shift[2])];
		pnOutput[3] = m_pShiftToDepth[(shift[3])];
		pnOutput[4] = m_pShiftToDepth[(shift[4])];
		pnOutput[5] = m_pShiftToDepth[(shift[5])];
		pnOutput[6] = m_pShiftToDepth[(shift[6])];
		pnOutput[7] = m_pShiftToDepth[(shift[7])];
		pnOutput[8] = m_pShiftToDepth[(shift[8])];
		pnOutput[9] = m_pShiftToDepth[(shift[9])];
		pnOutput[10] = m_pShiftToDepth[(shift[10])];
		pnOutput[11] = m_pShiftToDepth[(shift[11])];
		pnOutput[12] = m_pShiftToDepth[(shift[12])];
		pnOutput[13] = m_pShiftToDepth[(shift[13])];
		pnOutput[14] = m_pShiftToDepth[(shift[14])];
		pnOutput[15] = m_pShiftToDepth[(shift[15])];
#else
		// input:	0,  1,2    (X8)
		//			-,---,-
		// bits:	8,4,4,8    (X8)
		//			---,---
		// output:	  0,  1    (X8)

		// Split 24 bytes into 3 vectors (64 bit each)
		inD3 = vld3_u8(pcInput);

		// rshft4D0 contains 4 MSB of second vector (placed at offset 0)
		rshft4D = vshr_n_u8(inD3.val[1], 4);
		// lshft4D0 contains 4 LSB of second vector (placed at offset 4)
		lshft4D = vshl_n_u8(inD3.val[1], 4);

		// Expand 64 bit vectors to 128 bit (8 values of 16 bits)
		shiftQ2.val[0] = vmovl_u8(inD3.val[0]);
		shiftQ2.val[1] = vmovl_u8(inD3.val[2]);
		rshft4Q = vmovl_u8(rshft4D);
		lshft4Q = vmovl_u8(lshft4D);

		// Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector
		shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4);
		shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q);

		// Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector
		lshft4Q = vshlq_n_u16(lshft4Q, 4);
		shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q);

		// Interleave shift values to a single vector
		vst2q_u16(shift, shiftQ2);

		depth[0] = m_pShiftToDepth[(shift[0])];
		depth[1] = m_pShiftToDepth[(shift[1])];

		depth[2] = m_pShiftToDepth[(shift[2])];
		depth[3] = m_pShiftToDepth[(shift[3])];

		depth[4] = m_pShiftToDepth[(shift[4])];
		depth[5] = m_pShiftToDepth[(shift[5])];

		depth[6] = m_pShiftToDepth[(shift[6])];
		depth[7] = m_pShiftToDepth[(shift[7])];

		// Load
		depthQ = vld1q_u16(depth);
		//Store
		vst1q_u16(pnOutput, depthQ);

		depth[8] = m_pShiftToDepth[(shift[8])];
		depth[9] = m_pShiftToDepth[(shift[9])];

		depth[10] = m_pShiftToDepth[(shift[10])];
		depth[11] = m_pShiftToDepth[(shift[11])];

		depth[12] = m_pShiftToDepth[(shift[12])];
		depth[13] = m_pShiftToDepth[(shift[13])];

		depth[14] = m_pShiftToDepth[(shift[14])];
		depth[15] = m_pShiftToDepth[(shift[15])];

		// Load
		depthQ = vld1q_u16(depth + 8);
		// Store
		vst1q_u16(pnOutput + 8, depthQ);
#endif
		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 16;
	}
	
	*pnActualRead = (XnUInt32)(pcInput - pOrigInput); // total bytes 
	*pnActualWritten = (XnUInt32)((XnUInt8*)pnOutput - pDest);

	return XN_STATUS_OK;
}
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
	const XnUInt8* pOrigInput = pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	*pnActualRead = 0;
	XnBuffer* pWriteBuffer = GetWriteBuffer();

	// Check there is enough room for the depth pixels
	if (!CheckWriteBufferForOverflow(nNeededOutput))
	{
		return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
	}

	XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer();

	XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7;
#ifdef XN_NEON
	XnUInt16 depth[8];
	uint16x8_t Q0;
#endif

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
    if(m_nScaleFactor > 1)
    {
      XnUInt32 px = m_nOffsetInFrame%m_CurrentVideoMode.resolutionX;
      XnUInt32 py = (m_nOffsetInFrame)/m_CurrentVideoMode.resolutionX;

      if(py%m_nScaleFactor != 0)
      {
        // Skip as many pixels as possible
        XnUInt32 nEltsToSkip =
            XN_MIN(nElements - nElem,
                   (m_CurrentVideoMode.resolutionX - px)/8
                   + (m_nScaleFactor-(py%m_nScaleFactor) - 1)*m_CurrentVideoMode.resolutionX/8);

        //      ::memset(pnOutput, 0, nEltsToSkip*8*sizeof(XnUInt16));
        pcInput += nEltsToSkip*XN_INPUT_ELEMENT_SIZE;
        pnOutput += nEltsToSkip*8;
        m_nOffsetInFrame += nEltsToSkip*8;
        nElem += (nEltsToSkip-1);
        continue;
      }
    }

    // input:  0,  1,  2,3,  4,  5,  6,7,  8,  9,10
    //         -,---,---,-,---,---,---,-,---,---,-
    // bits:   8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
    //         ---,---,-----,---,---,-----,---,---
    // output:   0,  1,    2,  3,  4,    5,  6,  7
    if(m_nScaleFactor == 2)
    {
      a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
      a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7);
      a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
      a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3);
    }
    else if(m_nScaleFactor == 4)
    {
      a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
      a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
    }
    else
    {
      a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5);
      a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2);
      a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7);
      a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4);
      a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1);
      a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6);
      a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3);
      a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0);
    }



#ifdef XN_NEON
		depth[0] = GetOutput(a0);
		depth[1] = GetOutput(a1);
		depth[2] = GetOutput(a2);
		depth[3] = GetOutput(a3);
		depth[4] = GetOutput(a4);
		depth[5] = GetOutput(a5);
		depth[6] = GetOutput(a6);
		depth[7] = GetOutput(a7);

		// Load
		Q0 = vld1q_u16(depth);
		// Store
		vst1q_u16(pnOutput, Q0);
#else
    if(m_nScaleFactor == 2)
    {
      *pnOutput++ = GetOutput(a0);
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a2);
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a4);
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a6);
      *pnOutput++ = 0;
    }
    else if(m_nScaleFactor == 4)
    {
      *pnOutput++ = GetOutput(a0);
      *pnOutput++ = 0;
      *pnOutput++ = 0;
      *pnOutput++ = 0;
      *pnOutput++ = GetOutput(a4);
      *pnOutput++ = 0;
      *pnOutput++ = 0;
      *pnOutput++ = 0;
    }
    else
    {
      *pnOutput++ = GetOutput(a0);
      *pnOutput++ = GetOutput(a1);
      *pnOutput++ = GetOutput(a2);
      *pnOutput++ = GetOutput(a3);
      *pnOutput++ = GetOutput(a4);
      *pnOutput++ = GetOutput(a5);
      *pnOutput++ = GetOutput(a6);
      *pnOutput++ = GetOutput(a7);
    }
#endif

		pcInput += XN_INPUT_ELEMENT_SIZE;
    m_nOffsetInFrame+=8;
	}

	*pnActualRead = (XnUInt32)(pcInput - pOrigInput);
	pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

	return XN_STATUS_OK;
}
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead)
{
	const XnUInt8* pOrigInput = pcInput;

	XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	*pnActualRead = 0;
	XnBuffer* pWriteBuffer = GetWriteBuffer();

	if (!CheckWriteBufferForOverflow(nNeededOutput))
	{
		return XN_STATUS_OUTPUT_BUFFER_OVERFLOW;
	}

	XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer();

	// Convert the 11bit packed data into 16bit shorts
	for (XnUInt32 nElem = 0; nElem < nElements; ++nElem)
	{
		// input:	0,  1,  2,3,  4,  5,  6,7,  8,  9,10
		//			-,---,---,-,---,---,---,-,---,---,-
		// bits:	8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
		//			---,---,-----,---,---,-----,---,---
		// output:	  0,  1,    2,  3,  4,    5,  6,  7

		pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5));
		pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2));
		pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7));
		pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4));
		pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1));
		pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6));
		pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3));
		pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0));

		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 8;
	}

	*pnActualRead = (XnUInt32)(pcInput - pOrigInput);
	pWriteBuffer->UnsafeUpdateSize(nNeededOutput);

	return XN_STATUS_OK;
}