XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); // Check there is enough room for the depth pixels if (!CheckDepthBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pShiftOut = GetShiftsOutputBuffer(); XnUInt16* pnOutput = GetDepthOutputBuffer(); XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7; #ifdef XN_NEON XnUInt16 shift[8]; XnUInt16 depth[8]; uint16x8_t Q0; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0); #ifdef XN_NEON shift[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0); shift[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0); shift[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0); shift[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0); shift[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0); shift[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0); shift[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0); shift[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0); depth[0] = GetOutput(a0); depth[1] = GetOutput(a1); depth[2] = GetOutput(a2); depth[3] = GetOutput(a3); depth[4] = GetOutput(a4); depth[5] = GetOutput(a5); depth[6] = GetOutput(a6); depth[7] = GetOutput(a7); // Load Q0 = vld1q_u16(depth); // Store vst1q_u16(pnOutput, Q0); // Load Q0 = vld1q_u16(shift); // Store vst1q_u16(pShiftOut, Q0); #else pShiftOut[0] = (((a0) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a0) : 0); pShiftOut[1] = (((a1) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a1) : 0); pShiftOut[2] = (((a2) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a2) : 0); pShiftOut[3] = (((a3) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a3) : 0); pShiftOut[4] = (((a4) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a4) : 0); pShiftOut[5] = (((a5) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a5) : 0); pShiftOut[6] = (((a6) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a6) : 0); pShiftOut[7] = (((a7) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (a7) : 0); pnOutput[0] = GetOutput(a0); pnOutput[1] = GetOutput(a1); pnOutput[2] = GetOutput(a2); pnOutput[3] = GetOutput(a3); pnOutput[4] = GetOutput(a4); pnOutput[5] = GetOutput(a5); pnOutput[6] = GetOutput(a6); pnOutput[7] = GetOutput(a7); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; pShiftOut += 8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
XnStatus XnPacked12DepthProcessor::Unpack12to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); if (!CheckDepthBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = GetDepthOutputBuffer(); XnUInt16* pShiftOut = GetShiftsOutputBuffer(); XnUInt16 shift[16]; #ifdef XN_NEON XnUInt16 depth[16]; uint8x8x3_t inD3; uint8x8_t rshft4D, lshft4D; uint16x8_t rshft4Q, lshft4Q; uint16x8_t depthQ; uint16x8x2_t shiftQ2; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { #ifndef XN_NEON // input: 0, 1,2,3, 4,5,6, 7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23 // -,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,-- // bits: 8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8 // ---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,---- // output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4); shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0); shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4); shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0); shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4); shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0); shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4); shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0); shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4); shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0); shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4); shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0); shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4); shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0); shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4); shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0); pShiftOut[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0); pShiftOut[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0); pShiftOut[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0); pShiftOut[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0); pShiftOut[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0); pShiftOut[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0); pShiftOut[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0); pShiftOut[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0); pShiftOut[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0); pShiftOut[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0); pShiftOut[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0); pShiftOut[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0); pShiftOut[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0); pShiftOut[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0); pShiftOut[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0); pShiftOut[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0); pnOutput[0] = GetOutput(shift[0]); pnOutput[1] = GetOutput(shift[1]); pnOutput[2] = GetOutput(shift[2]); pnOutput[3] = GetOutput(shift[3]); pnOutput[4] = GetOutput(shift[4]); pnOutput[5] = GetOutput(shift[5]); pnOutput[6] = GetOutput(shift[6]); pnOutput[7] = GetOutput(shift[7]); pnOutput[8] = GetOutput(shift[8]); pnOutput[9] = GetOutput(shift[9]); pnOutput[10] = GetOutput(shift[10]); pnOutput[11] = GetOutput(shift[11]); pnOutput[12] = GetOutput(shift[12]); pnOutput[13] = GetOutput(shift[13]); pnOutput[14] = GetOutput(shift[14]); pnOutput[15] = GetOutput(shift[15]); #else // input: 0, 1,2 (X8) // -,---,- // bits: 8,4,4,8 (X8) // ---,--- // output: 0, 1 (X8) // Split 24 bytes into 3 vectors (64 bit each) inD3 = vld3_u8(pcInput); // rshft4D0 contains 4 MSB of second vector (placed at offset 0) rshft4D = vshr_n_u8(inD3.val[1], 4); // lshft4D0 contains 4 LSB of second vector (placed at offset 4) lshft4D = vshl_n_u8(inD3.val[1], 4); // Expand 64 bit vectors to 128 bit (8 values of 16 bits) shiftQ2.val[0] = vmovl_u8(inD3.val[0]); shiftQ2.val[1] = vmovl_u8(inD3.val[2]); rshft4Q = vmovl_u8(rshft4D); lshft4Q = vmovl_u8(lshft4D); // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4); shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q); // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector lshft4Q = vshlq_n_u16(lshft4Q, 4); shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q); // Interleave shift values to a single vector vst2q_u16(shift, shiftQ2); shift[0] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[0]) : 0); shift[1] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[1]) : 0); shift[2] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[2]) : 0); shift[3] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[3]) : 0); shift[4] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[4]) : 0); shift[5] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[5]) : 0); shift[6] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[6]) : 0); shift[7] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[7]) : 0); shift[8] = (((shift[0]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[8]) : 0); shift[9] = (((shift[1]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[9]) : 0); shift[10] = (((shift[2]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[10]) : 0); shift[11] = (((shift[3]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[11]) : 0); shift[12] = (((shift[4]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[12]) : 0); shift[13] = (((shift[5]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[13]) : 0); shift[14] = (((shift[6]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[14]) : 0); shift[15] = (((shift[7]) < (XN_DEVICE_SENSOR_MAX_SHIFT_VALUE-1)) ? (shift[15]) : 0); depth[0] = GetOutput(shift[0]); depth[1] = GetOutput(shift[1]); depth[2] = GetOutput(shift[2]); depth[3] = GetOutput(shift[3]); depth[4] = GetOutput(shift[4]); depth[5] = GetOutput(shift[5]); depth[6] = GetOutput(shift[6]); depth[7] = GetOutput(shift[7]); // Load depthQ = vld1q_u16(depth); //Store vst1q_u16(pnOutput, depthQ); // Load depthQ = vld1q_u16(shift); // Store vst1q_u16(pShiftOut, depthQ); depth[8] = GetOutput(shift[8]); depth[9] = GetOutput(shift[9]); depth[10] = GetOutput(shift[10]); depth[11] = GetOutput(shift[11]); depth[12] = GetOutput(shift[12]); depth[13] = GetOutput(shift[13]); depth[14] = GetOutput(shift[14]); depth[15] = GetOutput(shift[15]); // Load depthQ = vld1q_u16(depth + 8); // Store vst1q_u16(pnOutput + 8, depthQ); // Load depthQ = vld1q_u16(shift + 8); // Store vst1q_u16(pShiftOut + 8, depthQ); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 16; pShiftOut += 16; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
int Unpack11to16(const unsigned char* pcInput, unsigned short* pnOutput, const unsigned long nInputSize) { const unsigned char* pOrigInput = pcInput; uint8x8_t inputfield; uint16x4_t shiftfield; uint16_t test[4]; unsigned long nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored unsigned long nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; // Convert the 11bit packed data into 16bit shorts for (unsigned long nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 #ifdef NEON // Load 64 bits of data inputfield = vld1_u8(pcInput); // Reverse it since the endianess is wrong. inputfield = vrev16_u8(inputfield); // Debug -- let's make sure it looks ok by looking at // it as a 16-bit element since that is ultimately what we want vst1_u16(test, inputfield); printf("i %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 5 bits to aling the first half-word // *note this does not compile since the compiler cannot deal with this // conversion for some reason. It can deal with vshr_n_u32() and lower. // print out the results shiftfield = vshr_n_u64(inputfield, 5); vst1_u16( test,shiftfield); printf("1 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 10 bits to aling the second half-word // print out the results shiftfield = vshr_n_u32(inputfield, 10); vst1_u16( test,shiftfield); printf("2 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 15 bits to aling the third half-word // print out the results shiftfield = vshr_n_u32(inputfield, 15); vst1_u16( test,shiftfield); printf("3 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // we would continue for all 8 half-word results #else // This is the original Primesense code... // shift the output by 5 bits to the right to align 11 bits on the 16 bit field vsri_n_u64(leftfield, shiftfield, 5); vst1_u64((uint64_t*)pnOutput, shiftfield); pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5)); pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2)); pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7)); pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4)); pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1)); pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6)); pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3)); pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0)); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; } return (pcInput - pOrigInput); }
XnStatus Link12BitS2DParser::Unpack12to16(const XnUInt8* pcInput,XnUInt8* pDest, const XnUInt32 nInputSize, XnUInt32* pnActualRead, XnUInt32* pnActualWritten) { const XnUInt8* pOrigInput = (XnUInt8*)pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored //XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnUInt16 *pnOutput = (XnUInt16*)pDest; XnUInt16 shift[16]; #ifdef XN_NEON XnUInt16 depth[16]; uint8x8x3_t inD3; uint8x8_t rshft4D, lshft4D; uint16x8_t rshft4Q, lshft4Q; uint16x8_t depthQ; uint16x8x2_t shiftQ2; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { #ifndef XN_NEON // input: 0, 1,2,3, 4,5,6, 7,8,9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23 // -,---,-,-,---,-,-,---,-,-,---,--,--,---,--,--,---,--,--,---,--,--,---,-- // bits: 8,4,4,8,8,4,4,8,8,4,4,8,8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8, 8,4,4, 8 // ---,---,---,---,---,---,---,----,----,----,----,----,----,----,----,---- // output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shift[0] = (XN_TAKE_BITS(pcInput[0],8,0) << 4) | XN_TAKE_BITS(pcInput[1],4,4); shift[1] = (XN_TAKE_BITS(pcInput[1],4,0) << 8) | XN_TAKE_BITS(pcInput[2],8,0); shift[2] = (XN_TAKE_BITS(pcInput[3],8,0) << 4) | XN_TAKE_BITS(pcInput[4],4,4); shift[3] = (XN_TAKE_BITS(pcInput[4],4,0) << 8) | XN_TAKE_BITS(pcInput[5],8,0); shift[4] = (XN_TAKE_BITS(pcInput[6],8,0) << 4) | XN_TAKE_BITS(pcInput[7],4,4); shift[5] = (XN_TAKE_BITS(pcInput[7],4,0) << 8) | XN_TAKE_BITS(pcInput[8],8,0); shift[6] = (XN_TAKE_BITS(pcInput[9],8,0) << 4) | XN_TAKE_BITS(pcInput[10],4,4); shift[7] = (XN_TAKE_BITS(pcInput[10],4,0) << 8) | XN_TAKE_BITS(pcInput[11],8,0); shift[8] = (XN_TAKE_BITS(pcInput[12],8,0) << 4) | XN_TAKE_BITS(pcInput[13],4,4); shift[9] = (XN_TAKE_BITS(pcInput[13],4,0) << 8) | XN_TAKE_BITS(pcInput[14],8,0); shift[10] = (XN_TAKE_BITS(pcInput[15],8,0) << 4) | XN_TAKE_BITS(pcInput[16],4,4); shift[11] = (XN_TAKE_BITS(pcInput[16],4,0) << 8) | XN_TAKE_BITS(pcInput[17],8,0); shift[12] = (XN_TAKE_BITS(pcInput[18],8,0) << 4) | XN_TAKE_BITS(pcInput[19],4,4); shift[13] = (XN_TAKE_BITS(pcInput[19],4,0) << 8) | XN_TAKE_BITS(pcInput[20],8,0); shift[14] = (XN_TAKE_BITS(pcInput[21],8,0) << 4) | XN_TAKE_BITS(pcInput[22],4,4); shift[15] = (XN_TAKE_BITS(pcInput[22],4,0) << 8) | XN_TAKE_BITS(pcInput[23],8,0); pnOutput[0] = m_pShiftToDepth[(shift[0])]; pnOutput[1] = m_pShiftToDepth[(shift[1])]; pnOutput[2] = m_pShiftToDepth[(shift[2])]; pnOutput[3] = m_pShiftToDepth[(shift[3])]; pnOutput[4] = m_pShiftToDepth[(shift[4])]; pnOutput[5] = m_pShiftToDepth[(shift[5])]; pnOutput[6] = m_pShiftToDepth[(shift[6])]; pnOutput[7] = m_pShiftToDepth[(shift[7])]; pnOutput[8] = m_pShiftToDepth[(shift[8])]; pnOutput[9] = m_pShiftToDepth[(shift[9])]; pnOutput[10] = m_pShiftToDepth[(shift[10])]; pnOutput[11] = m_pShiftToDepth[(shift[11])]; pnOutput[12] = m_pShiftToDepth[(shift[12])]; pnOutput[13] = m_pShiftToDepth[(shift[13])]; pnOutput[14] = m_pShiftToDepth[(shift[14])]; pnOutput[15] = m_pShiftToDepth[(shift[15])]; #else // input: 0, 1,2 (X8) // -,---,- // bits: 8,4,4,8 (X8) // ---,--- // output: 0, 1 (X8) // Split 24 bytes into 3 vectors (64 bit each) inD3 = vld3_u8(pcInput); // rshft4D0 contains 4 MSB of second vector (placed at offset 0) rshft4D = vshr_n_u8(inD3.val[1], 4); // lshft4D0 contains 4 LSB of second vector (placed at offset 4) lshft4D = vshl_n_u8(inD3.val[1], 4); // Expand 64 bit vectors to 128 bit (8 values of 16 bits) shiftQ2.val[0] = vmovl_u8(inD3.val[0]); shiftQ2.val[1] = vmovl_u8(inD3.val[2]); rshft4Q = vmovl_u8(rshft4D); lshft4Q = vmovl_u8(lshft4D); // Even indexed shift = 8 bits from first vector + 4 MSB bits of second vector shiftQ2.val[0] = vshlq_n_u16(shiftQ2.val[0], 4); shiftQ2.val[0] = vorrq_u16(shiftQ2.val[0], rshft4Q); // Odd indexed shift = 4 LSB bits of second vector + 8 bits from third vector lshft4Q = vshlq_n_u16(lshft4Q, 4); shiftQ2.val[1] = vorrq_u16(shiftQ2.val[1], lshft4Q); // Interleave shift values to a single vector vst2q_u16(shift, shiftQ2); depth[0] = m_pShiftToDepth[(shift[0])]; depth[1] = m_pShiftToDepth[(shift[1])]; depth[2] = m_pShiftToDepth[(shift[2])]; depth[3] = m_pShiftToDepth[(shift[3])]; depth[4] = m_pShiftToDepth[(shift[4])]; depth[5] = m_pShiftToDepth[(shift[5])]; depth[6] = m_pShiftToDepth[(shift[6])]; depth[7] = m_pShiftToDepth[(shift[7])]; // Load depthQ = vld1q_u16(depth); //Store vst1q_u16(pnOutput, depthQ); depth[8] = m_pShiftToDepth[(shift[8])]; depth[9] = m_pShiftToDepth[(shift[9])]; depth[10] = m_pShiftToDepth[(shift[10])]; depth[11] = m_pShiftToDepth[(shift[11])]; depth[12] = m_pShiftToDepth[(shift[12])]; depth[13] = m_pShiftToDepth[(shift[13])]; depth[14] = m_pShiftToDepth[(shift[14])]; depth[15] = m_pShiftToDepth[(shift[15])]; // Load depthQ = vld1q_u16(depth + 8); // Store vst1q_u16(pnOutput + 8, depthQ); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 16; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); // total bytes *pnActualWritten = (XnUInt32)((XnUInt8*)pnOutput - pDest); return XN_STATUS_OK; }
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); // Check there is enough room for the depth pixels if (!CheckWriteBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer(); XnUInt16 a0,a1,a2,a3,a4,a5,a6,a7; #ifdef XN_NEON XnUInt16 depth[8]; uint16x8_t Q0; #endif // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { if(m_nScaleFactor > 1) { XnUInt32 px = m_nOffsetInFrame%m_CurrentVideoMode.resolutionX; XnUInt32 py = (m_nOffsetInFrame)/m_CurrentVideoMode.resolutionX; if(py%m_nScaleFactor != 0) { // Skip as many pixels as possible XnUInt32 nEltsToSkip = XN_MIN(nElements - nElem, (m_CurrentVideoMode.resolutionX - px)/8 + (m_nScaleFactor-(py%m_nScaleFactor) - 1)*m_CurrentVideoMode.resolutionX/8); // ::memset(pnOutput, 0, nEltsToSkip*8*sizeof(XnUInt16)); pcInput += nEltsToSkip*XN_INPUT_ELEMENT_SIZE; pnOutput += nEltsToSkip*8; m_nOffsetInFrame += nEltsToSkip*8; nElem += (nEltsToSkip-1); continue; } } // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 if(m_nScaleFactor == 2) { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); } else if(m_nScaleFactor == 4) { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); } else { a0 = (XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5); a1 = (XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2); a2 = (XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7); a3 = (XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4); a4 = (XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1); a5 = (XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6); a6 = (XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3); a7 = (XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0); } #ifdef XN_NEON depth[0] = GetOutput(a0); depth[1] = GetOutput(a1); depth[2] = GetOutput(a2); depth[3] = GetOutput(a3); depth[4] = GetOutput(a4); depth[5] = GetOutput(a5); depth[6] = GetOutput(a6); depth[7] = GetOutput(a7); // Load Q0 = vld1q_u16(depth); // Store vst1q_u16(pnOutput, Q0); #else if(m_nScaleFactor == 2) { *pnOutput++ = GetOutput(a0); *pnOutput++ = 0; *pnOutput++ = GetOutput(a2); *pnOutput++ = 0; *pnOutput++ = GetOutput(a4); *pnOutput++ = 0; *pnOutput++ = GetOutput(a6); *pnOutput++ = 0; } else if(m_nScaleFactor == 4) { *pnOutput++ = GetOutput(a0); *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = GetOutput(a4); *pnOutput++ = 0; *pnOutput++ = 0; *pnOutput++ = 0; } else { *pnOutput++ = GetOutput(a0); *pnOutput++ = GetOutput(a1); *pnOutput++ = GetOutput(a2); *pnOutput++ = GetOutput(a3); *pnOutput++ = GetOutput(a4); *pnOutput++ = GetOutput(a5); *pnOutput++ = GetOutput(a6); *pnOutput++ = GetOutput(a7); } #endif pcInput += XN_INPUT_ELEMENT_SIZE; m_nOffsetInFrame+=8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }
XnStatus XnPacked11DepthProcessor::Unpack11to16(const XnUInt8* pcInput, const XnUInt32 nInputSize, XnUInt32* pnActualRead) { const XnUInt8* pOrigInput = pcInput; XnUInt32 nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored XnUInt32 nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; *pnActualRead = 0; XnBuffer* pWriteBuffer = GetWriteBuffer(); if (!CheckWriteBufferForOverflow(nNeededOutput)) { return XN_STATUS_OUTPUT_BUFFER_OVERFLOW; } XnUInt16* pnOutput = (XnUInt16*)pWriteBuffer->GetUnsafeWritePointer(); // Convert the 11bit packed data into 16bit shorts for (XnUInt32 nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5)); pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2)); pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7)); pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4)); pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1)); pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6)); pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3)); pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0)); pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; } *pnActualRead = (XnUInt32)(pcInput - pOrigInput); pWriteBuffer->UnsafeUpdateSize(nNeededOutput); return XN_STATUS_OK; }