void test_vsri_nu64 (void) { uint64x1_t out_uint64x1_t; uint64x1_t arg0_uint64x1_t; uint64x1_t arg1_uint64x1_t; out_uint64x1_t = vsri_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1); }
uint64x1_t test_vsri_n_u64(uint64x1_t a1, uint64x1_t a2) { // CHECK: test_vsri_n_u64 return vsri_n_u64(a1, a2, 1); // CHECK: llvm.arm64.neon.vsri.v1i64 // CHECK_CODEGEN: sri d0, d1, #1 }
int Unpack11to16(const unsigned char* pcInput, unsigned short* pnOutput, const unsigned long nInputSize) { const unsigned char* pOrigInput = pcInput; uint8x8_t inputfield; uint16x4_t shiftfield; uint16_t test[4]; unsigned long nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored unsigned long nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; // Convert the 11bit packed data into 16bit shorts for (unsigned long nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 #ifdef NEON // Load 64 bits of data inputfield = vld1_u8(pcInput); // Reverse it since the endianess is wrong. inputfield = vrev16_u8(inputfield); // Debug -- let's make sure it looks ok by looking at // it as a 16-bit element since that is ultimately what we want vst1_u16(test, inputfield); printf("i %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 5 bits to aling the first half-word // *note this does not compile since the compiler cannot deal with this // conversion for some reason. It can deal with vshr_n_u32() and lower. // print out the results shiftfield = vshr_n_u64(inputfield, 5); vst1_u16( test,shiftfield); printf("1 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 10 bits to aling the second half-word // print out the results shiftfield = vshr_n_u32(inputfield, 10); vst1_u16( test,shiftfield); printf("2 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 15 bits to aling the third half-word // print out the results shiftfield = vshr_n_u32(inputfield, 15); vst1_u16( test,shiftfield); printf("3 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // we would continue for all 8 half-word results #else // This is the original Primesense code... // shift the output by 5 bits to the right to align 11 bits on the 16 bit field vsri_n_u64(leftfield, shiftfield, 5); vst1_u64((uint64_t*)pnOutput, shiftfield); pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5)); pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2)); pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7)); pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4)); pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1)); pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6)); pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3)); pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0)); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; } return (pcInput - pOrigInput); }
uint64x1_t test_vsri_n_u64 (uint64x1_t a, uint64x1_t b) { return vsri_n_u64 (a, b, 9); }