Exemple #1
0
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
  const uint32x2_t zero = vdup_n_u32(0);
  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}
Exemple #2
0
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  (void)left;
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  dst[3 * stride + 3] = above[7];
}
Exemple #3
0
int  Unpack11to16(const unsigned char* pcInput, unsigned short* pnOutput, const unsigned long  nInputSize)
{
	const unsigned char* pOrigInput = pcInput;
	uint8x8_t inputfield;
	uint16x4_t shiftfield;
	uint16_t test[4];

	unsigned long nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored
	unsigned long nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE;

	// Convert the 11bit packed data into 16bit shorts
	for (unsigned long nElem = 0; nElem < nElements; ++nElem)
	{
		// input:	0,  1,  2,3,  4,  5,  6,7,  8,  9,10
		//		-,---,---,-,---,---,---,-,---,---,-
		// bits:	8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8
		//		---,---,-----,---,---,-----,---,---
		// output:	  0,  1,    2,  3,  4,    5,  6,  7
#ifdef NEON
	        // Load 64 bits of data
		inputfield = vld1_u8(pcInput);
		// Reverse it since the endianess is wrong.
		inputfield = vrev16_u8(inputfield);

		// Debug -- let's make sure it looks ok by looking at 
		// it as a 16-bit element since that is ultimately what we want
		vst1_u16(test, inputfield);
		printf("i %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);

		// Right shift by 5 bits to aling the first half-word
		// *note this does not compile since the compiler cannot deal with this 
		// conversion for some reason. It can deal with vshr_n_u32() and lower.
		// print out the results
		shiftfield = vshr_n_u64(inputfield, 5);
		vst1_u16( test,shiftfield);
		printf("1 %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);
		
		// Right shift by 10 bits to aling the second half-word
		// print out the results
		shiftfield = vshr_n_u32(inputfield, 10);
		vst1_u16( test,shiftfield);
		printf("2 %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);

		// Right shift by 15 bits to aling the third half-word
		// print out the results
		shiftfield = vshr_n_u32(inputfield, 15);
		vst1_u16( test,shiftfield);
		printf("3 %04x %04x %04x %04x\n",
			test[0], test[1], test[2], test[3]);

		// we would continue for all 8 half-word results
		
#else
		// This is the original Primesense code...
		// shift the output by 5 bits to the right to align 11 bits on the 16 bit field
		vsri_n_u64(leftfield, shiftfield, 5);

		vst1_u64((uint64_t*)pnOutput, shiftfield);

		pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5));
		pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2));
		pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7));
		pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4));
		pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1));
		pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6));
		pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3));
		pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0));
#endif

		pcInput += XN_INPUT_ELEMENT_SIZE;
		pnOutput += 8;
	}

	return (pcInput - pOrigInput);
}
uint64x1_t
test_vshr_n_u64 (uint64x1_t a)
{
  return vshr_n_u64 (a, 3);
}