void test_vst1u16 (void) { uint16_t *arg0_uint16_t; uint16x4_t arg1_uint16x4_t; vst1_u16 (arg0_uint16_t, arg1_uint16x4_t); }
static void interpolate5LineNeon(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width, int k1, int k2) { uint16x4_t kRedBlueMask_4 = vdup_n_u16(ColorMask::kRedBlueMask); uint16x4_t kGreenMask_4 = vdup_n_u16(ColorMask::kGreenMask); uint16x4_t k1_4 = vdup_n_u16(k1); uint16x4_t k2_4 = vdup_n_u16(k2); while (width >= 4) { uint16x4_t srcA_4 = vld1_u16(srcA); uint16x4_t srcB_4 = vld1_u16(srcB); uint16x4_t p1_4 = srcB_4; uint16x4_t p2_4 = srcA_4; uint16x4_t p1_rb_4 = vand_u16(p1_4, kRedBlueMask_4); uint16x4_t p1_g_4 = vand_u16(p1_4, kGreenMask_4); uint16x4_t p2_rb_4 = vand_u16(p2_4, kRedBlueMask_4); uint16x4_t p2_g_4 = vand_u16(p2_4, kGreenMask_4); uint32x4_t tmp_rb_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_rb_4, k2_4), p1_rb_4, k1_4), 3); uint32x4_t tmp_g_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_g_4, k2_4), p1_g_4, k1_4), 3); uint16x4_t p_rb_4 = vmovn_u32(tmp_rb_4); p_rb_4 = vand_u16(p_rb_4, kRedBlueMask_4); uint16x4_t p_g_4 = vmovn_u32(tmp_g_4); p_g_4 = vand_u16(p_g_4, kGreenMask_4); uint16x4_t result_4 = p_rb_4 | p_g_4; vst1_u16(dst, result_4); dst += 4; srcA += 4; srcB += 4; width -= 4; } }
// res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x4_t a0 = vld1_u16(*dest); const uint16x4_t a1 = vld1_u16(*dest + stride); const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); // Note: In some profile tests, res is quite close to +/-32767. // We use saturating addition. const int16x8_t b = vqaddq_s16(res, a); const int16x8_t c = vminq_s16(b, max); const uint16x8_t d = vqshluq_n_s16(c, 0); vst1_u16(*dest, vget_high_u16(d)); *dest += stride; vst1_u16(*dest, vget_low_u16(d)); *dest += stride; }
int Unpack11to16(const unsigned char* pcInput, unsigned short* pnOutput, const unsigned long nInputSize) { const unsigned char* pOrigInput = pcInput; uint8x8_t inputfield; uint16x4_t shiftfield; uint16_t test[4]; unsigned long nElements = nInputSize / XN_INPUT_ELEMENT_SIZE; // floored unsigned long nNeededOutput = nElements * XN_OUTPUT_ELEMENT_SIZE; // Convert the 11bit packed data into 16bit shorts for (unsigned long nElem = 0; nElem < nElements; ++nElem) { // input: 0, 1, 2,3, 4, 5, 6,7, 8, 9,10 // -,---,---,-,---,---,---,-,---,---,- // bits: 8,3,5,6,2,8,1,7,4,4,7,1,8,2,6,5,3,8 // ---,---,-----,---,---,-----,---,--- // output: 0, 1, 2, 3, 4, 5, 6, 7 #ifdef NEON // Load 64 bits of data inputfield = vld1_u8(pcInput); // Reverse it since the endianess is wrong. inputfield = vrev16_u8(inputfield); // Debug -- let's make sure it looks ok by looking at // it as a 16-bit element since that is ultimately what we want vst1_u16(test, inputfield); printf("i %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 5 bits to aling the first half-word // *note this does not compile since the compiler cannot deal with this // conversion for some reason. It can deal with vshr_n_u32() and lower. // print out the results shiftfield = vshr_n_u64(inputfield, 5); vst1_u16( test,shiftfield); printf("1 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 10 bits to aling the second half-word // print out the results shiftfield = vshr_n_u32(inputfield, 10); vst1_u16( test,shiftfield); printf("2 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // Right shift by 15 bits to aling the third half-word // print out the results shiftfield = vshr_n_u32(inputfield, 15); vst1_u16( test,shiftfield); printf("3 %04x %04x %04x %04x\n", test[0], test[1], test[2], test[3]); // we would continue for all 8 half-word results #else // This is the original Primesense code... // shift the output by 5 bits to the right to align 11 bits on the 16 bit field vsri_n_u64(leftfield, shiftfield, 5); vst1_u64((uint64_t*)pnOutput, shiftfield); pnOutput[0] = GetOutput((XN_TAKE_BITS(pcInput[0],8,0) << 3) | XN_TAKE_BITS(pcInput[1],3,5)); pnOutput[1] = GetOutput((XN_TAKE_BITS(pcInput[1],5,0) << 6) | XN_TAKE_BITS(pcInput[2],6,2)); pnOutput[2] = GetOutput((XN_TAKE_BITS(pcInput[2],2,0) << 9) | (XN_TAKE_BITS(pcInput[3],8,0) << 1) | XN_TAKE_BITS(pcInput[4],1,7)); pnOutput[3] = GetOutput((XN_TAKE_BITS(pcInput[4],7,0) << 4) | XN_TAKE_BITS(pcInput[5],4,4)); pnOutput[4] = GetOutput((XN_TAKE_BITS(pcInput[5],4,0) << 7) | XN_TAKE_BITS(pcInput[6],7,1)); pnOutput[5] = GetOutput((XN_TAKE_BITS(pcInput[6],1,0) << 10) | (XN_TAKE_BITS(pcInput[7],8,0) << 2) | XN_TAKE_BITS(pcInput[8],2,6)); pnOutput[6] = GetOutput((XN_TAKE_BITS(pcInput[8],6,0) << 5) | XN_TAKE_BITS(pcInput[9],5,3)); pnOutput[7] = GetOutput((XN_TAKE_BITS(pcInput[9],3,0) << 8) | XN_TAKE_BITS(pcInput[10],8,0)); #endif pcInput += XN_INPUT_ELEMENT_SIZE; pnOutput += 8; } return (pcInput - pOrigInput); }
void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; (void)filter_y; (void)filter_y_stride; (void)bd; if (w < 8) { // avg4 uint16x4_t s0, s1, d0, d1; uint16x8_t s01, d01; do { s0 = vld1_u16(src); d0 = vld1_u16(dst); src += src_stride; s1 = vld1_u16(src); d1 = vld1_u16(dst + dst_stride); src += src_stride; s01 = vcombine_u16(s0, s1); d01 = vcombine_u16(d0, d1); d01 = vrhaddq_u16(s01, d01); vst1_u16(dst, vget_low_u16(d01)); dst += dst_stride; vst1_u16(dst, vget_high_u16(d01)); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 8) { // avg8 uint16x8_t s0, s1, d0, d1; do { s0 = vld1q_u16(src); d0 = vld1q_u16(dst); src += src_stride; s1 = vld1q_u16(src); d1 = vld1q_u16(dst + dst_stride); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); vst1q_u16(dst, d0); dst += dst_stride; vst1q_u16(dst, d1); dst += dst_stride; h -= 2; } while (h > 0); } else if (w < 32) { // avg16 uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; do { s0l = vld1q_u16(src); s0h = vld1q_u16(src + 8); d0l = vld1q_u16(dst); d0h = vld1q_u16(dst + 8); src += src_stride; s1l = vld1q_u16(src); s1h = vld1q_u16(src + 8); d1l = vld1q_u16(dst + dst_stride); d1h = vld1q_u16(dst + dst_stride + 8); src += src_stride; d0l = vrhaddq_u16(s0l, d0l); d0h = vrhaddq_u16(s0h, d0h); d1l = vrhaddq_u16(s1l, d1l); d1h = vrhaddq_u16(s1h, d1h); vst1q_u16(dst, d0l); vst1q_u16(dst + 8, d0h); dst += dst_stride; vst1q_u16(dst, d1l); vst1q_u16(dst + 8, d1h); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 32) { // avg32 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; h -= 2; } while (h > 0); } else { // avg64 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); s0 = vld1q_u16(src + 32); s1 = vld1q_u16(src + 40); s2 = vld1q_u16(src + 48); s3 = vld1q_u16(src + 56); d0 = vld1q_u16(dst + 32); d1 = vld1q_u16(dst + 40); d2 = vld1q_u16(dst + 48); d3 = vld1q_u16(dst + 56); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst + 32, d0); vst1q_u16(dst + 40, d1); vst1q_u16(dst + 48, d2); vst1q_u16(dst + 56, d3); src += src_stride; dst += dst_stride; } while (--h); } }
inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); }