Esempio n. 1
0
inline unsigned int GetByteMask2(uint8x16_t a, uint8x16_t b) 
{
	uint8x16_t am = vandq_u8(a, compaction_mask);
	uint8x16_t bm = vandq_u8(b, compaction_mask);
	uint8x8_t a_sum = vpadd_u8(vget_high_u8(am), vget_low_u8(am)); 
	uint8x8_t b_sum = vpadd_u8(vget_high_u8(bm), vget_low_u8(bm)); 
	a_sum = vpadd_u8(b_sum, a_sum);
	a_sum = vpadd_u8(a_sum, a_sum);
	return vget_lane_u32(vreinterpret_u32_u8(a_sum), 0);   
}
 SIMD_INLINE uint32x4_t SquaredDifferenceSumMasked(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & mask)
 {
     uint8x16_t ad = vandq_u8(vabdq_u8(a, b), mask);
     uint16x8_t lo = Square(vget_low_u8(ad));
     uint16x8_t hi = Square(vget_high_u8(ad));
     return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi));
 }
		template <bool align> SIMD_INLINE void EdgeBackgroundGrowRangeSlow(const uint8_t * value, uint8_t * background, uint8x16_t mask)
		{
			const uint8x16_t _value = Load<align>(value);
			const uint8x16_t _background = Load<align>(background);
			const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(_value, _background));
			Store<align>(background, vqaddq_u8(_background, inc));
		}
Esempio n. 4
0
void test_vandQu8 (void)
{
  uint8x16_t out_uint8x16_t;
  uint8x16_t arg0_uint8x16_t;
  uint8x16_t arg1_uint8x16_t;

  out_uint8x16_t = vandq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
}
static inline uint8x16x4_t
enc_reshuffle (uint8x16x3_t in)
{
	uint8x16x4_t out;

	// Divide bits of three input bytes over four output bytes:
	out.val[0] = vshrq_n_u8(in.val[0], 2);
	out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4));
	out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2));
	out.val[3] = in.val[2];

	// Clear top two bits:
	out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F));
	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
	out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F));

	return out;
}
		template <bool align> SIMD_INLINE void EdgeBackgroundIncrementCount(const uint8_t * value,
			const uint8_t * backgroundValue, uint8_t * backgroundCount, size_t offset, uint8x16_t mask)
		{
			const uint8x16_t _value = Load<align>(value + offset);
			const uint8x16_t _backgroundValue = Load<align>(backgroundValue + offset);
			const uint8x16_t _backgroundCount = Load<align>(backgroundCount + offset);

			const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(_value, _backgroundValue));

			Store<align>(backgroundCount + offset, vqaddq_u8(_backgroundCount, inc));
		}
        template <bool align> void SquaredDifferenceSumMasked(
            const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
            const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
        {
            assert(width < 0x10000);
            if (align)
            {
                assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
                assert(Aligned(mask) && Aligned(maskStride));
            }

            size_t alignedWidth = Simd::AlignLo(width, A);
            uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);
            uint8x16_t _index = vdupq_n_u8(index);
            uint64x2_t _sum = K64_0000000000000000;

            for (size_t row = 0; row < height; ++row)
            {
                uint32x4_t rowSum = K32_00000000;
                for (size_t col = 0; col < alignedWidth; col += A)
                {
                    uint8x16_t _mask = vceqq_u8(Load<align>(mask + col), _index);
                    uint8x16_t _a = Load<align>(a + col);
                    uint8x16_t _b = Load<align>(b + col);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask));
                }
                if (width - alignedWidth)
                {
                    uint8x16_t _mask = vandq_u8(tailMask, vceqq_u8(Load<align>(mask + width - A), _index));
                    uint8x16_t _a = Load<align>(a + width - A);
                    uint8x16_t _b = Load<align>(b + width - A);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask));
                }
                _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum));
                a += aStride;
                b += bStride;
                mask += maskStride;
            }
            *sum = ExtractSum64u(_sum);
        }
		template <bool align> SIMD_INLINE void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, uint8_t * backgroundValue,
			const uint8_t * mask, size_t offset, const uint8x16_t & threshold, const uint8x16_t & tailMask)
		{
			const uint8x16_t _mask = Load<align>(mask + offset);
			EdgeBackgroundAdjustRange<align>(backgroundCount, backgroundValue, offset, threshold, vandq_u8(_mask, tailMask));
		}
		SIMD_INLINE uint8x16_t AdjustEdge(const uint8x16_t & count, const uint8x16_t & value, const uint8x16_t & mask, const uint8x16_t & threshold)
		{
			const uint8x16_t inc = vandq_u8(mask, vcgtq_u8(count, threshold));
			const uint8x16_t dec = vandq_u8(mask, vcltq_u8(count, threshold));
			return vqsubq_u8(vqaddq_u8(value, inc), dec);
		}
Esempio n. 10
0
 /*
 * Notice:
 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 *   numbers of DD bits
 */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		   uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union i40e_rx_desc *rxdp;
	struct i40e_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;

	/* mask to shuffle from desc. to mbuf */
	uint8x16_t shuf_msk = {
		0xFF, 0xFF,   /* pkt_type set as unknown */
		0xFF, 0xFF,   /* pkt_type set as unknown */
		14, 15,       /* octet 15~14, low 16 bits pkt_len */
		0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
		14, 15,       /* octet 15~14, 16 bits data_len */
		2, 3,         /* octet 2~3, low 16 bits vlan_macip */
		4, 5, 6, 7    /* octet 4~7, 32bits rss */
		};

	uint8x16_t eop_check = {
		0x02, 0x00, 0x02, 0x00,
		0x02, 0x00, 0x02, 0x00,
		0x00, 0x00, 0x00, 0x00,
		0x00, 0x00, 0x00, 0x00
		};

	uint16x8_t crc_adjust = {
		0, 0,         /* ignore pkt_type field */
		rxq->crc_len, /* sub crc on pkt_len */
		0,            /* ignore high-16bits of pkt_len */
		rxq->crc_len, /* sub crc on data_len */
		0, 0, 0       /* ignore non-length fields */
		};

	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	rte_prefetch_non_temporal(rxdp);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
		i40e_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.qword1.status_error_len &
			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
		return 0;

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */

	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_I40E_DESCS_PER_LOOP,
			rxdp += RTE_I40E_DESCS_PER_LOOP) {
		uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		uint16x8x2_t sterr_tmp1, sterr_tmp2;
		uint64x2_t mbp1, mbp2;
		uint16x8_t staterr;
		uint16x8_t tmp;
		uint64_t stat;

		int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};

		/* B.1 load 1 mbuf point */
		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
		rte_rmb();

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);

		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
		descs[0] =  vld1q_u64((uint64_t *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
					    len_shl);
		descs[3] = vreinterpretq_u64_u32(len3);
		uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
					    len_shl);
		descs[2] = vreinterpretq_u64_u32(len2);

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
				       vreinterpretq_u16_u64(descs[3]));
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
				       vreinterpretq_u16_u64(descs[2]));

		/* C.2 get 4 pkts staterr value  */
		staterr = vzipq_u16(sterr_tmp1.val[1],
				    sterr_tmp2.val[1]).val[0];

		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
		pkt_mb4 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
		pkt_mb3 = vreinterpretq_u8_u16(tmp);

		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
					    len_shl);
		descs[1] = vreinterpretq_u64_u32(len1);
		uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
					    len_shl);
		descs[0] = vreinterpretq_u64_u32(len0);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);

		/* D.3 copy final 3,4 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
				 pkt_mb4);
		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
				 pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
		pkt_mb2 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
		pkt_mb1 = vreinterpretq_u8_u16(tmp);

		/* C* extract and record EOP bit */
		if (split_packet) {
			uint8x16_t eop_shuf_mask = {
					0x00, 0x02, 0x04, 0x06,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF};
			uint8x16_t eop_bits;

			/* and with mask to extract bits, flipping 1-0 */
			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
			eop_bits = vandq_u8(eop_bits, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);

			/* store the resulting 32-bit value */
			vst1q_lane_u32((uint32_t *)split_packet,
				       vreinterpretq_u32_u8(eop_bits), 0);
			split_packet += RTE_I40E_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
		staterr = vreinterpretq_u16_s16(
				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
					    I40E_UINT16_BIT - 1));
		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);

		rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);

		/* D.3 copy final 1,2 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
			 pkt_mb2);
		vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
			 pkt_mb1);
		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
		/* C.4 calc avaialbe number of desc */
		if (unlikely(stat == 0)) {
			nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
		} else {
			nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
			break;
		}
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}
Esempio n. 11
0
static
void
neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
                                    uint32_t *d_end, uint8_t btable[8][4][16],
                                    uint32_t val, int xor, int altmap)
{
  int i, j;
#ifdef ARCH_AARCH64
  uint8x16_t tables[8][4];
#else
  uint8x8x2_t tables[8][4];
#endif
  uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
  uint8x16_t p0, p1, p2, p3, si, mask1;
  uint16x8x2_t r0, r1;
  uint8x16x2_t q0, q1;

  for (i = 0; i < 8; i++) {
    for (j = 0; j < 4; j++) {
#ifdef ARCH_AARCH64
      tables[i][j] = vld1q_u8(btable[i][j]);
#else
      tables[i][j].val[0] = vld1_u8(btable[i][j]);
      tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
#endif
    }
  }

  mask1 = vdupq_n_u8(0xf);

  while (dst < d_end) {

      v0 = vld1q_u32(src); src += 4;
      v1 = vld1q_u32(src); src += 4;
      v2 = vld1q_u32(src); src += 4;
      v3 = vld1q_u32(src); src += 4;

      if (altmap) {
          q0.val[0] = vreinterpretq_u8_u32(v0);
          q0.val[1] = vreinterpretq_u8_u32(v1);
          q1.val[0] = vreinterpretq_u8_u32(v2);
          q1.val[1] = vreinterpretq_u8_u32(v3);
      } else {
          r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
          r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));

          q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
                        vreinterpretq_u8_u16(r1.val[0]));
          q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
                        vreinterpretq_u8_u16(r1.val[1]));
      }

      si = vandq_u8(q0.val[0], mask1);
      p0 = vqtbl1q_u8(tables[0][0], si);
      p1 = vqtbl1q_u8(tables[0][1], si);
      p2 = vqtbl1q_u8(tables[0][2], si);
      p3 = vqtbl1q_u8(tables[0][3], si);

      si = vshrq_n_u8(q0.val[0], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));

      si = vandq_u8(q0.val[1], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));

      si = vshrq_n_u8(q0.val[1], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));

      si = vandq_u8(q1.val[0], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));

      si = vshrq_n_u8(q1.val[0], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));

      si = vandq_u8(q1.val[1], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));

      si = vshrq_n_u8(q1.val[1], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));

      if (altmap) {
          s0 = vreinterpretq_u32_u8(p0);
          s1 = vreinterpretq_u32_u8(p1);
          s2 = vreinterpretq_u32_u8(p2);
          s3 = vreinterpretq_u32_u8(p3);
      } else {
          q0 = vtrnq_u8(p0, p1);
          q1 = vtrnq_u8(p2, p3);

          r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
                         vreinterpretq_u16_u8(q1.val[0]));
          r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
                         vreinterpretq_u16_u8(q1.val[1]));

          s0 = vreinterpretq_u32_u16(r0.val[0]);
          s1 = vreinterpretq_u32_u16(r1.val[0]);
          s2 = vreinterpretq_u32_u16(r0.val[1]);
          s3 = vreinterpretq_u32_u16(r1.val[1]);
      }

      if (xor) {
          v0 = vld1q_u32(dst);
          v1 = vld1q_u32(dst + 4);
          v2 = vld1q_u32(dst + 8);
          v3 = vld1q_u32(dst + 12);
          s0 = veorq_u32(s0, v0);
          s1 = veorq_u32(s1, v1);
          s2 = veorq_u32(s2, v2);
          s3 = veorq_u32(s3, v3);
      }

      vst1q_u32(dst,      s0);
      vst1q_u32(dst + 4,  s1);
      vst1q_u32(dst + 8,  s2);
      vst1q_u32(dst + 12, s3);

      dst += 16;
  }
}
Esempio n. 12
0
int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg,
                             YV12_BUFFER_CONFIG *running_avg,
                             MACROBLOCK *signal, unsigned int motion_magnitude,
                             int y_offset, int uv_offset) {
    /* If motion_magnitude is small, making the denoiser more aggressive by
     * increasing the adjustment for each level, level1 adjustment is
     * increased, the deltas stay the same.
     */
    const uint8x16_t v_level1_adjustment = vdupq_n_u8(
        (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 : 3);
    const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
    const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
    const uint8x16_t v_level1_threshold = vdupq_n_u8(4);
    const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
    const uint8x16_t v_level3_threshold = vdupq_n_u8(16);

    /* Local variables for array pointers and strides. */
    unsigned char *sig = signal->thismb;
    int            sig_stride = 16;
    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
    int            mc_running_avg_y_stride = mc_running_avg->y_stride;
    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
    int            running_avg_y_stride = running_avg->y_stride;

    /* Go over lines. */
    int i;
    int sum_diff = 0;
    for (i = 0; i < 16; ++i) {
        int8x16_t v_sum_diff = vdupq_n_s8(0);
        uint8x16_t v_running_avg_y;

        /* Load inputs. */
        const uint8x16_t v_sig = vld1q_u8(sig);
        const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);

        /* Calculate absolute difference and sign masks. */
        const uint8x16_t v_abs_diff      = vabdq_u8(v_sig, v_mc_running_avg_y);
        const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
        const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);

        /* Figure out which level that put us in. */
        const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
                                                  v_abs_diff);
        const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
                                                  v_abs_diff);
        const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
                                                  v_abs_diff);

        /* Calculate absolute adjustments for level 1, 2 and 3. */
        const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
                                                        v_delta_level_1_and_2);
        const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
                                                        v_delta_level_2_and_3);
        const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
            v_level2_adjustment);
        const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
            v_level1and2_adjustment, v_level3_adjustment);

        /* Figure adjustment absolute value by selecting between the absolute
         * difference if in level0 or the value for level 1, 2 and 3.
         */
        const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
            v_level1and2and3_adjustment, v_abs_diff);

        /* Calculate positive and negative adjustments. Apply them to the signal
         * and accumulate them. Adjustments are less than eight and the maximum
         * sum of them (7 * 16) can fit in a signed char.
         */
        const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
                                                     v_abs_adjustment);
        const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
                                                     v_abs_adjustment);
        v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
        v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
        v_sum_diff = vqaddq_s8(v_sum_diff,
                               vreinterpretq_s8_u8(v_pos_adjustment));
        v_sum_diff = vqsubq_s8(v_sum_diff,
                               vreinterpretq_s8_u8(v_neg_adjustment));

        /* Store results. */
        vst1q_u8(running_avg_y, v_running_avg_y);

        /* Sum all the accumulators to have the sum of all pixel differences
         * for this macroblock.
         */
        {
            int s0 = vgetq_lane_s8(v_sum_diff,  0) +
                     vgetq_lane_s8(v_sum_diff,  1) +
                     vgetq_lane_s8(v_sum_diff,  2) +
                     vgetq_lane_s8(v_sum_diff,  3);
            int s1 = vgetq_lane_s8(v_sum_diff,  4) +
                     vgetq_lane_s8(v_sum_diff,  5) +
                     vgetq_lane_s8(v_sum_diff,  6) +
                     vgetq_lane_s8(v_sum_diff,  7);
            int s2 = vgetq_lane_s8(v_sum_diff,  8) +
                     vgetq_lane_s8(v_sum_diff,  9) +
                     vgetq_lane_s8(v_sum_diff, 10) +
                     vgetq_lane_s8(v_sum_diff, 11);
            int s3 = vgetq_lane_s8(v_sum_diff, 12) +
                     vgetq_lane_s8(v_sum_diff, 13) +
                     vgetq_lane_s8(v_sum_diff, 14) +
                     vgetq_lane_s8(v_sum_diff, 15);
            sum_diff += s0 + s1+ s2 + s3;
        }

        /* Update pointers for next iteration. */
        sig += sig_stride;
        mc_running_avg_y += mc_running_avg_y_stride;
        running_avg_y += running_avg_y_stride;
    }

    /* Too much adjustments => copy block. */
    if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
        return COPY_BLOCK;

    /* Tell above level that block was filtered. */
    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride,
                      signal->thismb, sig_stride);
    return FILTER_BLOCK;
}
Esempio n. 13
0
inline  uint8x16_t vandq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vandq_u8 (v0, v1); }
Esempio n. 14
0
uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
  // CHECK-LABEL: test_vandq_u8
  return vandq_u8(a, b);
  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
}
Esempio n. 15
0
void FORCE_INLINE CSA(uint8x16_t& h, uint8x16_t& l, uint8x16_t a, uint8x16_t b, uint8x16_t c)
{
  uint8x16_t u = veorq_u8(a, b);
  h = vorrq_u8(vandq_u8(a, b), vandq_u8(u, c));
  l = veorq_u8(u, c);
}
Esempio n. 16
0
static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
                                       uint8x16_t qlimit,   // limit
                                       uint8x16_t qthresh,  // thresh
                                       uint8x16_t q3,       // p3
                                       uint8x16_t q4,       // p2
                                       uint8x16_t q5,       // p1
                                       uint8x16_t q6,       // p0
                                       uint8x16_t q7,       // q0
                                       uint8x16_t q8,       // q1
                                       uint8x16_t q9,       // q2
                                       uint8x16_t q10,      // q3
                                       uint8x16_t *q5r,     // p1
                                       uint8x16_t *q6r,     // p0
                                       uint8x16_t *q7r,     // q0
                                       uint8x16_t *q8r) {   // q1
  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
  int16x8_t q2s16, q11s16;
  uint16x8_t q4u16;
  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
  int8x8_t d2s8, d3s8;

  q11u8 = vabdq_u8(q3, q4);
  q12u8 = vabdq_u8(q4, q5);
  q13u8 = vabdq_u8(q5, q6);
  q14u8 = vabdq_u8(q8, q7);
  q3 = vabdq_u8(q9, q8);
  q4 = vabdq_u8(q10, q9);

  q11u8 = vmaxq_u8(q11u8, q12u8);
  q12u8 = vmaxq_u8(q13u8, q14u8);
  q3 = vmaxq_u8(q3, q4);
  q15u8 = vmaxq_u8(q11u8, q12u8);

  q9 = vabdq_u8(q6, q7);

  // vp8_hevmask
  q13u8 = vcgtq_u8(q13u8, qthresh);
  q14u8 = vcgtq_u8(q14u8, qthresh);
  q15u8 = vmaxq_u8(q15u8, q3);

  q2u8 = vabdq_u8(q5, q8);
  q9 = vqaddq_u8(q9, q9);

  q15u8 = vcgeq_u8(qlimit, q15u8);

  // vp8_filter() function
  // convert to signed
  q10 = vdupq_n_u8(0x80);
  q8 = veorq_u8(q8, q10);
  q7 = veorq_u8(q7, q10);
  q6 = veorq_u8(q6, q10);
  q5 = veorq_u8(q5, q10);

  q2u8 = vshrq_n_u8(q2u8, 1);
  q9 = vqaddq_u8(q9, q2u8);

  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                   vget_low_s8(vreinterpretq_s8_u8(q6)));
  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                    vget_high_s8(vreinterpretq_s8_u8(q6)));

  q9 = vcgeq_u8(qblimit, q9);

  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));

  q14u8 = vorrq_u8(q13u8, q14u8);

  q4u16 = vdupq_n_u16(3);
  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));

  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
  q15u8 = vandq_u8(q15u8, q9);

  q1s8 = vreinterpretq_s8_u8(q1u8);
  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));

  q4 = vdupq_n_u8(3);
  q9 = vdupq_n_u8(4);
  // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
  d2s8 = vqmovn_s16(q2s16);
  d3s8 = vqmovn_s16(q11s16);
  q1s8 = vcombine_s8(d2s8, d3s8);
  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
  q1s8 = vreinterpretq_s8_u8(q1u8);

  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
  q2s8 = vshrq_n_s8(q2s8, 3);
  q1s8 = vshrq_n_s8(q1s8, 3);

  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);

  q1s8 = vrshrq_n_s8(q1s8, 1);
  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);

  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
  return;
}
Esempio n. 17
0
int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
                             int mc_running_avg_y_stride,
                             unsigned char *running_avg_y,
                             int running_avg_y_stride,
                             unsigned char *sig, int sig_stride,
                             unsigned int motion_magnitude,
                             int increase_denoising) {
    /* If motion_magnitude is small, making the denoiser more aggressive by
     * increasing the adjustment for each level, level1 adjustment is
     * increased, the deltas stay the same.
     */
    int shift_inc  = (increase_denoising &&
        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
    const uint8x16_t v_level1_adjustment = vmovq_n_u8(
        (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
    const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
    const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
    const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
    const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
    const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
    int64x2_t v_sum_diff_total = vdupq_n_s64(0);

    /* Go over lines. */
    int r;
    for (r = 0; r < 16; ++r) {
        /* Load inputs. */
        const uint8x16_t v_sig = vld1q_u8(sig);
        const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);

        /* Calculate absolute difference and sign masks. */
        const uint8x16_t v_abs_diff      = vabdq_u8(v_sig, v_mc_running_avg_y);
        const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
        const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);

        /* Figure out which level that put us in. */
        const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
                                                  v_abs_diff);
        const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
                                                  v_abs_diff);
        const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
                                                  v_abs_diff);

        /* Calculate absolute adjustments for level 1, 2 and 3. */
        const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
                                                        v_delta_level_1_and_2);
        const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
                                                        v_delta_level_2_and_3);
        const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
            v_level2_adjustment);
        const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
            v_level1and2_adjustment, v_level3_adjustment);

        /* Figure adjustment absolute value by selecting between the absolute
         * difference if in level0 or the value for level 1, 2 and 3.
         */
        const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
            v_level1and2and3_adjustment, v_abs_diff);

        /* Calculate positive and negative adjustments. Apply them to the signal
         * and accumulate them. Adjustments are less than eight and the maximum
         * sum of them (7 * 16) can fit in a signed char.
         */
        const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
                                                     v_abs_adjustment);
        const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
                                                     v_abs_adjustment);

        uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
        v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);

        /* Store results. */
        vst1q_u8(running_avg_y, v_running_avg_y);

        /* Sum all the accumulators to have the sum of all pixel differences
         * for this macroblock.
         */
        {
            const int8x16_t v_sum_diff =
                vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
                          vreinterpretq_s8_u8(v_neg_adjustment));

            const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);

            const int32x4_t fedc_ba98_7654_3210 =
                vpaddlq_s16(fe_dc_ba_98_76_54_32_10);

            const int64x2_t fedcba98_76543210 =
                vpaddlq_s32(fedc_ba98_7654_3210);

            v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
        }

        /* Update pointers for next iteration. */
        sig += sig_stride;
        mc_running_avg_y += mc_running_avg_y_stride;
        running_avg_y += running_avg_y_stride;
    }

    /* Too much adjustments => copy block. */
    {
        int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
                                      vget_low_s64(v_sum_diff_total));
        int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
        int sum_diff_thresh = SUM_DIFF_THRESHOLD;

        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
        if (sum_diff > sum_diff_thresh) {
          // Before returning to copy the block (i.e., apply no denoising),
          // checK if we can still apply some (weaker) temporal filtering to
          // this block, that would otherwise not be denoised at all. Simplest
          // is to apply an additional adjustment to running_avg_y to bring it
          // closer to sig. The adjustment is capped by a maximum delta, and
          // chosen such that in most cases the resulting sum_diff will be
          // within the accceptable range given by sum_diff_thresh.

          // The delta is set by the excess of absolute pixel diff over the
          // threshold.
          int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
          // Only apply the adjustment for max delta up to 3.
          if (delta < 4) {
            const uint8x16_t k_delta = vmovq_n_u8(delta);
            sig -= sig_stride * 16;
            mc_running_avg_y -= mc_running_avg_y_stride * 16;
            running_avg_y -= running_avg_y_stride * 16;
            for (r = 0; r < 16; ++r) {
              uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
              const uint8x16_t v_sig = vld1q_u8(sig);
              const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);

              /* Calculate absolute difference and sign masks. */
              const uint8x16_t v_abs_diff      = vabdq_u8(v_sig,
                                                          v_mc_running_avg_y);
              const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
                                                          v_mc_running_avg_y);
              const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
                                                          v_mc_running_avg_y);
              // Clamp absolute difference to delta to get the adjustment.
              const uint8x16_t v_abs_adjustment =
                  vminq_u8(v_abs_diff, (k_delta));

              const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
                                                           v_abs_adjustment);
              const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
                                                           v_abs_adjustment);

              v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
              v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);

              /* Store results. */
              vst1q_u8(running_avg_y, v_running_avg_y);

              {
                  const int8x16_t v_sum_diff =
                      vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
                                vreinterpretq_s8_u8(v_pos_adjustment));

                  const int16x8_t fe_dc_ba_98_76_54_32_10 =
                      vpaddlq_s8(v_sum_diff);
                  const int32x4_t fedc_ba98_7654_3210 =
                      vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
                  const int64x2_t fedcba98_76543210 =
                      vpaddlq_s32(fedc_ba98_7654_3210);

                  v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
                                                fedcba98_76543210);
              }
              /* Update pointers for next iteration. */
              sig += sig_stride;
              mc_running_avg_y += mc_running_avg_y_stride;
              running_avg_y += running_avg_y_stride;
            }
            {
              // Update the sum of all pixel differences of this MB.
              x = vqadd_s64(vget_high_s64(v_sum_diff_total),
                            vget_low_s64(v_sum_diff_total));
              sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);

              if (sum_diff > sum_diff_thresh) {
                return COPY_BLOCK;
              }
            }
          } else {
            return COPY_BLOCK;
          }
        }
    }
Esempio n. 18
0
static INLINE void vp8_mbloop_filter_neon(
        uint8x16_t qblimit,  // mblimit
        uint8x16_t qlimit,   // limit
        uint8x16_t qthresh,  // thresh
        uint8x16_t q3,       // p2
        uint8x16_t q4,       // p2
        uint8x16_t q5,       // p1
        uint8x16_t q6,       // p0
        uint8x16_t q7,       // q0
        uint8x16_t q8,       // q1
        uint8x16_t q9,       // q2
        uint8x16_t q10,      // q3
        uint8x16_t *q4r,     // p1
        uint8x16_t *q5r,     // p1
        uint8x16_t *q6r,     // p0
        uint8x16_t *q7r,     // q0
        uint8x16_t *q8r,     // q1
        uint8x16_t *q9r) {   // q1
    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
    int8x16_t q0s8, q12s8, q14s8, q15s8;
    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;

    q11u8 = vabdq_u8(q3, q4);
    q12u8 = vabdq_u8(q4, q5);
    q13u8 = vabdq_u8(q5, q6);
    q14u8 = vabdq_u8(q8, q7);
    q1u8  = vabdq_u8(q9, q8);
    q0u8  = vabdq_u8(q10, q9);

    q11u8 = vmaxq_u8(q11u8, q12u8);
    q12u8 = vmaxq_u8(q13u8, q14u8);
    q1u8  = vmaxq_u8(q1u8, q0u8);
    q15u8 = vmaxq_u8(q11u8, q12u8);

    q12u8 = vabdq_u8(q6, q7);

    // vp8_hevmask
    q13u8 = vcgtq_u8(q13u8, qthresh);
    q14u8 = vcgtq_u8(q14u8, qthresh);
    q15u8 = vmaxq_u8(q15u8, q1u8);

    q15u8 = vcgeq_u8(qlimit, q15u8);

    q1u8 = vabdq_u8(q5, q8);
    q12u8 = vqaddq_u8(q12u8, q12u8);

    // vp8_filter() function
    // convert to signed
    q0u8 = vdupq_n_u8(0x80);
    q9 = veorq_u8(q9, q0u8);
    q8 = veorq_u8(q8, q0u8);
    q7 = veorq_u8(q7, q0u8);
    q6 = veorq_u8(q6, q0u8);
    q5 = veorq_u8(q5, q0u8);
    q4 = veorq_u8(q4, q0u8);

    q1u8 = vshrq_n_u8(q1u8, 1);
    q12u8 = vqaddq_u8(q12u8, q1u8);

    q14u8 = vorrq_u8(q13u8, q14u8);
    q12u8 = vcgeq_u8(qblimit, q12u8);

    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                     vget_low_s8(vreinterpretq_s8_u8(q6)));
    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                      vget_high_s8(vreinterpretq_s8_u8(q6)));

    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
                     vreinterpretq_s8_u8(q8));

    q11s16 = vdupq_n_s16(3);
    q2s16  = vmulq_s16(q2s16, q11s16);
    q13s16 = vmulq_s16(q13s16, q11s16);

    q15u8 = vandq_u8(q15u8, q12u8);

    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));

    q12u8 = vdupq_n_u8(3);
    q11u8 = vdupq_n_u8(4);
    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
    d2 = vqmovn_s16(q2s16);
    d3 = vqmovn_s16(q13s16);
    q1s8 = vcombine_s8(d2, d3);
    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q13s8 = vshrq_n_s8(q13s8, 3);

    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);

    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
    d5 = vdup_n_s8(9);
    d4 = vdup_n_s8(18);

    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
    d5 = vdup_n_s8(27);
    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);

    d0  = vqshrn_n_s16(q0s16 , 7);
    d1  = vqshrn_n_s16(q11s16, 7);
    d24 = vqshrn_n_s16(q12s16, 7);
    d25 = vqshrn_n_s16(q13s16, 7);
    d28 = vqshrn_n_s16(q14s16, 7);
    d29 = vqshrn_n_s16(q15s16, 7);

    q0s8  = vcombine_s8(d0, d1);
    q12s8 = vcombine_s8(d24, d25);
    q14s8 = vcombine_s8(d28, d29);

    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
    q15s8 = vqsubq_s8((q7s8), q14s8);
    q14s8 = vqaddq_s8((q6s8), q14s8);

    q1u8 = vdupq_n_u8(0x80);
    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
    return;
}