Exemplos de vshrq_n_s16 em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: vshrQ_ns16.c Projeto: 0day-ci/gcc

void test_vshrQ_ns16 (void)
{
  int16x8_t out_int16x8_t;
  int16x8_t arg0_int16x8_t;

  out_int16x8_t = vshrq_n_s16 (arg0_int16x8_t, 1);
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: yuv_neon.c Projeto: 1vanK/Urho3D

static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
                                   const uint16_t* best_y, uint16_t* out) {
  int i;
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  const int16x8_t zero = vdupq_n_s16(0);
  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t a0 = vld1q_s16(A + i + 0);
    const int16x8_t a1 = vld1q_s16(A + i + 1);
    const int16x8_t b0 = vld1q_s16(B + i + 0);
    const int16x8_t b1 = vld1q_s16(B + i + 1);
    const int16x8_t a0b1 = vaddq_s16(a0, b1);
    const int16x8_t a1b0 = vaddq_s16(a1, b0);
    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
    const int16x8_t d0 = vaddq_s16(c1, a0);
    const int16x8_t d1 = vaddq_s16(c0, a1);
    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
    const int16x8x2_t f = vzipq_s16(e0, e1);
    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
  }
  for (; i < len; ++i) {
    const int a0b1 = A[i + 0] + B[i + 1];
    const int a1b0 = A[i + 1] + B[i + 0];
    const int a0a1b0b1 = a0b1 + a1b0 + 8;
    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
  }
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: qdrawhelper_neon.cpp Projeto: Nacto1/qt-everywhere-opensource-src-4.6.2

QT_BEGIN_NAMESPACE

static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half)
{
    // result = (x + (x >> 8) + 0x80) >> 8

    const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8
    const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80
    const int16x8_t sum = vaddq_s16(temp, sum_part);

    return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8));
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: shortidct4x4llm_neon.c Projeto: Corax26/libvpx

void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
  int i;
  uint32x2_t d6u32 = vdup_n_u32(0);
  uint8x8_t d1u8;
  int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
  uint16x8_t q1u16;
  int16x8_t q1s16, q2s16, q3s16, q4s16;
  int32x2x2_t v2tmp0, v2tmp1;
  int16x4x2_t v2tmp2, v2tmp3;

  d2 = vld1_s16(input);
  d3 = vld1_s16(input + 4);
  d4 = vld1_s16(input + 8);
  d5 = vld1_s16(input + 12);

  // 1st for loop
  q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
  q2s16 = vcombine_s16(d3, d5);

  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);

  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

  q3s16 = vshrq_n_s16(q3s16, 1);
  q4s16 = vshrq_n_s16(q4s16, 1);

  q3s16 = vqaddq_s16(q3s16, q2s16);
  q4s16 = vqaddq_s16(q4s16, q2s16);

  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1

  d2 = vqadd_s16(d12, d11);
  d3 = vqadd_s16(d13, d10);
  d4 = vqsub_s16(d13, d10);
  d5 = vqsub_s16(d12, d11);

  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                    vreinterpret_s16_s32(v2tmp1.val[0]));
  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                    vreinterpret_s16_s32(v2tmp1.val[1]));

  // 2nd for loop
  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
  q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);

  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);

  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

  q3s16 = vshrq_n_s16(q3s16, 1);
  q4s16 = vshrq_n_s16(q4s16, 1);

  q3s16 = vqaddq_s16(q3s16, q2s16);
  q4s16 = vqaddq_s16(q4s16, q2s16);

  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1

  d2 = vqadd_s16(d12, d11);
  d3 = vqadd_s16(d13, d10);
  d4 = vqsub_s16(d13, d10);
  d5 = vqsub_s16(d12, d11);

  d2 = vrshr_n_s16(d2, 3);
  d3 = vrshr_n_s16(d3, 3);
  d4 = vrshr_n_s16(d4, 3);
  d5 = vrshr_n_s16(d5, 3);

  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                    vreinterpret_s16_s32(v2tmp1.val[0]));
  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                    vreinterpret_s16_s32(v2tmp1.val[1]));

  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
  q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);

  // dc_only_idct_add
  for (i = 0; i < 2; i++, q1s16 = q2s16) {
    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
    pred_ptr += pred_stride;
    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
    pred_ptr += pred_stride;

    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));

    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
    dst_ptr += dst_stride;
    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
    dst_ptr += dst_stride;
  }
  return;
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: rfx_neon.c Projeto: AlessioLeo/FreeRDP

rfx_dwt_2d_decode_block_horiz_NEON(INT16 * l, INT16 * h, INT16 * dst, int subband_width)
{
	int y, n;
	INT16 * l_ptr = l;
	INT16 * h_ptr = h;
	INT16 * dst_ptr = dst;

	for (y = 0; y < subband_width; y++)
	{
		/* Even coefficients */
		for (n = 0; n < subband_width; n+=8)
		{
			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
			int16x8_t l_n = vld1q_s16(l_ptr);

			int16x8_t h_n = vld1q_s16(h_ptr);
			int16x8_t h_n_m = vld1q_s16(h_ptr - 1);

			if (n == 0)
			{
				int16_t first = vgetq_lane_s16(h_n_m, 1);
				h_n_m = vsetq_lane_s16(first, h_n_m, 0);
			}

			int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
			tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
			tmp_n = vshrq_n_s16(tmp_n, 1);

			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);

			vst1q_s16(l_ptr, dst_n);

			l_ptr+=8;
			h_ptr+=8;
		}
		l_ptr -= subband_width;
		h_ptr -= subband_width;

		/* Odd coefficients */
		for (n = 0; n < subband_width; n+=8)
		{
			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);

			int16x8_t h_n = vld1q_s16(h_ptr);

			h_n = vshlq_n_s16(h_n, 1);

			int16x8x2_t dst_n;
			dst_n.val[0] = vld1q_s16(l_ptr);
			int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
			if (n == subband_width - 8)
			{
				int16_t last = vgetq_lane_s16(dst_n_p, 6);
				dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
			}

			dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
			dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);

			dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);

			vst2q_s16(dst_ptr, dst_n);

			l_ptr+=8;
			h_ptr+=8;
			dst_ptr+=16;
		}
	}
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: rfx_neon.c Projeto: AlessioLeo/FreeRDP

rfx_dwt_2d_decode_block_vert_NEON(INT16 * l, INT16 * h, INT16 * dst, int subband_width)
{
	int x, n;
	INT16 * l_ptr = l;
	INT16 * h_ptr = h;
	INT16 * dst_ptr = dst;

	int total_width = subband_width + subband_width;

	/* Even coefficients */
	for (n = 0; n < subband_width; n++)
	{
		for (x = 0; x < total_width; x+=8)
		{
			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);

			int16x8_t l_n = vld1q_s16(l_ptr);
			int16x8_t h_n = vld1q_s16(h_ptr);

			int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));;
			if (n == 0)
				tmp_n = vaddq_s16(tmp_n, h_n);
			else
			{
				int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
				tmp_n = vaddq_s16(tmp_n, h_n_m);
			}
			tmp_n = vshrq_n_s16(tmp_n, 1);

			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
			vst1q_s16(dst_ptr, dst_n);

			l_ptr+=8;
			h_ptr+=8;
			dst_ptr+=8;
		}
		dst_ptr+=total_width;
	}

	h_ptr = h;
	dst_ptr = dst + total_width;

	/* Odd coefficients */
	for (n = 0; n < subband_width; n++)
	{
		for (x = 0; x < total_width; x+=8)
		{
		// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
		int16x8_t h_n = vld1q_s16(h_ptr);
		int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);

		h_n = vshlq_n_s16(h_n, 1);

		int16x8_t tmp_n = dst_n_m;
		if (n == subband_width - 1)
			tmp_n = vaddq_s16(tmp_n, dst_n_m);
		else
		{
			int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
			tmp_n = vaddq_s16(tmp_n, dst_n_p);
		}
		tmp_n = vshrq_n_s16(tmp_n, 1);

		int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
		vst1q_s16(dst_ptr, dst_n);

		h_ptr+=8;
		dst_ptr+=8;
	}
	dst_ptr+=total_width;
}
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: rfx_neon.c Projeto: pafcioooo/openulteo

void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
{
    int16x8_t zero = vdupq_n_s16(0);
    int16x8_t max = vdupq_n_s16(255);
    int16x8_t y_add = vdupq_n_s16(128);

    int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer;
    int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
    int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;

    int i;
    for (i = 0; i < 4096 / 8; i++)
    {
        int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
        y = vaddq_s16(y, y_add);

        int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]);

        // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
        int16x8_t r = vaddq_s16(y, cr);
        r = vaddq_s16(r, vshrq_n_s16(cr, 2));
        r = vaddq_s16(r, vshrq_n_s16(cr, 3));
        r = vaddq_s16(r, vshrq_n_s16(cr, 5));
        r = vminq_s16(vmaxq_s16(r, zero), max);
        vst1q_s16((sint16*)&y_r_buf[i], r);

        // cb = cb_g_buf[i];
        int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]);

        // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
        int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
        g = vsubq_s16(g, vshrq_n_s16(cb, 4));
        g = vsubq_s16(g, vshrq_n_s16(cb, 5));
        g = vsubq_s16(g, vshrq_n_s16(cr, 1));
        g = vsubq_s16(g, vshrq_n_s16(cr, 3));
        g = vsubq_s16(g, vshrq_n_s16(cr, 4));
        g = vsubq_s16(g, vshrq_n_s16(cr, 5));
        g = vminq_s16(vmaxq_s16(g, zero), max);
        vst1q_s16((sint16*)&cb_g_buf[i], g);

        // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
        int16x8_t b = vaddq_s16(y, cb);
        b = vaddq_s16(b, vshrq_n_s16(cb, 1));
        b = vaddq_s16(b, vshrq_n_s16(cb, 2));
        b = vaddq_s16(b, vshrq_n_s16(cb, 6));
        b = vminq_s16(vmaxq_s16(b, zero), max);
        vst1q_s16((sint16*)&cr_b_buf[i], b);
    }

}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: i40e_rxtx_vec_neon.c Projeto: cleveritcz/f-stack

 /*
 * Notice:
 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 *   numbers of DD bits
 */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		   uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union i40e_rx_desc *rxdp;
	struct i40e_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;

	/* mask to shuffle from desc. to mbuf */
	uint8x16_t shuf_msk = {
		0xFF, 0xFF,   /* pkt_type set as unknown */
		0xFF, 0xFF,   /* pkt_type set as unknown */
		14, 15,       /* octet 15~14, low 16 bits pkt_len */
		0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
		14, 15,       /* octet 15~14, 16 bits data_len */
		2, 3,         /* octet 2~3, low 16 bits vlan_macip */
		4, 5, 6, 7    /* octet 4~7, 32bits rss */
		};

	uint8x16_t eop_check = {
		0x02, 0x00, 0x02, 0x00,
		0x02, 0x00, 0x02, 0x00,
		0x00, 0x00, 0x00, 0x00,
		0x00, 0x00, 0x00, 0x00
		};

	uint16x8_t crc_adjust = {
		0, 0,         /* ignore pkt_type field */
		rxq->crc_len, /* sub crc on pkt_len */
		0,            /* ignore high-16bits of pkt_len */
		rxq->crc_len, /* sub crc on data_len */
		0, 0, 0       /* ignore non-length fields */
		};

	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	rte_prefetch_non_temporal(rxdp);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
		i40e_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.qword1.status_error_len &
			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
		return 0;

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */

	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_I40E_DESCS_PER_LOOP,
			rxdp += RTE_I40E_DESCS_PER_LOOP) {
		uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		uint16x8x2_t sterr_tmp1, sterr_tmp2;
		uint64x2_t mbp1, mbp2;
		uint16x8_t staterr;
		uint16x8_t tmp;
		uint64_t stat;

		int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};

		/* B.1 load 1 mbuf point */
		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
		rte_rmb();

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);

		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
		descs[0] =  vld1q_u64((uint64_t *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
					    len_shl);
		descs[3] = vreinterpretq_u64_u32(len3);
		uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
					    len_shl);
		descs[2] = vreinterpretq_u64_u32(len2);

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
				       vreinterpretq_u16_u64(descs[3]));
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
				       vreinterpretq_u16_u64(descs[2]));

		/* C.2 get 4 pkts staterr value  */
		staterr = vzipq_u16(sterr_tmp1.val[1],
				    sterr_tmp2.val[1]).val[0];

		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
		pkt_mb4 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
		pkt_mb3 = vreinterpretq_u8_u16(tmp);

		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
					    len_shl);
		descs[1] = vreinterpretq_u64_u32(len1);
		uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
					    len_shl);
		descs[0] = vreinterpretq_u64_u32(len0);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);

		/* D.3 copy final 3,4 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
				 pkt_mb4);
		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
				 pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
		pkt_mb2 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
		pkt_mb1 = vreinterpretq_u8_u16(tmp);

		/* C* extract and record EOP bit */
		if (split_packet) {
			uint8x16_t eop_shuf_mask = {
					0x00, 0x02, 0x04, 0x06,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF};
			uint8x16_t eop_bits;

			/* and with mask to extract bits, flipping 1-0 */
			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
			eop_bits = vandq_u8(eop_bits, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);

			/* store the resulting 32-bit value */
			vst1q_lane_u32((uint32_t *)split_packet,
				       vreinterpretq_u32_u8(eop_bits), 0);
			split_packet += RTE_I40E_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
		staterr = vreinterpretq_u16_s16(
				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
					    I40E_UINT16_BIT - 1));
		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);

		rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);

		/* D.3 copy final 1,2 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
			 pkt_mb2);
		vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
			 pkt_mb1);
		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
		/* C.4 calc avaialbe number of desc */
		if (unlikely(stat == 0)) {
			nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
		} else {
			nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
			break;
		}
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: vp9_quantize_neon.c Projeto: bear101/libvpx

void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr,
                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
                          const int16_t *scan, const int16_t *iscan) {
  // TODO(jingning) Decide the need of these arguments after the
  // quantization process is completed.
  (void)zbin_ptr;
  (void)quant_shift_ptr;
  (void)scan;

  if (!skip_block) {
    // Quantization pass: All coefficients with index >= zero_flag are
    // skippable. Note: zero_flag can be zero.
    int i;
    const int16x8_t v_zero = vdupq_n_s16(0);
    const int16x8_t v_one = vdupq_n_s16(1);
    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
    // adjust for dc
    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
    // process dc and the first seven ac coeffs
    {
      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
      const int32x4_t v_tmp_lo =
          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
      const int32x4_t v_tmp_hi =
          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
      const int16x8_t v_tmp2 =
          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
      store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
      v_round = vmovq_n_s16(round_ptr[1]);
      v_quant = vmovq_n_s16(quant_ptr[1]);
      v_dequant = vmovq_n_s16(dequant_ptr[1]);
    }
    // now process the rest of the ac coeffs
    for (i = 8; i < count; i += 8) {
      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
      const int32x4_t v_tmp_lo =
          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
      const int32x4_t v_tmp_hi =
          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
      const int16x8_t v_tmp2 =
          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
      store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
    }
    {
      const int16x4_t v_eobmax_3210 = vmax_s16(
          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
      const int64x1_t v_eobmax_xx32 =
          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
      const int16x4_t v_eobmax_tmp =
          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
      const int64x1_t v_eobmax_xxx3 =
          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
      const int16x4_t v_eobmax_final =
          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));

      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
    }
  } else {
    memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
    memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
    *eob_ptr = 0;
  }
}

Exemplo n.º 10

0

Exibir arquivo

Arquivo: neon.c Projeto: yaojingguo/gcc-intrinsics-samplecode

void  yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count,
			 unsigned char * destp)
{
  const unsigned char *source_endp;
  const unsigned char *vector_endp;
  int remainder;
  const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0};
  const int16x8_t v_coeff = {90, -46, 0,  0, 90, -46, 0,  0};
  const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF};
  const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 
  int16x8_t mp0_rgba;  /* macropixel 0's resulting RGBA RGBA pixels  */
  int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels  */
  uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */
  uint8x8_t rgba0, rgba1; /* rgba values as bytes  */
  uint8x16_t bothrgba;
  uint8_t * destinationp; /* pointer into output buffer destp  */
  int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */
  const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff,
			     0x00, 0x00, 0x00, 0x00};
  
  
  /* we're working with things in 4-byte macropixels  */
  remainder = source_byte_count % 4;

  source_endp = sourcep + source_byte_count;
  vector_endp = source_endp - remainder;
  destinationp = (uint8_t *)destp;

  while (sourcep < vector_endp)
    {
     /* pull YUYV from 2 four byte macropixels starting at sourcep. */
      /* we'll increment sourcep as we go to save the array dereference */
      /* and separate increment instruction at the end of the loop  */

      /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */
      rawpixels = vld1_u8(sourcep);
      sourcep += sizeof(rawpixels);

      widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels));
 


      
      /* ---------- process macropixel 0 --------------- */
      /* take macropixel zero ([YUYV]0) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp0_rgba  */
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]0 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP0_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP0_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);

	/* store ALPHA in elements 3 and 7 (after the RGB components)  */
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */
	/* change interpretation from unsigned 16-bit to signed  */
	/* 16-bit.   */
	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_low_s16(widerpixels);
	
	/* form a vector of the U component from MP0  */
	u_vec = vdupq_lane_s16(yuyv, MP0_U);
	
	/* subtract uvbias from u_vec */
	u_vec = vsubq_s16(u_vec, uvbias);

	/* form a vector of the V component from MP0  */
	v_vec = vdupq_lane_s16(yuyv, MP0_V);
	
	/* subtract uvbias from v_vec */
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);

	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec = vaddq_s16(u_vec, v_vec);
	  
	uv_vec = vshrq_n_s16(uv_vec, 6);

	/* now mp0_rgba = y_vec + u_vec + v_vec  */
	mp0_rgba = vaddq_s16(wider_yalpha, uv_vec);

      }

      /* ---------- process macropixel 1 --------------- */
      /* take macropixel one ([YUYV]1) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp1_rgba  */      
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]1 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP1_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP1_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);
	  
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */


	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_high_s16(widerpixels);
	u_vec = vdupq_lane_s16(yuyv, 1);
	u_vec = vsubq_s16(u_vec, uvbias);
	
	v_vec = vdupq_lane_s16(yuyv, 3);
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);
     
	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec  = vaddq_s16(u_vec, v_vec);
	uv_vec = vshrq_n_s16(uv_vec, 6);


	/* now mp1_rgba = y_vec + u_vec + v_vec  */
	mp1_rgba = vaddq_s16(wider_yalpha, uv_vec);
      }
      

      /* turn mp0_rgba from a vector of shorts to a vector of  */
      /* unsigned unsigned chars. this will saturate: clipping  */
      /* the values between 0 and 255.   */
      
      rgba0 = vqmovun_s16(mp0_rgba);
      rgba1 = vqmovun_s16(mp1_rgba);

      /* make it faster to copy these back out of vector registers into  */
      /* memory by combining rgba0 and rgba1 into the larger bothrgba.   */
      /* then store that back into memory at destinationp.               */

      bothrgba = vcombine_u8(rgba0, rgba1);
      
      vst1q_u8(destinationp, bothrgba);
      destinationp += 16;
      
      
    }
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: prim_colors.c Projeto: achyutjayesh/FreeRDP

PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
	const INT16 *pSrc[3],
	int srcStep,
	INT16 *pDst[3],
	int dstStep,
	const prim_size_t *roi)	/* region of interest */
{
	/* TODO: If necessary, check alignments and call the general version. */

	int16x8_t zero = vdupq_n_s16(0);
	int16x8_t max = vdupq_n_s16(255);
	int16x8_t y_add = vdupq_n_s16(128);

	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
	int16x8_t* b_buf  = (int16x8_t*) pDst[2];

	int srcbump = srcStep / sizeof(int16x8_t);
	int dstbump = dstStep / sizeof(int16x8_t);
	int yp;

	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
	for (yp=0; yp<roi->height; ++yp)
	{
		int i;
		for (i=0; i<imax; i++)
		{
			int16x8_t y = vld1q_s16((INT16*) (y_buf+i));
			y = vaddq_s16(y, y_add);

			int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i));

			/* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)),
			 *    0, 255);
			 */
			int16x8_t r = vaddq_s16(y, cr);
			r = vaddq_s16(r, vshrq_n_s16(cr, 2));
			r = vaddq_s16(r, vshrq_n_s16(cr, 3));
			r = vaddq_s16(r, vshrq_n_s16(cr, 5));
			r = vminq_s16(vmaxq_s16(r, zero), max);
			vst1q_s16((INT16*) (r_buf+i), r);

			/* cb = cb_g_buf[i]; */
			int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i));

			/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1)
			 * - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
			 */
			int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
			g = vsubq_s16(g, vshrq_n_s16(cb, 4));
			g = vsubq_s16(g, vshrq_n_s16(cb, 5));
			g = vsubq_s16(g, vshrq_n_s16(cr, 1));
			g = vsubq_s16(g, vshrq_n_s16(cr, 3));
			g = vsubq_s16(g, vshrq_n_s16(cr, 4));
			g = vsubq_s16(g, vshrq_n_s16(cr, 5));
			g = vminq_s16(vmaxq_s16(g, zero), max);
			vst1q_s16((INT16*) (g_buf+i), g);

			/* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)),
			 * 0, 255);
			 */
			int16x8_t b = vaddq_s16(y, cb);
			b = vaddq_s16(b, vshrq_n_s16(cb, 1));
			b = vaddq_s16(b, vshrq_n_s16(cb, 2));
			b = vaddq_s16(b, vshrq_n_s16(cb, 6));
			b = vminq_s16(vmaxq_s16(b, zero), max);
			vst1q_s16((INT16*) (b_buf+i), b);
		}
		y_buf  += srcbump;
		cb_buf += srcbump;
		cr_buf += srcbump;
		r_buf += dstbump;
		g_buf += dstbump;
		b_buf += dstbump;
	}
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: usrp_lib.cpp Projeto: debashish216/lte-testbed-news

static int trx_usrp_read(openair0_device *device, openair0_timestamp *ptimestamp, void **buff, int nsamps, int cc)
{
   usrp_state_t *s = (usrp_state_t*)device->priv;
   int samples_received=0,i,j;
   int nsamps2;  // aligned to upper 32 or 16 byte boundary
#if defined(__x86_64) || defined(__i386__)
#ifdef __AVX2__
   __m256i buff_tmp[2][nsamps>>3];
   nsamps2 = (nsamps+7)>>3;
#else
   __m128i buff_tmp[2][nsamps>>2];
   nsamps2 = (nsamps+3)>>2;
#endif
#elif defined(__arm__)
   int16x8_t buff_tmp[2][nsamps>>2];
   nsamps2 = (nsamps+3)>>2;
#endif


  if (device->type == USRP_B200_IF) {  
    if (cc>1) {
    // receive multiple channels (e.g. RF A and RF B)
      std::vector<void *> buff_ptrs;
 
      for (int i=0;i<cc;i++) buff_ptrs.push_back(buff_tmp[i]);
      samples_received = s->rx_stream->recv(buff_ptrs, nsamps, s->rx_md);
    } else {
    // receive a single channel (e.g. from connector RF A)
      samples_received = s->rx_stream->recv(buff_tmp[0], nsamps, s->rx_md);
    }
   
  // bring RX data into 12 LSBs for softmodem RX
    for (int i=0;i<cc;i++) {
      for (int j=0; j<nsamps2; j++) {      
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
        ((__m256i *)buff[i])[j] = _mm256_srai_epi16(buff_tmp[i][j],4);
#else
        ((__m128i *)buff[i])[j] = _mm_srai_epi16(buff_tmp[i][j],4);
#endif
#elif defined(__arm__)
        ((int16x8_t*)buff[i])[j] = vshrq_n_s16(buff_tmp[i][j],4);
#endif
      }
    }
  } else if (device->type == USRP_X300_IF) {
    if (cc>1) {
    // receive multiple channels (e.g. RF A and RF B)
      std::vector<void *> buff_ptrs;
 
      for (int i=0;i<cc;i++) buff_ptrs.push_back(buff[i]);
      samples_received = s->rx_stream->recv(buff_ptrs, nsamps, s->rx_md);
    } else {
    // receive a single channel (e.g. from connector RF A)
      samples_received = s->rx_stream->recv(buff[0], nsamps, s->rx_md);
    }
  }

  if (samples_received < nsamps) {
    printf("[recv] received %d samples out of %d\n",samples_received,nsamps);
    
  }

  //handle the error code
  switch(s->rx_md.error_code){
  case uhd::rx_metadata_t::ERROR_CODE_NONE:
    break;
  case uhd::rx_metadata_t::ERROR_CODE_OVERFLOW:
    printf("[recv] USRP RX OVERFLOW!\n");
    s->num_overflows++;
    break;
  case uhd::rx_metadata_t::ERROR_CODE_TIMEOUT:
    printf("[recv] USRP RX TIMEOUT!\n");
    break;
  default:
    printf("[recv] Unexpected error on RX, Error code: 0x%x\n",s->rx_md.error_code);
    break;
  }
  s->rx_count += nsamps;
  s->rx_timestamp = s->rx_md.time_spec.to_ticks(s->sample_rate);
  *ptimestamp = s->rx_timestamp;

  return samples_received;
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: fdct_neon.c Projeto: webmproject/libvpx

void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                      int stride) {
  int i;
  // input[M * stride] * 16
  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);

  // If the very first value != 0, then add 1.
  if (input[0] != 0) {
    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
    input_0 = vadd_s16(input_0, one);
  }

  for (i = 0; i < 2; ++i) {
    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
    const int16x8_t input_32 = vcombine_s16(input_3, input_2);

    // in_0 +/- in_3, in_1 +/- in_2
    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
    const int16x8_t s_32 = vsubq_s16(input_01, input_32);

    // step_0 +/- step_1, step_2 +/- step_3
    const int16x4_t s_0 = vget_low_s16(s_01);
    const int16x4_t s_1 = vget_high_s16(s_01);
    const int16x4_t s_2 = vget_high_s16(s_32);
    const int16x4_t s_3 = vget_low_s16(s_32);

    // (s_0 +/- s_1) * cospi_16_64
    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);

    // fdct_round_shift
    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);

    // s_3 * cospi_8_64 + s_2 * cospi_24_64
    // s_3 * cospi_24_64 - s_2 * cospi_8_64
    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);

    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);

    // fdct_round_shift
    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);

    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);

    input_0 = out_0;
    input_1 = out_1;
    input_2 = out_2;
    input_3 = out_3;
  }

  {
    // Not quite a rounding shift. Only add 1 despite shifting by 2.
    const int16x8_t one = vdupq_n_s16(1);
    int16x8_t out_01 = vcombine_s16(input_0, input_1);
    int16x8_t out_23 = vcombine_s16(input_2, input_3);
    out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
    out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
    store_s16q_to_tran_low(final_output + 0 * 8, out_01);
    store_s16q_to_tran_low(final_output + 1 * 8, out_23);
  }
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: prim_colors_opt.c Projeto: kpsone/FreeRDP

pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
    const INT16 *pSrc[3],
    int srcStep,
    INT16 *pDst[3],
    int dstStep,
    const prim_size_t *roi)	/* region of interest */
{
    /* TODO: If necessary, check alignments and call the general version. */

    int16x8_t zero = vdupq_n_s16(0);
    int16x8_t max = vdupq_n_s16(255);

    int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
    int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
    int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
    int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
    int16x8_t c4096 = vdupq_n_s16(4096);

    int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
    int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
    int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
    int16x8_t* r_buf  = (int16x8_t*) pDst[0];
    int16x8_t* g_buf  = (int16x8_t*) pDst[1];
    int16x8_t* b_buf  = (int16x8_t*) pDst[2];

    int srcbump = srcStep / sizeof(int16x8_t);
    int dstbump = dstStep / sizeof(int16x8_t);
    int yp;

    int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
    for (yp=0; yp<roi->height; ++yp)
    {
        int i;
        for (i=0; i<imax; i++)
        {
            /*
            	In order to use NEON signed 16-bit integer multiplication we need to convert
            	the floating point factors to signed int without loosing information.
            	The result of this multiplication is 32 bit and we have a NEON instruction
            	that returns the hi word of the saturated double.
            	Thus we will multiply the factors by the highest possible 2^n, take the
            	upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
            	shift by 1 to reverse the doubling) and correct	this result by multiplying it
            	by 2^(16-n).
            	For the given factors in the conversion matrix the best possible n is 14.

            	Example for calculating r:
            	r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
            	r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
            	r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
            	r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
            */

            /* y = (y_buf[i] + 4096) >> 2 */
            int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
            y = vaddq_s16(y, c4096);
            y = vshrq_n_s16(y, 2);
            /* cb = cb_buf[i]; */
            int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
            /* cr = cr_buf[i]; */
            int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);

            /* (y + HIWORD(cr*22986)) >> 3 */
            int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
            r = vshrq_n_s16(r, 3);
            /* r_buf[i] = MINMAX(r, 0, 255); */
            r = vminq_s16(vmaxq_s16(r, zero), max);
            vst1q_s16((INT16*)&r_buf[i], r);

            /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
            int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
            g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
            g = vshrq_n_s16(g, 3);
            /* g_buf[i] = MINMAX(g, 0, 255); */
            g = vminq_s16(vmaxq_s16(g, zero), max);
            vst1q_s16((INT16*)&g_buf[i], g);

            /* (y + HIWORD(cb*28999)) >> 3 */
            int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
            b = vshrq_n_s16(b, 3);
            /* b_buf[i] = MINMAX(b, 0, 255); */
            b = vminq_s16(vmaxq_s16(b, zero), max);
            vst1q_s16((INT16*)&b_buf[i], b);
        }

        y_buf  += srcbump;
        cb_buf += srcbump;
        cr_buf += srcbump;
        r_buf += dstbump;
        g_buf += dstbump;
        b_buf += dstbump;
    }
    return PRIMITIVES_SUCCESS;
}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: idct_dequant_full_2x_neon.c Projeto: 93i/godot

void idct_dequant_full_2x_neon(
        int16_t *q,
        int16_t *dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0, *dst1;
    int32x2_t d28, d29, d30, d31;
    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
    int16x8_t qEmpty = vdupq_n_s16(0);
    int32x4x2_t q2tmp0, q2tmp1;
    int16x8x2_t q2tmp2, q2tmp3;
    int16x4_t dLow0, dLow1, dHigh0, dHigh1;

    d28 = d29 = d30 = d31 = vdup_n_s32(0);

    // load dq
    q0 = vld1q_s16(dq);
    dq += 8;
    q1 = vld1q_s16(dq);

    // load q
    q2 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q3 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q4 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q5 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);

    // load src from dst
    dst0 = dst;
    dst1 = dst + 4;
    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
    dst0 += stride;
    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
    dst1 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
    dst0 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
    dst1 += stride;

    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
    dst0 += stride;
    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
    dst1 += stride;
    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);

    q2 = vmulq_s16(q2, q0);
    q3 = vmulq_s16(q3, q1);
    q4 = vmulq_s16(q4, q0);
    q5 = vmulq_s16(q5, q1);

    // vswp
    dLow0 = vget_low_s16(q2);
    dHigh0 = vget_high_s16(q2);
    dLow1 = vget_low_s16(q4);
    dHigh1 = vget_high_s16(q4);
    q2 = vcombine_s16(dLow0, dLow1);
    q4 = vcombine_s16(dHigh0, dHigh1);

    dLow0 = vget_low_s16(q3);
    dHigh0 = vget_high_s16(q3);
    dLow1 = vget_low_s16(q5);
    dHigh1 = vget_high_s16(q5);
    q3 = vcombine_s16(dLow0, dLow1);
    q5 = vcombine_s16(dHigh0, dHigh1);

    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);

    q10 = vqaddq_s16(q2, q3);
    q11 = vqsubq_s16(q2, q3);

    q8 = vshrq_n_s16(q8, 1);
    q9 = vshrq_n_s16(q9, 1);

    q4 = vqaddq_s16(q4, q8);
    q5 = vqaddq_s16(q5, q9);

    q2 = vqsubq_s16(q6, q5);
    q3 = vqaddq_s16(q7, q4);

    q4 = vqaddq_s16(q10, q3);
    q5 = vqaddq_s16(q11, q2);
    q6 = vqsubq_s16(q11, q2);
    q7 = vqsubq_s16(q10, q3);

    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));

    // loop 2
    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);

    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);

    q10 = vshrq_n_s16(q10, 1);
    q11 = vshrq_n_s16(q11, 1);

    q10 = vqaddq_s16(q2tmp2.val[1], q10);
    q11 = vqaddq_s16(q2tmp3.val[1], q11);

    q8 = vqsubq_s16(q8, q11);
    q9 = vqaddq_s16(q9, q10);

    q4 = vqaddq_s16(q2, q9);
    q5 = vqaddq_s16(q3, q8);
    q6 = vqsubq_s16(q3, q8);
    q7 = vqsubq_s16(q2, q9);

    q4 = vrshrq_n_s16(q4, 3);
    q5 = vrshrq_n_s16(q5, 3);
    q6 = vrshrq_n_s16(q6, 3);
    q7 = vrshrq_n_s16(q7, 3);

    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));

    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
                                          vreinterpret_u8_s32(d28)));
    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
                                          vreinterpret_u8_s32(d29)));
    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
                                          vreinterpret_u8_s32(d30)));
    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
                                          vreinterpret_u8_s32(d31)));

    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));

    dst0 = dst;
    dst1 = dst + 4;
    vst1_lane_s32((int32_t *)dst0, d28, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d28, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d29, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d29, 1);
    dst1 += stride;

    vst1_lane_s32((int32_t *)dst0, d30, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d30, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d31, 0);
    vst1_lane_s32((int32_t *)dst1, d31, 1);
    return;
}