Example #1
0
// c = a - b
FORCE_INLINE
int __ext_v_sub_complex16(struct complex16* c, int len, struct complex16* a,
          int __unused_2, struct complex16* b, int __unused_1)
 {
   const int wlen = 4;// sizeof(vcs) / sizeof(complex16);
  __m128i* As = (__m128i*) a;
  __m128i* Bs = (__m128i*) b;
  __m128i* Cs = (__m128i*) c;
  for (int i = 0; i < len / wlen; i++)
  {
    __m128i ma = _mm_loadu_si128(&As[i]);
    __m128i mb = _mm_loadu_si128(&Bs[i]);
    _mm_storeu_si128(&Cs[i], _mm_sub_epi16(ma, mb));
  }
  for (int i = (len / wlen) * wlen; i < len; i++)
  {
    c[i].re = a[i].re - b[i].re;
    c[i].im = a[i].im - b[i].im;
  }
  return 0;
 }
Example #2
0
//FINL 
int __ext_v_shift_right_complex32(struct complex32* z, int __unused_3, struct complex32* x, int len, int shift)
{
	const int wlen = 2;// sizeof(vci) / sizeof(complex32);
	for (int i = 0; i < len / wlen; i++)
	{/*
		vci *xi = (vci *)(x + wlen*i);
		vci output = (shift_right(*xi, shift));
		memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vci));*/

		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));

		_mm_storeu_si128((__m128i *) (z + wlen*i), _mm_srai_epi32(mx, shift));

	}
	for (int i = (len / wlen) * wlen; i < len; i++)
	{
		z[i].re = x[i].re >> shift;
		z[i].im = x[i].im >> shift;
	}
	return 0;
}
Example #3
0
FORCE_INLINE
int __ext_v_add_complex32(struct complex32* c, int len, struct complex32* a,
  int __unused_2, struct complex32* b, int __unused_1)
{
  const int wlen = 2; // sizeof(vci) / sizeof(complex32);
  __m128i* As = (__m128i*) a;
  __m128i* Bs = (__m128i*) b;
  __m128i* Cs = (__m128i*) c;
  for (int i = 0; i < len / wlen; i++)
  {
    __m128i ma = _mm_loadu_si128(&As[i]);
    __m128i mb = _mm_loadu_si128(&Bs[i]);
    _mm_storeu_si128(&Cs[i], _mm_add_epi32(ma, mb));
  }
  for (int i = (len / wlen) * wlen; i < len; i++)
  {
    c[i].re = a[i].re + b[i].re;
    c[i].im = a[i].im + b[i].im;
  }
  return 0;
}
Example #4
0
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
                                     int num_pixels, uint8_t* dst) {
  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
  const __m128i mask_0x07 = _mm_set1_epi8(0x07);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
    const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
    const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
    const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
    const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
    const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
    const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
    const __m128i b1 = _mm_srli_epi16(b0, 3);
    const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
#if (WEBP_SWAP_16BIT_CSP == 1)
    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
#else
    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
#endif
    _mm_storeu_si128(out++, rgba);
    num_pixels -= 8;
  }
  // left-overs
  if (num_pixels > 0) {
    VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  }
}
Example #5
0
// Multiply the first source vector by the conjugate of the second source vector
// ie. re + j * im = a * conj(b)
//Return by reference for performance
FORCE_INLINE
int __ext_v_conj_mul_complex16(struct complex16* out, int lenout,
                struct complex16* x, int len1,
                struct complex16* y, int len2, int shift){
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Outs = (__m128i*) out;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms2 = _mm_xor_si128(my, xmm5);
    ms2 = _mm_add_epi32(ms2, xmm4);

    ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    __m128i mre = _mm_srai_epi32(_mm_madd_epi16(my, mx), shift);
    __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, mx), shift);

    mre = _mm_and_si128(mre, xmm6);
    mim = _mm_and_si128(mim, xmm6);

    mim = _mm_slli_epi32(mim, 0x10);

    _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    out[i].re = (x[i].re * y[i].re + x[i].im * y[i].im) >> shift;
    out[i].im = (x[i].im * y[i].re - x[i].re * y[i].im) >> shift;
  }

  return 0;
}
static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
                                int tapsNum, uint32_t *buf) {
  __m128i u[8], v[6];

  assert(tapsNum == 10 || tapsNum == 12);
  if (tapsNum == 10) {
    src -= 1;
  }

  u[0] = _mm_loadu_si128((__m128i const *)src);
  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));

  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));

  transpose_pair(u, v);

  u[0] = _mm_madd_epi16(v[0], f[0]);
  u[1] = _mm_madd_epi16(v[1], f[1]);
  u[2] = _mm_madd_epi16(v[2], f[2]);
  u[3] = _mm_madd_epi16(v[3], f[3]);
  u[4] = _mm_madd_epi16(v[4], f[4]);
  u[5] = _mm_madd_epi16(v[5], f[5]);

  u[6] = _mm_min_epi32(u[2], u[3]);
  u[7] = _mm_max_epi32(u[2], u[3]);

  u[0] = _mm_add_epi32(u[0], u[1]);
  u[0] = _mm_add_epi32(u[0], u[5]);
  u[0] = _mm_add_epi32(u[0], u[4]);
  u[0] = _mm_add_epi32(u[0], u[6]);
  u[0] = _mm_add_epi32(u[0], u[7]);

  _mm_storeu_si128((__m128i *)buf, u[0]);
}
Example #7
0
// TODO: These should eat clock cycles.
static void gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size)
{
	u8* dst = ((u8*)g_dsp.dram);

#if _M_SSE >= 0x301
	if (cpu_info.bSSSE3 && !(size % 16))
	{
		for (u32 i = 0; i < size; i += 16)
		{
			_mm_storeu_si128((__m128i *)&dst[dsp_addr + i], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]), s_mask));
		}
	}
	else
#endif
	{
		for (u32 i = 0; i < size; i += 2)
		{
			*(u16*)&dst[dsp_addr + i] = Common::swap16(*(const u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]);
		}
	}
	INFO_LOG(DSPLLE, "*** ddma_in RAM (0x%08x) -> DRAM_DSP (0x%04x) : size (0x%08x)", addr, dsp_addr / 2, size);
}
Example #8
0
static void AESNI_CBC_decrypt(const unsigned char *in,unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds)
{
    __m128i data,feedback,last_in;
    int i,j;
    if (length%16)
        length = length/16+1;
    else length /=16;
    feedback=_mm_loadu_si128 ((__m128i*)ivec);
    for(i=0; i < length; i++)
    {
        last_in=_mm_loadu_si128 (&((__m128i*)in)[i]);
        data = _mm_xor_si128 (last_in,((__m128i*)key)[0]);
        for(j=1; j <number_of_rounds; j++)
        {
            data = _mm_aesdec_si128 (data,((__m128i*)key)[j]);
	}
        data = _mm_aesdeclast_si128 (data,((__m128i*)key)[j]);
        data = _mm_xor_si128 (data,feedback);
        _mm_storeu_si128 (&((__m128i*)out)[i],data);
        feedback=last_in;
    }
}
Example #9
0
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
  int pa_minus_pb;
  const __m128i zero = _mm_setzero_si128();
  const __m128i A0 = _mm_cvtsi32_si128(a);
  const __m128i B0 = _mm_cvtsi32_si128(b);
  const __m128i C0 = _mm_cvtsi32_si128(c);
  const __m128i AC0 = _mm_subs_epu8(A0, C0);
  const __m128i CA0 = _mm_subs_epu8(C0, A0);
  const __m128i BC0 = _mm_subs_epu8(B0, C0);
  const __m128i CB0 = _mm_subs_epu8(C0, B0);
  const __m128i AC = _mm_or_si128(AC0, CA0);
  const __m128i BC = _mm_or_si128(BC0, CB0);
  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
  const __m128i diff = _mm_sub_epi16(pb, pa);
  {
    int16_t out[8];
    _mm_storeu_si128((__m128i*)out, diff);
    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  }
  return (pa_minus_pb <= 0) ? a : b;
}
Example #10
0
//FINL 
int __ext_v_shift_right_int16(int16* z, int __unused_3, int16* x, int len, int shift)
{
	const int wlen = 8;// sizeof(vs) / sizeof(int16);
	for (int i = 0; i < len / wlen; i++)
	{
		/*
		vs *xi = (vs *)(x + wlen*i);

		vs output = (shift_right(*xi, shift));
		memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vs));*/

		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		_mm_storeu_si128((__m128i *) (z + wlen*i), _mm_srai_epi16(mx, shift));

	
	}
	for (int i = (len / wlen) * wlen; i < len; i++)
	{
		z[i] = x[i] >> shift;
	}
	return 0;
}
Example #11
0
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
static void
shuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 8;
  size_t j;
  int k, l;
  uint8_t* dest_for_jth_element;
  __m128i xmm0[8], xmm1[8];

  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
    /* Fetch 16 elements (128 bytes) then transpose bytes. */
    for (k = 0; k < 8; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
    }
    /* Transpose words */
    for (k = 0, l = 0; k < 4; k++, l += 2) {
      xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 1]);
      xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 1]);
    }
    /* Transpose double words */
    for (k = 0, l = 0; k < 4; k++, l++) {
      if (k == 2) l += 2;
      xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 2]);
      xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 2]);
    }
    /* Transpose quad words */
    for (k = 0; k < 4; k++) {
      xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 4]);
      xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 4]);
    }
    /* Store the result vectors */
    dest_for_jth_element = dest + j;
    for (k = 0; k < 8; k++) {
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
    }
  }
}
Example #12
0
void __ext_v_andnot(unsigned char *output, int outlen, unsigned char *input1, int inlen1, unsigned char *input2, int inlen2)
{
	int cnt = 0;
	int bytelen1 = inlen1 / 8 + ((inlen1 % 8) > 0);

	while (cnt + 16 <= bytelen1)
	{
		__m128i mi1 = _mm_loadu_si128((__m128i *) (input1 + cnt));
		__m128i mi2 = _mm_loadu_si128((__m128i *) (input2 + cnt));

		_mm_storeu_si128((__m128i *) (output + cnt), _mm_andnot_si128(mi1, mi2));

		cnt += 16;
	}

	while (cnt < bytelen1)
	{
		output[cnt] = (~input1[cnt]) & input2[cnt];
		cnt++;
	}
	outlen = inlen1;
}
Example #13
0
/* The GCM counter. Counts on the last 32 bits, ignoring carry. */
static inline void _nc_count_16_be_4 (uint64_t *init, uint64_t *dst, size_t blocks) {

#if defined (__nc_SSE__)
  __m128i ctr, c1   = _mm_set_epi32 (1, 0, 0, 0),
               mask = _mm_set_epi64x (0x0c0d0e0f0b0a0908, 0x0706050403020100);
  ctr = _mm_shuffle_epi8 (_mm_loadu_si128 ((__m128i *) init), mask);
  for (; blocks --; dst += 2) {
    _mm_storeu_si128 ((__m128i *) dst, _mm_shuffle_epi8 (ctr, mask));
    ctr = _mm_add_epi32 (ctr, c1);
  }
#else
  uint64_t qw1 = init[0];
  uint32_t dw3 = ((uint32_t*) init)[2],
           dw4 = be32toh (((uint32_t*) init)[3]);
  for (; blocks --; dst += 2) {
    dst[0] = qw1;
    ((uint32_t*) dst)[2] = dw3;
    ((uint32_t*) dst)[3] = htobe32 (dw4 ++);
  }
#endif

}
Example #14
0
void InvShiftRows_sse(BYTE state[][4])
{
        __m128i stateSse = _mm_set_epi8(state[3][3],
                                        state[3][2],
                                        state[3][1],
                                        state[3][0],
                                        state[2][3],
                                        state[2][2],
                                        state[2][1],
                                        state[2][0],
                                        state[1][3],
                                        state[1][2],
                                        state[1][1],
                                        state[1][0],
                                        state[0][3],
                                        state[0][2],
                                        state[0][1],
                                        state[0][0]);
        __m128i shuffleVar = _mm_set_epi8(12, 15, 14, 13, 9, 8, 11, 10, 6, 5, 4, 7, 3, 2, 1, 0);
        __m128i shuffledState = _mm_shuffle_epi8(stateSse, shuffleVar);
        _mm_storeu_si128(state, shuffledState);
}
Example #15
0
void
aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
    const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN])
{
	__m128i tot;
	__m128i tout[8];
	const struct blocks8 *blks;
	struct blocks8 *top;
	size_t i, cnt;

	cnt = len / AES_BLOCK_LEN / 8;
	for (i = 0; i < cnt; i++) {
		blks = (const struct blocks8 *)from;
		top = (struct blocks8 *)to;
		aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
		    blks->blk[6], blks->blk[7], tout);
		top->blk[0] = tout[0];
		top->blk[1] = tout[1];
		top->blk[2] = tout[2];
		top->blk[3] = tout[3];
		top->blk[4] = tout[4];
		top->blk[5] = tout[5];
		top->blk[6] = tout[6];
		top->blk[7] = tout[7];
		from += AES_BLOCK_LEN * 8;
		to += AES_BLOCK_LEN * 8;
	}
	i *= 8;
	cnt = len / AES_BLOCK_LEN;
	for (; i < cnt; i++) {
		tot = aesni_dec(rounds - 1, key_schedule,
		    _mm_loadu_si128((const __m128i *)from));
		_mm_storeu_si128((__m128i *)to, tot);
		from += AES_BLOCK_LEN;
		to += AES_BLOCK_LEN;
	}
}
Example #16
0
void
png_read_filter_row_sub3_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   png_bytep rp = row;
   __m128i racc = _mm_setzero_si128();

   PNG_UNUSED(prev_row)

   __m128i nrb = _mm_load_si128((__m128i*)(rp));

   for (i = 0; i < row_info->rowbytes; i += 15, rp += 15)
   {
      __m128i rb = nrb;
#ifndef __SSSE3__
      nrb = _mm_loadu_si128((__m128i*)(rp + 15));
      racc = _mm_srli_si128(_mm_slli_si128(racc, 1), 13);
      racc = _mm_or_si128(racc, _mm_slli_si128(rb, 3));
#else
      nrb = _mm_lddqu_si128((__m128i*)(rp + 15));
      racc =  _mm_alignr_epi8(rb, _mm_slli_si128(racc, 1), 13);
#endif

      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = rb;

      _mm_storeu_si128((__m128i*)rp, rb);
   }
}
Example #17
0
static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
                                              uint32_t* argb_data,
                                              int num_pixels) {
  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);

  int i;

  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
    const __m128i b = in;

    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
    const __m128i r_new =
        _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);

    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
    const __m128i b_new =
        _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);

    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }

  // Fall-back to C-version for left-overs.
  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
}
Example #18
0
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) {
#ifdef _M_SSE
	// Size will always be 16-byte aligned as the hwBlockSize is.
	while (size >= 8) {
		__m128i in1 = _mm_loadu_si128((__m128i *)in);
		__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
		__m128i packed = _mm_packs_epi32(in1, in2);
		if (useShift) {
			packed = _mm_srai_epi16(packed, volShift);
		}
		_mm_storeu_si128((__m128i *)out, packed);
		out += 8;
		in += 8;
		size -= 8;
	}
#elif PPSSPP_ARCH(ARM_NEON)
	int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer
	while (size >= 8) {
		int32x4_t in1 = vld1q_s32(in);
		int32x4_t in2 = vld1q_s32(in + 4);
		int16x4_t packed1 = vqmovn_s32(in1);
		int16x4_t packed2 = vqmovn_s32(in2);
		if (useShift) {
			packed1 = vshl_s16(packed1, signedVolShift);
			packed2 = vshl_s16(packed2, signedVolShift);
		}
		vst1_s16(out, packed1);
		vst1_s16(out + 4, packed2);
		out += 8;
		in += 8;
		size -= 8;
	}
#endif
	// This does the remainder if SIMD was used, otherwise it does it all.
	for (size_t i = 0; i < size; i++) {
		out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]);
	}
}
Example #19
0
//FINL 
int __ext_v_shift_right_complex16(struct complex16* z, int __unused_3, struct complex16* x, int len, int shift)
{
	const int wlen = 4;// sizeof(vcs) / sizeof(complex16);
	for (int i = 0; i < len / wlen; i++)
	{
		//vcs *xi = (vcs *)(x + wlen*i);

		//vcs output = (shift_right(*xi, shift));
		//memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vcs));



		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		_mm_storeu_si128((__m128i *) (z + wlen*i), _mm_srai_epi16(mx,shift));

	}
	for (int i = (len / wlen) * wlen; i < len; i++)
	{
		z[i].re = x[i].re >> shift;
		z[i].im = x[i].im >> shift;
	}
	return 0;
}
Example #20
0
// This function is called in code/WiFi/receiver/downSample.blk
//FINL 
int __ext_interleave_loww( struct complex16* x, int __unused_5,
                     struct complex16* y, int __unused_4,
                     struct complex16* z, int __unused_3)
{
	assert (__unused_4 == 4);
	assert (__unused_3 == 4);
	assert (__unused_5 == 4);
	
	/*vcs t1 = *( (vcs*)x );
	vcs t2 = *( (vcs*)y );

	vcs *po = (vcs *)z;

	*po = (vcs)(interleave_low ((vcui&)t1, (vcui&)t2));*/

	
	__m128i mx = _mm_loadu_si128((__m128i *)x);
	__m128i my = _mm_loadu_si128((__m128i *)y);
	_mm_storeu_si128((__m128i *) z, _mm_unpacklo_epi64(mx,my));


	return 0;
}
Example #21
0
static FORCE_INLINE void blur_r6_h_right_sse2(const PixelType *srcp, PixelType *dstp) {
    __m128i avg12 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 1)), _mm_loadu_si128((const __m128i *)(srcp - 2)));
    __m128i avg34 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 3)), _mm_loadu_si128((const __m128i *)(srcp - 4)));
    __m128i avg56 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 5)), _mm_loadu_si128((const __m128i *)(srcp - 6)));

    __m128i avg012 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp)), avg12);
    __m128i avg3456 = mm_avg_epu<PixelType>(avg34, avg56);
    __m128i avg0123456 = mm_avg_epu<PixelType>(avg012, avg3456);
    __m128i avg = mm_avg_epu<PixelType>(avg012, avg0123456);

    // This is the right edge. Only the highest six pixels are needed.
    if (sizeof(PixelType) == 1) {
        int extra_bytes = *(int16_t *)(dstp + 8);
        avg = _mm_insert_epi16(avg, extra_bytes, 4);
        _mm_storeh_pi((__m64 *)(dstp + 8), _mm_castsi128_ps(avg));
    } else {
        int extra_bytes = dstp[0];
        avg = _mm_insert_epi16(avg, extra_bytes, 0);
        extra_bytes = dstp[1];
        avg = _mm_insert_epi16(avg, extra_bytes, 1);
        _mm_storeu_si128((__m128i *)(dstp), avg);
    }
}
Example #22
0
int aesni_xcryptecb( aes_context *ctx,
                     int mode,
                     const unsigned char input[16],
                     unsigned char output[16] )
{
    __m128i block;
    const __m128i *subkeys = (__m128i *) ctx->rk;
    const int rounds = ctx->nr;
    int i;

    /* This could be faster if more data was provided at once. */

    block = _mm_loadu_si128( (__m128i *) input );
    block = _mm_xor_si128( block, subkeys[0] );

    if( mode == AES_ENCRYPT ) {
        for( i = 1; i < rounds - 1; i += 2 ) {
            block = _mm_aesenc_si128( block, subkeys[i] );
            block = _mm_aesenc_si128( block, subkeys[i + 1] );
        }

        block = _mm_aesenc_si128( block, subkeys[rounds - 1] );
        block = _mm_aesenclast_si128( block, subkeys[rounds] );
    } else {
        for( i = 1; i < rounds - 1; i += 2 ) {
            block = _mm_aesdec_si128( block, subkeys[i] );
            block = _mm_aesdec_si128( block, subkeys[i + 1] );
        }

        block = _mm_aesdec_si128( block, subkeys[rounds - 1] );
        block = _mm_aesdeclast_si128( block, subkeys[rounds] );
    }

    _mm_storeu_si128( (__m128i *) output, block );

    return( 0 );
}
Example #23
0
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
{
#ifdef LV_HAVE_SSE
  unsigned int number = 0;

  const unsigned int eighthPoints = len / 8;

  const float* inputVectorPtr = (const float*)x;
  int16_t* outputVectorPtr = z;

  __m128 vScalar = _mm_set_ps1(scale);
  __m128 inputVal1, inputVal2;
  __m128i intInputVal1, intInputVal2;
  __m128 ret1, ret2;

  for(;number < eighthPoints; number++){
    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;

    ret1 = _mm_mul_ps(inputVal1, vScalar);
    ret2 = _mm_mul_ps(inputVal2, vScalar);

    intInputVal1 = _mm_cvtps_epi32(ret1);
    intInputVal2 = _mm_cvtps_epi32(ret2);

    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);

    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
    outputVectorPtr += 8;
  }

  number = eighthPoints * 8;
  for(; number < len; number++){
    z[number] = (int16_t) (x[number] * scale);
  }
#endif
}
Example #24
0
static unsigned reg_sad_sse2(const pixel * const data1, const pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
{
  int y, x;
  unsigned sad = 0;
  __m128i sse_inc = _mm_setzero_si128 ();
  long long int sse_inc_array[2];
  
  for (y = 0; y < height; ++y) {
    for (x = 0; x <= width-16; x+=16) {
      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
    }

    for (; x < width; ++x) {
      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
    }
  }
  _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
  sad += sse_inc_array[0] + sse_inc_array[1];

  return sad;
}
Example #25
0
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
static void
shuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  uint8_t* dest_for_ith_element;
  __m128i xmm0[4], xmm1[4];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
    /* Fetch 16 elements (64 bytes) then transpose bytes and words. */
    for (j = 0; j < 4; j++) {
      xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i))));
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8);
      xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d);
      xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]);
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e);
      xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]);
    }
    /* Transpose double words */
    for (j = 0; j < 2; j++) {
      xmm1[j * 2] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
      xmm1[j * 2 + 1] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
    }
    /* Transpose quad words */
    for (j = 0; j < 2; j++) {
      xmm0[j * 2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j + 2]);
      xmm0[j * 2 + 1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j + 2]);
    }
    /* Store the result vectors */
    dest_for_ith_element = dest + i;
    for (j = 0; j < 4; j++) {
      _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]);
    }
  }
}
Example #26
0
// Predictor1: left.
static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
                               int num_pixels, uint32_t* out) {
  int i;
  __m128i prev = _mm_set1_epi32(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    // a | b | c | d
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    // 0 | a | b | c
    const __m128i shift0 = _mm_slli_si128(src, 4);
    // a | a + b | b + c | c + d
    const __m128i sum0 = _mm_add_epi8(src, shift0);
    // 0 | 0 | a | a + b
    const __m128i shift1 = _mm_slli_si128(sum0, 8);
    // a | a + b | a + b + c | a + b + c + d
    const __m128i sum1 = _mm_add_epi8(sum0, shift1);
    const __m128i res = _mm_add_epi8(sum1, prev);
    _mm_storeu_si128((__m128i*)&out[i], res);
    // replicate prev output on the four lanes
    prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
  }
}
Example #27
0
//vz optimized template specialization
template<> void
cvtScale_<short, short, float>( const short* src, size_t sstep,
           short* dst, size_t dstep, Size size,
           float scale, float shift )   
{
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

	for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
		#if CV_SSE2
			if(USE_SSE2)
            {
                __m128 scale128 = _mm_set1_ps (scale);
                __m128 shift128 = _mm_set1_ps (shift);
				for(; x <= size.width - 8; x += 8 )
				{
					__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
					__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
					r0 = _mm_cvtps_epi32(rf0);
					r1 = _mm_cvtps_epi32(rf1);
    				r0 = _mm_packs_epi32(r0, r1);
					_mm_storeu_si128((__m128i*)(dst + x), r0);
				}    
			}
        #endif

        for(; x < size.width; x++ )
            dst[x] = saturate_cast<short>(src[x]*scale + shift);
    } 
}
Example #28
0
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i pa, pb;
    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
    {
      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
      const __m128i A = _mm_and_si128(mask, L);
      const __m128i B = _mm_andnot_si128(mask, T);
      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
      const __m128i res = _mm_sub_epi8(src, pred);
      _mm_storeu_si128((__m128i*)&out[i], res);
    }
  }
  if (i != num_pixels) {
    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
}
Example #29
0
static inline void xor_into (uint8_t *src, uint8_t *dst, size_t n) {
#if defined (__nc_SSE2__)
  while (n >= 16) {
    _mm_storeu_si128 (
        (__m128i*) dst,
        _mm_xor_si128 (
          _mm_loadu_si128 ((__m128i*) src),
          _mm_loadu_si128 ((__m128i*) dst)));
    src += 16;
    dst += 16;
    n   -= 16;
  }
#endif
  while (n >= u_long_s) {
    *((u_long *) dst) ^= *((u_long *) src);
    src += u_long_s;
    dst += u_long_s;
    n   -= u_long_s;
  }
  while (n-- > 0) {
    *dst = *(src ++) ^ *dst;
    dst++;
  }
}
Example #30
0
int64_t get_sum_vectorised (int64_t * vector)
{

	__m128i sum = _mm_setzero_si128();
	int64_t actualSum = 0;


	for (int64_t i = 0; i < g_length/4*4; i += 4)
	{
		__m128i temp = _mm_loadu_si128((__m128i *)(vector + i));
		sum = _mm_add_epi64(sum, temp);
	}

	int64_t A[4] = {0,0,0,0};
	_mm_storeu_si128((__m128i *)A, sum);
	actualSum += A[0] + A[1] + A[2] + A[3];

	for (int64_t i = g_length/4*4; i < g_length; i++)
	{
		actualSum += vector[i];
	}

	return actualSum;   
}