Exemplo n.º 1
0
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
static void
unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 2;
  size_t i;
  int j;
  __m256i ymm0[2], ymm1[2];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Load 32 elements (64 bytes) into 2 YMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 2; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    for (j = 0; j < 2; j++) {
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
    }
    /* Compute the low 64 bytes */
    ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
    /* Compute the hi 64 bytes */
    ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
    /* Store the result vectors in proper order */
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
  }
}
size_t varvectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) {
  size_t k = 0;
  __m256i * a = (__m256i *) array;
  __m128i s = _mm_set1_epi32(shiftamount);
  for (; k + 3 < length / 8 ; k +=4, a+=4) {
    __m256i v1 = _mm256_loadu_si256(a);
    __m256i v2 = _mm256_loadu_si256(a + 1);
    __m256i v3 = _mm256_loadu_si256(a + 2);
    __m256i v4 = _mm256_loadu_si256(a + 3);

    v1 = _mm256_srl_epi32(v1,s);
    v2 = _mm256_srl_epi32(v2,s);
    v3 = _mm256_srl_epi32(v3,s);
    v4 = _mm256_srl_epi32(v4,s);

     _mm256_storeu_si256(a,v1);
     _mm256_storeu_si256(a + 1,v2);
     _mm256_storeu_si256(a + 2,v3);
     _mm256_storeu_si256(a + 3,v4);

  }
  for (; k  < length / 8 ; k ++, a++) {
    __m256i v = _mm256_loadu_si256(a);
    v = _mm256_srl_epi32(v,s);
     _mm256_storeu_si256(a,v);
  }
  k *= 8;
  for (; k < length; ++k) {
    array[k] = array[k] >> shiftamount;
  }
  return 0;
}
Exemplo n.º 3
0
static void
avx2_test (void)
{
  int i;
  int ck[8];
  int r[8];
  unsigned int imm;
  int fail = 0;

  union256i_q s1, s2, d;

  for (i = 0; i < 256; i += 16)
    for (imm = 0; imm < 100; imm++)
      {
	/* Recompute the results for 256-bits */
	compute_correct_result_256 (&vals[i + 0], &vals[i + 8], imm, ck);

	s1.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 0]);
	s2.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 8]);

	/* Run the 256-bit tests */
	avx2_test_palignr256 (s1.x, s2.x, imm, &d.x);

	_mm256_storeu_si256 ((__m256i *) r, d.x);

	fail += checkVi (r, ck, 8);
      }

  if (fail != 0)
    abort ();
}
Exemplo n.º 4
0
static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
                                       ptrdiff_t src_stride, tran_low_t *coeff,
                                       int is_final) {
#if CONFIG_VP9_HIGHBITDEPTH
  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
  int16_t *t_coeff = temp_coeff;
#else
  int16_t *t_coeff = coeff;
#endif
  int16_t *coeff16 = (int16_t *)coeff;
  int idx;
  for (idx = 0; idx < 2; ++idx) {
    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
  }

  for (idx = 0; idx < 64; idx += 16) {
    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));

    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);

    b0 = _mm256_srai_epi16(b0, 1);
    b1 = _mm256_srai_epi16(b1, 1);
    b2 = _mm256_srai_epi16(b2, 1);
    b3 = _mm256_srai_epi16(b3, 1);
    if (is_final) {
      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
      coeff += 16;
    } else {
      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
      coeff16 += 16;
    }
    t_coeff += 16;
  }
}
Exemplo n.º 5
0
static void _mm256i_print(__m256i x) {
    int *data = new int[8];
    _mm256_storeu_si256((__m256i*)&data[0],x);
    for(size_t i =0 ; i < 8; i++) {
        std::cout << "Data[" << i << "]: " << data[i] << std::endl;
    }
    delete[] data;
}
Exemplo n.º 6
0
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
static void
shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 16;
  size_t j;
  int k, l;
  __m256i ymm0[16], ymm1[16];

  /* Create the shuffle mask.
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
     most to least significant (i.e., their order is reversed when compared to
     loading the mask from an array). */
  const __m256i shmask = _mm256_set_epi8(
    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);

  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
    for (k = 0; k < 16; k++) {
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
    }
    /* Transpose bytes */
    for (k = 0, l = 0; k < 8; k++, l +=2) {
      ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
      ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
    }
    /* Transpose words */
    for (k = 0, l = -2; k < 8; k++, l++) {
      if ((k%2) == 0) l += 2;
      ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
      ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
    }
    /* Transpose double words */
    for (k = 0, l = -4; k < 8; k++, l++) {
      if ((k%4) == 0) l += 4;
      ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
      ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
    }
    /* Transpose quad words */
    for (k = 0; k < 8; k++) {
      ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
      ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
    }
    for (k = 0; k < 16; k++) {
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
      ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
    }
    /* Store the result vectors */
    uint8_t* const dest_for_jth_element = dest + j;
    for (k = 0; k < 16; k++) {
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
    }
  }
}
Exemplo n.º 7
0
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
static void
unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 8;
  size_t i;
  int j;
  __m256i ymm0[8], ymm1[8];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 8; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Shuffle words */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
    }
    for (j = 0; j < 8; j++) {
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
    }

    /* Shuffle 4-byte dwords */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
    }

    /* Store the result vectors in proper order */
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
  }
}
Exemplo n.º 8
0
static INLINE void quantize(const __m256i *qp, __m256i *c,
                            const int16_t *iscan_ptr, int log_scale,
                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
                            __m256i *eob) {
  const __m256i abs_coeff = _mm256_abs_epi32(*c);
  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);

  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
  __m256i q_hi = _mm256_srli_epi64(q, 32);
  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
  q_hi = _mm256_slli_epi64(q_hi, 32);
  q = _mm256_or_si256(q_lo, q_hi);
  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
  q = _mm256_andnot_si256(mask, q);

  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
  dq = _mm256_srai_epi32(dq, log_scale);
  q = _mm256_sign_epi32(q, *c);
  dq = _mm256_sign_epi32(dq, *c);

  _mm256_storeu_si256((__m256i *)qcoeff, q);
  _mm256_storeu_si256((__m256i *)dqcoeff, dq);

  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
  const __m128i zr = _mm_setzero_si128();
  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
  const __m256i iscan =
      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);

  const __m256i zero = _mm256_setzero_si256();
  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
  cur_eob = _mm256_and_si256(cur_eob, nz);
  *eob = _mm256_max_epi32(cur_eob, *eob);
}
Exemplo n.º 9
0
void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
                                    ptrdiff_t src_stride, tran_low_t *coeff) {
  int idx;
  tran_low_t *t_coeff = coeff;
  for (idx = 0; idx < 4; ++idx) {
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
  }

  for (idx = 0; idx < 256; idx += 8) {
    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));

    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);

    b0 = _mm256_srai_epi32(b0, 2);
    b1 = _mm256_srai_epi32(b1, 2);
    b2 = _mm256_srai_epi32(b2, 2);
    b3 = _mm256_srai_epi32(b3, 2);

    coeff0 = _mm256_add_epi32(b0, b2);
    coeff1 = _mm256_add_epi32(b1, b3);
    coeff2 = _mm256_sub_epi32(b0, b2);
    coeff3 = _mm256_sub_epi32(b1, b3);

    _mm256_storeu_si256((__m256i *)coeff, coeff0);
    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);

    coeff += 8;
    t_coeff += 8;
  }
}
Exemplo n.º 10
0
void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len)
{
  int i;

#ifdef _OPENMP
#pragma omp parallel for private(i)
#endif
  for ( i = 0; i < len; i+=16 ) {
    __m512  vfp32  = gxm_fp32_to_bfp16_rne_adjustment_avx512f( _mm512_loadu_ps( in+i ) );
    __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f( vfp32 );
    _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 );
  }
}
Exemplo n.º 11
0
// Reverse with intrinsics
// First - reverse the values in the first half and second half
// Second - copy the second half of temp array to the beginning of original one, and first half (from temp array) next to the first in original one
// (swap the halfs)
inline void reverse(char * bytes, int numChunks)
{
	static char temp[64];	
	__m256i firstHalf, secondHalf;

	for (int i = 0; i < numChunks; ++i)
	{
		// Reverse first half.
		firstHalf = _mm256_set_epi8(bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
			bytes[16], bytes[17], bytes[18], bytes[19], bytes[20], bytes[21], bytes[22], bytes[23], bytes[24], bytes[25], bytes[26], bytes[27], bytes[28], bytes[29], bytes[30], bytes[31]);
		
		// Reverse second half.
		secondHalf = _mm256_set_epi8(bytes[32], bytes[33], bytes[34], bytes[35], bytes[36], bytes[37], bytes[38], bytes[39], bytes[40], bytes[41], bytes[42], bytes[43], bytes[44], bytes[45], bytes[46], bytes[47], bytes[48],
			bytes[49], bytes[50], bytes[51], bytes[52], bytes[53], bytes[54], bytes[55], bytes[56], bytes[57], bytes[58], bytes[59], bytes[60], bytes[61], bytes[62], bytes[63]);

		// write the second half at the begining, and after it- first one.
		_mm256_storeu_si256((__m256i*)bytes, secondHalf);
		_mm256_storeu_si256((__m256i*)(bytes + 32), firstHalf);

		bytes += 64;
	}
}
Exemplo n.º 12
0
void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width)
{
    int x = 0;

    __m256 scale256 = _mm256_set1_ps(scale);
    __m256 shift256 = _mm256_set1_ps(shift);
    const int shuffle = 0xD8;

    for (; x <= width - 16; x += 16)
    {
        __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
        v_src = _mm256_permute4x64_epi64(v_src, shuffle);
        __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
        __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
        __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
        __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
        _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
        _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
    }

    for (; x < width; x++)
        dst[x] = saturate_cast<int>(src[x] * scale + shift);
}
Exemplo n.º 13
0
inline void reverse(char* bytes, int numChunks)
{
    static __m256i reversed1;
    static __m256i reversed2;

    for(size_t i = 0; i < numChunks; ++i)
    {
        reversed1 = _mm256_set_epi8(bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8],
                                    bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15], bytes[16],
                                    bytes[17], bytes[18], bytes[19], bytes[20], bytes[21], bytes[22], bytes[23], bytes[24],
                                    bytes[25], bytes[26], bytes[27], bytes[28], bytes[29], bytes[30], bytes[31]);

		reversed2 = _mm256_set_epi8(bytes[32], bytes[33], bytes[34], bytes[35], bytes[36], bytes[37], bytes[38], bytes[39],
                                    bytes[40], bytes[41], bytes[42], bytes[43], bytes[44], bytes[45], bytes[46], bytes[47],
                                    bytes[48], bytes[49], bytes[50], bytes[51], bytes[52], bytes[53], bytes[54], bytes[55],
                                    bytes[56], bytes[57], bytes[58], bytes[59], bytes[60], bytes[61], bytes[62], bytes[63]);


        _mm256_storeu_si256((__m256i*)&bytes[0], reversed2);
        _mm256_storeu_si256((__m256i*)&bytes[32], reversed1);
        bytes += 64;
    }
}
size_t vectorshift(uint32_t *array, size_t length, int shiftamount) {
  size_t k = 0;
  __m256i * a = (__m256i *) array;
  for (; k  < length / 8 ; k ++, a++) {
    __m256i v = _mm256_loadu_si256(a);
    v = _mm256_srli_epi32(v,SHIFTAMOUNT);
     _mm256_storeu_si256(a,v);
  }
  k *= 8;
  for (; k < length; ++k) {
    array[k] = array[k] >> SHIFTAMOUNT;
  }
  return 0;
}
Exemplo n.º 15
0
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
{
    const int AB_BITS = MAX(10, (int)INTER_BITS);
    int x1 = 0;
    __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1);
    __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0);
    for (; x1 <= bw - 16; x1 += 16)
    {
        __m256i tx0, tx1, ty0, ty1;
        tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX);
        ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY);
        tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX);
        ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY);

        tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS);
        ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS);
        tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS);
        ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS);

        __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask),
            _mm256_and_si256(tx1, fxy_mask));
        __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask),
            _mm256_and_si256(ty1, fxy_mask));
        tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS),
            _mm256_srai_epi32(tx1, INTER_BITS));
        ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS),
            _mm256_srai_epi32(ty1, INTER_BITS));
        fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS));
        fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);

        _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0));
        _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0));
        _mm256_storeu_si256((__m256i*)(alpha + x1), fx_);
    }
    _mm256_zeroupper();
    return x1;
}
size_t varvectorshift(uint32_t *array, size_t length, int shiftamount) {
  size_t k = 0;
  __m256i * a = (__m256i *) array;
  __m128i s = _mm_set1_epi32(shiftamount);
  for (; k  < length / 8 ; k ++, a++) {
    __m256i v = _mm256_loadu_si256(a);
    v = _mm256_srl_epi32(v,s);
     _mm256_storeu_si256(a,v);
  }
  k *= 8;
  for (; k < length; ++k) {
    array[k] = array[k] >> shiftamount;
  }
  return 0;
}
Exemplo n.º 17
0
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
static void
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  __m256i ymm0[4], ymm1[4];

  /* Create the shuffle mask.
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
     most to least significant (i.e., their order is reversed when compared to
     loading the mask from an array). */
  const __m256i mask = _mm256_set_epi32(
    0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
    }
    /* Transpose double words */
    for (j = 0; j < 2; j++) {
      ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
      ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Transpose quad words */
    for (j = 0; j < 2; j++) {
      ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
      ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
    }
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
    }
    /* Store the result vectors */
    uint8_t* const dest_for_ith_element = dest + i;
    for (j = 0; j < 4; j++) {
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
    }
  }
}
Exemplo n.º 18
0
void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                                  tran_low_t *coeff) {
  __m128i src16[8];
  __m256i src32[8];

  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));

  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
  src32[7] = _mm256_cvtepi16_epi32(src16[7]);

  highbd_hadamard_col8_avx2(src32, 0);
  highbd_hadamard_col8_avx2(src32, 1);

  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
  coeff += 8;
  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
}
Exemplo n.º 19
0
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
static void
shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 8;
  size_t j;
  int k, l;
  __m256i ymm0[8], ymm1[8];

  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
    /* Fetch 32 elements (256 bytes) then transpose bytes. */
    for (k = 0; k < 8; k++) {
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
      ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
      ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
    }
    /* Transpose words */
    for (k = 0, l = 0; k < 4; k++, l +=2) {
      ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]);
      ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]);
    }
    /* Transpose double words */
    for (k = 0, l = 0; k < 4; k++, l++) {
      if (k == 2) l += 2;
      ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]);
      ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]);
    }
    /* Transpose quad words */
    for (k = 0; k < 4; k++) {
      ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]);
      ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]);
    }
    for(k = 0; k < 8; k++) {
      ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
      ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
    }
    /* Store the result vectors */
    uint8_t* const dest_for_jth_element = dest + j;
    for (k = 0; k < 8; k++) {
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
    }
  }
}
Exemplo n.º 20
0
static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                                int16_t *coeff) {
  __m256i src[8];
  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));

  hadamard_col8x2_avx2(src, 0);
  hadamard_col8x2_avx2(src, 1);

  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
}
Exemplo n.º 21
0
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
static void
unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  __m256i ymm0[4], ymm1[4];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Load 32 elements (128 bytes) into 4 YMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    for (j = 0; j < 2; j++) {
      /* Compute the low 64 bytes */
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 64 bytes */
      ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 2; j++) {
      /* Compute the low 64 bytes */
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 64 bytes */
      ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
    }
    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);

    /* Store the result vectors in proper order */
    for (j = 0; j < 4; j++) {
      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
    }
  }
}
Exemplo n.º 22
0
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 2;
  size_t j;
  int k;
  __m256i ymm0[2], ymm1[2];

  /* Create the shuffle mask.
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
     most to least significant (i.e., their order is reversed when compared to
     loading the mask from an array). */
  const __m256i shmask = _mm256_set_epi8(
    0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
    0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
    0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
    0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);

  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
    /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
    for (k = 0; k < 2; k++) {
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
      ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
    }

    ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
    ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);

    ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
    ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
    ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);

    /* Store the result vectors */
    uint8_t* const dest_for_jth_element = dest + j;
    for (k = 0; k < 2; k++) {
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
    }
  }
}
Exemplo n.º 23
0
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                            const __m256i a,
                                            uint8_t *comp_pred) {
  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));

  const __m256i ma = _mm256_sub_epi8(alpha_max, a);

  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);

  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);

  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
}
Exemplo n.º 24
0
void*  xmemset(void* dest, int c, size_t n)
{
    void* ret = dest;
    if (n < 16) {
        xmemset_lt16(dest, c, n);
        return ret;
    }

    __m256i mm = _mm256_set1_epi8((char)c);

    if (((unsigned long)dest & 31) == 0) {
        for ( ; n >= 256; n -= 256) {
            _mm256_store_si256((__m256i*)dest, mm);
            _mm256_store_si256((__m256i*)dest + 1, mm);
            _mm256_store_si256((__m256i*)dest + 2, mm);
            _mm256_store_si256((__m256i*)dest + 3, mm);
            _mm256_store_si256((__m256i*)dest + 4, mm);
            _mm256_store_si256((__m256i*)dest + 5, mm);
            _mm256_store_si256((__m256i*)dest + 6, mm);
            _mm256_store_si256((__m256i*)dest + 7, mm); // 8
            dest = (void*)((__m256i*)dest + 8);
        }

        if (n >= 128) {
            _mm256_store_si256((__m256i*)dest, mm);
            _mm256_store_si256((__m256i*)dest + 1, mm);
            _mm256_store_si256((__m256i*)dest + 2, mm);
            _mm256_store_si256((__m256i*)dest + 3, mm);
            dest = (void*)((__m256i*)dest + 4);
            n -= 128;
        }

        if (n >= 64) {
            _mm256_store_si256((__m256i*)dest, mm);
            _mm256_store_si256((__m256i*)dest + 1, mm);
            dest = (void*)((__m256i*)dest + 2);
            n -= 64;
        }

        if (n >= 32) {
            _mm256_store_si256((__m256i*)dest, mm);
            dest = (void*)((__m256i*)dest + 1);
            n -= 32;
        }

        if (n >= 16) {
            _mm_store_si128((__m128i*)dest, _mm_set1_epi8((char)c));
            dest = (void*)((__m128i*)dest + 1);
            n -= 16;
        }

    } else {
        for ( ; n >= 256; n -= 256) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            _mm256_storeu_si256((__m256i*)dest + 1, mm);
            _mm256_storeu_si256((__m256i*)dest + 2, mm);
            _mm256_storeu_si256((__m256i*)dest + 3, mm);
            _mm256_storeu_si256((__m256i*)dest + 4, mm);
            _mm256_storeu_si256((__m256i*)dest + 5, mm);
            _mm256_storeu_si256((__m256i*)dest + 6, mm);
            _mm256_storeu_si256((__m256i*)dest + 7, mm); // 8
            dest = (void*)((__m256i*)dest + 8);
        }

        if (n >= 128) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            _mm256_storeu_si256((__m256i*)dest + 1, mm);
            _mm256_storeu_si256((__m256i*)dest + 2, mm);
            _mm256_storeu_si256((__m256i*)dest + 3, mm);
            dest = (void*)((__m256i*)dest + 4);
            n -= 128;
        }

        if (n >= 64) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            _mm256_storeu_si256((__m256i*)dest + 1, mm);
            dest = (void*)((__m256i*)dest + 2);
            n -= 64;
        }

        if (n >= 32) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            dest = (void*)((__m256i*)dest + 1);
            n -= 32;
        }

        if (n >= 16) {
            _mm_storeu_si128((__m128i*)dest, _mm_set1_epi8((char)c));
            dest = (void*)((__m128i*)dest + 1);
            n -= 16;
        }
    }

    xmemset_lt16(dest, c, n);
    return ret;
}
Exemplo n.º 25
0
void kvz_pixels_blit_avx2(const kvz_pixel * const orig, kvz_pixel * const dst,
                         const unsigned width, const unsigned height,
                         const unsigned orig_stride, const unsigned dst_stride)
{
  unsigned y;
  //There is absolutely no reason to have a width greater than the source or the destination stride.
  assert(width <= orig_stride);
  assert(width <= dst_stride);

#ifdef CHECKPOINTS
  char *buffer = malloc((3 * width + 1) * sizeof(char));
  for (y = 0; y < height; ++y) {
    int p;
    for (p = 0; p < width; ++p) {
      sprintf((buffer + 3*p), "%02X ", orig[y*orig_stride]);
    }
    buffer[3*width] = 0;
    CHECKPOINT("kvz_pixels_blit_avx2: %04d: %s", y, buffer);
  }
  FREE_POINTER(buffer);
#endif //CHECKPOINTS

  if (width == orig_stride && width == dst_stride) {
    memcpy(dst, orig, width * height * sizeof(kvz_pixel));
    return;
  }

  int nxn_width = (width == height) ? width : 0;
  switch (nxn_width) {
    case 4:
      *(int32_t*)&dst[dst_stride*0] = *(int32_t*)&orig[orig_stride*0];
      *(int32_t*)&dst[dst_stride*1] = *(int32_t*)&orig[orig_stride*1];
      *(int32_t*)&dst[dst_stride*2] = *(int32_t*)&orig[orig_stride*2];
      *(int32_t*)&dst[dst_stride*3] = *(int32_t*)&orig[orig_stride*3];
      break;
    case 8:
      *(int64_t*)&dst[dst_stride*0] = *(int64_t*)&orig[orig_stride*0];
      *(int64_t*)&dst[dst_stride*1] = *(int64_t*)&orig[orig_stride*1];
      *(int64_t*)&dst[dst_stride*2] = *(int64_t*)&orig[orig_stride*2];
      *(int64_t*)&dst[dst_stride*3] = *(int64_t*)&orig[orig_stride*3];
      *(int64_t*)&dst[dst_stride*4] = *(int64_t*)&orig[orig_stride*4];
      *(int64_t*)&dst[dst_stride*5] = *(int64_t*)&orig[orig_stride*5];
      *(int64_t*)&dst[dst_stride*6] = *(int64_t*)&orig[orig_stride*6];
      *(int64_t*)&dst[dst_stride*7] = *(int64_t*)&orig[orig_stride*7];
       break;
    case 16:
      for (int i = 0; i < 16; ++i) {
        __m128i temp = _mm_loadu_si128((__m128i*)(orig + i * orig_stride));
        _mm_storeu_si128((__m128i*)(dst + i * dst_stride), temp);
      }
      break;
    case 32:
      for (int i = 0; i < 32; ++i) {
        __m256i temp = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride));
        _mm256_storeu_si256((__m256i*)(dst + i * dst_stride), temp);
      }
    break;
    case 64:
      for (int i = 0; i < 64; ++i) {
        __m256i temp0 = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride));
        _mm256_storeu_si256((__m256i*)(dst + i * dst_stride), temp0);
        __m256i temp1 = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride + sizeof(__m256)));
        _mm256_storeu_si256((__m256i*)(dst + i * dst_stride + sizeof(__m256)), temp1);
      }
    break;
  default:

    if (orig == dst) {
      //If we have the same array, then we should have the same stride
      assert(orig_stride == dst_stride);
      return;
    }
    assert(orig != dst || orig_stride == dst_stride);

    for (y = 0; y < height; ++y) {
      memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(kvz_pixel));
    }
    break;
  }
}
Exemplo n.º 26
0
 /*!
  * \brief Unaligned store of the given packed vector at the
  * given memory position
  */
 ETL_STATIC_INLINE(void) storeu(int64_t* memory, avx_simd_long value) {
     _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
 }
Exemplo n.º 27
0
void nibble_sort_beekman1(uint64_t *buf) {
  // already in the right order
  //__m256i
  // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL};
  __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL,
                   0x0e060c040a020800ULL, 0x0f070d050b030901ULL};
  __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL,
                   0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL};
  // use less instructions below
  //__m256i
  // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL};
  __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL,
                   0x000d070605040301ULL, 0x0f0e0b090a080c02ULL};
  __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL,
                   0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL};
  __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL,
                   0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL};
  __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL,
                   0x070500060b030901ULL, 0x0f0d080e0c040a02ULL};
  __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL,
                   0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL};
  __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL,
                   0x070c0b0a09080605ULL, 0x0f0e04030201000dULL};
  __m256i nibblemask = _mm256_set1_epi8(0x0f);
  for (uint32_t i = 0; i < (1024 / 4); i += 1) {
    __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2;
    r0 &= nibblemask;
    r1 ^= r0;
    r1 = _mm256_srli_epi64(r1, 4);

#define sort_and_shuffle(n)                                                    \
  r2 = _mm256_max_epi8(r0, r1);                                                \
  r0 = _mm256_min_epi8(r0, r1);                                                \
  r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000);           \
  r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111);           \
  r1 = _mm256_shuffle_epi8(r1, shuf##n);                                       \
  r2 = _mm256_shuffle_epi8(r2, shuf##n);                                       \
  r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000);           \
  r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111)

    sort_and_shuffle(1);
    sort_and_shuffle(2);
    { // sort_and_shuffle(3);
      r2 = _mm256_max_epi8(r0, r1);
      r0 = _mm256_min_epi8(r0, r1);
      r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2);
      r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2);
      r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111);
      r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000);
    }
    sort_and_shuffle(4);
    sort_and_shuffle(5);
    sort_and_shuffle(6);
    sort_and_shuffle(7);
    sort_and_shuffle(8);
    sort_and_shuffle(9);

    r1 = _mm256_slli_epi64(r1, 4);
    _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0);
  }
}
Exemplo n.º 28
0
/* For data organized into a row for each bit (8 * elem_size rows), transpose
 * the bytes. */
int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size,
         const size_t elem_size) {

    size_t hh, ii, jj, kk, mm;
    char* in_b = (char*) in;
    char* out_b = (char*) out;

    CHECK_MULT_EIGHT(size);

    size_t nrows = 8 * elem_size;
    size_t nbyte_row = size / 8;

    if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size,
            elem_size);

    __m256i ymm_0[8];
    __m256i ymm_1[8];
    __m256i ymm_storeage[8][4];

    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
        for (ii = 0; ii + 3 < elem_size; ii += 4) {
            for (hh = 0; hh < 4; hh ++) {

                for (kk = 0; kk < 8; kk ++){
                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
                }

                for (kk = 0; kk < 4; kk ++){
                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                }

                for (kk = 0; kk < 2; kk ++){
                    for (mm = 0; mm < 2; mm ++){
                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
                                ymm_1[kk * 4 + mm * 2],
                                ymm_1[kk * 4 + mm * 2 + 1]);
                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
                                ymm_1[kk * 4 + mm * 2],
                                ymm_1[kk * 4 + mm * 2 + 1]);
                    }
                }

                for (kk = 0; kk < 4; kk ++){
                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                }

                for (kk = 0; kk < 8; kk ++){
                    ymm_storeage[kk][hh] = ymm_1[kk];
                }
            }

            for (mm = 0; mm < 8; mm ++) {

                for (kk = 0; kk < 4; kk ++){
                    ymm_0[kk] = ymm_storeage[mm][kk];
                }

                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);

                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);

                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
            }
        }
    }
    for (ii = 0; ii < nrows; ii ++ ) {
        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
        }
    }
    return size * elem_size;
}
Exemplo n.º 29
0
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
static void
unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 16;
  size_t i;
  int j;
  __m256i ymm0[16], ymm1[16];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 16; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
    }

    /* Shuffle bytes */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
    }
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
    }

    /* Shuffle 8-byte qwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
    }

    for (j = 0; j < 8; j++) {
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
      ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
    }

    /* Store the result vectors in proper order */
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
  }
}
Exemplo n.º 30
0
void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
                                    int width, int height, const uint8_t *ref8,
                                    int ref_stride, const uint8_t *mask,
                                    int mask_stride, int invert_mask) {
  int i = 0;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
  const uint16_t *src0 = invert_mask ? pred : ref;
  const uint16_t *src1 = invert_mask ? ref : pred;
  const int stride0 = invert_mask ? width : ref_stride;
  const int stride1 = invert_mask ? ref_stride : width;
  const __m256i zero = _mm256_setzero_si256();

  if (width == 8) {
    do {
      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);

      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));

      __m256i m = _mm256_castsi128_si256(m_l);
      m = _mm256_insertf128_si256(m, m_h, 1);
      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);

      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);

      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));

      _mm_storeu_si128((__m128i *)(comp_pred + width),
                       _mm256_extractf128_si256(comp, 1));

      src0 += (stride0 << 1);
      src1 += (stride1 << 1);
      mask += (mask_stride << 1);
      comp_pred += (width << 1);
      i += 2;
    } while (i < height);
  } else if (width == 16) {
    do {
      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
      const __m256i m_16 =
          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));

      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);

      _mm256_storeu_si256((__m256i *)comp_pred, comp);

      src0 += stride0;
      src1 += stride1;
      mask += mask_stride;
      comp_pred += width;
      i += 1;
    } while (i < height);
  } else if (width == 32) {
    do {
      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));

      const __m256i m01_16 =
          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
      const __m256i m23_16 =
          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));

      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);

      _mm256_storeu_si256((__m256i *)comp_pred, comp);
      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);

      src0 += stride0;
      src1 += stride1;
      mask += mask_stride;
      comp_pred += width;
      i += 1;
    } while (i < height);
  }
}