示例#1
0
 SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst)
 {
     const __m256i _src = Load<srcAlign>((__m256i*)(src + offset));
     const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<srcAlign>((__m256i*)(mask + offset)), index), K8_01);
     __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask));
     __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask));
     Store<dstAlign>((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); 
     Store<dstAlign>((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31));
 }
示例#2
0
// Compare rank with all values currently in the queue.  Returns -1 if the value already exists
// or is larger than all values.
// Otherwise, returns the index of the register in which the value should be inserted.
// Mask is replicated to both lanes, so it can be used for both value and rank lane.
int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i &gtmask)
{
    static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
    __m256i eq, eq4;
    int reg, mask;

    // Because items are sorted in ascending order within each (double) register, the mask after GT
    // comparison must be of the form 000...1111, which is one less than a power of two.
    {
        __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20);		// [0 .. 7]
        gtmask = _mm256_cmpgt_epi32(r0_7, mrank);
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask));
        eq = _mm256_cmpeq_epi32(r0_7, mrank);
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 1;
    }

    if (!mask) {
        __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20);	// [8 .. 15]
        gtmask = _mm256_cmpgt_epi32(r8_15, mrank);
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask));
        eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank));
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 3;
    }

    if (!mask) {
        gtmask = _mm256_cmpgt_epi32(_rv[4], mrank);							// [16 .. 19]; don't care about value
        eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF;       // ignore comparison with values
        eq = _mm256_or_si256(eq, eq4);
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 4;
    }

    if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0)
        mask = 0;
    if (!mask)
        return -1;

    // Adjust register according to mask (higher 128-bits i double register: one register lower)
    // There is no "previous" register to test against for equality if we need to insert in the
    // very first register.  Also duplicate the same mask to both lanes.

    if (mask > 0xF) {
        mask >>= 4;
        --reg;
        gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11);           // replicate high lane to both
    }
static inline int16_t _mm256_hmax_epi16_rpl(__m256i a) {
    a = _mm256_max_epi16(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,0,0)));
    a = _mm256_max_epi16(a, _mm256_slli_si256(a, 8));
    a = _mm256_max_epi16(a, _mm256_slli_si256(a, 4));
    a = _mm256_max_epi16(a, _mm256_slli_si256(a, 2));
    return _mm256_extract_epi16_rpl(a, 15);
}
示例#4
0
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
static void
unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  __m256i ymm0[4], ymm1[4];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Load 32 elements (128 bytes) into 4 YMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    for (j = 0; j < 2; j++) {
      /* Compute the low 64 bytes */
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 64 bytes */
      ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 2; j++) {
      /* Compute the low 64 bytes */
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 64 bytes */
      ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
    }
    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);

    /* Store the result vectors in proper order */
    for (j = 0; j < 4; j++) {
      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
    }
  }
}
示例#5
0
文件: testimm-9.c 项目: pjump/gcc
void
test8bit (void)
{
    l1 = _mm256_mpsadbw_epu8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_alignr_epi8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    i1 = _mm_blend_epi32 (i1, i1, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_blend_epi32 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_blend_epi16(l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_permute2x128_si256 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    e1 = _mm256_permute4x64_pd (e2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_permute4x64_epi64 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shuffle_epi32 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shufflehi_epi16 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shufflelo_epi16 (l2, 256);  /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_slli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_srli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
}
示例#6
0
static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                                int16_t *coeff) {
  __m256i src[8];
  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));

  hadamard_col8x2_avx2(src, 0);
  hadamard_col8x2_avx2(src, 1);

  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
  coeff += 16;
  _mm256_storeu_si256((__m256i *)coeff,
                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
}
示例#7
0
        template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer)
        {
            __m256i src[3], shuffled;
            src[0] = _mm256_load_si256(buffer + 0);
            src[1] = _mm256_load_si256(buffer + 1);
            src[2] = _mm256_load_si256(buffer + 2);

            shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00);
            shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01));
            shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02));
            _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0)));

            shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10);
            shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11));
            shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12));
            _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1)));

            shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20);
            shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21));
            shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22));
            _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2)));
        }        
示例#8
0
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
  // CHECK: shufflevector{{.*}}<i32 2, i32 3, i32 6, i32 7>
  return _mm256_permute2x128_si256(a, b, 0x31);
}
示例#9
0
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_permute2x128_si256
  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49)
  return _mm256_permute2x128_si256(a, b, 0x31);
}
static INLINE void update_qp(__m256i *qp) {
  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
}
示例#11
0
/* For data organized into a row for each bit (8 * elem_size rows), transpose
 * the bytes. */
int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size,
         const size_t elem_size) {

    size_t hh, ii, jj, kk, mm;
    char* in_b = (char*) in;
    char* out_b = (char*) out;

    CHECK_MULT_EIGHT(size);

    size_t nrows = 8 * elem_size;
    size_t nbyte_row = size / 8;

    if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size,
            elem_size);

    __m256i ymm_0[8];
    __m256i ymm_1[8];
    __m256i ymm_storeage[8][4];

    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
        for (ii = 0; ii + 3 < elem_size; ii += 4) {
            for (hh = 0; hh < 4; hh ++) {

                for (kk = 0; kk < 8; kk ++){
                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
                }

                for (kk = 0; kk < 4; kk ++){
                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                }

                for (kk = 0; kk < 2; kk ++){
                    for (mm = 0; mm < 2; mm ++){
                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
                                ymm_1[kk * 4 + mm * 2],
                                ymm_1[kk * 4 + mm * 2 + 1]);
                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
                                ymm_1[kk * 4 + mm * 2],
                                ymm_1[kk * 4 + mm * 2 + 1]);
                    }
                }

                for (kk = 0; kk < 4; kk ++){
                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
                            ymm_0[kk * 2 + 1]);
                }

                for (kk = 0; kk < 8; kk ++){
                    ymm_storeage[kk][hh] = ymm_1[kk];
                }
            }

            for (mm = 0; mm < 8; mm ++) {

                for (kk = 0; kk < 4; kk ++){
                    ymm_0[kk] = ymm_storeage[mm][kk];
                }

                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);

                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);

                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
                _mm256_storeu_si256((__m256i *) &out_b[
                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
            }
        }
    }
    for (ii = 0; ii < nrows; ii ++ ) {
        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
        }
    }
    return size * elem_size;
}
示例#12
0
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
  // CHECK: @llvm.x86.avx2.vperm2i128
  return _mm256_permute2x128_si256(a, b, 0x31);
}
示例#13
0
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
static void
unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
{
  size_t i;
  int j;
  __m256i ymm0[16], ymm1[16];

  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));

  /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
     to optimize cache utilization. */
  size_t offset_into_type;
  for (offset_into_type = 0; offset_into_type < bytesoftype;
    offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
    for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
      /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
      const uint8_t* const src_for_ith_element = src + i;
      for (j = 0; j < 16; j++) {
        ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
      }

      /* Shuffle bytes */
      for (j = 0; j < 8; j++) {
        /* Compute the low 32 bytes */
        ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
        /* Compute the hi 32 bytes */
        ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
      }
      /* Shuffle 2-byte words */
      for (j = 0; j < 8; j++) {
        /* Compute the low 32 bytes */
        ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
        /* Compute the hi 32 bytes */
        ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
      }
      /* Shuffle 4-byte dwords */
      for (j = 0; j < 8; j++) {
        /* Compute the low 32 bytes */
        ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
        /* Compute the hi 32 bytes */
        ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
      }

      /* Shuffle 8-byte qwords */
      for (j = 0; j < 8; j++) {
        /* Compute the low 32 bytes */
        ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
        /* Compute the hi 32 bytes */
        ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
      }

      for (j = 0; j < 8; j++) {
        ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
        ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
      }

      /* Store the result vectors in proper order */
      const uint8_t* const dest_with_offset = dest + offset_into_type;
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
      _mm256_storeu2_m128i(
        (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
        (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
    }
  }
}
示例#14
0
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
static void
unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 16;
  size_t i;
  int j;
  __m256i ymm0[16], ymm1[16];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 16; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
    }

    /* Shuffle bytes */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
    }
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
    }

    /* Shuffle 8-byte qwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
      /* Compute the hi 32 bytes */
      ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
    }

    for (j = 0; j < 8; j++) {
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
      ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
    }

    /* Store the result vectors in proper order */
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
  }
}
示例#15
0
int normHamming(const uchar* a, const uchar* b, int n)
{
    CV_AVX_GUARD;

    int i = 0;
    int result = 0;
#if CV_AVX2
    {
        __m256i _r0 = _mm256_setzero_si256();
        __m256i _0 = _mm256_setzero_si256();
        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);

        for(; i <= n - 32; i+= 32)
        {
            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
            __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));

            __m256i _xor = _mm256_xor_si256(_a0, _b0);

            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
                             _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));

            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
        }
        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
    }
#endif // CV_AVX2

#if CV_POPCNT
    {
#  if defined CV_POPCNT_U64
        for(; i <= n - 8; i += 8)
        {
            result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
        }
#  endif
        for(; i <= n - 4; i += 4)
        {
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
#endif // CV_POPCNT

#if CV_SIMD128
    {
        v_uint32x4 t = v_setzero_u32();
        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
        {
            t += v_popcount(v_load(a + i) ^ v_load(b + i));
        }
        result += v_reduce_sum(t);
    }
#endif // CV_SIMD128
#if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
        result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    }
#endif
    for(; i < n; i++)
    {
        result += popCountTable[a[i] ^ b[i]];
    }
    return result;
}
示例#16
0
static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
  __m256i a0 = in[0];
  __m256i a1 = in[1];
  __m256i a2 = in[2];
  __m256i a3 = in[3];
  __m256i a4 = in[4];
  __m256i a5 = in[5];
  __m256i a6 = in[6];
  __m256i a7 = in[7];

  __m256i b0 = _mm256_add_epi32(a0, a1);
  __m256i b1 = _mm256_sub_epi32(a0, a1);
  __m256i b2 = _mm256_add_epi32(a2, a3);
  __m256i b3 = _mm256_sub_epi32(a2, a3);
  __m256i b4 = _mm256_add_epi32(a4, a5);
  __m256i b5 = _mm256_sub_epi32(a4, a5);
  __m256i b6 = _mm256_add_epi32(a6, a7);
  __m256i b7 = _mm256_sub_epi32(a6, a7);

  a0 = _mm256_add_epi32(b0, b2);
  a1 = _mm256_add_epi32(b1, b3);
  a2 = _mm256_sub_epi32(b0, b2);
  a3 = _mm256_sub_epi32(b1, b3);
  a4 = _mm256_add_epi32(b4, b6);
  a5 = _mm256_add_epi32(b5, b7);
  a6 = _mm256_sub_epi32(b4, b6);
  a7 = _mm256_sub_epi32(b5, b7);

  if (iter == 0) {
    b0 = _mm256_add_epi32(a0, a4);
    b7 = _mm256_add_epi32(a1, a5);
    b3 = _mm256_add_epi32(a2, a6);
    b4 = _mm256_add_epi32(a3, a7);
    b2 = _mm256_sub_epi32(a0, a4);
    b6 = _mm256_sub_epi32(a1, a5);
    b1 = _mm256_sub_epi32(a2, a6);
    b5 = _mm256_sub_epi32(a3, a7);

    a0 = _mm256_unpacklo_epi32(b0, b1);
    a1 = _mm256_unpacklo_epi32(b2, b3);
    a2 = _mm256_unpackhi_epi32(b0, b1);
    a3 = _mm256_unpackhi_epi32(b2, b3);
    a4 = _mm256_unpacklo_epi32(b4, b5);
    a5 = _mm256_unpacklo_epi32(b6, b7);
    a6 = _mm256_unpackhi_epi32(b4, b5);
    a7 = _mm256_unpackhi_epi32(b6, b7);

    b0 = _mm256_unpacklo_epi64(a0, a1);
    b1 = _mm256_unpacklo_epi64(a4, a5);
    b2 = _mm256_unpackhi_epi64(a0, a1);
    b3 = _mm256_unpackhi_epi64(a4, a5);
    b4 = _mm256_unpacklo_epi64(a2, a3);
    b5 = _mm256_unpacklo_epi64(a6, a7);
    b6 = _mm256_unpackhi_epi64(a2, a3);
    b7 = _mm256_unpackhi_epi64(a6, a7);

    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
  } else {
    in[0] = _mm256_add_epi32(a0, a4);
    in[7] = _mm256_add_epi32(a1, a5);
    in[3] = _mm256_add_epi32(a2, a6);
    in[4] = _mm256_add_epi32(a3, a7);
    in[2] = _mm256_sub_epi32(a0, a4);
    in[6] = _mm256_sub_epi32(a1, a5);
    in[1] = _mm256_sub_epi32(a2, a6);
    in[5] = _mm256_sub_epi32(a3, a7);
  }
}