SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst) { const __m256i _src = Load<srcAlign>((__m256i*)(src + offset)); const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<srcAlign>((__m256i*)(mask + offset)), index), K8_01); __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask)); __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask)); Store<dstAlign>((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); Store<dstAlign>((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); }
// Compare rank with all values currently in the queue. Returns -1 if the value already exists // or is larger than all values. // Otherwise, returns the index of the register in which the value should be inserted. // Mask is replicated to both lanes, so it can be used for both value and rank lane. int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i >mask) { static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); __m256i eq, eq4; int reg, mask; // Because items are sorted in ascending order within each (double) register, the mask after GT // comparison must be of the form 000...1111, which is one less than a power of two. { __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20); // [0 .. 7] gtmask = _mm256_cmpgt_epi32(r0_7, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_cmpeq_epi32(r0_7, mrank); _ASSERTE(((mask + 1) & mask) == 0); reg = 1; } if (!mask) { __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20); // [8 .. 15] gtmask = _mm256_cmpgt_epi32(r8_15, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank)); _ASSERTE(((mask + 1) & mask) == 0); reg = 3; } if (!mask) { gtmask = _mm256_cmpgt_epi32(_rv[4], mrank); // [16 .. 19]; don't care about value eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF; // ignore comparison with values eq = _mm256_or_si256(eq, eq4); _ASSERTE(((mask + 1) & mask) == 0); reg = 4; } if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0) mask = 0; if (!mask) return -1; // Adjust register according to mask (higher 128-bits i double register: one register lower) // There is no "previous" register to test against for equality if we need to insert in the // very first register. Also duplicate the same mask to both lanes. if (mask > 0xF) { mask >>= 4; --reg; gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11); // replicate high lane to both }
static inline int16_t _mm256_hmax_epi16_rpl(__m256i a) { a = _mm256_max_epi16(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,0,0))); a = _mm256_max_epi16(a, _mm256_slli_si256(a, 8)); a = _mm256_max_epi16(a, _mm256_slli_si256(a, 4)); a = _mm256_max_epi16(a, _mm256_slli_si256(a, 2)); return _mm256_extract_epi16_rpl(a, 15); }
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ static void unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m256i ymm0[4], ymm1[4]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load 32 elements (128 bytes) into 4 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 4; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 2; j++) { /* Compute the low 64 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 64 bytes */ ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 2; j++) { /* Compute the low 64 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 64 bytes */ ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20); ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20); ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31); ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31); /* Store the result vectors in proper order */ for (j = 0; j < 4; j++) { _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]); } } }
void test8bit (void) { l1 = _mm256_mpsadbw_epu8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_alignr_epi8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi32 (i1, i1, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_blend_epi32 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_blend_epi16(l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2x128_si256 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute4x64_pd (e2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute4x64_epi64 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_shuffle_epi32 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_shufflehi_epi16 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_shufflelo_epi16 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_slli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_srli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { __m256i src[8]; src[0] = _mm256_loadu_si256((const __m256i *)src_diff); src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[0], src[1], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[2], src[3], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[4], src[5], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[6], src[7], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[0], src[1], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[2], src[3], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[4], src[5], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[6], src[7], 0x31)); }
template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer) { __m256i src[3], shuffled; src[0] = _mm256_load_si256(buffer + 0); src[1] = _mm256_load_si256(buffer + 1); src[2] = _mm256_load_si256(buffer + 2); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); }
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { // CHECK: shufflevector{{.*}}<i32 2, i32 3, i32 6, i32 7> return _mm256_permute2x128_si256(a, b, 0x31); }
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_permute2x128_si256 // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49) return _mm256_permute2x128_si256(a, b, 0x31); }
static INLINE void update_qp(__m256i *qp) { qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; }
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx2.vperm2i128 return _mm256_permute2x128_si256(a, b, 0x31); }
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ static void unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) { size_t i; int j; __m256i ymm0[16], ymm1[16]; const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) to optimize cache utilization. */ size_t offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ const uint8_t* const dest_with_offset = dest + offset_into_type; _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); } } }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t i; int j; __m256i ymm0[16], ymm1[16]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); } }
int normHamming(const uchar* a, const uchar* b, int n) { CV_AVX_GUARD; int i = 0; int result = 0; #if CV_AVX2 { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); for(; i <= n - 32; i+= 32) { __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); __m256i _xor = _mm256_xor_si256(_a0, _b0); __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); } _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } #endif // CV_AVX2 #if CV_POPCNT { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) { result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); } # endif for(; i <= n - 4; i += 4) { result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } #endif // CV_POPCNT #if CV_SIMD128 { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) { t += v_popcount(v_load(a + i) ^ v_load(b + i)); } result += v_reduce_sum(t); } #endif // CV_SIMD128 #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; } #endif for(; i < n; i++) { result += popCountTable[a[i] ^ b[i]]; } return result; }
static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi32(a0, a1); __m256i b1 = _mm256_sub_epi32(a0, a1); __m256i b2 = _mm256_add_epi32(a2, a3); __m256i b3 = _mm256_sub_epi32(a2, a3); __m256i b4 = _mm256_add_epi32(a4, a5); __m256i b5 = _mm256_sub_epi32(a4, a5); __m256i b6 = _mm256_add_epi32(a6, a7); __m256i b7 = _mm256_sub_epi32(a6, a7); a0 = _mm256_add_epi32(b0, b2); a1 = _mm256_add_epi32(b1, b3); a2 = _mm256_sub_epi32(b0, b2); a3 = _mm256_sub_epi32(b1, b3); a4 = _mm256_add_epi32(b4, b6); a5 = _mm256_add_epi32(b5, b7); a6 = _mm256_sub_epi32(b4, b6); a7 = _mm256_sub_epi32(b5, b7); if (iter == 0) { b0 = _mm256_add_epi32(a0, a4); b7 = _mm256_add_epi32(a1, a5); b3 = _mm256_add_epi32(a2, a6); b4 = _mm256_add_epi32(a3, a7); b2 = _mm256_sub_epi32(a0, a4); b6 = _mm256_sub_epi32(a1, a5); b1 = _mm256_sub_epi32(a2, a6); b5 = _mm256_sub_epi32(a3, a7); a0 = _mm256_unpacklo_epi32(b0, b1); a1 = _mm256_unpacklo_epi32(b2, b3); a2 = _mm256_unpackhi_epi32(b0, b1); a3 = _mm256_unpackhi_epi32(b2, b3); a4 = _mm256_unpacklo_epi32(b4, b5); a5 = _mm256_unpacklo_epi32(b6, b7); a6 = _mm256_unpackhi_epi32(b4, b5); a7 = _mm256_unpackhi_epi32(b6, b7); b0 = _mm256_unpacklo_epi64(a0, a1); b1 = _mm256_unpacklo_epi64(a4, a5); b2 = _mm256_unpackhi_epi64(a0, a1); b3 = _mm256_unpackhi_epi64(a4, a5); b4 = _mm256_unpacklo_epi64(a2, a3); b5 = _mm256_unpacklo_epi64(a6, a7); b6 = _mm256_unpackhi_epi64(a2, a3); b7 = _mm256_unpackhi_epi64(a6, a7); in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); } else { in[0] = _mm256_add_epi32(a0, a4); in[7] = _mm256_add_epi32(a1, a5); in[3] = _mm256_add_epi32(a2, a6); in[4] = _mm256_add_epi32(a3, a7); in[2] = _mm256_sub_epi32(a0, a4); in[6] = _mm256_sub_epi32(a1, a5); in[1] = _mm256_sub_epi32(a2, a6); in[5] = _mm256_sub_epi32(a3, a7); } }