static inline bool avxcontains(hashset_t * set, uint64_t target) { __m256i vtarget = _mm256_set1_epi64x(target); __m256i vlocation = _mm256_and_si256(avxhash(vtarget, set->vmultiplier),set->sizemask); __m256i svalue = _mm256_i64gather_epi64((const long long int *) set->data,vlocation,8); __m256i eq = _mm256_cmpeq_epi64(vtarget,svalue); return _mm256_testz_si256(eq,eq) == 0; }
int main() { hashset_t H; for(int k = 0; k < K; ++k) H.multiplier[k] = getrand64(); H.vmultiplier= _mm256_loadu_si256((__m256i const * )H.multiplier); uint64_t howmany = 100; uint64_t * keys = malloc(sizeof(uint64_t) * howmany); for(uint64_t k = 0; k < howmany; ++k) keys[k] = getrand64(); for(H.size = 1024; H.size < (UINT64_C(1) << 32) ; H.size *=2) { H.sizemask = _mm256_set1_epi64x(H.size-1); printf("alloc size = %f MB \n", H.size * sizeof(uint64_t) / (1024 * 1024.0)); H.data = calloc(H.size , sizeof(uint64_t)); for(int j = 0; j < howmany; j += 2) H.data[hash(H.multiplier[0],j) & (H.size - 1)] = j; int answer = expected(&H,howmany,keys); RDTSC_BEST(checkthemall(&H,howmany,keys), answer, cache_flush(&H,howmany,keys), 50,howmany); RDTSC_BEST(avxcheckthemall(&H,howmany,keys), answer, cache_flush(&H,howmany,keys), 50,howmany); free(H.data); } free(keys); }
/*! * \brief Fill a packed vector by replicating a value */ ETL_STATIC_INLINE(avx_simd_long) set(int64_t value) { return _mm256_set1_epi64x(value); }
static FORCE_INLINE void FlowInter_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType)); __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType)); dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf; if (sizeof(PixelType) == 1) { dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf); } else { dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf); } __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv); __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv); if (sizeof(PixelType) == 1) { f = _mm256_mullo_epi32(f, maskf); b = _mm256_mullo_epi32(b, maskb); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); } else { const __m256i qwords_255 = _mm256_set1_epi64x(255); __m256i tempf = _mm256_mul_epu32(f, maskf); __m256i tempb = _mm256_mul_epu32(b, maskb); tempf = _mm256_add_epi64(tempf, qwords_255); tempb = _mm256_add_epi64(tempb, qwords_255); tempf = _mm256_srli_epi64(tempf, 8); tempb = _mm256_srli_epi64(tempb, 8); f = _mm256_srli_epi64(f, 32); b = _mm256_srli_epi64(b, 32); f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32)); b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32)); f = _mm256_add_epi64(f, qwords_255); b = _mm256_add_epi64(b, qwords_255); f = _mm256_srli_epi64(f, 8); b = _mm256_srli_epi64(b, 8); f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32)); b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32)); } f = _mm256_add_epi32(f, dstF_maskf_inv); b = _mm256_add_epi32(b, dstB_maskb_inv); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
int64_t * const restrict del_pr = _del_pr+PAD; #ifdef PARASAIL_TABLE parasail_result_t *result = parasail_result_new_table1(s1Len, s2Len); #else #ifdef PARASAIL_ROWCOL parasail_result_t *result = parasail_result_new_rowcol1(s1Len, s2Len); #else parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int64_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi64x(NEG_INF); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 8); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi64x(open); __m256i vGap = _mm256_set1_epi64x(gap); __m256i vOne = _mm256_set1_epi64x(1); __m256i vN = _mm256_set1_epi64x(N); __m256i vNegOne = _mm256_set1_epi64x(-1); __m256i vI = _mm256_set_epi64x(0,1,2,3); __m256i vJreset = _mm256_set_epi64x(0,-1,-2,-3); __m256i vMaxScore = vNegInf; __m256i vEndI = vNegInf; __m256i vEndJ = vNegInf; __m256i vILimit = _mm256_set1_epi64x(s1Len); __m256i vILimit1 = _mm256_sub_epi64(vILimit, vOne); __m256i vJLimit = _mm256_set1_epi64x(s2Len); __m256i vJLimit1 = _mm256_sub_epi64(vJLimit, vOne);
void Initialize() { /* Round constants for p_1: 01, 02, 05, 0a, 15, 0b, 17, 0e, 1d, 1b, 16, 0c Round constants for p_2: 18, 11, 03, 07, 0f, 1f Round constants for p_3: 1e, 1c, 19, 13, 06, 0d */ shuffleControlMaskFirstReg = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, //0 9, 10, 11, 12, 13, 14, 15, 8, //1 18, 19, 20, 21, 22, 23, 16, 17, //2 28, 29, 30, 31, 24, 25, 26, 27); //4 shuffleControlMaskSecondReg = _mm256_setr_epi8( 7, 0, 1, 2, 3, 4, 5, 6, //7 255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits 255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits 255, 255, 255, 255, 255, 255, 255, 255); //Setting it to 0xFF makes shuffle zero the bits invShuffleControlMaskFirstReg = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, //0 15, 8, 9, 10, 11, 12, 13, 14, //1 22, 23, 16, 17, 18, 19, 20, 21, //2 28, 29, 30, 31, 24, 25, 26, 27); //4 invShuffleControlMaskSecondReg = _mm256_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 0, //7 255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); m256iAllOne = _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF); //Set the bits to 1111'1111 in the column two, second row byte, if the roundconstant has a onebit on this indice //p1 p1_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0); }