Exemplo n.º 1
0
        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride));

            size_t alignedWidth = AlignLo(width, A);

            __m256i _permuteBody = _mm256_setr_epi32(0, 1, 2, 0, 3, 4, 5, 0);
            __m256i _permuteTail = _mm256_setr_epi32(2, 3, 4, 0, 5, 6, 7, 0);

            __m256i _shuffle = _mm256_setr_epi8(
                0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1,
                0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);

            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    Store<align>((__m256i*)(gray + col), BgrToGray<align>(bgr + 3*col, _permuteBody, _permuteTail, _shuffle));
                if(width != alignedWidth)
                    Store<false>((__m256i*)(gray + width - A), BgrToGray<false>(bgr + 3*(width - A), _permuteBody, _permuteTail, _shuffle));
                bgr += bgrStride;
                gray += grayStride;
            }
        }
static inline __m256i
enc_translate (const __m256i in)
{
	// LUT contains Absolute offset for all ranges:
	const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
	                                     65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs    Index  Characters
	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
	// 3  [62]      [43]       -19       12  +
	// 4  [63]      [47]       -16       13  /

	// Create LUT indices from input:
	// the index for range #0 is right, others are 1 less than expected:
	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	__m256i mask = CMPGT(in, 25);

	// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
	indices = _mm256_sub_epi8(indices, mask);

	// Add offsets to input values:
	__m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));

	return out;
}
static inline __m256i
_mm256_bswap_epi32 (const __m256i in)
{
	// _mm256_shuffle_epi8() works on two 128-bit lanes separately:
	return _mm256_shuffle_epi8(in, _mm256_setr_epi8(
		 3,  2,  1,  0,
		 7,  6,  5,  4,
		11, 10,  9,  8,
		15, 14, 13, 12,
		 3,  2,  1,  0,
		 7,  6,  5,  4,
		11, 10,  9,  8,
		15, 14, 13, 12));
}
static inline __m256i
dec_reshuffle (__m256i in)
{
	// Shuffle bytes to 32-bit bigendian:
	in = _mm256_bswap_epi32(in);

	// Mask in a single byte per shift:
	__m256i mask = _mm256_set1_epi32(0x3F000000);

	// Pack bytes together:
	__m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2);
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4));
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6));
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8));

	// Pack bytes together within 32-bit words, discarding words 3 and 7:
	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
		 3,  2,  1,
		 7,  6,  5,
		11, 10,  9,
		15, 14, 13,
		-1, -1, -1, -1,
		 3,  2,  1,
		 7,  6,  5,
		11, 10,  9,
		15, 14, 13,
		-1, -1, -1, -1));

	// Pack 32-bit words together, squashing empty words 3 and 7:
	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(
		0, 1, 2, 4, 5, 6, -1, -1));
}
static __m256i avx2_popcount(const __m256i vec) {

    const __m256i lookup = _mm256_setr_epi8(
        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,

        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
    );

    const __m256i low_mask = _mm256_set1_epi8(0x0f);

    const __m256i lo  = _mm256_and_si256(vec, low_mask);
    const __m256i hi  = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask);
    const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
    const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);

    return _mm256_add_epi8(popcnt1, popcnt2);
}
Exemplo n.º 6
0
void calculate_fma_float (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256 dd = _mm256_set1_ps ((float) scale);
    __m256 XX0 = _mm256_set1_ps ((float) X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256 y0 = _mm256_set1_ps (j*(float) scale + (float) Y0);
        for (unsigned i = 0; i < SX; i += 8)	{
            __m256i ind = _mm256_setr_epi32 (i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
            __m256 x0 = _mm256_fmadd_ps (dd, _mm256_cvtepi32_ps (ind), XX0);
            __m256 x = x0;
            __m256 y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256 x2 = _mm256_mul_ps (x, x);
                __m256 y2 = _mm256_mul_ps (y, y);
                __m256 abs = _mm256_add_ps (x2, y2);
                __m256i cmp = _mm256_castps_si256 (_mm256_cmp_ps (abs, _mm256_set1_ps (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi32 (counts, cmp_mask);
                __m256 t = _mm256_add_ps (x, x);
                y = _mm256_fmadd_ps (t, y, y0);
                x = _mm256_add_ps (_mm256_sub_ps (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12));
            __m128i result128 = _128i_shuffle (_mm256_extractf128_si256 (result, 0), _mm256_extractf128_si256 (result, 1), 0, 0, 0, 0);
            result128 = _mm_shuffle_epi32 (result128, combine_4_2bits (0, 2, 0, 2));
            _mm_storel_epi64 ((__m128i*) out, result128);
            out += 8;
        }
    }
}
Exemplo n.º 7
0
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256d dd = _mm256_set1_pd (scale);
    __m256d XX0 = _mm256_set1_pd (X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256d y0 = _mm256_set1_pd (j*scale + Y0);
        for (unsigned i = 0; i < SX; i += 4)	{

            __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3);
            __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0);
            __m256d x = x0;
            __m256d y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256d x2 = _mm256_mul_pd (x, x);
                __m256d y2 = _mm256_mul_pd (y, y);
                __m256d abs = _mm256_add_pd (x2, y2);
                __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi64 (counts, cmp_mask);
                __m256d t = _mm256_add_pd (x, x);
                y = _mm256_fmadd_pd (t, y, y0);
                x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8));
            *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16);
            out += 4;
        }
    }
}
Exemplo n.º 8
0
void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
                                  int maxres, ViterbiResult* result)
#endif
#endif
{
    
    // Linear topology of query (and template) HMM:
    // 1. The HMM HMM has L+2 columns. Columns 1 to L contain
    //    a match state, a delete state and an insert state each.
    // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY)
    //    This column has only a match state and it has only a transitions to the next match state.
    // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY)
    //    Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0.
    // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments
    //    (as long as the gap opening penalty d is higher than the best match score S(a,b)).
    
    // Pairwise alignment of two HMMs:
    // 1. Pair-states for the alignment of two HMMs are
    //    MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match),  DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert)
    // 2. Transitions are allowed only between the MM-state and each of the four other states.
    
    // Saving space:
    // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L)
    // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory.
    // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]).
    // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated,
    //    sXY[i-1][j'] = sXY[j']   for j'>=j (A below)
    //    sXY[i][j']   = sXY[j']   for j'<j  (B below)
    //    sXY[i-1][j-1]= sXY_i_1_j_1         (C below)
    //    sXY[i][j]    = sXY_i_j             (D below)
    //                   j-1
    //                     j
    // i-1:               CAAAAAAAAAAAAAAAAAA
    //  i :   BBBBBBBBBBBBBD
    // Variable declarations

    const float smin = (this->local ? 0 : -FLT_MAX);  //used to distinguish between SW and NW algorithms in maximization
    const simd_float smin_vec    = simdf32_set(smin);
    const simd_float shift_vec   = simdf32_set(shift);
//    const simd_float one_vec     = simdf32_set(1); //   00000001
    const simd_int mm_vec        = simdi32_set(2); //MM 00000010
    const simd_int gd_vec        = simdi32_set(3); //GD 00000011
    const simd_int im_vec        = simdi32_set(4); //IM 00000100
    const simd_int dg_vec        = simdi32_set(5); //DG 00000101
    const simd_int mi_vec        = simdi32_set(6); //MI 00000110
    const simd_int gd_mm_vec     = simdi32_set(8); //   00001000
    const simd_int im_mm_vec     = simdi32_set(16);//   00010000
    const simd_int dg_mm_vec     = simdi32_set(32);//   00100000
    const simd_int mi_mm_vec     = simdi32_set(64);//   01000000

#ifdef VITERBI_SS_SCORE
    HMM * q_s = q->GetHMM(0);
    const unsigned char * t_index;
    if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED  ){
        t_index = t->pred_index;
    }else if(ss_hmm_mode == HMM::PRED_DSSP){
        t_index = t->dssp_index;
    }
    simd_float * ss_score_vec = (simd_float *) ss_score;
#endif
    
#ifdef AVX2
    const simd_int shuffle_mask_extract = _mm256_setr_epi8(0,  4,  8,  12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                                                           -1, -1, -1,  -1,  0,  4,  8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
#endif
#ifdef VITERBI_CELLOFF
    const __m128i tmp_vec        = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000
#ifdef AVX2
    const simd_int co_vec               = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1);
    const simd_int float_min_vec     = (simd_int) _mm256_set1_ps(-FLT_MAX);
    const simd_int shuffle_mask_celloff = _mm256_set_epi8(
                                                          15, 14, 13, 12,
                                                          15, 14, 13, 12,
                                                          15, 14, 13, 12,
                                                          15, 14, 13, 12,
                                                          3, 2,  1, 0,
                                                          3, 2,  1, 0,
                                                          3, 2,  1, 0,
                                                          3, 2,  1, 0);
#else // SSE case
    const simd_int co_vec = tmp_vec;
    const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX);
#endif
#endif // AVX2 end
    
    int i,j;      //query and template match state indices
    simd_int i2_vec = simdi32_set(0);
    simd_int j2_vec = simdi32_set(0);
    simd_float sMM_i_j = simdf32_set(0);
    simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j;

    
    simd_float Si_vec;
    simd_float sMM_i_1_j_1;
    simd_float sMI_i_1_j_1;
    simd_float sIM_i_1_j_1;
    simd_float sGD_i_1_j_1;
    simd_float sDG_i_1_j_1;
    
    simd_float score_vec     = simdf32_set(-FLT_MAX);
    simd_int byte_result_vec = simdi32_set(0);

    // Initialization of top row, i.e. cells (0,j)
    for (j=0; j <= t->L; ++j)
    {
        const unsigned int index_pos_j = j * 5;
        sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template);
        sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX);
        sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX);
        sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX);
        sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX);
    }
    // Viterbi algorithm
    const int queryLength = q->L;
    for (i=1; i <= queryLength; ++i) // Loop through query positions i
    {

        // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues
        // Initialize cells
        sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query);  // initialize at (i-1,0)
        sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1)
        sMI_i_1_j_1 = simdf32_set(-FLT_MAX);
        sDG_i_1_j_1 = simdf32_set(-FLT_MAX);
        sGD_i_1_j_1 = simdf32_set(-FLT_MAX);

        // initialize at (i,jmin-1)
        const unsigned int index_pos_i = 0 * 5;
        sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query);           // initialize at (i,0)
        sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX);
        sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX);
        sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX);
        sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX);
#ifdef AVX2
        unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i);
#else
        unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i);
#endif

        const unsigned int start_pos_tr_i_1 = (i - 1) * 7;
        const unsigned int start_pos_tr_i = (i) * 7;
        const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M
        const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D
        const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M
        const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D
        const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m
        const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I
        const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I


        // Find maximum score; global alignment: maxize only over last row and last column
        const bool findMaxInnerLoop = (local || i == queryLength);
        const int targetLength = t->L;
#ifdef VITERBI_SS_SCORE
        if(ss_hmm_mode == HMM::NO_SS_INFORMATION){
            // set all to log(1.0) = 0.0
            memset(ss_score, 0, (targetLength+1)*VECSIZE_FLOAT*sizeof(float));
        }else {
            const float * score;
            if(ss_hmm_mode == HMM::PRED_PRED){
                score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0];
            }else if (ss_hmm_mode == HMM::DSSP_PRED){
                score = &S73[ (int)q_s->ss_dssp[i]][0][0];
            }else{
                score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0];
            }
            // access SS scores and write them to the ss_score array
            for (j = 0; j <= (targetLength*VECSIZE_FLOAT); j++) // Loop through template positions j
            {
                ss_score[j] = ssw * score[t_index[j]];
            }
        }
#endif
        for (j=1; j <= targetLength; ++j) // Loop through template positions j
        {
            simd_int index_vec;
            simd_int res_gt_vec;
            // cache line optimized reading
            const unsigned int start_pos_tr_j_1 = (j-1) * 7;
            const unsigned int start_pos_tr_j = (j) * 7;

            const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M
            const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D
            const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M
            const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D
            const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m
            const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j));   // I2i
            const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1));     // M2I
            
            // Find max value
            // CALCULATE_MAX6( sMM_i_j,
            //                 smin,
            //                 sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M],
            //                 sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M],
            //                 sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M],
            //                 sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M],
            //                 sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M],
            //                 bMM[i][j]
            //                 );
            // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M]
            simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m);
            // if mm > min { 2 }
            res_gt_vec       = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec);
            byte_result_vec  = simdi_and(res_gt_vec, mm_vec);
            sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec);
            
            // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M]
            simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m);
            // if gd > max { 3 }
            res_gt_vec       = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j);
            index_vec        = simdi_and( res_gt_vec, gd_vec);
            byte_result_vec  = simdi_or(  index_vec,  byte_result_vec);
            
            sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec);
            
            
            // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M]
            simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m);
            // if im > max { 4 }
            MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec);
            sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec);
            
            // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M]
            simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m);
            // if dg > max { 5 }
            MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec);
            sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec);
            
            // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M],
            simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m);
            // if mi > max { 6 }
            MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec);
            sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec);
            
            // TODO add secondary structure score
            // calculate amino acid profile-profile scores
            Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j]));
#ifdef VITERBI_SS_SCORE
            Si_vec = simdf32_add(ss_score_vec[j], Si_vec);
#endif
            Si_vec = simdf32_add(Si_vec, shift_vec);
            
            sMM_i_j = simdf32_add(sMM_i_j, Si_vec);
            //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]);
            
            const unsigned int index_pos_j   = (j * 5);
            const unsigned int index_pos_j_1 = (j - 1) * 5;
            const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0));
            const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3));
            const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4));
            const simd_float sMM_j   = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0));
            const simd_float sDG_j   = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1));
            const simd_float sMI_j   = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2));
            sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0));
            sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1));
            sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2));
            sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3));
            sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4));
            
            //            sGD_i_j = max2
            //            (
            //             sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query
            //             sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query
            //             bGD[i][j]
            //             );
            //sMM_DG_GD_MI_IM_vec
            simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query
            simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query
            // if mm_gd > gd_dg { 8 }
            MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec);
            
            sGD_i_j = simdf32_max(
                                 mm_gd_vec,
                                 gd_gd_vec
                                 );
            //            sIM_i_j = max2
            //            (
            //             sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] ,
            //             sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query
            //             bIM[i][j]
            //             );
            
            
            simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m);
            simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query
            // if mm_mm > im_im { 16 }
            MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec);
            
            sIM_i_j = simdf32_max(
                                  mm_mm_vec,
                                  im_im_vec
                                  );
            
            //            sDG_i_j = max2
            //            (
            //             sMM[j] + q->tr[i-1][M2D],
            //             sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query
            //             bDG[i][j]
            //             );
            simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d);
            simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query
            // if mm_dg > dg_dg { 32 }
            MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec);
            
            sDG_i_j = simdf32_max( mm_dg_vec
                                  ,
                                  dg_dg_vec
                                  );
            

            
            //            sMI_i_j = max2
            //            (
            //             sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template
            //             sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template
            //             bMI[i][j]
            //             );
            simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i);  // MM->MI gap opening M2I in template
            simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i);  // MI->MI gap extension I2I in template
            // if mm_mi > mi_mi { 64 }
            MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec);
            
            sMI_i_j = simdf32_max(
                                  mm_mi_vec,
                                  mi_mi_vec
                                  );

            
            // Cell of logic
            // if (cell_off[i][j])
            //shift   10000000100000001000000010000000 -> 01000000010000000100000001000000
            //because 10000000000000000000000000000000 = -2147483648 kills cmplt
#ifdef VITERBI_CELLOFF
#ifdef AVX2
            simd_int matrix_vec    = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1);
            matrix_vec             = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff);
#else
//            if(((sCO_MI_DG_IM_GD_MM_vec[j]  >>1) & 0x40404040) > 0){
//                std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j]  >>1) & 0x40404040   ) << std::endl;
//            }
            simd_int matrix_vec    = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1);

#endif
            simd_int cell_off_vec  = simdi_and(matrix_vec, co_vec);
            simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec    ); // shift is because signed can't be checked here
            simd_float  cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse
            sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec);    // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off
            sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec);
            sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec);
            sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec);
            sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec);
#endif
            
            
            
            simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j);
            simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j);
            simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j);
            simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j);
            simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j);

            // write values back to ViterbiMatrix
#ifdef AVX2
            /* byte_result_vec        000H  000G  000F  000E   000D  000C  000B  000A */
            /* abcdefgh               0000  0000  HGFE  0000   0000  0000  0000  DCBA */
            const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract);
            /* abcd                                            0000  0000  0000  DCBA */
            const __m128i abcd     = _mm256_castsi256_si128(abcdefgh);
            /* efgh                                            0000  0000  HGFE  0000 */
            const __m128i efgh     = _mm256_extracti128_si256(abcdefgh, 1);
            _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh));
#else
            byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec);
            byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec);
            int int_result  = _mm_cvtsi128_si32(byte_result_vec);
            sCO_MI_DG_IM_GD_MM_vec[j] = int_result;
#endif
            

            
            // Find maximum score; global alignment: maxize only over last row and last column
            // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; }
            if (findMaxInnerLoop){
                
                // new score is higer
                // output
                //  0   0   0   MAX
                simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec);
                // old score is higher
                // output
                //  MAX MAX MAX 0
                simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec);
                
                
                simd_int curr_pos_j   = simdi32_set(j);
                simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j);
                simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec);
                j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo);
                simd_int curr_pos_i   = simdi32_set(i);
                simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i);
                simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec);
                i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo);
                
                score_vec=simdf32_max(sMM_i_j,score_vec);
            }
            
            
            
        } //end for j
        
        // if global alignment: look for best cell in last column
        if (!local){
            
            // new score is higer
            // output
            //  0   0   0   MAX
            simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec);
            // old score is higher
            // output
            //  MAX MAX MAX 0
            simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec);

            
            simd_int curr_pos_j   = simdi32_set(j);
            simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j);
            simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec);
            j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo);
            simd_int curr_pos_i   = simdi32_set(i);
            simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i);
            simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec);
            i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo);
            
            score_vec = simdf32_max(sMM_i_j,score_vec);
        }    // end for j
    }     // end for i
    
    for(int seq_index=0; seq_index < maxres; seq_index++){
        result->score[seq_index]=((float*)&score_vec)[seq_index];
        result->i[seq_index] = ((int*)&i2_vec)[seq_index];
        result->j[seq_index] = ((int*)&j2_vec)[seq_index];
//        std::cout << seq_index << "\t" << result->score[seq_index] << "\t" << result->i[seq_index] <<"\t" << result->j[seq_index] << std::endl;
    }
    
    //   printf("Template=%-12.12s  i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score);
}
Exemplo n.º 9
0
	/* Shift bits by 6, mask in only the third byte: */
	res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 6) , mask));
	mask = _mm256_srli_epi32(mask, 8);

	/* No shift necessary for the fourth byte because we duplicated
	 * the third byte to this position; just mask: */
	res = _mm256_or_si256(res, _mm256_and_si256(str, mask));

	/* Reorder to 32-bit little-endian: */
	res = _mm256_shuffle_epi8(res,
	      _mm256_setr_epi8(
			 3,  2,  1,  0,
			 7,  6,  5,  4,
			11, 10,  9,  8,
			15, 14, 13, 12,
			 3,  2,  1,  0,
			 7,  6,  5,  4,
			11, 10,  9,  8,
			15, 14, 13, 12));

	/* The bits have now been shifted to the right locations;
	 * translate their values 0..63 to the Base64 alphabet.
	 * Because AVX2 can only compare 'greater than', start from end of alphabet: */

	/* set 5: 63, "/" */
	s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63));
	blockmask = s5mask;

	/* set 4: 62, "+" */
	s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62));
Exemplo n.º 10
0
int normHamming(const uchar* a, const uchar* b, int n)
{
    CV_AVX_GUARD;

    int i = 0;
    int result = 0;
#if CV_AVX2
    {
        __m256i _r0 = _mm256_setzero_si256();
        __m256i _0 = _mm256_setzero_si256();
        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);

        for(; i <= n - 32; i+= 32)
        {
            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
            __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));

            __m256i _xor = _mm256_xor_si256(_a0, _b0);

            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
                             _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));

            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
        }
        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
    }
#endif // CV_AVX2

#if CV_POPCNT
    {
#  if defined CV_POPCNT_U64
        for(; i <= n - 8; i += 8)
        {
            result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
        }
#  endif
        for(; i <= n - 4; i += 4)
        {
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
#endif // CV_POPCNT

#if CV_SIMD128
    {
        v_uint32x4 t = v_setzero_u32();
        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
        {
            t += v_popcount(v_load(a + i) ^ v_load(b + i));
        }
        result += v_reduce_sum(t);
    }
#endif // CV_SIMD128
#if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
        result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    }
#endif
    for(; i < n; i++)
    {
        result += popCountTable[a[i] ^ b[i]];
    }
    return result;
}
Exemplo n.º 11
0
void Initialize() {
	/*
	Round constants for p_1:
	01, 02, 05, 0a, 15, 0b, 17, 0e, 1d, 1b, 16, 0c

	Round constants for p_2:
	18, 11, 03, 07, 0f, 1f

	Round constants for p_3:
	1e, 1c, 19, 13, 06, 0d
	*/

	shuffleControlMaskFirstReg = _mm256_setr_epi8(
		0, 1, 2, 3, 4, 5, 6, 7, //0
		9, 10, 11, 12, 13, 14, 15, 8, //1 
		18, 19, 20, 21, 22, 23, 16, 17, //2
		28, 29, 30, 31, 24, 25, 26, 27); //4
	shuffleControlMaskSecondReg = _mm256_setr_epi8(
		7, 0, 1, 2, 3, 4, 5, 6, //7
		255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits
		255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits
		255, 255, 255, 255, 255, 255, 255, 255); //Setting it to 0xFF makes shuffle zero the bits

	invShuffleControlMaskFirstReg = _mm256_setr_epi8(
		0, 1, 2, 3, 4, 5, 6, 7, //0
		15, 8, 9, 10, 11, 12, 13, 14, //1 
		22, 23, 16, 17, 18, 19, 20, 21, //2
		28, 29, 30, 31, 24, 25, 26, 27); //4
	invShuffleControlMaskSecondReg = _mm256_setr_epi8(
		1, 2, 3, 4, 5, 6, 7, 0, //7
		255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits
		255, 255, 255, 255, 255, 255, 255, 255,
		255, 255, 255, 255, 255, 255, 255, 255);

	m256iAllOne = _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF);

	//Set the bits to 1111'1111 in the column two, second row byte, if the roundconstant has a onebit on this indice
	//p1
	p1_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0, 0);

	p1_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0, 0);

	p1_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p1_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p1_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0);

	p4_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0);
}