static inline int8_t _mm256_hmax_epi8_rpl(__m256i a) { a = _mm256_max_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,0,0))); a = _mm256_max_epi8(a, _mm256_slli_si256(a, 8)); a = _mm256_max_epi8(a, _mm256_slli_si256(a, 4)); a = _mm256_max_epi8(a, _mm256_slli_si256(a, 2)); a = _mm256_max_epi8(a, _mm256_slli_si256(a, 1)); return _mm256_extract_epi8_rpl(a, 31); }
static inline void arr_store_si256( int *array, __m256i vWH, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (0 <= i+0 && i+0 < s1Len && 0 <= j-0 && j-0 < s2Len) { array[1LL*(i+0)*s2Len + (j-0)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 31); } if (0 <= i+1 && i+1 < s1Len && 0 <= j-1 && j-1 < s2Len) { array[1LL*(i+1)*s2Len + (j-1)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 30); } if (0 <= i+2 && i+2 < s1Len && 0 <= j-2 && j-2 < s2Len) { array[1LL*(i+2)*s2Len + (j-2)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 29); } if (0 <= i+3 && i+3 < s1Len && 0 <= j-3 && j-3 < s2Len) { array[1LL*(i+3)*s2Len + (j-3)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 28); } if (0 <= i+4 && i+4 < s1Len && 0 <= j-4 && j-4 < s2Len) { array[1LL*(i+4)*s2Len + (j-4)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 27); } if (0 <= i+5 && i+5 < s1Len && 0 <= j-5 && j-5 < s2Len) { array[1LL*(i+5)*s2Len + (j-5)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 26); } if (0 <= i+6 && i+6 < s1Len && 0 <= j-6 && j-6 < s2Len) { array[1LL*(i+6)*s2Len + (j-6)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 25); } if (0 <= i+7 && i+7 < s1Len && 0 <= j-7 && j-7 < s2Len) { array[1LL*(i+7)*s2Len + (j-7)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 24); } if (0 <= i+8 && i+8 < s1Len && 0 <= j-8 && j-8 < s2Len) { array[1LL*(i+8)*s2Len + (j-8)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 23); } if (0 <= i+9 && i+9 < s1Len && 0 <= j-9 && j-9 < s2Len) { array[1LL*(i+9)*s2Len + (j-9)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 22); } if (0 <= i+10 && i+10 < s1Len && 0 <= j-10 && j-10 < s2Len) { array[1LL*(i+10)*s2Len + (j-10)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 21); } if (0 <= i+11 && i+11 < s1Len && 0 <= j-11 && j-11 < s2Len) { array[1LL*(i+11)*s2Len + (j-11)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 20); } if (0 <= i+12 && i+12 < s1Len && 0 <= j-12 && j-12 < s2Len) { array[1LL*(i+12)*s2Len + (j-12)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 19); } if (0 <= i+13 && i+13 < s1Len && 0 <= j-13 && j-13 < s2Len) { array[1LL*(i+13)*s2Len + (j-13)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 18); } if (0 <= i+14 && i+14 < s1Len && 0 <= j-14 && j-14 < s2Len) { array[1LL*(i+14)*s2Len + (j-14)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 17); } if (0 <= i+15 && i+15 < s1Len && 0 <= j-15 && j-15 < s2Len) { array[1LL*(i+15)*s2Len + (j-15)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 16); } if (0 <= i+16 && i+16 < s1Len && 0 <= j-16 && j-16 < s2Len) { array[1LL*(i+16)*s2Len + (j-16)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 15); } if (0 <= i+17 && i+17 < s1Len && 0 <= j-17 && j-17 < s2Len) { array[1LL*(i+17)*s2Len + (j-17)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 14); } if (0 <= i+18 && i+18 < s1Len && 0 <= j-18 && j-18 < s2Len) { array[1LL*(i+18)*s2Len + (j-18)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 13); } if (0 <= i+19 && i+19 < s1Len && 0 <= j-19 && j-19 < s2Len) { array[1LL*(i+19)*s2Len + (j-19)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 12); } if (0 <= i+20 && i+20 < s1Len && 0 <= j-20 && j-20 < s2Len) { array[1LL*(i+20)*s2Len + (j-20)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 11); } if (0 <= i+21 && i+21 < s1Len && 0 <= j-21 && j-21 < s2Len) { array[1LL*(i+21)*s2Len + (j-21)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 10); } if (0 <= i+22 && i+22 < s1Len && 0 <= j-22 && j-22 < s2Len) { array[1LL*(i+22)*s2Len + (j-22)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 9); } if (0 <= i+23 && i+23 < s1Len && 0 <= j-23 && j-23 < s2Len) { array[1LL*(i+23)*s2Len + (j-23)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 8); } if (0 <= i+24 && i+24 < s1Len && 0 <= j-24 && j-24 < s2Len) { array[1LL*(i+24)*s2Len + (j-24)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 7); } if (0 <= i+25 && i+25 < s1Len && 0 <= j-25 && j-25 < s2Len) { array[1LL*(i+25)*s2Len + (j-25)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 6); } if (0 <= i+26 && i+26 < s1Len && 0 <= j-26 && j-26 < s2Len) { array[1LL*(i+26)*s2Len + (j-26)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 5); } if (0 <= i+27 && i+27 < s1Len && 0 <= j-27 && j-27 < s2Len) { array[1LL*(i+27)*s2Len + (j-27)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 4); } if (0 <= i+28 && i+28 < s1Len && 0 <= j-28 && j-28 < s2Len) { array[1LL*(i+28)*s2Len + (j-28)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 3); } if (0 <= i+29 && i+29 < s1Len && 0 <= j-29 && j-29 < s2Len) { array[1LL*(i+29)*s2Len + (j-29)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 2); } if (0 <= i+30 && i+30 < s1Len && 0 <= j-30 && j-30 < s2Len) { array[1LL*(i+30)*s2Len + (j-30)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 1); } if (0 <= i+31 && i+31 < s1Len && 0 <= j-31 && j-31 < s2Len) { array[1LL*(i+31)*s2Len + (j-31)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 0); } }
static inline void arr_store_rowcol( int *row, int *col, __m256i vWH, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (i+0 == s1Len-1 && 0 <= j-0 && j-0 < s2Len) { row[j-0] = (int8_t)_mm256_extract_epi8_rpl(vWH, 31); } if (j-0 == s2Len-1 && 0 <= i+0 && i+0 < s1Len) { col[(i+0)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 31); } if (i+1 == s1Len-1 && 0 <= j-1 && j-1 < s2Len) { row[j-1] = (int8_t)_mm256_extract_epi8_rpl(vWH, 30); } if (j-1 == s2Len-1 && 0 <= i+1 && i+1 < s1Len) { col[(i+1)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 30); } if (i+2 == s1Len-1 && 0 <= j-2 && j-2 < s2Len) { row[j-2] = (int8_t)_mm256_extract_epi8_rpl(vWH, 29); } if (j-2 == s2Len-1 && 0 <= i+2 && i+2 < s1Len) { col[(i+2)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 29); } if (i+3 == s1Len-1 && 0 <= j-3 && j-3 < s2Len) { row[j-3] = (int8_t)_mm256_extract_epi8_rpl(vWH, 28); } if (j-3 == s2Len-1 && 0 <= i+3 && i+3 < s1Len) { col[(i+3)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 28); } if (i+4 == s1Len-1 && 0 <= j-4 && j-4 < s2Len) { row[j-4] = (int8_t)_mm256_extract_epi8_rpl(vWH, 27); } if (j-4 == s2Len-1 && 0 <= i+4 && i+4 < s1Len) { col[(i+4)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 27); } if (i+5 == s1Len-1 && 0 <= j-5 && j-5 < s2Len) { row[j-5] = (int8_t)_mm256_extract_epi8_rpl(vWH, 26); } if (j-5 == s2Len-1 && 0 <= i+5 && i+5 < s1Len) { col[(i+5)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 26); } if (i+6 == s1Len-1 && 0 <= j-6 && j-6 < s2Len) { row[j-6] = (int8_t)_mm256_extract_epi8_rpl(vWH, 25); } if (j-6 == s2Len-1 && 0 <= i+6 && i+6 < s1Len) { col[(i+6)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 25); } if (i+7 == s1Len-1 && 0 <= j-7 && j-7 < s2Len) { row[j-7] = (int8_t)_mm256_extract_epi8_rpl(vWH, 24); } if (j-7 == s2Len-1 && 0 <= i+7 && i+7 < s1Len) { col[(i+7)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 24); } if (i+8 == s1Len-1 && 0 <= j-8 && j-8 < s2Len) { row[j-8] = (int8_t)_mm256_extract_epi8_rpl(vWH, 23); } if (j-8 == s2Len-1 && 0 <= i+8 && i+8 < s1Len) { col[(i+8)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 23); } if (i+9 == s1Len-1 && 0 <= j-9 && j-9 < s2Len) { row[j-9] = (int8_t)_mm256_extract_epi8_rpl(vWH, 22); } if (j-9 == s2Len-1 && 0 <= i+9 && i+9 < s1Len) { col[(i+9)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 22); } if (i+10 == s1Len-1 && 0 <= j-10 && j-10 < s2Len) { row[j-10] = (int8_t)_mm256_extract_epi8_rpl(vWH, 21); } if (j-10 == s2Len-1 && 0 <= i+10 && i+10 < s1Len) { col[(i+10)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 21); } if (i+11 == s1Len-1 && 0 <= j-11 && j-11 < s2Len) { row[j-11] = (int8_t)_mm256_extract_epi8_rpl(vWH, 20); } if (j-11 == s2Len-1 && 0 <= i+11 && i+11 < s1Len) { col[(i+11)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 20); } if (i+12 == s1Len-1 && 0 <= j-12 && j-12 < s2Len) { row[j-12] = (int8_t)_mm256_extract_epi8_rpl(vWH, 19); } if (j-12 == s2Len-1 && 0 <= i+12 && i+12 < s1Len) { col[(i+12)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 19); } if (i+13 == s1Len-1 && 0 <= j-13 && j-13 < s2Len) { row[j-13] = (int8_t)_mm256_extract_epi8_rpl(vWH, 18); } if (j-13 == s2Len-1 && 0 <= i+13 && i+13 < s1Len) { col[(i+13)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 18); } if (i+14 == s1Len-1 && 0 <= j-14 && j-14 < s2Len) { row[j-14] = (int8_t)_mm256_extract_epi8_rpl(vWH, 17); } if (j-14 == s2Len-1 && 0 <= i+14 && i+14 < s1Len) { col[(i+14)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 17); } if (i+15 == s1Len-1 && 0 <= j-15 && j-15 < s2Len) { row[j-15] = (int8_t)_mm256_extract_epi8_rpl(vWH, 16); } if (j-15 == s2Len-1 && 0 <= i+15 && i+15 < s1Len) { col[(i+15)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 16); } if (i+16 == s1Len-1 && 0 <= j-16 && j-16 < s2Len) { row[j-16] = (int8_t)_mm256_extract_epi8_rpl(vWH, 15); } if (j-16 == s2Len-1 && 0 <= i+16 && i+16 < s1Len) { col[(i+16)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 15); } if (i+17 == s1Len-1 && 0 <= j-17 && j-17 < s2Len) { row[j-17] = (int8_t)_mm256_extract_epi8_rpl(vWH, 14); } if (j-17 == s2Len-1 && 0 <= i+17 && i+17 < s1Len) { col[(i+17)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 14); } if (i+18 == s1Len-1 && 0 <= j-18 && j-18 < s2Len) { row[j-18] = (int8_t)_mm256_extract_epi8_rpl(vWH, 13); } if (j-18 == s2Len-1 && 0 <= i+18 && i+18 < s1Len) { col[(i+18)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 13); } if (i+19 == s1Len-1 && 0 <= j-19 && j-19 < s2Len) { row[j-19] = (int8_t)_mm256_extract_epi8_rpl(vWH, 12); } if (j-19 == s2Len-1 && 0 <= i+19 && i+19 < s1Len) { col[(i+19)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 12); } if (i+20 == s1Len-1 && 0 <= j-20 && j-20 < s2Len) { row[j-20] = (int8_t)_mm256_extract_epi8_rpl(vWH, 11); } if (j-20 == s2Len-1 && 0 <= i+20 && i+20 < s1Len) { col[(i+20)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 11); } if (i+21 == s1Len-1 && 0 <= j-21 && j-21 < s2Len) { row[j-21] = (int8_t)_mm256_extract_epi8_rpl(vWH, 10); } if (j-21 == s2Len-1 && 0 <= i+21 && i+21 < s1Len) { col[(i+21)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 10); } if (i+22 == s1Len-1 && 0 <= j-22 && j-22 < s2Len) { row[j-22] = (int8_t)_mm256_extract_epi8_rpl(vWH, 9); } if (j-22 == s2Len-1 && 0 <= i+22 && i+22 < s1Len) { col[(i+22)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 9); } if (i+23 == s1Len-1 && 0 <= j-23 && j-23 < s2Len) { row[j-23] = (int8_t)_mm256_extract_epi8_rpl(vWH, 8); } if (j-23 == s2Len-1 && 0 <= i+23 && i+23 < s1Len) { col[(i+23)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 8); } if (i+24 == s1Len-1 && 0 <= j-24 && j-24 < s2Len) { row[j-24] = (int8_t)_mm256_extract_epi8_rpl(vWH, 7); } if (j-24 == s2Len-1 && 0 <= i+24 && i+24 < s1Len) { col[(i+24)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 7); } if (i+25 == s1Len-1 && 0 <= j-25 && j-25 < s2Len) { row[j-25] = (int8_t)_mm256_extract_epi8_rpl(vWH, 6); } if (j-25 == s2Len-1 && 0 <= i+25 && i+25 < s1Len) { col[(i+25)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 6); } if (i+26 == s1Len-1 && 0 <= j-26 && j-26 < s2Len) { row[j-26] = (int8_t)_mm256_extract_epi8_rpl(vWH, 5); } if (j-26 == s2Len-1 && 0 <= i+26 && i+26 < s1Len) { col[(i+26)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 5); } if (i+27 == s1Len-1 && 0 <= j-27 && j-27 < s2Len) { row[j-27] = (int8_t)_mm256_extract_epi8_rpl(vWH, 4); } if (j-27 == s2Len-1 && 0 <= i+27 && i+27 < s1Len) { col[(i+27)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 4); } if (i+28 == s1Len-1 && 0 <= j-28 && j-28 < s2Len) { row[j-28] = (int8_t)_mm256_extract_epi8_rpl(vWH, 3); } if (j-28 == s2Len-1 && 0 <= i+28 && i+28 < s1Len) { col[(i+28)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 3); } if (i+29 == s1Len-1 && 0 <= j-29 && j-29 < s2Len) { row[j-29] = (int8_t)_mm256_extract_epi8_rpl(vWH, 2); } if (j-29 == s2Len-1 && 0 <= i+29 && i+29 < s1Len) { col[(i+29)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 2); } if (i+30 == s1Len-1 && 0 <= j-30 && j-30 < s2Len) { row[j-30] = (int8_t)_mm256_extract_epi8_rpl(vWH, 1); } if (j-30 == s2Len-1 && 0 <= i+30 && i+30 < s1Len) { col[(i+30)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 1); } if (i+31 == s1Len-1 && 0 <= j-31 && j-31 < s2Len) { row[j-31] = (int8_t)_mm256_extract_epi8_rpl(vWH, 0); } if (j-31 == s2Len-1 && 0 <= i+31 && i+31 < s1Len) { col[(i+31)] = (int8_t)_mm256_extract_epi8_rpl(vWH, 0); } }
static inline void arr_store_col( int *col, __m256i vH, int32_t t, int32_t seglen) { col[ 0*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 0); col[ 1*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 1); col[ 2*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 2); col[ 3*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 3); col[ 4*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 4); col[ 5*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 5); col[ 6*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 6); col[ 7*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 7); col[ 8*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 8); col[ 9*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 9); col[10*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 10); col[11*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 11); col[12*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 12); col[13*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 13); col[14*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 14); col[15*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 15); col[16*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 16); col[17*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 17); col[18*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 18); col[19*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 19); col[20*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 20); col[21*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 21); col[22*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 22); col[23*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 23); col[24*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 24); col[25*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 25); col[26*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 26); col[27*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 27); col[28*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 28); col[29*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 29); col[30*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 30); col[31*seglen+t] = (int8_t)_mm256_extract_epi8_rpl(vH, 31); }
static inline void arr_store_si256( int *array, __m256i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen) { array[( 0*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 0); array[( 1*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 1); array[( 2*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 2); array[( 3*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 3); array[( 4*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 4); array[( 5*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 5); array[( 6*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 6); array[( 7*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 7); array[( 8*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 8); array[( 9*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 9); array[(10*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 10); array[(11*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 11); array[(12*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 12); array[(13*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 13); array[(14*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 14); array[(15*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 15); array[(16*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 16); array[(17*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 17); array[(18*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 18); array[(19*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 19); array[(20*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 20); array[(21*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 21); array[(22*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 22); array[(23*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 23); array[(24*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 24); array[(25*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 25); array[(26*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 26); array[(27*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 27); array[(28*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 28); array[(29*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 29); array[(30*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 30); array[(31*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 31); }
static inline void arr_store( int *array, __m256i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen, int32_t bias) { array[1LL*( 0*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 0) - bias; array[1LL*( 1*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 1) - bias; array[1LL*( 2*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 2) - bias; array[1LL*( 3*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 3) - bias; array[1LL*( 4*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 4) - bias; array[1LL*( 5*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 5) - bias; array[1LL*( 6*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 6) - bias; array[1LL*( 7*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 7) - bias; array[1LL*( 8*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 8) - bias; array[1LL*( 9*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 9) - bias; array[1LL*(10*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 10) - bias; array[1LL*(11*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 11) - bias; array[1LL*(12*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 12) - bias; array[1LL*(13*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 13) - bias; array[1LL*(14*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 14) - bias; array[1LL*(15*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 15) - bias; array[1LL*(16*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 16) - bias; array[1LL*(17*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 17) - bias; array[1LL*(18*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 18) - bias; array[1LL*(19*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 19) - bias; array[1LL*(20*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 20) - bias; array[1LL*(21*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 21) - bias; array[1LL*(22*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 22) - bias; array[1LL*(23*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 23) - bias; array[1LL*(24*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 24) - bias; array[1LL*(25*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 25) - bias; array[1LL*(26*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 26) - bias; array[1LL*(27*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 27) - bias; array[1LL*(28*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 28) - bias; array[1LL*(29*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 29) - bias; array[1LL*(30*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 30) - bias; array[1LL*(31*seglen+t)*dlen + d] = (int8_t)_mm256_extract_epi8_rpl(vH, 31) - bias; }