static inline int8_t _mm_hmax_epi8_rpl(__m128i a) { a = _mm_max_epi8_rpl(a, _mm_srli_si128(a, 8)); a = _mm_max_epi8_rpl(a, _mm_srli_si128(a, 4)); a = _mm_max_epi8_rpl(a, _mm_srli_si128(a, 2)); a = _mm_max_epi8_rpl(a, _mm_srli_si128(a, 1)); return _mm_extract_epi8_rpl(a, 0); }
static inline void arr_store_col( int *col, __m128i vH, int32_t t, int32_t seglen) { col[ 0*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 0); col[ 1*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 1); col[ 2*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 2); col[ 3*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 3); col[ 4*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 4); col[ 5*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 5); col[ 6*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 6); col[ 7*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 7); col[ 8*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 8); col[ 9*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 9); col[10*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 10); col[11*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 11); col[12*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 12); col[13*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 13); col[14*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 14); col[15*seglen+t] = (int8_t)_mm_extract_epi8_rpl(vH, 15); }
static inline void arr_store_si128( int *array, __m128i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen) { array[( 0*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 0); array[( 1*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 1); array[( 2*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 2); array[( 3*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 3); array[( 4*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 4); array[( 5*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 5); array[( 6*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 6); array[( 7*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 7); array[( 8*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 8); array[( 9*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 9); array[(10*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 10); array[(11*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 11); array[(12*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 12); array[(13*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 13); array[(14*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 14); array[(15*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 15); }
static inline void arr_store_si128( int *array, __m128i vWscore, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (0 <= i+0 && i+0 < s1Len && 0 <= j-0 && j-0 < s2Len) { array[(i+0)*s2Len + (j-0)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 15); } if (0 <= i+1 && i+1 < s1Len && 0 <= j-1 && j-1 < s2Len) { array[(i+1)*s2Len + (j-1)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 14); } if (0 <= i+2 && i+2 < s1Len && 0 <= j-2 && j-2 < s2Len) { array[(i+2)*s2Len + (j-2)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 13); } if (0 <= i+3 && i+3 < s1Len && 0 <= j-3 && j-3 < s2Len) { array[(i+3)*s2Len + (j-3)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 12); } if (0 <= i+4 && i+4 < s1Len && 0 <= j-4 && j-4 < s2Len) { array[(i+4)*s2Len + (j-4)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 11); } if (0 <= i+5 && i+5 < s1Len && 0 <= j-5 && j-5 < s2Len) { array[(i+5)*s2Len + (j-5)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 10); } if (0 <= i+6 && i+6 < s1Len && 0 <= j-6 && j-6 < s2Len) { array[(i+6)*s2Len + (j-6)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 9); } if (0 <= i+7 && i+7 < s1Len && 0 <= j-7 && j-7 < s2Len) { array[(i+7)*s2Len + (j-7)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 8); } if (0 <= i+8 && i+8 < s1Len && 0 <= j-8 && j-8 < s2Len) { array[(i+8)*s2Len + (j-8)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 7); } if (0 <= i+9 && i+9 < s1Len && 0 <= j-9 && j-9 < s2Len) { array[(i+9)*s2Len + (j-9)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 6); } if (0 <= i+10 && i+10 < s1Len && 0 <= j-10 && j-10 < s2Len) { array[(i+10)*s2Len + (j-10)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 5); } if (0 <= i+11 && i+11 < s1Len && 0 <= j-11 && j-11 < s2Len) { array[(i+11)*s2Len + (j-11)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 4); } if (0 <= i+12 && i+12 < s1Len && 0 <= j-12 && j-12 < s2Len) { array[(i+12)*s2Len + (j-12)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 3); } if (0 <= i+13 && i+13 < s1Len && 0 <= j-13 && j-13 < s2Len) { array[(i+13)*s2Len + (j-13)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 2); } if (0 <= i+14 && i+14 < s1Len && 0 <= j-14 && j-14 < s2Len) { array[(i+14)*s2Len + (j-14)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 1); } if (0 <= i+15 && i+15 < s1Len && 0 <= j-15 && j-15 < s2Len) { array[(i+15)*s2Len + (j-15)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 0); } }
static inline void arr_store_rowcol( int *row, int *col, __m128i vWscore, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (i+0 == s1Len-1 && 0 <= j-0 && j-0 < s2Len) { row[j-0] = (int8_t)_mm_extract_epi8_rpl(vWscore, 15); } if (j-0 == s2Len-1 && 0 <= i+0 && i+0 < s1Len) { col[(i+0)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 15); } if (i+1 == s1Len-1 && 0 <= j-1 && j-1 < s2Len) { row[j-1] = (int8_t)_mm_extract_epi8_rpl(vWscore, 14); } if (j-1 == s2Len-1 && 0 <= i+1 && i+1 < s1Len) { col[(i+1)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 14); } if (i+2 == s1Len-1 && 0 <= j-2 && j-2 < s2Len) { row[j-2] = (int8_t)_mm_extract_epi8_rpl(vWscore, 13); } if (j-2 == s2Len-1 && 0 <= i+2 && i+2 < s1Len) { col[(i+2)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 13); } if (i+3 == s1Len-1 && 0 <= j-3 && j-3 < s2Len) { row[j-3] = (int8_t)_mm_extract_epi8_rpl(vWscore, 12); } if (j-3 == s2Len-1 && 0 <= i+3 && i+3 < s1Len) { col[(i+3)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 12); } if (i+4 == s1Len-1 && 0 <= j-4 && j-4 < s2Len) { row[j-4] = (int8_t)_mm_extract_epi8_rpl(vWscore, 11); } if (j-4 == s2Len-1 && 0 <= i+4 && i+4 < s1Len) { col[(i+4)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 11); } if (i+5 == s1Len-1 && 0 <= j-5 && j-5 < s2Len) { row[j-5] = (int8_t)_mm_extract_epi8_rpl(vWscore, 10); } if (j-5 == s2Len-1 && 0 <= i+5 && i+5 < s1Len) { col[(i+5)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 10); } if (i+6 == s1Len-1 && 0 <= j-6 && j-6 < s2Len) { row[j-6] = (int8_t)_mm_extract_epi8_rpl(vWscore, 9); } if (j-6 == s2Len-1 && 0 <= i+6 && i+6 < s1Len) { col[(i+6)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 9); } if (i+7 == s1Len-1 && 0 <= j-7 && j-7 < s2Len) { row[j-7] = (int8_t)_mm_extract_epi8_rpl(vWscore, 8); } if (j-7 == s2Len-1 && 0 <= i+7 && i+7 < s1Len) { col[(i+7)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 8); } if (i+8 == s1Len-1 && 0 <= j-8 && j-8 < s2Len) { row[j-8] = (int8_t)_mm_extract_epi8_rpl(vWscore, 7); } if (j-8 == s2Len-1 && 0 <= i+8 && i+8 < s1Len) { col[(i+8)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 7); } if (i+9 == s1Len-1 && 0 <= j-9 && j-9 < s2Len) { row[j-9] = (int8_t)_mm_extract_epi8_rpl(vWscore, 6); } if (j-9 == s2Len-1 && 0 <= i+9 && i+9 < s1Len) { col[(i+9)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 6); } if (i+10 == s1Len-1 && 0 <= j-10 && j-10 < s2Len) { row[j-10] = (int8_t)_mm_extract_epi8_rpl(vWscore, 5); } if (j-10 == s2Len-1 && 0 <= i+10 && i+10 < s1Len) { col[(i+10)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 5); } if (i+11 == s1Len-1 && 0 <= j-11 && j-11 < s2Len) { row[j-11] = (int8_t)_mm_extract_epi8_rpl(vWscore, 4); } if (j-11 == s2Len-1 && 0 <= i+11 && i+11 < s1Len) { col[(i+11)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 4); } if (i+12 == s1Len-1 && 0 <= j-12 && j-12 < s2Len) { row[j-12] = (int8_t)_mm_extract_epi8_rpl(vWscore, 3); } if (j-12 == s2Len-1 && 0 <= i+12 && i+12 < s1Len) { col[(i+12)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 3); } if (i+13 == s1Len-1 && 0 <= j-13 && j-13 < s2Len) { row[j-13] = (int8_t)_mm_extract_epi8_rpl(vWscore, 2); } if (j-13 == s2Len-1 && 0 <= i+13 && i+13 < s1Len) { col[(i+13)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 2); } if (i+14 == s1Len-1 && 0 <= j-14 && j-14 < s2Len) { row[j-14] = (int8_t)_mm_extract_epi8_rpl(vWscore, 1); } if (j-14 == s2Len-1 && 0 <= i+14 && i+14 < s1Len) { col[(i+14)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 1); } if (i+15 == s1Len-1 && 0 <= j-15 && j-15 < s2Len) { row[j-15] = (int8_t)_mm_extract_epi8_rpl(vWscore, 0); } if (j-15 == s2Len-1 && 0 <= i+15 && i+15 < s1Len) { col[(i+15)] = (int8_t)_mm_extract_epi8_rpl(vWscore, 0); } }
static inline void arr_store( int *array, __m128i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen, int32_t bias) { array[1LL*( 0*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 0) - bias; array[1LL*( 1*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 1) - bias; array[1LL*( 2*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 2) - bias; array[1LL*( 3*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 3) - bias; array[1LL*( 4*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 4) - bias; array[1LL*( 5*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 5) - bias; array[1LL*( 6*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 6) - bias; array[1LL*( 7*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 7) - bias; array[1LL*( 8*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 8) - bias; array[1LL*( 9*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 9) - bias; array[1LL*(10*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 10) - bias; array[1LL*(11*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 11) - bias; array[1LL*(12*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 12) - bias; array[1LL*(13*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 13) - bias; array[1LL*(14*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 14) - bias; array[1LL*(15*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8_rpl(vH, 15) - bias; }