void mulrc16_shuffle_avx2(uint8_t *region, uint8_t constant, size_t length) { uint8_t *end; register __m256i in, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) { memset(region, 0, length); return; } if (constant == 1) return; bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region+length; region<end; region+=32) { in = _mm256_load_si256((void *)region); l = _mm256_and_si256(in, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h, l); _mm256_store_si256((void *)region, out); } }
void key_schedule(const unsigned char *k, u256 rk[40][16]) { int i, j; u256 tk1[32], tmp[32]; unsigned char *tmp_key = malloc(32); for(i = 0; i < 2; i++) memcpy(tmp_key + 16*i, k, 16); pack_key(tk1, tmp_key); for(j = 0; j < 40; j++) { //Extract round key for(i = 0; i < 16; i++){ rk[j][i] = tk1[i]; } //Add constant into key u256 rc = _mm256_set_epi64x(0x000000FF000000FFull, 0x000000FF000000FFull, 0x000000FF000000FFull, 0x000000FF000000FFull); if(RC[j]>>5 & 1) rk[j][14] = XOR(rk[j][14], rc); if(RC[j]>>4 & 1) rk[j][15] = XOR(rk[j][15], rc); if(RC[j]>>3 & 1) rk[j][4] = XOR(rk[j][4], rc); if(RC[j]>>2 & 1) rk[j][5] = XOR(rk[j][5], rc); if(RC[j]>>1 & 1) rk[j][6] = XOR(rk[j][6], rc); if(RC[j]>>0 & 1) rk[j][7] = XOR(rk[j][7], rc); //Update TK1 for(i = 0; i < 16; i++){ tmp[16 + i] = tk1[0 + i]; } //Apply bit permutation for(i = 0; i < 8; i++){ tmp[0 + i] = XOR(_mm256_shuffle_epi8(tk1[16 + i], _mm256_set_epi8(0xff,28,0xff,29,0xff,24,0xff,25,0xff,20,0xff,21,0xff,16,0xff,17,0xff,12,0xff,13,0xff,8,0xff,9,0xff,4,0xff,5,0xff,0,0xff,1)), _mm256_shuffle_epi8(tk1[24 + i], _mm256_set_epi8(29,0xff,31,0xff,25,0xff,27,0xff,21,0xff,23,0xff,17,0xff,19,0xff,13,0xff,15,0xff,9,0xff,11,0xff,5,0xff,7,0xff,1,0xff,3,0xff))); tmp[8 + i] = XOR(_mm256_shuffle_epi8(tk1[16 + i], _mm256_set_epi8(31,0xff,0xff,30,27,0xff,0xff,26,23,0xff,0xff,22,19,0xff,0xff,18,15,0xff,0xff,14,11,0xff,0xff,10,7,0xff,0xff,6,3,0xff,0xff,2)), _mm256_shuffle_epi8(tk1[24 + i], _mm256_set_epi8(0xff,28,30,0xff,0xff,24,26,0xff,0xff,20,22,0xff,0xff,16,18,0xff,0xff,12,14,0xff,0xff,8,10,0xff,0xff,4,6,0xff,0xff,0,2,0xff))); } for(i = 0; i < 32; i++){ tk1[i] = tmp[i]; } } free(tmp_key); }
static inline __m256i enc_translate (const __m256i in) { // LUT contains Absolute offset for all ranges: const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Index Characters // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 [2..11] 0123456789 // 3 [62] [43] -19 12 + // 4 [63] [47] -16 13 / // Create LUT indices from input: // the index for range #0 is right, others are 1 less than expected: __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: __m256i mask = CMPGT(in, 25); // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: indices = _mm256_sub_epi8(indices, mask); // Add offsets to input values: __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); return out; }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t j; int k, l; __m256i ymm0[16], ymm1[16]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
void inline Write8(unsigned char* out, int offset, __m256i v) { v = _mm256_shuffle_epi8(v, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL)); WriteLE32(out + 0 + offset, _mm256_extract_epi32(v, 7)); WriteLE32(out + 32 + offset, _mm256_extract_epi32(v, 6)); WriteLE32(out + 64 + offset, _mm256_extract_epi32(v, 5)); WriteLE32(out + 96 + offset, _mm256_extract_epi32(v, 4)); WriteLE32(out + 128 + offset, _mm256_extract_epi32(v, 3)); WriteLE32(out + 160 + offset, _mm256_extract_epi32(v, 2)); WriteLE32(out + 192 + offset, _mm256_extract_epi32(v, 1)); WriteLE32(out + 224 + offset, _mm256_extract_epi32(v, 0)); }
__m256i inline Read8(const unsigned char* chunk, int offset) { __m256i ret = _mm256_set_epi32( ReadLE32(chunk + 0 + offset), ReadLE32(chunk + 64 + offset), ReadLE32(chunk + 128 + offset), ReadLE32(chunk + 192 + offset), ReadLE32(chunk + 256 + offset), ReadLE32(chunk + 320 + offset), ReadLE32(chunk + 384 + offset), ReadLE32(chunk + 448 + offset) ); return _mm256_shuffle_epi8(ret, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL)); }
void p1_inv(YMM(*state)[2]) { for (int round = 0; round < p1_rounds; round++) { //Constant Addition state[0][0] = XOR(state[0][0], p1_constants_bit0[(p1_rounds - 1) - round]); state[1][0] = XOR(state[1][0], p1_constants_bit1[(p1_rounds - 1) - round]); state[2][0] = XOR(state[2][0], p1_constants_bit2[(p1_rounds - 1) - round]); state[3][0] = XOR(state[3][0], p1_constants_bit3[(p1_rounds - 1) - round]); state[4][0] = XOR(state[4][0], p1_constants_bit4[(p1_rounds - 1) - round]); //Mix Columns mixcolumns_inv_80bit(state); //Shift Rows for (int reg = 0; reg < 5; reg++) { state[reg][0] = _mm256_shuffle_epi8(state[reg][0], invShuffleControlMaskFirstReg); state[reg][1] = _mm256_shuffle_epi8(state[reg][1], invShuffleControlMaskSecondReg); } //Sub Bytes sbox_inv(state); } }
static inline __m256i _mm256_bswap_epi32 (const __m256i in) { // _mm256_shuffle_epi8() works on two 128-bit lanes separately: return _mm256_shuffle_epi8(in, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); }
static __m256i avx2_popcount(const __m256i vec) { const __m256i lookup = _mm256_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); const __m256i low_mask = _mm256_set1_epi8(0x0f); const __m256i lo = _mm256_and_si256(vec, low_mask); const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); return _mm256_add_epi8(popcnt1, popcnt2); }
void maddrc16_shuffle_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i in1, in2, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region1+length; region1<end; region1+=32, region2+=32) { in2 = _mm256_load_si256((void *)region2); in1 = _mm256_load_si256((void *)region1); l = _mm256_and_si256(in2, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in2, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h,l); out = _mm256_xor_si256(out, in1); _mm256_store_si256((void *)region1, out); } }
template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer) { __m256i src[3], shuffled; src[0] = _mm256_load_si256(buffer + 0); src[1] = _mm256_load_si256(buffer + 1); src[2] = _mm256_load_si256(buffer + 2); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); }
static inline __m256i enc_reshuffle (__m256i in) { // Spread out 32-bit words over both halves of the input register: in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32( 0, 1, 2, -1, 3, 4, 5, -1)); // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm256_shuffle_epi8(in, _mm256_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2, -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // cd = [00000000|00000000|0000cccc|ccdddddd] const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF)); // ab = [0000aaaa|aabbbbbb|00000000|00000000] const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000)); // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd] const __m256i merged = _mm256_or_si256(ab, cd); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m256i indices = _mm256_or_si256(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm256_bswap_epi32(indices); }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t j; int k; __m256i ymm0[2], ymm1[2]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8); ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d); ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0); ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f); ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e); /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]); } } }
static inline __m256i dec_reshuffle (__m256i in) { // Shuffle bytes to 32-bit bigendian: in = _mm256_bswap_epi32(in); // Mask in a single byte per shift: __m256i mask = _mm256_set1_epi32(0x3F000000); // Pack bytes together: __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8)); // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1)); // Pack 32-bit words together, squashing empty words 3 and 7: return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( 0, 1, 2, 4, 5, 6, -1, -1)); }
void calculate_fma_float (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY) { __m256 dd = _mm256_set1_ps ((float) scale); __m256 XX0 = _mm256_set1_ps ((float) X0); for (unsigned j = YSTART; j < SY; j++) { __m256 y0 = _mm256_set1_ps (j*(float) scale + (float) Y0); for (unsigned i = 0; i < SX; i += 8) { __m256i ind = _mm256_setr_epi32 (i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); __m256 x0 = _mm256_fmadd_ps (dd, _mm256_cvtepi32_ps (ind), XX0); __m256 x = x0; __m256 y = y0; __m256i counts = _mm256_setzero_si256 (); __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu); for (unsigned n = 0; n < 255; n++) { __m256 x2 = _mm256_mul_ps (x, x); __m256 y2 = _mm256_mul_ps (y, y); __m256 abs = _mm256_add_ps (x2, y2); __m256i cmp = _mm256_castps_si256 (_mm256_cmp_ps (abs, _mm256_set1_ps (4), 1)); cmp_mask = _mm256_and_si256 (cmp_mask, cmp); if (_mm256_testz_si256 (cmp_mask, cmp_mask)) { break; } counts = _mm256_sub_epi32 (counts, cmp_mask); __m256 t = _mm256_add_ps (x, x); y = _mm256_fmadd_ps (t, y, y0); x = _mm256_add_ps (_mm256_sub_ps (x2, y2), x0); } __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12)); __m128i result128 = _128i_shuffle (_mm256_extractf128_si256 (result, 0), _mm256_extractf128_si256 (result, 1), 0, 0, 0, 0); result128 = _mm_shuffle_epi32 (result128, combine_4_2bits (0, 2, 0, 2)); _mm_storel_epi64 ((__m128i*) out, result128); out += 8; } } }
static inline void do_encode_12bytes(const char (*alphabet)[2], char *out, __m256i chunk) { const __m256i shufflemask = _mm256_set_epi8( -1, 9, 10, 11, -1, 9, 10, 11, -1, 6, 7, 8, -1, 6, 7, 8, -1, 3, 4, 5, -1, 3, 4, 5, -1, 0, 1, 2, -1, 0, 1, 2 ); const __m256i shifts = _mm256_set_epi32(0, 12, 0, 12, 0, 12, 0, 12); const __m256i masks = _mm256_set1_epi32(4095); // convert from big endian and rearrange the bytes chunk = _mm256_shuffle_epi8(chunk, shufflemask); chunk = _mm256_srlv_epi32(chunk, shifts); chunk = _mm256_and_si256(chunk, masks); // write the two halves to memory do_encode_6bytes(alphabet, out + 0, _mm256_extracti128_si256(chunk, 0)); do_encode_6bytes(alphabet, out + 8, _mm256_extracti128_si256(chunk, 1)); }
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY) { __m256d dd = _mm256_set1_pd (scale); __m256d XX0 = _mm256_set1_pd (X0); for (unsigned j = YSTART; j < SY; j++) { __m256d y0 = _mm256_set1_pd (j*scale + Y0); for (unsigned i = 0; i < SX; i += 4) { __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3); __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0); __m256d x = x0; __m256d y = y0; __m256i counts = _mm256_setzero_si256 (); __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu); for (unsigned n = 0; n < 255; n++) { __m256d x2 = _mm256_mul_pd (x, x); __m256d y2 = _mm256_mul_pd (y, y); __m256d abs = _mm256_add_pd (x2, y2); __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1)); cmp_mask = _mm256_and_si256 (cmp_mask, cmp); if (_mm256_testz_si256 (cmp_mask, cmp_mask)) { break; } counts = _mm256_sub_epi64 (counts, cmp_mask); __m256d t = _mm256_add_pd (x, x); y = _mm256_fmadd_pd (t, y, y0); x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0); } __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8)); *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16); out += 4; } } }
template <bool align> SIMD_INLINE void Reorder64bit(const uint8_t * src, uint8_t * dst) { __m256i _src = Load<align>((__m256i*)src); Store<align>((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_64)); }
__m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_shuffle_epi8 // CHECK: call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_shuffle_epi8(a, b); }
__m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx2.pshuf.b return _mm256_shuffle_epi8(a, b); }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
SIMD_INLINE __m256i PermuteAndShiffle(__m256i bgr, __m256i permute, __m256i shuffle) { return _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(bgr, permute), shuffle); }
static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; const __m128i thresh = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); const __m128i blimit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); p4 = _mm256_castsi256_si128(p256_4); p3 = _mm256_castsi256_si128(p256_3); p2 = _mm256_castsi256_si128(p256_2); p1 = _mm256_castsi256_si128(p256_1); p0 = _mm256_castsi256_si128(p256_0); q0 = _mm256_castsi256_si128(q256_0); q1 = _mm256_castsi256_si128(q256_1); q2 = _mm256_castsi256_si128(q256_2); q3 = _mm256_castsi256_si128(q256_3); q4 = _mm256_castsi256_si128(q256_4); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); mask = _mm_max_epu8(work, mask); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); const __m128i te0 = _mm_set1_epi8(0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); __m128i ps1 = _mm_xor_si128(p1, t80); __m128i ps0 = _mm_xor_si128(p0, t80); __m128i qs0 = _mm_xor_si128(q0, t80); __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); /* Filter1 >> 3 */ work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); /* Filter2 >> 3 */ work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); /* filt >> 1 */ filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); // loopfilter done { __m128i work; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); flat = _mm_max_epu8(work, flat); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); flat = _mm_max_epu8(work, flat); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); p256_5 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); q256_5 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); p5 = _mm256_castsi256_si128(p256_5); q5 = _mm256_castsi256_si128(q256_5); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); flat2 = _mm_max_epu8(work, flat2); p256_6 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); q256_6 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); p6 = _mm256_castsi256_si128(p256_6); q6 = _mm256_castsi256_si128(q256_6); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); flat2 = _mm_max_epu8(work, flat2); p256_7 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); q256_7 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); p7 = _mm256_castsi256_si128(p256_7); q7 = _mm256_castsi256_si128(q256_7); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations { const __m256i eight = _mm256_set1_epi16(8); const __m256i four = _mm256_set1_epi16(4); __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; const __m256i filter = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); p256_7 = _mm256_shuffle_epi8(p256_7, filter); p256_6 = _mm256_shuffle_epi8(p256_6, filter); p256_5 = _mm256_shuffle_epi8(p256_5, filter); p256_4 = _mm256_shuffle_epi8(p256_4, filter); p256_3 = _mm256_shuffle_epi8(p256_3, filter); p256_2 = _mm256_shuffle_epi8(p256_2, filter); p256_1 = _mm256_shuffle_epi8(p256_1, filter); p256_0 = _mm256_shuffle_epi8(p256_0, filter); q256_0 = _mm256_shuffle_epi8(q256_0, filter); q256_1 = _mm256_shuffle_epi8(q256_1, filter); q256_2 = _mm256_shuffle_epi8(q256_2, filter); q256_3 = _mm256_shuffle_epi8(q256_3, filter); q256_4 = _mm256_shuffle_epi8(q256_4, filter); q256_5 = _mm256_shuffle_epi8(q256_5, filter); q256_6 = _mm256_shuffle_epi8(q256_6, filter); q256_7 = _mm256_shuffle_epi8(q256_7, filter); pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), _mm256_add_epi16(p256_4, p256_3)); pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), _mm256_add_epi16(q256_4, q256_3)); pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); pixelFilter_p = _mm256_add_epi16( eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); pixetFilter_p2p1p0 = _mm256_add_epi16( four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); flat2_p0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); flat2_q0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(p256_3, p256_0)), 3); flat_p0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(q256_3, q256_0)), 3); flat_q0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(p256_7, p256_7); sum_q7 = _mm256_add_epi16(q256_7, q256_7); sum_p3 = _mm256_add_epi16(p256_3, p256_3); sum_q3 = _mm256_add_epi16(q256_3, q256_3); pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); flat2_p1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); flat2_q1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(sum_p3, p256_1)), 3); flat_p1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, _mm256_add_epi16(sum_q3, q256_1)), 3); flat_q1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); sum_p3 = _mm256_add_epi16(sum_p3, p256_3); sum_q3 = _mm256_add_epi16(sum_q3, q256_3); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); flat2_p2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); flat2_q2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(sum_p3, p256_2)), 3); flat_p2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, _mm256_add_epi16(sum_q3, q256_2)), 3); flat_q2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); flat2_p3 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); flat2_q3 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); flat2_p4 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); flat2_q4 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); flat2_p5 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); flat2_q5 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); flat2_p6 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); flat2_q6 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ p2 = _mm_andnot_si128(flat, p2); flat_p2 = _mm_and_si128(flat, flat_p2); p2 = _mm_or_si128(flat_p2, p2); p1 = _mm_andnot_si128(flat, ps1); flat_p1 = _mm_and_si128(flat, flat_p1); p1 = _mm_or_si128(flat_p1, p1); p0 = _mm_andnot_si128(flat, ps0); flat_p0 = _mm_and_si128(flat, flat_p0); p0 = _mm_or_si128(flat_p0, p0); q0 = _mm_andnot_si128(flat, qs0); flat_q0 = _mm_and_si128(flat, flat_q0); q0 = _mm_or_si128(flat_q0, q0); q1 = _mm_andnot_si128(flat, qs1); flat_q1 = _mm_and_si128(flat, flat_q1); q1 = _mm_or_si128(flat_q1, q1); q2 = _mm_andnot_si128(flat, q2); flat_q2 = _mm_and_si128(flat, flat_q2); q2 = _mm_or_si128(flat_q2, q2); p6 = _mm_andnot_si128(flat2, p6); flat2_p6 = _mm_and_si128(flat2, flat2_p6); p6 = _mm_or_si128(flat2_p6, p6); _mm_storeu_si128((__m128i *)(s - 7 * p), p6); p5 = _mm_andnot_si128(flat2, p5); flat2_p5 = _mm_and_si128(flat2, flat2_p5); p5 = _mm_or_si128(flat2_p5, p5); _mm_storeu_si128((__m128i *)(s - 6 * p), p5); p4 = _mm_andnot_si128(flat2, p4); flat2_p4 = _mm_and_si128(flat2, flat2_p4); p4 = _mm_or_si128(flat2_p4, p4); _mm_storeu_si128((__m128i *)(s - 5 * p), p4); p3 = _mm_andnot_si128(flat2, p3); flat2_p3 = _mm_and_si128(flat2, flat2_p3); p3 = _mm_or_si128(flat2_p3, p3); _mm_storeu_si128((__m128i *)(s - 4 * p), p3); p2 = _mm_andnot_si128(flat2, p2); flat2_p2 = _mm_and_si128(flat2, flat2_p2); p2 = _mm_or_si128(flat2_p2, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); p1 = _mm_andnot_si128(flat2, p1); flat2_p1 = _mm_and_si128(flat2, flat2_p1); p1 = _mm_or_si128(flat2_p1, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); p0 = _mm_andnot_si128(flat2, p0); flat2_p0 = _mm_and_si128(flat2, flat2_p0); p0 = _mm_or_si128(flat2_p0, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); q0 = _mm_andnot_si128(flat2, q0); flat2_q0 = _mm_and_si128(flat2, flat2_q0); q0 = _mm_or_si128(flat2_q0, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); q1 = _mm_andnot_si128(flat2, q1); flat2_q1 = _mm_and_si128(flat2, flat2_q1); q1 = _mm_or_si128(flat2_q1, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); q2 = _mm_andnot_si128(flat2, q2); flat2_q2 = _mm_and_si128(flat2, flat2_q2); q2 = _mm_or_si128(flat2_q2, q2); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); q3 = _mm_andnot_si128(flat2, q3); flat2_q3 = _mm_and_si128(flat2, flat2_q3); q3 = _mm_or_si128(flat2_q3, q3); _mm_storeu_si128((__m128i *)(s + 3 * p), q3); q4 = _mm_andnot_si128(flat2, q4); flat2_q4 = _mm_and_si128(flat2, flat2_q4); q4 = _mm_or_si128(flat2_q4, q4); _mm_storeu_si128((__m128i *)(s + 4 * p), q4); q5 = _mm_andnot_si128(flat2, q5); flat2_q5 = _mm_and_si128(flat2, flat2_q5); q5 = _mm_or_si128(flat2_q5, q5); _mm_storeu_si128((__m128i *)(s + 5 * p), q5); q6 = _mm_andnot_si128(flat2, q6); flat2_q6 = _mm_and_si128(flat2, flat2_q6); q6 = _mm_or_si128(flat2_q6, q6); _mm_storeu_si128((__m128i *)(s + 6 * p), q6); } }
void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result) #endif #endif { // Linear topology of query (and template) HMM: // 1. The HMM HMM has L+2 columns. Columns 1 to L contain // a match state, a delete state and an insert state each. // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY) // This column has only a match state and it has only a transitions to the next match state. // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY) // Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0. // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments // (as long as the gap opening penalty d is higher than the best match score S(a,b)). // Pairwise alignment of two HMMs: // 1. Pair-states for the alignment of two HMMs are // MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match), DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert) // 2. Transitions are allowed only between the MM-state and each of the four other states. // Saving space: // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L) // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory. // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]). // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated, // sXY[i-1][j'] = sXY[j'] for j'>=j (A below) // sXY[i][j'] = sXY[j'] for j'<j (B below) // sXY[i-1][j-1]= sXY_i_1_j_1 (C below) // sXY[i][j] = sXY_i_j (D below) // j-1 // j // i-1: CAAAAAAAAAAAAAAAAAA // i : BBBBBBBBBBBBBD // Variable declarations const float smin = (this->local ? 0 : -FLT_MAX); //used to distinguish between SW and NW algorithms in maximization const simd_float smin_vec = simdf32_set(smin); const simd_float shift_vec = simdf32_set(shift); // const simd_float one_vec = simdf32_set(1); // 00000001 const simd_int mm_vec = simdi32_set(2); //MM 00000010 const simd_int gd_vec = simdi32_set(3); //GD 00000011 const simd_int im_vec = simdi32_set(4); //IM 00000100 const simd_int dg_vec = simdi32_set(5); //DG 00000101 const simd_int mi_vec = simdi32_set(6); //MI 00000110 const simd_int gd_mm_vec = simdi32_set(8); // 00001000 const simd_int im_mm_vec = simdi32_set(16);// 00010000 const simd_int dg_mm_vec = simdi32_set(32);// 00100000 const simd_int mi_mm_vec = simdi32_set(64);// 01000000 #ifdef VITERBI_SS_SCORE HMM * q_s = q->GetHMM(0); const unsigned char * t_index; if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED ){ t_index = t->pred_index; }else if(ss_hmm_mode == HMM::PRED_DSSP){ t_index = t->dssp_index; } simd_float * ss_score_vec = (simd_float *) ss_score; #endif #ifdef AVX2 const simd_int shuffle_mask_extract = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); #endif #ifdef VITERBI_CELLOFF const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 #ifdef AVX2 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1); const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX); const simd_int shuffle_mask_celloff = _mm256_set_epi8( 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); #else // SSE case const simd_int co_vec = tmp_vec; const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX); #endif #endif // AVX2 end int i,j; //query and template match state indices simd_int i2_vec = simdi32_set(0); simd_int j2_vec = simdi32_set(0); simd_float sMM_i_j = simdf32_set(0); simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j; simd_float Si_vec; simd_float sMM_i_1_j_1; simd_float sMI_i_1_j_1; simd_float sIM_i_1_j_1; simd_float sGD_i_1_j_1; simd_float sDG_i_1_j_1; simd_float score_vec = simdf32_set(-FLT_MAX); simd_int byte_result_vec = simdi32_set(0); // Initialization of top row, i.e. cells (0,j) for (j=0; j <= t->L; ++j) { const unsigned int index_pos_j = j * 5; sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template); sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX); } // Viterbi algorithm const int queryLength = q->L; for (i=1; i <= queryLength; ++i) // Loop through query positions i { // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues // Initialize cells sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query); // initialize at (i-1,0) sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1) sMI_i_1_j_1 = simdf32_set(-FLT_MAX); sDG_i_1_j_1 = simdf32_set(-FLT_MAX); sGD_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i,jmin-1) const unsigned int index_pos_i = 0 * 5; sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query); // initialize at (i,0) sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX); #ifdef AVX2 unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i); #else unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i); #endif const unsigned int start_pos_tr_i_1 = (i - 1) * 7; const unsigned int start_pos_tr_i = (i) * 7; const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I // Find maximum score; global alignment: maxize only over last row and last column const bool findMaxInnerLoop = (local || i == queryLength); const int targetLength = t->L; #ifdef VITERBI_SS_SCORE if(ss_hmm_mode == HMM::NO_SS_INFORMATION){ // set all to log(1.0) = 0.0 memset(ss_score, 0, (targetLength+1)*VECSIZE_FLOAT*sizeof(float)); }else { const float * score; if(ss_hmm_mode == HMM::PRED_PRED){ score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0]; }else if (ss_hmm_mode == HMM::DSSP_PRED){ score = &S73[ (int)q_s->ss_dssp[i]][0][0]; }else{ score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0]; } // access SS scores and write them to the ss_score array for (j = 0; j <= (targetLength*VECSIZE_FLOAT); j++) // Loop through template positions j { ss_score[j] = ssw * score[t_index[j]]; } } #endif for (j=1; j <= targetLength; ++j) // Loop through template positions j { simd_int index_vec; simd_int res_gt_vec; // cache line optimized reading const unsigned int start_pos_tr_j_1 = (j-1) * 7; const unsigned int start_pos_tr_j = (j) * 7; const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j)); // I2i const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1)); // M2I // Find max value // CALCULATE_MAX6( sMM_i_j, // smin, // sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M], // sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M], // sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M], // sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M], // sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], // bMM[i][j] // ); // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M] simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m); // if mm > min { 2 } res_gt_vec = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec); byte_result_vec = simdi_and(res_gt_vec, mm_vec); sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec); // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M] simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m); // if gd > max { 3 } res_gt_vec = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j); index_vec = simdi_and( res_gt_vec, gd_vec); byte_result_vec = simdi_or( index_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec); // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M] simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m); // if im > max { 4 } MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec); // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M] simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m); // if dg > max { 5 } MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec); // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m); // if mi > max { 6 } MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec); // TODO add secondary structure score // calculate amino acid profile-profile scores Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j])); #ifdef VITERBI_SS_SCORE Si_vec = simdf32_add(ss_score_vec[j], Si_vec); #endif Si_vec = simdf32_add(Si_vec, shift_vec); sMM_i_j = simdf32_add(sMM_i_j, Si_vec); //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]); const unsigned int index_pos_j = (j * 5); const unsigned int index_pos_j_1 = (j - 1) * 5; const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0)); const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3)); const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4)); const simd_float sMM_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); const simd_float sDG_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); const simd_float sMI_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3)); sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4)); // sGD_i_j = max2 // ( // sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query // sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query // bGD[i][j] // ); //sMM_DG_GD_MI_IM_vec simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query // if mm_gd > gd_dg { 8 } MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec); sGD_i_j = simdf32_max( mm_gd_vec, gd_gd_vec ); // sIM_i_j = max2 // ( // sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] , // sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query // bIM[i][j] // ); simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m); simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query // if mm_mm > im_im { 16 } MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec); sIM_i_j = simdf32_max( mm_mm_vec, im_im_vec ); // sDG_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2D], // sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query // bDG[i][j] // ); simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d); simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query // if mm_dg > dg_dg { 32 } MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec); sDG_i_j = simdf32_max( mm_dg_vec , dg_dg_vec ); // sMI_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template // sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template // bMI[i][j] // ); simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i); // MM->MI gap opening M2I in template simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i); // MI->MI gap extension I2I in template // if mm_mi > mi_mi { 64 } MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec); sMI_i_j = simdf32_max( mm_mi_vec, mi_mi_vec ); // Cell of logic // if (cell_off[i][j]) //shift 10000000100000001000000010000000 -> 01000000010000000100000001000000 //because 10000000000000000000000000000000 = -2147483648 kills cmplt #ifdef VITERBI_CELLOFF #ifdef AVX2 simd_int matrix_vec = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1); matrix_vec = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff); #else // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040 ) << std::endl; // } simd_int matrix_vec = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1); #endif simd_int cell_off_vec = simdi_and(matrix_vec, co_vec); simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec ); // shift is because signed can't be checked here simd_float cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec); // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec); sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec); sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec); sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec); #endif simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j); // write values back to ViterbiMatrix #ifdef AVX2 /* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */ /* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */ const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract); /* abcd 0000 0000 0000 DCBA */ const __m128i abcd = _mm256_castsi256_si128(abcdefgh); /* efgh 0000 0000 HGFE 0000 */ const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1); _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh)); #else byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec); byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec); int int_result = _mm_cvtsi128_si32(byte_result_vec); sCO_MI_DG_IM_GD_MM_vec[j] = int_result; #endif // Find maximum score; global alignment: maxize only over last row and last column // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; } if (findMaxInnerLoop){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec=simdf32_max(sMM_i_j,score_vec); } } //end for j // if global alignment: look for best cell in last column if (!local){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec = simdf32_max(sMM_i_j,score_vec); } // end for j } // end for i for(int seq_index=0; seq_index < maxres; seq_index++){ result->score[seq_index]=((float*)&score_vec)[seq_index]; result->i[seq_index] = ((int*)&i2_vec)[seq_index]; result->j[seq_index] = ((int*)&j2_vec)[seq_index]; // std::cout << seq_index << "\t" << result->score[seq_index] << "\t" << result->i[seq_index] <<"\t" << result->j[seq_index] << std::endl; } // printf("Template=%-12.12s i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score); }
mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 6, mask in only the third byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 6) , mask)); mask = _mm256_srli_epi32(mask, 8); /* No shift necessary for the fourth byte because we duplicated * the third byte to this position; just mask: */ res = _mm256_or_si256(res, _mm256_and_si256(str, mask)); /* Reorder to 32-bit little-endian: */ res = _mm256_shuffle_epi8(res, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); /* The bits have now been shifted to the right locations; * translate their values 0..63 to the Base64 alphabet. * Because AVX2 can only compare 'greater than', start from end of alphabet: */ /* set 5: 63, "/" */ s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63)); blockmask = s5mask; /* set 4: 62, "+" */ s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62));
SIMD_INLINE void InterpolateX4(const __m256i * alpha, __m256i * buffer) { __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X4); _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); }
int normHamming(const uchar* a, const uchar* b, int n) { CV_AVX_GUARD; int i = 0; int result = 0; #if CV_AVX2 { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); for(; i <= n - 32; i+= 32) { __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); __m256i _xor = _mm256_xor_si256(_a0, _b0); __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); } _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } #endif // CV_AVX2 #if CV_POPCNT { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) { result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); } # endif for(; i <= n - 4; i += 4) { result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } #endif // CV_POPCNT #if CV_SIMD128 { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) { t += v_popcount(v_load(a + i) ^ v_load(b + i)); } result += v_reduce_sum(t); } #endif // CV_SIMD128 #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; } #endif for(; i < n; i++) { result += popCountTable[a[i] ^ b[i]]; } return result; }
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ static void shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) { size_t j; int k, l; __m256i ymm0[16], ymm1[16]; const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Advance the offset into the type by the vector size (in bytes), unless this is the initial iteration and the type size is not a multiple of the vector size. In that case, only advance by the number of bytes necessary so that the number of remaining bytes in the type will be a multiple of the vector size. */ size_t offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { /* Fetch elements in groups of 512 bytes */ const uint8_t* const src_with_offset = src + offset_into_type; for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu2_m128i( (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype), (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype)); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]); } } } }