meave::vec::AVX avx1(const ::uint8_t *$, ::size_t len) noexcept { assert(0 == len % 4); meave::vec::AVX hash{ .i8_ = _mm256_set1_epi64(aux::fnv_offset_basis()) }; meave::vec::AVX prime{ .i8_ = _mm256_set1_epi64(aux::fnv_prime()) }; for (const ::uint8_t *p = $; len; len-=4) { hash.i8_ = _mm256_mul_epi64(hash.i8_, prime); meave::vec::AVX x{ .i8_ = _mm256_set_epi64(p[0], p[1], p[2],p[3])}; hash.i8_ = _mm256_xor_si256(hash.i8_, x.i8_); } return hash; } meave::vec::AVX avx1a(const ::uint8_t *$, ::size_t len) noexcept { assert(0 == len % 4); meave::vec::AVX hash{ .i8_ = _mm256_set1_epi64(aux::fnv_offset_basis()) }; meave::vec::AVX prime{ .i8_ = _mm256_set1_epi64(aux::fnv_prime()) }; for (const ::uint8_t *p = $; len; len-=4) { meave::vec::AVX x{ .i8_ = _mm256_set_epi64(p[0], p[1], p[2],p[3])}; hash.i8_ = _mm256_xor_si256(hash.i8_, x.i8_); hash.i8_ = _mm256_mul_epi64(hash.i8_, prime); } return hash; } #endif } } /* namespace ::meave::fnv */
/** * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 * words of type uint64_t), using Blake2b's G function as the internal permutation * * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state #if defined __AVX2__ __m256i state_v[2], in_v[2]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); _mm256_store_si256( (__m256i*)(&state[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_store_si256( (__m256i*)(&state[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); #elif defined __AVX__ __m128i state_v[4], in_v[4]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); in_v[0] = _mm_loadu_si128( (__m128i*)(&in[0]) ); in_v[1] = _mm_loadu_si128( (__m128i*)(&in[2]) ); in_v[2] = _mm_loadu_si128( (__m128i*)(&in[4]) ); in_v[3] = _mm_loadu_si128( (__m128i*)(&in[6]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); #else state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; state[4] ^= in[4]; state[5] ^= in[5]; state[6] ^= in[6]; state[7] ^= in[7]; #endif //Applies the transformation f to the sponge's state blake2bLyra(state); }
static void TestAVX2MT2(MTTest *t) { long long int count = t->count; void* mem = t->mem; int size = t->size; __m256i dummy = _mm256_set1_epi32(0); for (int i = 0; i < count; ++i) { // AVX2 load & xor: const __m256i* data = (const __m256i*)mem; const __m256i* end = (const __m256i*)(((byte*)mem) + size * 1024); // We're attempting to get the compiler to make dummy2 a register. We need it because // otherwise the complete loop will get eliminated. __m256i dummy2 = _mm256_set1_epi32(0); for (; data != end; ++data) { dummy2 = _mm256_load_si256(data); } dummy = _mm256_xor_si256(dummy, dummy2); } t->dummy ^= dummy.m256i_i32[0]; }
int TestAVX2(long long int size) { long long int bytes = 1024ll * 1024ll * 4096ll; long long int count = bytes / (size * 1024ll); std::ostringstream oss; oss << "Size: " << size << "KB; speed:"; void* mem = _aligned_malloc((size * 1024), 32); int limit = (size * 1024) / 32; __m256i dummy = _mm256_set1_epi32(0); { Util::Timer timer(oss.str().c_str(), bytes); for (int i = 0; i < count; ++i) { // AVX2 load & xor: const __m256i* data = (const __m256i*)mem; const __m256i* end = (const __m256i*)(((byte*)mem) + size * 1024); __m256i dummy2 = _mm256_set1_epi32(0); for (; data != end; ++data) { dummy2 = _mm256_load_si256(data); } dummy = _mm256_xor_si256(dummy2, dummy); } } _aligned_free(mem); return (int)(dummy.m256i_i32[0]); }
void mulrc16_shuffle_avx2(uint8_t *region, uint8_t constant, size_t length) { uint8_t *end; register __m256i in, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) { memset(region, 0, length); return; } if (constant == 1) return; bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region+length; region<end; region+=32) { in = _mm256_load_si256((void *)region); l = _mm256_and_si256(in, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h, l); _mm256_store_si256((void *)region, out); } }
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v) { // Convert X,Y first into U,V space then round to nearest // integer. That gets us close to correct answer, mapping XY to a // lozenge-shaped space rather than hexagonal. We then correct the // four regions that lie outside the hexagonal cell assigning them // to their correct neighboring cell. // Writer's note: see ~/Google Drive/Work/calin // double dv = y*c_vy_inv; // double du = x-dv*c_vx; // u = std::lround(du); // v = std::lround(dv); // du -= u; // dv -= v; y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv)); x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x); u = _mm256_cvtps_epi32(x); v = _mm256_cvtps_epi32(y); x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u)); y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v)); // double c3 = dv-du; const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x)); __m256i uvshift; __m256i mask; // double c1 = du+0.5*dv; // double c2 = dv+0.5*du; // if(c3<0) { // if(c1>=1) u++; // else if(c2<-1) v--; // } else { // if(c2>=1) v++; // else if(c1<-1) u--; // } uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask); uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask); }
void maddrc16_imul_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i reg1, reg2, ri[4], sp[4], mi[4]; const uint8_t *p = pt[constant]; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } mi[0] = _mm256_set1_epi8(0x11); mi[1] = _mm256_set1_epi8(0x22); mi[2] = _mm256_set1_epi8(0x44); mi[3] = _mm256_set1_epi8(0x88); sp[0] = _mm256_set1_epi16(p[0]); sp[1] = _mm256_set1_epi16(p[1]); sp[2] = _mm256_set1_epi16(p[2]); sp[3] = _mm256_set1_epi16(p[3]); for (end=region1+length; region1<end; region1+=32, region2+=32) { reg2 = _mm256_load_si256((void *)region2); reg1 = _mm256_load_si256((void *)region1); ri[0] = _mm256_and_si256(reg2, mi[0]); ri[1] = _mm256_and_si256(reg2, mi[1]); ri[2] = _mm256_and_si256(reg2, mi[2]); ri[3] = _mm256_and_si256(reg2, mi[3]); ri[1] = _mm256_srli_epi16(ri[1], 1); ri[2] = _mm256_srli_epi16(ri[2], 2); ri[3] = _mm256_srli_epi16(ri[3], 3); ri[0] = _mm256_mullo_epi16(ri[0], sp[0]); ri[1] = _mm256_mullo_epi16(ri[1], sp[1]); ri[2] = _mm256_mullo_epi16(ri[2], sp[2]); ri[3] = _mm256_mullo_epi16(ri[3], sp[3]); ri[0] = _mm256_xor_si256(ri[0], ri[1]); ri[2] = _mm256_xor_si256(ri[2], ri[3]); ri[0] = _mm256_xor_si256(ri[0], ri[2]); ri[0] = _mm256_xor_si256(ri[0], reg1); _mm256_store_si256((void *)region1, ri[0]); } }
void maddrc16_shuffle_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i in1, in2, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region1+length; region1<end; region1+=32, region2+=32) { in2 = _mm256_load_si256((void *)region2); in1 = _mm256_load_si256((void *)region1); l = _mm256_and_si256(in2, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in2, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h,l); out = _mm256_xor_si256(out, in1); _mm256_store_si256((void *)region1, out); } }
size_t __FASTCALL strlen_fast_v2_avx(const char * str) { size_t len; register __m256i zero32, src32_low, src32_high; register size_t zero_mask_low, zero_mask_high; register uint64_t zero_mask; unsigned long zero_index; register const char * cur = str; // Set the zero masks (32 bytes). INIT_ZERO_32(zero32); zero32 = _mm256_xor_si256(zero32, zero32); // Get the misalignment bytes last 6 bits. size_t misalignment = (size_t)cur & 0x3F; // If the misalignment bytes is < 32 bytes? if (misalignment < 0x20) { if (misalignment == 0) { // If misalignment is 0, skip this step. goto main_loop; } // Align address to 64 bytes for main loop. cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F)); // Load 32 bytes from target string to YMM register. src32_low = _mm256_load_si256((__m256i *)(cur)); src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_low = _mm256_cmpeq_epi8(src32_low, zero32); src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 32 bits. zero_mask_low = (size_t)_mm256_movemask_epi8(src32_low); zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); // Remove last missalign bits. zero_mask_low >>= misalignment; zero_mask_low <<= misalignment; if (zero_mask_low != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_low); goto strlen_exit; } else if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; goto strlen_exit; } // Align address to the next 64 bytes for main loop. cur += 64; } else {
void extern avx2_test (void) { x = _mm256_xor_si256 (x, x); }
int normHamming(const uchar* a, const uchar* b, int n) { CV_AVX_GUARD; int i = 0; int result = 0; #if CV_AVX2 { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); for(; i <= n - 32; i+= 32) { __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); __m256i _xor = _mm256_xor_si256(_a0, _b0); __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); } _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } #endif // CV_AVX2 #if CV_POPCNT { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) { result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); } # endif for(; i <= n - 4; i += 4) { result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } #endif // CV_POPCNT #if CV_SIMD128 { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) { t += v_popcount(v_load(a + i) ^ v_load(b + i)); } result += v_reduce_sum(t); } #endif // CV_SIMD128 #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; } #endif for(; i < n; i++) { result += popCountTable[a[i] ^ b[i]]; } return result; }
/** * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The * output of this operation, "rand", is then used to make * "M[rowOut][col] = M[rowOut][col] XOR rand" and * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) { uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row unsigned int i; for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" #if defined __AVX2__ __m256i state_v[3], in_v[3], inout_v[3]; #define out_v in_v // reuse register in next code block state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); _mm256_store_si256( (__m256i*)(&state[0]), _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], inout_v[0] ) ) ); _mm256_store_si256( (__m256i*)(&state[4]), _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], inout_v[1] ) ) ); _mm256_store_si256( (__m256i*)(&state[8]), _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], inout_v[2] ) ) ); #elif defined __AVX__ __m128i state_v[6], in_v[6], inout_v[6]; #define out_v in_v // reuse register in next code block state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); inout_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[0]) ); inout_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[2]) ); inout_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[4]) ); inout_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[6]) ); inout_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[8]) ); inout_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[10]) ); in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], _mm_add_epi64( in_v[0], inout_v[0] ) ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], _mm_add_epi64( in_v[1], inout_v[1] ) ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], _mm_add_epi64( in_v[2], inout_v[2] ) ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], _mm_add_epi64( in_v[3], inout_v[3] ) ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], _mm_add_epi64( in_v[4], inout_v[4] ) ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], _mm_add_epi64( in_v[5], inout_v[5] ) ) ); #else state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); #endif //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); //M[rowOut][col] = M[rowOut][col] XOR rand #if defined __AVX2__ state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); out_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); out_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); out_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], out_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], out_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], out_v[2] ) ); #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); out_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[0]) ); out_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[2]) ); out_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[4]) ); out_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[6]) ); out_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[8]) ); out_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[10]) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], out_v[0] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], out_v[1] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], out_v[2] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], out_v[3] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], out_v[4] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], out_v[5] ) ); #else ptrWordOut[0] ^= state[0]; ptrWordOut[1] ^= state[1]; ptrWordOut[2] ^= state[2]; ptrWordOut[3] ^= state[3]; ptrWordOut[4] ^= state[4]; ptrWordOut[5] ^= state[5]; ptrWordOut[6] ^= state[6]; ptrWordOut[7] ^= state[7]; ptrWordOut[8] ^= state[8]; ptrWordOut[9] ^= state[9]; ptrWordOut[10] ^= state[10]; ptrWordOut[11] ^= state[11]; #endif //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } }
/** * Performs a reduced duplex operation for a single row, from the highest to * the lowest index, using the reduced-round Blake2b's G function as the * internal permutation * * @param state The current state of the sponge * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row unsigned int i; for (i = 0; i < nCols; i++) { //Absorbing "M[prev][col]" #if defined __AVX2__ __m256i state_v[3], in_v[3]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); _mm256_store_si256( (__m256i*)(&state[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_store_si256( (__m256i*)(&state[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_store_si256( (__m256i*)(&state[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); #elif defined __AVX__ __m128i state_v[6], in_v[6]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); #else state[0] ^= (ptrWordIn[0]); state[1] ^= (ptrWordIn[1]); state[2] ^= (ptrWordIn[2]); state[3] ^= (ptrWordIn[3]); state[4] ^= (ptrWordIn[4]); state[5] ^= (ptrWordIn[5]); state[6] ^= (ptrWordIn[6]); state[7] ^= (ptrWordIn[7]); state[8] ^= (ptrWordIn[8]); state[9] ^= (ptrWordIn[9]); state[10] ^= (ptrWordIn[10]); state[11] ^= (ptrWordIn[11]); #endif //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); //M[row][C-1-col] = M[prev][col] XOR rand #if defined __AVX2__ // in_v should not need to be reloaded, but it does and it segfaults if // loading alogned state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); #else ptrWordOut[0] = ptrWordIn[0] ^ state[0]; ptrWordOut[1] = ptrWordIn[1] ^ state[1]; ptrWordOut[2] = ptrWordIn[2] ^ state[2]; ptrWordOut[3] = ptrWordIn[3] ^ state[3]; ptrWordOut[4] = ptrWordIn[4] ^ state[4]; ptrWordOut[5] = ptrWordIn[5] ^ state[5]; ptrWordOut[6] = ptrWordIn[6] ^ state[6]; ptrWordOut[7] = ptrWordIn[7] ^ state[7]; ptrWordOut[8] = ptrWordIn[8] ^ state[8]; ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; #endif //Input: next column (i.e., next block in sequence) ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } }
/** * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The * output of this operation, "rand", is then used to make * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left and N_COLS is a system parameter. * * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; #if defined __AVX2__ __m256i state_v[4], in_v[3], inout_v[3]; #define t_state in_v state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); for ( i = 0; i < nCols; i++ ) { in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], inout_v[0] ) ); state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], inout_v[1] ) ); state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], inout_v[2] ) ); LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); //M[row*][col] = M[row*][col] XOR rotW(rand) t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 ); t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 ); t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 ); inout_v[0] = _mm256_xor_si256( inout_v[0], _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) ); inout_v[1] = _mm256_xor_si256( inout_v[1], _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) ); inout_v[2] = _mm256_xor_si256( inout_v[2], _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) ); _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] ); _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] ); _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] ); //Inputs: next column (i.e., next block in sequence) ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #undef t_state #elif defined __AVX__ __m128i state_v[6], in_v[6], inout_v[6]; for ( i = 0; i < nCols; i++ ) { state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); inout_v[0] = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) ); inout_v[1] = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) ); inout_v[2] = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) ); inout_v[3] = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) ); inout_v[4] = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) ); inout_v[5] = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], _mm_add_epi64( in_v[0], inout_v[0] ) ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], _mm_add_epi64( in_v[1], inout_v[1] ) ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], _mm_add_epi64( in_v[2], inout_v[2] ) ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], _mm_add_epi64( in_v[3], inout_v[3] ) ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], _mm_add_epi64( in_v[4], inout_v[4] ) ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], _mm_add_epi64( in_v[5], inout_v[5] ) ) ); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #else for ( i = 0; i < nCols; i++ ) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); //M[row][col] = M[prev][col] XOR rand #endif #if defined __AVX2__ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); _mm_store_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); #else ptrWordOut[0] = ptrWordIn[0] ^ state[0]; ptrWordOut[1] = ptrWordIn[1] ^ state[1]; ptrWordOut[2] = ptrWordIn[2] ^ state[2]; ptrWordOut[3] = ptrWordIn[3] ^ state[3]; ptrWordOut[4] = ptrWordIn[4] ^ state[4]; ptrWordOut[5] = ptrWordIn[5] ^ state[5]; ptrWordOut[6] = ptrWordIn[6] ^ state[6]; ptrWordOut[7] = ptrWordIn[7] ^ state[7]; ptrWordOut[8] = ptrWordIn[8] ^ state[8]; ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; #endif //M[row*][col] = M[row*][col] XOR rotW(rand) // Need to fix this before taking state load/store out of loop #ifdef __AVX2__ #else ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Inputs: next column (i.e., next block in sequence) ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } #endif } /** * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The * output of this operation, "rand", is then used to make * "M[rowOut][col] = M[rowOut][col] XOR rand" and * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ) { uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row int i; #if defined __AVX2__ for ( i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" __m256i state_v[4], in_v[3], inout_v[3]; #define out_v in_v // reuse register in next code block #define t_state in_v state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], inout_v[0] ) ); state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], inout_v[1] ) ); state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], inout_v[2] ) ); out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], out_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], out_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], out_v[2] ) ); /* t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 ); t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 ); t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 ); inout_v[0] = _mm256_xor_si256( inout_v[0], _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) ); inout_v[1] = _mm256_xor_si256( inout_v[1], _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) ); inout_v[2] = _mm256_xor_si256( inout_v[2], _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] ); _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] ); _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); */ #undef out_v #undef t_state //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } #elif defined __AVX__ for ( i = 0; i < nCols; i++) { __m128i state_v[6], in_v[6], inout_v[6]; #define out_v in_v // reuse register in next code block state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); inout_v[0] = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) ); inout_v[1] = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) ); inout_v[2] = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) ); inout_v[3] = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) ); inout_v[4] = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) ); inout_v[5] = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], _mm_add_epi64( in_v[0], inout_v[0] ) ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], _mm_add_epi64( in_v[1], inout_v[1] ) ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], _mm_add_epi64( in_v[2], inout_v[2] ) ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], _mm_add_epi64( in_v[3], inout_v[3] ) ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], _mm_add_epi64( in_v[4], inout_v[4] ) ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], _mm_add_epi64( in_v[5], inout_v[5] ) ) ); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #else for ( i = 0; i < nCols; i++) { state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #endif //M[rowOut][col] = M[rowOut][col] XOR rand #if defined __AVX2__ /* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); out_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); out_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); out_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], out_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], out_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], out_v[2] ) ); */ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); out_v[0] = _mm_load_si128( (__m128i*)(&ptrWordOut[0]) ); out_v[1] = _mm_load_si128( (__m128i*)(&ptrWordOut[2]) ); out_v[2] = _mm_load_si128( (__m128i*)(&ptrWordOut[4]) ); out_v[3] = _mm_load_si128( (__m128i*)(&ptrWordOut[6]) ); out_v[4] = _mm_load_si128( (__m128i*)(&ptrWordOut[8]) ); out_v[5] = _mm_load_si128( (__m128i*)(&ptrWordOut[10]) ); _mm_store_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], out_v[0] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], out_v[1] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], out_v[2] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], out_v[3] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], out_v[4] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], out_v[5] ) ); //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } #else ptrWordOut[0] ^= state[0]; ptrWordOut[1] ^= state[1]; ptrWordOut[2] ^= state[2]; ptrWordOut[3] ^= state[3]; ptrWordOut[4] ^= state[4]; ptrWordOut[5] ^= state[5]; ptrWordOut[6] ^= state[6]; ptrWordOut[7] ^= state[7]; ptrWordOut[8] ^= state[8]; ptrWordOut[9] ^= state[9]; ptrWordOut[10] ^= state[10]; ptrWordOut[11] ^= state[11]; //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } #endif }
/** * Performs a reduced duplex operation for a single row, from the highest to * the lowest index, using the reduced-round Blake2b's G function as the * internal permutation * * @param state The current state of the sponge * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ inline void reducedDuplexRow1( uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols ) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; #if defined __AVX2__ __m256i state_v[4], in_v[3]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); #endif for ( i = 0; i < nCols; i++ ) { #if defined __AVX2__ in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] ); LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); #elif defined __AVX__ __m128i state_v[6], in_v[6]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #else //Absorbing "M[prev][col]" state[0] ^= (ptrWordIn[0]); state[1] ^= (ptrWordIn[1]); state[2] ^= (ptrWordIn[2]); state[3] ^= (ptrWordIn[3]); state[4] ^= (ptrWordIn[4]); state[5] ^= (ptrWordIn[5]); state[6] ^= (ptrWordIn[6]); state[7] ^= (ptrWordIn[7]); state[8] ^= (ptrWordIn[8]); state[9] ^= (ptrWordIn[9]); state[10] ^= (ptrWordIn[10]); state[11] ^= (ptrWordIn[11]); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #endif #if defined __AVX2__ /* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); */ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); #else //M[row][C-1-col] = M[prev][col] XOR rand ptrWordOut[0] = ptrWordIn[0] ^ state[0]; ptrWordOut[1] = ptrWordIn[1] ^ state[1]; ptrWordOut[2] = ptrWordIn[2] ^ state[2]; ptrWordOut[3] = ptrWordIn[3] ^ state[3]; ptrWordOut[4] = ptrWordIn[4] ^ state[4]; ptrWordOut[5] = ptrWordIn[5] ^ state[5]; ptrWordOut[6] = ptrWordIn[6] ^ state[6]; ptrWordOut[7] = ptrWordIn[7] ^ state[7]; ptrWordOut[8] = ptrWordIn[8] ^ state[8]; ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; #endif //Input: next column (i.e., next block in sequence) ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } #if defined __AVX2__ _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #endif }
__m256i inline Xor(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
size_t __FASTCALL strlen_fast_v1b_avx(const char * str) { size_t len; register __m256i zero32, src32_low, src32_high; register size_t zero_mask_low, zero_mask_high; register uint64_t zero_mask; unsigned long zero_index; register const char * cur = str; // Get the misalignment bytes last 6 bits. size_t misalignment = (size_t)str & 0x3F; if (misalignment != 0) { misalignment = (size_t)str & 0x1F; // Scan the null terminator in first missalign bytes. register const char * end = cur + ((size_t)16UL - misalignment); while (cur < end) { // Find out the null terminator. if (*cur == '\0') { return (size_t)(cur - str); } cur++; } // Align address to 64 bytes for main loop. end = (const char *)((size_t)str & ((size_t)~(size_t)0x3F)) + 64; register __m128i zero16, src16; register uint32_t zero_mask16; // Set the zero masks (16 bytes). INIT_ZERO_16(zero16); zero16 = _mm_xor_si128(zero16, zero16); // Minor 16 bytes loop while (cur < end) { // Load the src 16 bytes to XMM register src16 = _mm_load_si128((__m128i *)(cur)); // Compare with zero16 masks per byte. src16 = _mm_cmpeq_epi8(src16, zero16); // Package the compare result (16 bytes) to 16 bits. zero_mask16 = (uint32_t)_mm_movemask_epi8(src16); // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 16 bytes). if (zero_mask16 != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask16); goto strlen_exit; } // One minor loop scan 16 bytes. cur += 16; } } // Set the zero masks (32 bytes). INIT_ZERO_32(zero32); zero32 = _mm256_xor_si256(zero32, zero32); // Main loop do { // Load the src 32 bytes to XMM register src32_low = _mm256_load_si256((__m256i *)(cur)); src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_low = _mm256_cmpeq_epi8(src32_low, zero32); src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 16 bits. zero_mask_low = (size_t)_mm256_movemask_epi8(src32_low); zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); #if defined(_WIN64) || defined(WIN64) || defined(_M_X64) || defined(_M_AMD64) \ || defined(_M_IA64) || defined(__amd64__) || defined(__x86_64__) // Combin the mask of the low 32 bits and high 32 bits. zero_mask = (zero_mask_high << 32) | zero_mask_low; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask != 0) { // Get the index of the first bit on set to 1. __BitScanForward64(zero_index, zero_mask); break; } #else (void)zero_mask; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask_low != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_low); break; } else if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; break; } #endif // _M_X64 || __x86_64__ // One loop scan 64 bytes. cur += 64; } while (1); strlen_exit: len = cur - str; len += zero_index; return len; }
static void mshabal256_compress(mshabal256_context *sc, const unsigned char *buf0, const unsigned char *buf1, const unsigned char *buf2, const unsigned char *buf3, const unsigned char *buf4, const unsigned char *buf5, const unsigned char *buf6, const unsigned char *buf7, size_t num) { union { u32 words[64 * MSHABAL256_FACTOR]; __m256i data[16]; } u; size_t j; __m256i A[12], B[16], C[16]; __m256i one; for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j); for (j = 0; j < 16; j++) { B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12); C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28); } one = _mm256_set1_epi32(C32(0xFFFFFFFF)); #define M(i) _mm256_load_si256(u.data + (i)) while (num-- > 0) { for (j = 0; j < 64 * MSHABAL256_FACTOR; j += 4 * MSHABAL256_FACTOR) { size_t o = j / MSHABAL256_FACTOR; u.words[j + 0] = *(u32 *)(buf0 + o); u.words[j + 1] = *(u32 *)(buf1 + o); u.words[j + 2] = *(u32 *)(buf2 + o); u.words[j + 3] = *(u32 *)(buf3 + o); u.words[j + 4] = *(u32 *)(buf4 + o); u.words[j + 5] = *(u32 *)(buf5 + o); u.words[j + 6] = *(u32 *)(buf6 + o); u.words[j + 7] = *(u32 *)(buf7 + o); } for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j)); A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); for (j = 0; j < 16; j++) B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); #define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ __m256i tt; \ tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), \ _mm256_srli_epi32(xa1, 17)); \ tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt); \ tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc); \ tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt); \ tt = _mm256_xor_si256(\ _mm256_xor_si256(tt, xb1), \ _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm)); \ xa0 = tt; \ tt = xb0; \ tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), \ _mm256_srli_epi32(tt, 31)); \ xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one)); \ } while (0) PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); #define SWAP_AND_SUB(xb, xc, xm) do { \ __m256i tmp; \ tmp = xb; \ xb = _mm256_sub_epi32(xc, xm); \ xc = tmp; \ } while (0) SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); buf0 += 64; buf1 += 64; buf2 += 64; buf3 += 64; buf4 += 64; buf5 += 64; buf6 += 64; buf7 += 64; if (++sc->Wlow == 0) sc->Whigh++; } for (j = 0; j < 12; j++) _mm256_storeu_si256((__m256i *)sc->state + j, A[j]); for (j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)sc->state + j + 12, B[j]); _mm256_storeu_si256((__m256i *)sc->state + j + 28, C[j]); } #undef M }
__m256i test_mm256_xor_si256(__m256i a, __m256i b) { // CHECK: xor <4 x i64> return _mm256_xor_si256(a, b); }
/** * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words * of type uint64_t), using Blake2b's G function as the internal permutation * * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ inline void absorbBlock(uint64_t *state, const uint64_t *in) { #if defined __AVX2__ __m256i state_v[4], in_v[3]; // only state is guaranteed aligned 256 state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] ); LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #elif defined __AVX__ __m128i state_v[6], in_v[6]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&in[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&in[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&in[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&in[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&in[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&in[10]) ); // do blake2bLyra without init // LYRA_ROUND_AVX2( state_v ) _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); //Applies the transformation f to the sponge's state blake2bLyra(state); #else //XORs the first BLOCK_LEN_INT64 words of "in" with the current state state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; state[4] ^= in[4]; state[5] ^= in[5]; state[6] ^= in[6]; state[7] ^= in[7]; state[8] ^= in[8]; state[9] ^= in[9]; state[10] ^= in[10]; state[11] ^= in[11]; //Applies the transformation f to the sponge's state blake2bLyra(state); #endif }