Size findFirstBit(Size a) { #ifdef __GNUC__ #ifdef __X64__ return __builtin_ctzl(a); #else return __builtin_ctz(a); #endif #elif defined(_MSC_VER) unsigned long pos; #ifdef __X64__ _BitScanForward64(&pos, a); #else _BitScanForward(&pos, a); #endif return pos; #else //Very naive implementation. Size c = 0; while(!(a & 1)) { a >>= 1; c++; } return c; #endif }
/******************************** Common functions ********************************/ static unsigned LZ4_NbCommonBytes (register size_t val) { if (LZ4_isLittleEndian()) { if (LZ4_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward64( &r, (U64)val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; _BitScanForward( &r, (U32)val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } }
/*-******************************************************** * Dictionary training functions **********************************************************/ static unsigned ZDICT_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { if (MEM_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) unsigned long r = 0; _BitScanForward64( &r, (U64)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else { /* 32 bits */ # if defined(_MSC_VER) unsigned long r=0; _BitScanForward( &r, (U32)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } } else { /* Big Endian CPU */
inline uint countTrailingZeros(uint64_t value) { unsigned long index; if (_BitScanForward64(&index, value)) return index; else return 64; }
int ffsll(long long value) { unsigned long index = 0; unsigned char isNonZero; isNonZero = _BitScanForward64(&index, value); return isNonZero ? index + 1 : 0; }
static unsigned __inline clz (unsigned long x) { unsigned long r; #if defined (WORDSIZE) && (WORDSIZE == 64) _BitScanForward64 (&r, x); #else _BitScanForward32 (&r, x); #endif return (r); }
inline bitcount_t trailingzeros(uint64_t v) { #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) unsigned long i; _BitScanForward64(&i, v); return i; #else // 32-bit x86 uint32_t high = v >> 32; uint32_t low = uint32_t(v); return low ? trailingzeros(low) : trailingzeros(high)+32; #endif }
static inline int count_trailing_zeros(word_t word) { #if defined(__GNUC__) return __builtin_ctzl(word); #elif defined(_MSC_VER) unsigned long index; # if defined(_M_AMD64) assert(_BitScanForward64(&index, word) != 0); # else assert(_BitScanForward(&index, word) != 0); # endif return static_cast<int>(index); #else #endif }
static FORCEINLINE uint64_t getNextPrime(uint64_t* bits, uint64_t base) { // calculate bitValues_[ bitScanForward(*bits) ] // using a custom De Bruijn bitscan //uint64_t debruijn64 = UINT64_C(0x3F08A4C6ACB9DBD); uint64_t mask = *bits - 1; //uint64_t bitValue = bruijnBitValues_[((*bits ^ mask) * debruijn64) >> 58]; //uint64_t prime = base + bitValue; unsigned long index; _BitScanForward64(&index, *bits); uint64_t prime = base + bitValuesRaw_[index]; *bits &= mask; return prime; }
/*-************************************ * Common functions **************************************/ static inline unsigned LZ4_NbCommonBytes (register reg_t val) { if (LZ4_isLittleEndian()) { if (sizeof(val)==8) { # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward64( &r, (U64)val ); return (int)(r>>3); # elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else /* 32 bits */ {
__INTRIN_INLINE bool bsf64(unsigned long* const index, const uint64_t mask) { #if defined(__GNUC__) || defined(__clang__) if (mask) { *index = (unsigned long)__builtin_ctzll(mask); return true; } else { return false; } #elif defined(_MSC_VER) return _BitScanForward64(index, mask) != 0; #else # error Unsupported platform #endif }
count_zeroes(size_t *x) { int result; #if defined(HAVE_BUILTIN_CTZL) result = __builtin_ctzl(*x); *x >>= result; #elif defined(HAVE_BITSCANFORWARD64) _BitScanForward64(&result, *x); *x >>= result; #elif defined(HAVE_BITSCANFORWARD) _BitScanForward(&result, *x); *x >>= result; #else result = 0; while ((*x & 1) == 0) { ++result; *x >>= 1; } #endif return result; }
/* ================== ================== */ void Process_Fragments( raster_output_& raster_output, shader_input_& shader_input ) { const __m128 zero = set_all(0.0f); shader_input.tile_mask_16x16 = 0x0; shader_input.tile_mask_64x64 = 0x0; //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_64x64]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_64x64][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_64x64( raster_fragment.w, i_buffer, coverage_mask, raster_output, shader_input ); } } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_16x16]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_16x16][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_16x16( raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input ); } } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_4x4]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_4x4][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input); } } //=============================================================================================== { //const __int32 start = raster_output_::MAX_FRAGMENTS - 1; //const __int32 end = raster_output.n_fragments[raster_output_::PARTIAL_ACCEPT_4x4]; //for (__int32 i_fragment = start; i_fragment > end; i_fragment--) { // raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::PARTIAL_ACCEPT_4x4][i_fragment]; // const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; // const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; // Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input); //} } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments_COMPLETE; __int32 n_depth_fragments = 0; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_complete_& raster_fragment = raster_output.raster_fragment_complete[i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; pixel_shader(i_buffer, coverage_mask, raster_fragment.bazza, shader_input); const __int32 i_buffer_depth_4x4 = i_buffer / (4 * 4); const __int32 i_buffer_depth_16x16 = i_buffer / (16 * 16); const __int32 i_buffer_depth_64x64 = i_buffer / (64 * 64); shader_input.depth_tiles_4x4[i_buffer_depth_4x4] = shader_input.z_max; shader_input.tile_mask_16x16 |= one_bit_64 << i_buffer_depth_16x16; shader_input.tile_mask_64x64 |= one_bit_64 << i_buffer_depth_64x64; } } //=============================================================================================== { //printf_s(" %llu ", shader_input.tile_mask_16x16); __int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_16x16); for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) { unsigned long i_tile_16x16; _BitScanForward64(&i_tile_16x16, shader_input.tile_mask_16x16); shader_input.tile_mask_16x16 ^= one_bit_64 << i_tile_16x16; const __int32 i_tile_4x4 = i_tile_16x16 * (4 * 4); __m128 depth_4x4[4]; depth_4x4[0] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (0 * 4)); depth_4x4[1] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (1 * 4)); depth_4x4[2] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (2 * 4)); depth_4x4[3] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (3 * 4)); __m128 z_max; z_max = depth_4x4[0]; z_max = min_vec(depth_4x4[1], z_max); z_max = min_vec(depth_4x4[2], z_max); z_max = min_vec(depth_4x4[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.depth_tiles_16x16[i_tile_16x16] = store_s(z_out); } } { __int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_64x64); //printf_s(" %llu ", n_tiles); for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) { unsigned long i_tile_64x64; _BitScanForward64(&i_tile_64x64, shader_input.tile_mask_64x64); shader_input.tile_mask_64x64 ^= one_bit_64 << i_tile_64x64; const __int32 i_tile_16x16 = i_tile_64x64 * (4 * 4); __m128 depth_16x16[4]; depth_16x16[0] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (0 * 4)); depth_16x16[1] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (1 * 4)); depth_16x16[2] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (2 * 4)); depth_16x16[3] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (3 * 4)); __m128 z_max; z_max = depth_16x16[0]; z_max = min_vec(depth_16x16[1], z_max); z_max = min_vec(depth_16x16[2], z_max); z_max = min_vec(depth_16x16[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.depth_tiles_64x64[i_tile_64x64] = store_s(z_out); } } }
BOOST_FORCEINLINE unsigned find_lsb(unsigned __int64 mask, const mpl::int_<2>&) { unsigned long result; _BitScanForward64(&result, mask); return result; }
void inline BSF( unsigned long* index, size_t& mask ) { _BitScanForward64( index, mask ); }
DWORD WINAPI find_nonce(void* data) { #else void* find_nonce(void* data) { #endif bc_trit_t midStateCopyLow[STATE_LENGTH], midStateCopyHigh[STATE_LENGTH]; int i, shift; bc_trit_t nonce_probe, nonce_output; PDThread* my_thread = (PDThread*)data; char* trits = my_thread->trits; memset(midStateCopyLow, 0, STATE_LENGTH * sizeof(bc_trit_t)); memset(midStateCopyHigh, 0, STATE_LENGTH * sizeof(bc_trit_t)); PearlDiver* ctx = my_thread->ctx; memcpy(midStateCopyLow, my_thread->states->mid_low, STATE_LENGTH * sizeof(bc_trit_t)); memcpy(midStateCopyHigh, my_thread->states->mid_high, STATE_LENGTH * sizeof(bc_trit_t)); for (i = my_thread->threadIndex; i-- > 0;) { pd_increment(midStateCopyLow, midStateCopyHigh, NONCE_INIT_START, NONCE_INCREMENT_START); } bc_trit_t scratchpadLow[STATE_LENGTH], scratchpadHigh[STATE_LENGTH], stateLow[STATE_LENGTH], stateHigh[STATE_LENGTH]; memset(stateLow, 0, STATE_LENGTH * sizeof(bc_trit_t)); memset(stateHigh, 0, STATE_LENGTH * sizeof(bc_trit_t)); memset(scratchpadLow, 0, STATE_LENGTH * sizeof(bc_trit_t)); memset(scratchpadHigh, 0, STATE_LENGTH * sizeof(bc_trit_t)); while (ctxStatusEq(my_thread, ctx, PD_SEARCHING)) { pd_increment(midStateCopyLow, midStateCopyHigh, NONCE_INCREMENT_START, HASH_LENGTH); memcpy(stateLow, midStateCopyLow, STATE_LENGTH * sizeof(bc_trit_t)); memcpy(stateHigh, midStateCopyHigh, STATE_LENGTH * sizeof(bc_trit_t)); pd_transform(stateLow, stateHigh, scratchpadLow, scratchpadHigh); if ((nonce_probe = is_found_fast(stateLow, stateHigh, my_thread->min_weight_magnitude)) == 0) continue; #if defined(_WIN32) && !defined(__MINGW32__) #ifdef _WIN64 _BitScanForward64(&shift, nonce_probe); #else _BitScanForward(&shift, nonce_probe); #endif nonce_output = 1 << shift; EnterCriticalSection(&my_thread->ctx->new_thread_search); #else shift = __builtin_ctzll(nonce_probe); nonce_output = 1 << shift; pthread_mutex_lock(&my_thread->ctx->new_thread_search); #endif if (ctx->status != PD_FOUND) { ctx->status = PD_FOUND; for (i = 0; i < HASH_LENGTH; i++) { trits[i] = (((bc_trit_t)(midStateCopyLow[i]) & nonce_output) == 0) ? 1 : ((((bc_trit_t)(midStateCopyHigh[i]) & nonce_output) == 0) ? -1 : 0); } } #if defined(_WIN32) && !defined(__MINGW32__) LeaveCriticalSection(&my_thread->ctx->new_thread_search); #else pthread_mutex_unlock(&my_thread->ctx->new_thread_search); #endif return 0; } return 0; }
unsigned char test_BitScanForward64(unsigned LONG *Index, unsigned __int64 Mask) { return _BitScanForward64(Index, Mask); }
//! Find least significant one bit in 64-bit number //! @param data 64-bit number to scan //! @return index (0..63) of least significant one bit //! This algorithm uses _BitScanForward64 intrinsic to find LS1B. constexpr static unsigned ls1b(std::uint64_t data) { unsigned long result; _BitScanForward64(&result, mask); return result; }