/* * This call tries to evenly balance the affinities for an array of * num_threads, according to the number of cores at our disposal... */ BOOL SetThreadAffinity(DWORD_PTR* thread_affinity, size_t num_threads) { size_t i, j, pc; DWORD_PTR affinity, dummy; memset(thread_affinity, 0, num_threads * sizeof(DWORD_PTR)); if (!GetProcessAffinityMask(GetCurrentProcess(), &affinity, &dummy)) return FALSE; uuprintf("\r\nThread affinities:"); uuprintf(" avail:\t%s", printbitslz(affinity)); // If we don't have enough virtual cores to evenly spread our load forget it pc = popcnt64(affinity); if (pc < num_threads) return FALSE; // Spread the affinity as evenly as we can thread_affinity[num_threads - 1] = affinity; for (i = 0; i < num_threads - 1; i++) { for (j = 0; j < pc / num_threads; j++) { thread_affinity[i] |= affinity & (-1LL * affinity); affinity ^= affinity & (-1LL * affinity); } uuprintf(" thr_%d:\t%s", i, printbitslz(thread_affinity[i])); thread_affinity[num_threads - 1] ^= thread_affinity[i]; } uuprintf(" thr_%d:\t%s", i, printbitslz(thread_affinity[i])); return TRUE; }
inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { ResultType result = 0; // Windows & generic platforms: #ifdef PLATFORM_64_BIT if(size%sizeof(uint64_t) == 0) { const uint64_t* pa = reinterpret_cast<const uint64_t*>(a); const uint64_t* pb = reinterpret_cast<const uint64_t*>(b); size /= (sizeof(uint64_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt64(*pa ^ *pb); } } else if(size%sizeof(uint32_t) == 0) { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } else { const ElementType * a2 = reinterpret_cast<const ElementType*> (a); const ElementType * b2 = reinterpret_cast<const ElementType*> (b); for (size_t i = 0; i < size / (sizeof(unsigned char)); ++i) { result += pop_count_LUT[a2[i] ^ b2[i]]; } } #else // PLATFORM_64_BIT if(size%sizeof(uint32_t) == 0) { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } else { const ElementType * a2 = reinterpret_cast<const ElementType*> (a); const ElementType * b2 = reinterpret_cast<const ElementType*> (b); for (size_t i = 0; i < size / (sizeof(unsigned char)); ++i) { result += pop_count_LUT[a2[i] ^ b2[i]]; } } #endif // PLATFORM_64_BIT return result; }
PiTable::PiTable(uint64_t max) : max_(max) { pi_.resize(max / 64 + 1); primesieve::iterator it(0, max); uint64_t pix = 0; uint64_t prime = 0; while ((prime = it.next_prime()) <= max) pi_[prime / 64].bits |= 1ull << (prime % 64); for (auto& i : pi_) { i.prime_count = pix; pix += popcnt64(i.bits); } }
int fw_sync_timebase( void ) { uint64_t numloops = 10; uint64_t value; uint64_t rc; Personality_t *pers = &FW_Personality; uint64_t numthreads; uint64_t msr; uint64_t geamap8 = 0; if(!PERS_ENABLED(PERS_ENABLE_MU)) return 0; if(!PERS_ENABLED(PERS_ENABLE_ND)) return 0; msr = mfmsr(); mtmsr(msr & ~(MSR_EE | MSR_CE | MSR_ME)); isync(); numthreads = popcnt64(DCRReadPriv(TESTINT_DCR(THREAD_ACTIVE0))) + popcnt64(DCRReadPriv(TESTINT_DCR(THREAD_ACTIVE1))); if(PhysicalThreadID() == 0) { #define WU_MMIO_PRIV_BASE ((volatile unsigned long *)0x3ffe8001c00) #define SET_THREAD(i) ((0x300 + (i)*0x40) / sizeof (unsigned long)) WU_MMIO_PRIV_BASE[SET_THREAD(0)] = WU_DCR__THREAD0_WU_EVENT_SET__GEA_WU_EN_set(0x8); if(ProcessorID() == 0) { // Setup classroute 14. Identical to classroute 15. value = DCRReadPriv(ND_500_DCR(CTRL_GI_CLASS_14_15)); ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_I_insert(value, ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS15_UP_PORT_I_get(value)); ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_O_insert(value, ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS15_UP_PORT_O_get(value)); DCRWritePriv(ND_500_DCR(CTRL_GI_CLASS_14_15), value); ppc_msync(); // Initialize GI pulse MUSPI_GIInit (&GI, 14, 0); // Initialize the GI barrier interrupt on classroute 14 DCRWritePriv(MU_DCR(BARRIER_INT_EN), MU_DCR__BARRIER_INT_EN__CLASS14_set(4)); // Route MU MAP4 interrupt to GEA lane 12 (wakeup unit bit 0) geamap8 = DCRReadPriv(GEA_DCR(GEA_INTERRUPT_MAP8)); DCRWritePriv(GEA_DCR(GEA_INTERRUPT_MAP8), GEA_DCR__GEA_INTERRUPT_MAP8__MU_MAP4_set(12)); rc = MUSPI_GIBarrierInit(&GIBarrier, 15); } // do local barrier BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER); while(BeDRAM_Read(BeDRAM_LOCKNUM_TIMESYNC_BARRIER) != numthreads) { } if(ProcessorID() == 0) { // Perform a barrier across all nodes. MUSPI_GIBarrierEnterAndWait(&GIBarrier); if ( rc != 0 ) { FW_Warning("MUSPI_GIBarrierInit for class route 15 returned rc = %ld.", rc); return -1; } // Start gsync counter (for debug) DCRWritePriv(TESTINT_DCR(GSYNC_CTR), -1); } doTimeSync(numloops); mtspr(SPRN_TENS, 0xf); } else if((ProcessorID() == 1) && (pers->Network_Config.PrimordialClassRoute.GlobIntUpPortOutputs == 0)) { BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER); createSendGIPulseThread(numloops); } else { BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER); mtspr(SPRN_TENC, 1 << ProcessorThreadID()); isync(); } // Wait for all hwthreads on node BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER); while(BeDRAM_Read(BeDRAM_LOCKNUM_TIMESYNC_BARRIER) != numthreads * 2) { } if(ProcessorID() == 0) { value = DCRReadPriv(ND_500_DCR(CTRL_GI_CLASS_14_15)); ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_I_insert(value, 0); ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_O_insert(value, 0); DCRWritePriv(ND_500_DCR(CTRL_GI_CLASS_14_15), value); ppc_msync(); // Initialize the barrier structure. DCRWritePriv(MU_DCR(BARRIER_INT_EN), MU_DCR__BARRIER_INT_EN__CLASS14_set(0)); DCRWritePriv(GEA_DCR(GEA_INTERRUPT_MAP8), geamap8); } WU_MMIO_PRIV_BASE[SET_THREAD(0)] = WU_DCR__THREAD0_WU_EVENT_SET__GEA_WU_EN_set(0); BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER); while(BeDRAM_Read(BeDRAM_LOCKNUM_TIMESYNC_BARRIER) != numthreads * 3) { } mtmsr(msr); isync(); return 0; }
inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { ResultType result = 0; #if (defined __GNUC__ || defined __clang__) && defined USE_SSE #ifdef __ARM_NEON__ { uint32x4_t bits = vmovq_n_u32(0); for (size_t i = 0; i < size; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t bitsSet = vcntq_u8 (AxorB); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); bits = vaddq_u32(bits, bitSet4); } uint64x2_t bitSet2 = vpaddlq_u32 (bits); result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } #else { //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) typedef unsigned long long pop_t; const size_t modulo = size % sizeof(pop_t); const pop_t* a2 = reinterpret_cast<const pop_t*> (a); const pop_t* b2 = reinterpret_cast<const pop_t*> (b); const pop_t* a2_end = a2 + (size / sizeof(pop_t)); for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); if (modulo) { //in the case where size is not dividable by sizeof(pop_t) //need to mask off the bits at the end pop_t a_final = 0, b_final = 0; memcpy(&a_final, a2, modulo); memcpy(&b_final, b2, modulo); result += __builtin_popcountll(a_final ^ b_final); } } #endif //NEON return result; #endif #ifdef PLATFORM_64_BIT if(size%64 == 0) { const uint64_t* pa = reinterpret_cast<const uint64_t*>(a); const uint64_t* pb = reinterpret_cast<const uint64_t*>(b); size /= (sizeof(uint64_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt64(*pa ^ *pb); } } else { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } #else const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } #endif return result; }