Ejemplo n.º 1
0
Archivo: stdfn.c Proyecto: ahe01/rufus
/*
 * This call tries to evenly balance the affinities for an array of
 * num_threads, according to the number of cores at our disposal...
 */
BOOL SetThreadAffinity(DWORD_PTR* thread_affinity, size_t num_threads)
{
	size_t i, j, pc;
	DWORD_PTR affinity, dummy;

	memset(thread_affinity, 0, num_threads * sizeof(DWORD_PTR));
	if (!GetProcessAffinityMask(GetCurrentProcess(), &affinity, &dummy))
		return FALSE;
	uuprintf("\r\nThread affinities:");
	uuprintf("  avail:\t%s", printbitslz(affinity));

	// If we don't have enough virtual cores to evenly spread our load forget it
	pc = popcnt64(affinity);
	if (pc < num_threads)
		return FALSE;

	// Spread the affinity as evenly as we can
	thread_affinity[num_threads - 1] = affinity;
	for (i = 0; i < num_threads - 1; i++) {
		for (j = 0; j < pc / num_threads; j++) {
			thread_affinity[i] |= affinity & (-1LL * affinity);
			affinity ^= affinity & (-1LL * affinity);
		}
		uuprintf("  thr_%d:\t%s", i, printbitslz(thread_affinity[i]));
		thread_affinity[num_threads - 1] ^= thread_affinity[i];
	}
	uuprintf("  thr_%d:\t%s", i, printbitslz(thread_affinity[i]));
	return TRUE;
}
Ejemplo n.º 2
0
  inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const
  {
    ResultType result = 0;
// Windows & generic platforms:

#ifdef PLATFORM_64_BIT
    if(size%sizeof(uint64_t) == 0)
    {
      const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
      const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
      size /= (sizeof(uint64_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt64(*pa ^ *pb);
      }
    }
    else if(size%sizeof(uint32_t) == 0)
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
    else
    {
      const ElementType * a2 = reinterpret_cast<const ElementType*> (a);
      const ElementType * b2 = reinterpret_cast<const ElementType*> (b);
      for (size_t i = 0;
           i < size / (sizeof(unsigned char)); ++i) {
        result += pop_count_LUT[a2[i] ^ b2[i]];
      }
    }
#else // PLATFORM_64_BIT
    if(size%sizeof(uint32_t) == 0)
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
    else
    {
      const ElementType * a2 = reinterpret_cast<const ElementType*> (a);
      const ElementType * b2 = reinterpret_cast<const ElementType*> (b);
      for (size_t i = 0;
           i < size / (sizeof(unsigned char)); ++i) {
        result += pop_count_LUT[a2[i] ^ b2[i]];
      }
    }
#endif // PLATFORM_64_BIT
    return result;
  }
Ejemplo n.º 3
0
PiTable::PiTable(uint64_t max) :
  max_(max)
{
  pi_.resize(max / 64 + 1);
  primesieve::iterator it(0, max);

  uint64_t pix = 0;
  uint64_t prime = 0;

  while ((prime = it.next_prime()) <= max)
    pi_[prime / 64].bits |= 1ull << (prime % 64);

  for (auto& i : pi_)
  {
    i.prime_count = pix;
    pix += popcnt64(i.bits);
  }
}
Ejemplo n.º 4
0
int fw_sync_timebase( void ) 
{

    uint64_t numloops = 10;
    uint64_t value;
    uint64_t rc;
    Personality_t *pers = &FW_Personality;    
    uint64_t numthreads;
    uint64_t msr;
    uint64_t geamap8 = 0;
    
    if(!PERS_ENABLED(PERS_ENABLE_MU))
        return 0;    

    if(!PERS_ENABLED(PERS_ENABLE_ND))
        return 0;    
    
    msr = mfmsr();
    mtmsr(msr & ~(MSR_EE | MSR_CE | MSR_ME));
    isync();
    
    numthreads = popcnt64(DCRReadPriv(TESTINT_DCR(THREAD_ACTIVE0))) + popcnt64(DCRReadPriv(TESTINT_DCR(THREAD_ACTIVE1)));
    if(PhysicalThreadID() == 0)
    {
#define WU_MMIO_PRIV_BASE ((volatile unsigned long *)0x3ffe8001c00)
#define SET_THREAD(i)           ((0x300 + (i)*0x40) / sizeof (unsigned long))
        WU_MMIO_PRIV_BASE[SET_THREAD(0)] = WU_DCR__THREAD0_WU_EVENT_SET__GEA_WU_EN_set(0x8);
        
        if(ProcessorID() == 0)
        {
            // Setup classroute 14.  Identical to classroute 15.
            value = DCRReadPriv(ND_500_DCR(CTRL_GI_CLASS_14_15));
            ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_I_insert(value, ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS15_UP_PORT_I_get(value));
            ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_O_insert(value, ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS15_UP_PORT_O_get(value));
            DCRWritePriv(ND_500_DCR(CTRL_GI_CLASS_14_15), value);
            ppc_msync();
            
            // Initialize GI pulse
            MUSPI_GIInit (&GI, 14, 0);
            
            // Initialize the GI barrier interrupt on classroute 14
            DCRWritePriv(MU_DCR(BARRIER_INT_EN), MU_DCR__BARRIER_INT_EN__CLASS14_set(4));
            
            // Route MU MAP4 interrupt to GEA lane 12 (wakeup unit bit 0)
            geamap8 = DCRReadPriv(GEA_DCR(GEA_INTERRUPT_MAP8));
            DCRWritePriv(GEA_DCR(GEA_INTERRUPT_MAP8), GEA_DCR__GEA_INTERRUPT_MAP8__MU_MAP4_set(12));
            rc = MUSPI_GIBarrierInit(&GIBarrier, 15);
        }
        
        // do local barrier
        BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER);
        while(BeDRAM_Read(BeDRAM_LOCKNUM_TIMESYNC_BARRIER) != numthreads)
        {
        }
        
        if(ProcessorID() == 0)
        {
            // Perform a barrier across all nodes.
            MUSPI_GIBarrierEnterAndWait(&GIBarrier);
            if ( rc != 0 )
            {
                FW_Warning("MUSPI_GIBarrierInit for class route 15 returned rc = %ld.", rc);
                return -1;
            }
            
            // Start gsync counter (for debug)
            DCRWritePriv(TESTINT_DCR(GSYNC_CTR), -1);
        }
        doTimeSync(numloops);
        mtspr(SPRN_TENS, 0xf);
    }
    else if((ProcessorID() == 1) && (pers->Network_Config.PrimordialClassRoute.GlobIntUpPortOutputs == 0))
    {
        BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER);
        createSendGIPulseThread(numloops);
    }
    else
    {
        BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER);
        mtspr(SPRN_TENC, 1 << ProcessorThreadID());
        isync();
    }
    
    // Wait for all hwthreads on node
    BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER);
    while(BeDRAM_Read(BeDRAM_LOCKNUM_TIMESYNC_BARRIER) != numthreads * 2)
    {
    }
    
    if(ProcessorID() == 0)
    {
        value = DCRReadPriv(ND_500_DCR(CTRL_GI_CLASS_14_15));
        ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_I_insert(value, 0);
        ND_500_DCR__CTRL_GI_CLASS_14_15__CLASS14_UP_PORT_O_insert(value, 0);
        DCRWritePriv(ND_500_DCR(CTRL_GI_CLASS_14_15), value);
        ppc_msync();
        
        // Initialize the barrier structure.  
        DCRWritePriv(MU_DCR(BARRIER_INT_EN), MU_DCR__BARRIER_INT_EN__CLASS14_set(0));
        DCRWritePriv(GEA_DCR(GEA_INTERRUPT_MAP8), geamap8);
    }
    WU_MMIO_PRIV_BASE[SET_THREAD(0)] = WU_DCR__THREAD0_WU_EVENT_SET__GEA_WU_EN_set(0);
    
    BeDRAM_ReadIncSat(BeDRAM_LOCKNUM_TIMESYNC_BARRIER);
    while(BeDRAM_Read(BeDRAM_LOCKNUM_TIMESYNC_BARRIER) != numthreads * 3)
    {
    }
    
    mtmsr(msr);
    isync();
    
    return 0;
}
Ejemplo n.º 5
0
  inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const
  {
    ResultType result = 0;
#if (defined __GNUC__ || defined __clang__) && defined USE_SSE
#ifdef __ARM_NEON__
    {
      uint32x4_t bits = vmovq_n_u32(0);
      for (size_t i = 0; i < size; i += 16) {
        uint8x16_t A_vec = vld1q_u8 (a + i);
        uint8x16_t B_vec = vld1q_u8 (b + i);
        uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
        uint8x16_t bitsSet = vcntq_u8 (AxorB);
        uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
        uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
        bits = vaddq_u32(bits, bitSet4);
      }
      uint64x2_t bitSet2 = vpaddlq_u32 (bits);
      result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
      result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    }
#else
    {
      //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
      typedef unsigned long long pop_t;
      const size_t modulo = size % sizeof(pop_t);
      const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
      const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
      const pop_t* a2_end = a2 + (size / sizeof(pop_t));

      for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));

      if (modulo) {
        //in the case where size is not dividable by sizeof(pop_t)
        //need to mask off the bits at the end
        pop_t a_final = 0, b_final = 0;
        memcpy(&a_final, a2, modulo);
        memcpy(&b_final, b2, modulo);
        result += __builtin_popcountll(a_final ^ b_final);
      }
    }
#endif //NEON
    return result;
#endif
#ifdef PLATFORM_64_BIT
    if(size%64 == 0)
    {
      const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
      const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
      size /= (sizeof(uint64_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt64(*pa ^ *pb);
      }
    }
    else
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
#else
    const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
    const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
    size /= (sizeof(uint32_t)/sizeof(unsigned char));
    for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
      result += popcnt32(*pa ^ *pb);
    }
#endif
    return result;
  }