void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset ) { // With 0 alignment this function makes no sense // use normal malloc instead WALBERLA_ASSERT_GREATER( alignment, 0 ); // Tests if alignment is power of two (assuming alignment>0) WALBERLA_ASSERT( !(alignment & (alignment - 1)) ); WALBERLA_ASSERT_LESS( offset, alignment ); if( offset == 0 ) { void * result = nullptr; WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) ); freePointers_[result] = result; return result; } void *pa; // pointer to allocated memory void *ptr; // pointer to usable aligned memory WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment )); WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment"); ptr = (void *) ((char *) (pa) + alignment - offset); freePointers_[ptr] = pa; WALBERLA_ASSERT_EQUAL(((size_t) ptr + offset) % alignment, 0 ); return ptr; }
uint_t StaticLevelwiseCurveBalanceWeighted::operator()( SetupBlockForest & forest, const uint_t numberOfProcesses, const memory_t /*perProcessMemoryLimit*/ ) { // TODO: take per process memory limit into account? std::vector< SetupBlock * > blocks; if( hilbert_ ) forest.getHilbertOrder( blocks ); else forest.getMortonOrder( blocks ); uint_t usedProcesses( uint_t(0) ); for( uint_t level = uint_t(0); level < forest.getNumberOfLevels(); ++level ) { std::vector< SetupBlock * > blocksOnLevel; for( auto block = blocks.begin(); block != blocks.end(); ++block ) if( (*block)->getLevel() == level ) blocksOnLevel.push_back( *block ); workload_t totalWeight( 0 ); for( auto block = blocksOnLevel.begin(); block != blocksOnLevel.end(); ++block ) { WALBERLA_ASSERT( !( (*block)->getWorkload() < workload_t(0) ) ); totalWeight += (*block)->getWorkload(); } uint_t c( uint_t(0) ); for( uint_t p = uint_t(0); p != numberOfProcesses; ++p ) { const workload_t minWeight = totalWeight / workload_c( numberOfProcesses - p ); workload_t weight( 0 ); while( weight < minWeight && c < blocksOnLevel.size() ) { blocksOnLevel[c]->assignTargetProcess(p); WALBERLA_ASSERT_LESS_EQUAL( p, usedProcesses ); usedProcesses = p + uint_t(1); const workload_t addedWeight = blocksOnLevel[c]->getWorkload(); weight += addedWeight; totalWeight -= addedWeight; ++c; } } while( c < blocksOnLevel.size() ) { blocksOnLevel[c]->assignTargetProcess( numberOfProcesses - uint_t(1) ); WALBERLA_ASSERT_LESS_EQUAL( numberOfProcesses - uint_t(1), usedProcesses ); usedProcesses = numberOfProcesses; ++c; } } return usedProcesses; }
WALBERLA_MPI_SECTION() { WALBERLA_ASSERT( !isMPIInitialized_ ); // Check first that MPI was not initialized before // f.e. when using Python, MPI could have been initialized by // a different MPI module like mpi4py int mpiAlreadyInitialized=0; MPI_Initialized( &mpiAlreadyInitialized ); if ( ! mpiAlreadyInitialized ) { MPI_Init( argc, argv ); finalizeOnDestruction_ = true; } isMPIInitialized_ = true; MPI_Comm_size( MPI_COMM_WORLD, &numProcesses_ ); MPI_Comm_rank( MPI_COMM_WORLD, &worldRank_ ); if( abortOnException ) std::set_terminate( customTerminateHandler ); }
/// Complexity is O(N), where N == this->size() CellInterval CellSet::boundingBox() const { WALBERLA_ASSERT( !empty() ); Set<Cell>::const_iterator beginIt = Set<Cell>::begin(); Set<Cell>::const_iterator endIt = Set<Cell>::end(); CellInterval interval( beginIt->x(), beginIt->y(), beginIt->z(), beginIt->x(), beginIt->y(), beginIt->z() ); for( Set<Cell>::const_iterator cellIt = ++beginIt; cellIt != endIt; ++cellIt ) { if( cellIt->x() < interval.xMin() ) interval.xMin() = cellIt->x(); if( cellIt->y() < interval.yMin() ) interval.yMin() = cellIt->y(); if( cellIt->z() < interval.zMin() ) interval.zMin() = cellIt->z(); if( cellIt->x() > interval.xMax() ) interval.xMax() = cellIt->x(); if( cellIt->y() > interval.yMax() ) interval.yMax() = cellIt->y(); if( cellIt->z() > interval.zMax() ) interval.zMax() = cellIt->z(); } return interval; }
void Block::resetNeighborhood( const PhantomBlock & phantom ) { std::map< BlockID, uint_t > neighborhoodMapping; neighborhood_.clear(); for( uint_t i = 0; i != phantom.getNeighborhoodSize(); ++i ) { neighborhood_.emplace_back( forest_, phantom.getNeighborId(i), phantom.getNeighborProcess(i), phantom.getNeighborState(i) ); neighborhoodMapping[ phantom.getNeighborId(i) ] = i; } for( uint_t i = 0; i != 26; ++i ) { neighborhoodSection_[i].clear(); for( uint_t j = 0; j != phantom.getNeighborhoodSectionSize(i); ++j ) { WALBERLA_ASSERT( neighborhoodMapping.find( phantom.getNeighborId(i,j) ) != neighborhoodMapping.end() ); neighborhoodSection_[i].push_back( &(neighborhood_[ neighborhoodMapping[phantom.getNeighborId(i,j)] ]) ); } } }
/*******************************************************************************************************************//** * \brief Gets all prime factors of a number. * * Uses trial division algorithm. * See http://en.wikipedia.org/w/index.php?title=Trial_division&oldid=518625973. * * \param n The number to be factorized. * * \pre n > 0 * * \return The prime factors in ascending order. **********************************************************************************************************************/ std::vector<uint_t> getPrimeFactors( const uint_t n ) { WALBERLA_ASSERT( n != 0 ); auto primes = getPrimes(n); std::vector<uint_t> primeFactors; uint_t n_rest = n; for(auto primeIt = primes.begin(); primeIt != primes.end(); ++primeIt) { if( *primeIt * *primeIt > n ) break; while( n_rest % *primeIt == 0) { n_rest /= *primeIt; primeFactors.push_back(*primeIt); } } if( n_rest != 1 ) primeFactors.push_back(n_rest); return primeFactors; }