int main(){ //calculate median of rdtsc call overhead - we will subtract it from all the tests we do int64_t overhead = calc_rdtsc_overhead(); printf("Clock Measurement Overhead: %ld Cycles\n",overhead); uint64_t testDigit = 1334782398988024; uint32_t TRIALS=1000000; //measure 1M times - set it to lower value if your function is slow - the faster your function, more measurements you need - in my experiments, for fastest functions, values seem to converge within 1M iterations measure_time(Slow Digits ,TRIALS,digits10_slow(testDigit),overhead); //default slow digits10 measure_time(Fast Digits ,TRIALS,digits10_fast(testDigit),overhead); //fast digits10 measure_time(Faster Digits,TRIALS,digits10_faster(testDigit),overhead); //faster digits10 measure_time(Small Function,TRIALS,smallFunction(i),overhead); //an example of measuring a very low-overhead function - it should be about 3 cycles on x86_64 (1 push, 1 mov, 1 ret) - we pass it a non-static argument (using loop variable i in measure_time macro body) so that it is not optimized away by the compiler from n calls to 1 call }
int main(){ //calculate median of rdtsc call overhead - we will subtract it from all the tests we do int64_t overhead = calc_rdtsc_overhead(); printf("Median Clock Measurement Overhead: %ld Cycles\n",overhead); uint64_t SIZE = 1024*1024; int64_t* array = malloc(SIZE*sizeof(int64_t)); for(int i=0; i < SIZE; ++i) array[i] = i; uint32_t TRIALS=1000000; //measure 1M times - set it to lower value if your function is slow - the faster your function, more measurements you need - in my experiments, for fastest functions, values seem to converge within 1M iterations measure_time(CPP Binary Search,TRIALS,bin_long_cpp(array,SIZE,23456),overhead); //CPP binary search measure_time(C Binary Search,TRIALS,bin_long_c(array,SIZE,23456),overhead); //C binary search }
void StoreState::update(simulator::Simulation& simulation, units::Time dt) { auto _ = measure_time("diffusion.store-state", simulator::TimeMeasurementIterationOutput(simulation)); // Get data table auto& table = simulation.getDataTable("diffusion"); // Foreach coordinates for (auto&& c : range(m_diffusionModule->getGridSize())) { // Create new row auto row = table.addRow( makePair("iteration", simulation.getIteration()), makePair("totalTime", simulation.getTotalTime().value()), makePair("x", c.getX()), makePair("y", c.getY()) ); // Foreach signals for (auto signalId : m_diffusionModule->getSignalIds()) { table.setData(row, makePair( m_diffusionModule->getSignalName(signalId), m_diffusionModule->getSignal(signalId, c).value() ) ); } } }
void Simulation::deleteObjects() { auto _ = measure_time("sim.delete", TimeMeasurementIterationOutput(this)); // Remove deleted objects m_objects.removeDeleted(); }
void Module::update(simulator::Simulation& simulation, units::Time dt) { // Store time step m_step = dt; auto _ = measure_time("agglutination", simulator::TimeMeasurementIterationOutput(simulation)); // Get physics world auto& world = simulation.getWorld(); // Foreach pending bodies for (const auto& p : m_toJoin) { b2WeldJointDef joint; joint.Initialize(p.bodyA, p.bodyB, p.bodyA->GetWorldCenter()); JointUserData* jUserData = new JointUserData(); jUserData->module = this; jUserData->Kd = p.dConst; joint.userData = jUserData; world.CreateJoint(&joint); } m_toJoin.clear(); // Joints to remove DynamicArray<b2Joint*> toRemove; // Foreach active joints for (auto joint = world.GetJointList(); joint != nullptr; joint = joint->GetNext()) { const JointUserData* jUserData = reinterpret_cast<const JointUserData*>(joint->GetUserData()); // Not our joint if (jUserData == nullptr) continue; if (jUserData->guard != '@') continue; std::bernoulli_distribution dist( getDisassociationPropensity( m_step, jUserData->Kd ) ); if (dist(g_gen)) { Log::debug("Released: ", joint->GetBodyA(), ", ", joint->GetBodyB()); toRemove.push_back(joint); delete jUserData; } } // Destroy joints for (auto joint : toRemove) world.DestroyJoint(joint); }
TEST(sfrlock, uncontended_write_cost) { double t; double r; pthread_mutex_t mutex; pthread_rwlock_t rwlock; sfrlock_t sfrlock; sfrlock_init(&sfrlock); pthread_rwlock_init(&rwlock, nullptr); pthread_mutex_init(&mutex, nullptr); r = measure_time([&] () { for (unsigned cnt = repeat; cnt; cnt--) { sfrlock_wrlock(&sfrlock); sfrlock_wrunlock(&sfrlock); } }); printf("sfrlock_t time: %lf ms\n", r / 1e6); t = measure_time([&] () { for (unsigned cnt = repeat; cnt; cnt--) { pthread_rwlock_wrlock(&rwlock); pthread_rwlock_unlock(&rwlock); } }); printf("pthread_rwlock_t time: %lf ms (%+.2lf%%)\n", t / 1e6, -(1 - (t / r)) * 100); t = measure_time([&] () { for (unsigned cnt = repeat; cnt; cnt--) { pthread_mutex_lock(&mutex); pthread_mutex_unlock(&mutex); } }); printf("pthread_mutex_t time: %lf ms (%+.2lf%%)\n", t / 1e6, -(1 - (t / r)) * 100); pthread_rwlock_destroy(&rwlock); pthread_mutex_destroy(&mutex); }
static void test_swap_double() { union { double d; uint64_t u; } ud = { .u = 0x7856341283C0F33F }; double x = 1.2344999991522893623141499119810760021209716796875; ud.u = BSWAP_64(ud.u); double r = ud.d; if (r == x) { printf("%.64f\n", ud.d); printf("swap okay\n"); } else { printf("swap failed\n"); printf("%.64f\n", ud.d); } double td = 1.2344999991522893623141499119810760021209716796875; if (memcmp(&td, "\x78\x56\x34\x12\x83\xC0\xF3\x3F", 8) == 0) { printf("little endian double\n"); } else if (memcmp(&td, "\x3F\xF3\xC0\x83\x12\x34\x56\x78", 8) == 0) { printf("big endian double\n"); } else { printf("not support number format to dump!"); } } int main(int argc, char const* argv[]) { test_swap_double(); test_b32(); union_test(); uint64_t i = 0x123456789abcdeff; measure_time(test_mc, i, "memory copy"); measure_time(test_mc2, i, "memory copy 2"); measure_time(test_bo, i, "bitwise operation"); return 0; }
void execute(std::function<void()> f) { if(mpi_mode && !master_instance) { // Setup data pipes bool firstJob = true; while(true) { std::string local_config = send_command(RQJ); if(local_config.empty()) break; if(firstJob) { for(action & a : actions) if(a.t == pipe) setup_pipe(a); firstJob = false; } parse_raw_config_str(local_config.c_str()); f(); send_command(DNE); } } else { if(actions.size() == 0) f(); else { int action_size = 1; for(auto& a : actions) action_size *= a.size(); int cur = 0; execute_action(0, action_size, cur, f); } if(!mpi_mode) { measure_time(); store_timing(); } } }
void Simulation::updateObjects(units::Time dt) { auto _ = measure_time("sim.objects", TimeMeasurementIterationOutput(this)); // Update simulations objects // Can't use range-for because update can add a new object. for (object::Container::SizeType i = 0u; i < m_objects.getCount(); ++i) { auto obj = m_objects[i]; Assert(obj); obj->update(dt); } }
int main( int argc, char * argv[]) { try { bool preserve = false, unwind = true, bind = false; boost::program_options::options_description desc("allowed options"); desc.add_options() ("help", "help message") ("bind,b", boost::program_options::value< bool >( & bind), "bind thread to CPU") ("fpu,f", boost::program_options::value< bool >( & preserve), "preserve FPU registers") ("unwind,u", boost::program_options::value< bool >( & unwind), "unwind coroutine-stack") ("jobs,j", boost::program_options::value< boost::uint64_t >( & jobs), "jobs to run"); boost::program_options::variables_map vm; boost::program_options::store( boost::program_options::parse_command_line( argc, argv, desc), vm); boost::program_options::notify( vm); if ( vm.count("help") ) { std::cout << desc << std::endl; return EXIT_SUCCESS; } if ( preserve) preserve_fpu = boost::coroutines::fpu_preserved; if ( ! unwind) unwind_stack = boost::coroutines::no_stack_unwind; if ( bind) bind_to_processor( 0); duration_type overhead_c = overhead_clock(); std::cout << "overhead " << overhead_c.count() << " nano seconds" << std::endl; boost::uint64_t res = measure_time( overhead_c).count(); std::cout << "average of " << res << " nano seconds" << std::endl; #ifdef BOOST_CONTEXT_CYCLE cycle_type overhead_y = overhead_cycle(); std::cout << "overhead " << overhead_y << " cpu cycles" << std::endl; res = measure_cycles( overhead_y); std::cout << "average of " << res << " cpu cycles" << std::endl; #endif return EXIT_SUCCESS; } catch ( std::exception const& e) { std::cerr << "exception: " << e.what() << std::endl; } catch (...) { std::cerr << "unhandled exception" << std::endl; } return EXIT_FAILURE; }
TEST(cwlock, uncontended_acquire) { double t; cwlock_t cwlock; cwlock_init(&cwlock); t = measure_time([&] () { for (unsigned cnt = repeat; cnt; cnt--) { if (cwlock_lock(&cwlock)) { cwlock_unlock(&cwlock); } } }); printf("cwlock_t time: %lf ms\n", t / 1e6); }
int main( int argc, char * argv[]) { try { bind_to_processor( 0); boost::program_options::options_description desc("allowed options"); desc.add_options() ("help", "help message") ("fpu,f", boost::program_options::value< bool >( & preserve_fpu), "preserve FPU registers") ("jobs,j", boost::program_options::value< boost::uint64_t >( & jobs), "jobs to run"); boost::program_options::variables_map vm; boost::program_options::store( boost::program_options::parse_command_line( argc, argv, desc), vm); boost::program_options::notify( vm); if ( vm.count("help") ) { std::cout << desc << std::endl; return EXIT_SUCCESS; } stack_allocator stack_alloc; fc = boost::context::make_fcontext( stack_alloc.allocate( stack_allocator::default_stacksize() ), stack_allocator::default_stacksize(), fn); boost::uint64_t res = measure_time().count(); std::cout << "average of " << res << " nano seconds" << std::endl; #ifdef BOOST_CONTEXT_CYCLE res = measure_cycles(); std::cout << "average of " << res << " cpu cycles" << std::endl; #endif return EXIT_SUCCESS; } catch ( std::exception const& e) { std::cerr << "exception: " << e.what() << std::endl; } catch (...) { std::cerr << "unhandled exception" << std::endl; } return EXIT_FAILURE; }
/** * Objective function evaluation, used by the optimization algorithm. * * params a structure holding configuration parameters which are * common to all transmitters; * tx_params a structure holding transmitter-specific configuration * parameters; * radio_zone radio zone for which the objective function is calculated; * sol_vector solution vector over which the objective function is calculated; * comm the object used to communicate with the workers; * */ static double obj_func (Parameters *params, Tx_parameters *tx_params, const char radio_zone, double *sol_vector, MPI_Comm *comm) { double score [params->ntx][2]; double ret_value [2]; MPI_Status status; #ifdef _PERFORMANCE_METRICS_ measure_time ("Send solution to all workers"); #endif // // broadcast the new solution to all workers // MPI_Bcast (sol_vector, params->clutter_category_count, MPI_DOUBLE, _COVERAGE_MASTER_RANK_, *comm); #ifdef _PERFORMANCE_METRICS_ measure_time (NULL); measure_time ("Gather partial objective-function values"); #endif // // receive the partial objective-function values from all workers, // aggregating the partial values before calculating the total // ret_value[0] = 0; ret_value[1] = 0; int workers_evaluating = 0; while (workers_evaluating < params->ntx) { // // receive the partial objective-function value from this worker // MPI_Recv (&(score[workers_evaluating][0]), 2, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, *comm, &status); if (status.MPI_ERROR) { int worker_rank = status.MPI_SOURCE; fprintf (stderr, "*** ERROR: Objective-function value incorrectly received from %d. worker\n", worker_rank); fflush (stderr); exit (1); } // // aggregate the received squared error // ret_value[0] += score[workers_evaluating][0]; // // aggregate the received field-measurement count // ret_value[1] += score[workers_evaluating][1]; workers_evaluating ++; } #ifdef _PERFORMANCE_METRICS_ measure_time (NULL); measure_time ("Build complete objective-function value"); #endif // // the total mean-squared error // ret_value[0] /= ret_value[1]; #ifdef _PERFORMANCE_METRICS_ measure_time (NULL); #endif return ret_value[0]; }
bool Simulation::update(units::Duration dt) { // Initialize simulation if (!isInitialized()) initialize(); // Increase step number m_iteration++; m_totalTime += dt; // Clear all stored forces for (auto& obj : m_objects) obj->setForce(Zero); // Update modules updateModules(dt); // Update objects updateObjects(dt); // Detect object that leaved the scene detectDeserters(); // Delete unused objects deleteObjects(); // Store data if (m_dataOutObjects) { for (const auto& object : m_objects) { const auto pos = object->getPosition(); const auto vel = object->getVelocity(); *m_dataOutObjects << // iteration getIteration() << ";" << // totalTime getTotalTime() << ";" << // id object->getId() << ";" << // typeName object->getTypeName() << ";" << // posX pos.getX() << ";" << // posY pos.getY() << ";" << // velX vel.getX() << ";" << // velY vel.getY() << "\n" ; } } #ifdef CECE_ENABLE_BOX2D_PHYSICS { auto _ = measure_time("sim.physics", TimeMeasurementIterationOutput(this)); m_world.Step(getPhysicsEngineTimeStep().value(), 10, 10); } #endif return (hasUnlimitedIterations() || getIteration() <= getIterations()); }
void Simulation::updateModules(units::Time dt) { auto _ = measure_time("sim.modules", TimeMeasurementIterationOutput(this)); m_modules.update(*this, dt); }
static int add_initial_data(const struct kmr_kv_box kv, const KMR_KVS *kvi, KMR_KVS *kvo, void *p, long i_) { common_t *common = (common_t *)p; char filename[FILENAME_LEN]; create_file(common->rank, common->iteration, common->file_size, filename, FILENAME_LEN); common->val_count = IO_COUNT * common->file_size; struct kmr_kv_box nkv = { .klen = sizeof(char) * (strlen(common->key) + 1), .k.p = common->key, .vlen = sizeof(char) * (strlen(filename) + 1), .v.p = (void *)filename }; kmr_add_kv(kvo, nkv); return MPI_SUCCESS; } static int increment_in_file_value(const struct kmr_kv_box kv, const KMR_KVS *kvi, KMR_KVS *kvo, void *p, long i_) { common_t *common = (common_t *)p; char *infile = (char *)kv.v.p; char outfile[FILENAME_LEN]; snprintf(outfile, FILENAME_LEN, "./%06d-%02d.dat", common->rank, common->iteration + 1); FILE *ifp = fopen(infile, "r"); FILE *ofp = fopen(outfile, "w+"); assert(ifp != 0 && ofp != 0); /* read/write 1MB at once */ long *buf = (long *)malloc(sizeof(long) * IO_COUNT); for (int i = 0; i < common->file_size; i++) { size_t cc = fread(buf, sizeof(long), IO_COUNT, ifp); assert(cc == IO_COUNT); for (int j = 0; j < IO_COUNT; j++) { buf[j] += 1; } cc = fwrite(buf, sizeof(long), IO_COUNT, ofp); assert(cc == IO_COUNT); } free(buf); fclose(ofp); struct kmr_kv_box nkv = { .klen = sizeof(char) * (strlen(common->key) + 1), .k.p = common->key, .vlen = sizeof(char) * (strlen(outfile) + 1), .v.p = (void *)outfile }; kmr_add_kv(kvo, nkv); #ifdef DEBUG fseek(ifp, 0, SEEK_SET); long val; fread(&val, sizeof(long), 1, ifp); fprintf(stderr, "Rank[%d]: process key[%s]-val[%ld]\n", common->rank, (char *)kv.k.p, val); #endif fclose(ifp); delete_file(common->rank, common->iteration); return MPI_SUCCESS; } int main(int argc, char **argv) { int thlv; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thlv); int nprocs, rank, task_nprocs; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); check_nprocs(nprocs, rank, &task_nprocs); kmr_init(); KMR *mr = kmr_create_context(MPI_COMM_WORLD, MPI_INFO_NULL, 0); mr->verbosity = 5; mr->trace_map_mp = 1; char even_key[KEY_LEN]; char odd_key[KEY_LEN]; snprintf(even_key, KEY_LEN, "even%06d", (rank / task_nprocs + 1)); snprintf(odd_key, KEY_LEN, "odd%06d", (rank % task_nprocs + 1)); common_t common0; common0.key = even_key; parse_param_file(argc, argv, &(common0.file_size)); common0.rank = rank; common0.iteration = 0; KMR_KVS *kvs0 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); kmr_map_once(kvs0, &common0, kmr_noopt, 0, add_initial_data); double itr_times[ITERATIONS]; for (int i = 0; i < ITERATIONS; i++) { common0.key = (i % 2 == 0)? odd_key : even_key; common0.iteration = i; KMR_KVS *kvs1 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE); struct timeval ts; measure_time(&ts); kmr_map_multiprocess_by_key(kvs0, kvs1, &common0, kmr_noopt, rank, increment_in_file_value); struct timeval te; measure_time(&te); itr_times[i] = calc_time_diff(&ts, &te); kvs0 = kvs1; } kmr_free_kvs(kvs0); delete_file(common0.rank, common0.iteration + 1); print_time(itr_times, ITERATIONS, rank); kmr_free_context(mr); kmr_fin(); MPI_Finalize(); return 0; }
/** * Calculates the coverage prediction for one transmitter, using the E/// model. * * params a structure holding configuration parameters which are * common to all transmitters; * tx_params a structure holding transmitter-specific configuration * parameters.- * */ void coverage (Parameters *params, Tx_parameters *tx_params, const int rank) { // // execute the path-loss calculation on CPU or GPU? // if (params->use_gpu) { // // initialize the OpenCL environment // init_gpu (params, tx_params, rank % 2); // // SIMULATE the LOS calculation on GPU // DoProfile_gpu (tx_params->m_obst_height, tx_params->m_obst_dist, tx_params->m_obst_offset, 1.0, tx_params->m_dem, tx_params->tx_north_coord_idx, tx_params->tx_east_coord_idx, tx_params->total_tx_height, tx_params->nrows, tx_params->ncols, params->map_ew_res, params->radius); #ifdef _PERFORMANCE_METRICS_ measure_time ("E/// on GPU"); #endif eric_pathloss_on_gpu (params, tx_params); } else { // // calculate the terrain profile from the top of the transmitter, // i.e. line-of-sight, only once per transmitter // DoProfile (tx_params->m_obst_height, tx_params->m_obst_dist, tx_params->m_obst_offset, 1.0, tx_params->m_dem, tx_params->tx_north_coord_idx, tx_params->tx_east_coord_idx, tx_params->total_tx_height, tx_params->nrows, tx_params->ncols, params->map_ew_res, params->radius); #ifdef _PERFORMANCE_METRICS_ measure_time ("E/// on CPU"); #endif eric_pathloss_on_cpu (params, tx_params); } #ifdef _PERFORMANCE_METRICS_ measure_time (NULL); #endif // // calculate the antenna influence, // overwriting the isotrophic path-loss // #ifdef _PERFORMANCE_METRICS_ measure_time ("Antenna influence"); #endif calculate_antenna_influence (params, tx_params); #ifdef _PERFORMANCE_METRICS_ measure_time (NULL); #endif // // if the coverage calculation happened on the GPU, // we need to refresh the memory buffers on the host // if (params->use_gpu) { size_t buff_size = tx_params->nrows * tx_params->ncols * sizeof (tx_params->m_loss[0][0]); read_buffer_blocking (tx_params->ocl_obj, 0, tx_params->m_loss_dev, buff_size, tx_params->m_loss[0]); } }
/** * Simulates the line-of-sight calculation on GPU.- * */ static int DoProfile_gpu (double **Obst_high, double **Obst_dist, double **Offset, double ResDist, double **Raster, double xBS, double yBS, double ZoTransBS, int xN, int yN, double scale, double radius) { #ifdef _PERFORMANCE_METRICS_ measure_time ("Simulating Line-of-sight on GPU"); #endif double AZI; int ix, iy; double dx, dy; // // LOS and obstacle height calculation is executed only once, // because its results are constant throughout the optimization // /* Offset ini for (ix = 0; ix < xN; ix++) { for (iy = 0; iy < yN; iy++) { Offset[ix][iy]=999; } }*/ // Kvadrant I for (ix = 0; ix < xN; ix++) { //Patrik AZI = atan((ix - xBS) / yBS); AZI = atan((ix - floor(xBS)) / floor(yBS)); if (cos(AZI) > sin(AZI)) { //Patrik dx = sin(AZI) / cos(AZI); //Patrik dy = -cos(AZI) / cos(AZI); dx = (ix - floor(xBS)) / floor(yBS); // tan(AZI) dy = -1; } else { //Patrik dx = sin(AZI) / sin(AZI); //Patrik dy = -cos(AZI) / sin(AZI); dx = 1; dy = -floor(yBS)/(ix - floor(xBS)); // ctan(AZI) } calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius); #ifdef _DEBUG_INFO_ printf ("DoProfile -> 1st quadrant: %d\n", ix); #endif } /* Kvadrant III for (ix = 0; ix < xN; ix++) { //Patrik AZI = atan((ix - xBS) / (yN - yBS)); AZI = atan((ix - floor(xBS)) / (yN - floor(yBS))); if (cos(AZI) > sin(AZI)) { //Patrik dx = sin(AZI) / cos(AZI); //Patrik dy = cos(AZI) / cos(AZI); dx = (ix - floor(xBS)) / (yN - floor(yBS)); // tan(AZI) dy = 1; } else { //Patrik dx = sin(AZI) / sin(AZI); //Patrik dy = cos(AZI) / sin(AZI); dx = 1; dy = (yN - floor(yBS)) / (ix - floor(xBS)); // ctan(AZI) } calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius); #ifdef _DEBUG_INFO_ printf ("DoProfile -> 3rd quadrant: %d\n", ix); #endif } // Kvadrant II for (iy = 0; iy < yN; iy++) { //Patrik AZI = atan((iy - yBS) / (xN - xBS)); AZI = atan((iy - floor(yBS)) / (xN - floor(xBS))); if (cos(AZI) > sin(AZI)) { //Patrik dx = cos(AZI) / cos(AZI); //Patrik dy = sin(AZI) / cos(AZI); dx = 1; dy = (iy - floor(yBS)) / (xN - floor(xBS)); // tan(AZI) } else { //Patrik dx = cos(AZI) / sin(AZI); //Patrik dy = sin(AZI) / sin(AZI); dx = (xN - floor(xBS)) / (iy - floor(yBS)); // ctan(AZI) dy = 1; } calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius); #ifdef _DEBUG_INFO_ printf ("DoProfile -> 2nd quadrant: %d\n", ix); #endif } // Kvadrant IV for (iy = 0; iy < yN; iy++) { //Patrik AZI = atan((iy - yBS) / xBS); AZI = atan((iy - floor(yBS)) / floor(xBS)); if (cos(AZI) > sin(AZI)) { //Patrik dx = -cos(AZI) / cos(AZI); //Patrik dy = sin(AZI) / cos(AZI); dx = -1; dy = (iy - floor(yBS)) / floor(xBS); // tan(AZI) } else { //Patrik dx = -cos(AZI) / sin(AZI); //Patrik dy = sin(AZI) / sin(AZI); dx = -floor(xBS) / (iy - floor(yBS)); // ctan(AZI) dy = 1; } calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius); #ifdef _DEBUG_INFO_ printf ("DoProfile -> 4th quadrant: %d\n", ix); #endif }*/ #ifdef _PERFORMANCE_METRICS_ measure_time (NULL); #endif return 0; }
void read_ic(char *fname) { int i, num_files, rest_files, ngroups, gr, filenr, masterTask, lastTask, groupMaster; double u_init, molecular_weight, dmax1, dmax2; char buf[500]; CPU_Step[CPU_MISC] += measure_time(); #ifdef RESCALEVINI if(ThisTask == 0 && RestartFlag == 0) { fprintf(stdout, "\nRescaling v_ini !\n\n"); fflush(stdout); } #endif NumPart = 0; N_gas = 0; All.TotNumPart = 0; num_files = find_files(fname); #if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE) NumPartPerFile = (long long *) mymalloc(num_files * sizeof(long long)); if(ThisTask == 0) get_particle_numbers(fname, num_files); MPI_Bcast(NumPartPerFile, num_files * sizeof(long long), MPI_BYTE, 0, MPI_COMM_WORLD); #endif rest_files = num_files; while(rest_files > NTask) { sprintf(buf, "%s.%d", fname, ThisTask + (rest_files - NTask)); if(All.ICFormat == 3) sprintf(buf, "%s.%d.hdf5", fname, ThisTask + (rest_files - NTask)); #if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE) FileNr = ThisTask + (rest_files - NTask); #endif ngroups = NTask / All.NumFilesWrittenInParallel; if((NTask % All.NumFilesWrittenInParallel)) ngroups++; groupMaster = (ThisTask / ngroups) * ngroups; for(gr = 0; gr < ngroups; gr++) { if(ThisTask == (groupMaster + gr)) /* ok, it's this processor's turn */ read_file(buf, ThisTask, ThisTask); MPI_Barrier(MPI_COMM_WORLD); } rest_files -= NTask; } if(rest_files > 0) { distribute_file(rest_files, 0, 0, NTask - 1, &filenr, &masterTask, &lastTask); if(num_files > 1) { sprintf(buf, "%s.%d", fname, filenr); if(All.ICFormat == 3) sprintf(buf, "%s.%d.hdf5", fname, filenr); #if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE) FileNr = filenr; #endif } else { sprintf(buf, "%s", fname); if(All.ICFormat == 3) sprintf(buf, "%s.hdf5", fname); #if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE) FileNr = 0; #endif } ngroups = rest_files / All.NumFilesWrittenInParallel; if((rest_files % All.NumFilesWrittenInParallel)) ngroups++; for(gr = 0; gr < ngroups; gr++) { if((filenr / All.NumFilesWrittenInParallel) == gr) /* ok, it's this processor's turn */ read_file(buf, masterTask, lastTask); MPI_Barrier(MPI_COMM_WORLD); } } #if defined(SUBFIND_RESHUFFLE_CATALOGUE) subfind_reshuffle_free(); #endif myfree_msg(CommBuffer, "CommBuffer"); if(header.flag_ic_info != FLAG_SECOND_ORDER_ICS) { /* this makes sure that masses are initialized in the case that the mass-block is empty for this particle type */ for(i = 0; i < NumPart; i++) { if(All.MassTable[P[i].Type] != 0) P[i].Mass = All.MassTable[P[i].Type]; } } #ifdef GENERATE_GAS_IN_ICS int count, j; double fac, d, a, b, rho; if(RestartFlag == 0) { header.flag_entropy_instead_u = 0; for(i = 0, count = 0; i < NumPart; i++) if(P[i].Type == 1) count++; memmove(P + count, P, sizeof(struct particle_data) * NumPart); NumPart += count; N_gas += count; if(N_gas > All.MaxPartSph) { printf("Task=%d ends up getting more SPH particles (%d) than allowed (%d)\n", ThisTask, N_gas, All.MaxPartSph); endrun(111); } fac = All.OmegaBaryon / All.Omega0; rho = All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G); for(i = count, j = 0; i < NumPart; i++) if(P[i].Type == 1) { P[j] = P[i]; d = pow(P[i].Mass / rho, 1.0 / 3); a = 0.5 * All.OmegaBaryon / All.Omega0 * d; b = 0.5 * (All.Omega0 - All.OmegaBaryon) / All.Omega0 * d; P[j].Mass *= fac; P[i].Mass *= (1 - fac); P[j].Type = 0; P[j].ID += 1000000000; P[i].Pos[0] += a; P[i].Pos[1] += a; P[i].Pos[2] += a; P[j].Pos[0] -= b; P[j].Pos[1] -= b; P[j].Pos[2] -= b; j++; } All.MassTable[0] = fac * All.MassTable[1]; All.MassTable[1] *= (1 - fac); } #endif #if defined(BLACK_HOLES) && defined(SWALLOWGAS) if(RestartFlag == 0) { All.MassTable[5] = 0; } #endif #ifdef SFR if(RestartFlag == 0) { if(All.MassTable[4] == 0 && All.MassTable[0] > 0) { All.MassTable[0] = 0; All.MassTable[4] = 0; } } #endif u_init = (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * All.InitGasTemp; u_init *= All.UnitMass_in_g / All.UnitEnergy_in_cgs; /* unit conversion */ if(All.InitGasTemp > 1.0e4) /* assuming FULL ionization */ molecular_weight = 4 / (8 - 5 * (1 - HYDROGEN_MASSFRAC)); else /* assuming NEUTRAL GAS */ molecular_weight = 4 / (1 + 3 * HYDROGEN_MASSFRAC); u_init /= molecular_weight; All.InitGasU = u_init; if(RestartFlag == 0) { if(All.InitGasTemp > 0) { for(i = 0; i < N_gas; i++) { if(ThisTask == 0 && i == 0 && SphP[i].Entropy == 0) printf("Initializing u from InitGasTemp !\n"); if(SphP[i].Entropy == 0) SphP[i].Entropy = All.InitGasU; /* Note: the coversion to entropy will be done in the function init(), after the densities have been computed */ } } } for(i = 0; i < N_gas; i++) SphP[i].Entropy = DMAX(All.MinEgySpec, SphP[i].Entropy); #ifdef EOS_DEGENERATE for(i = 0; i < N_gas; i++) SphP[i].u = 0; #endif MPI_Barrier(MPI_COMM_WORLD); if(ThisTask == 0) { printf("reading done.\n"); fflush(stdout); } if(ThisTask == 0) { printf("Total number of particles : %d%09d\n\n", (int) (All.TotNumPart / 1000000000), (int) (All.TotNumPart % 1000000000)); fflush(stdout); } CPU_Step[CPU_SNAPSHOT] += measure_time(); }
/*! This function computes the gravitational potential for ALL the particles. * First, the (short-range) tree potential is computed, and then, if needed, * the long range PM potential is added. */ void compute_potential(void) { int i; #ifndef NOGRAVITY int j, k, ret, sendTask, recvTask; int ndone, ndone_flag, dummy; int ngrp, place, nexport, nimport; double fac; MPI_Status status; double r2; if(All.ComovingIntegrationOn) set_softenings(); if(ThisTask == 0) { printf("Start computation of potential for all particles...\n"); fflush(stdout); } CPU_Step[CPU_MISC] += measure_time(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); CPU_Step[CPU_MISC] += measure_time(); #if defined(SFR) || defined(BLACK_HOLES) rearrange_particle_sequence(); #endif force_treebuild(NumPart, NULL); CPU_Step[CPU_TREEBUILD] += measure_time(); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } /* allocate buffers to arrange communication */ All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + sizeof(struct gravdata_in) + sizeof(struct potdata_out) + sizemax(sizeof(struct gravdata_in), sizeof(struct potdata_out)))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); for(i = 0; i < NumPart; i++) if(P[i].Ti_current != All.Ti_Current) drift_particle(i, All.Ti_Current); i = 0; /* beginn with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i < NumPart; i++) { #ifndef PMGRID ret = force_treeevaluate_potential(i, 0, &nexport, Send_count); #else ret = force_treeevaluate_potential_shortrange(i, 0, &nexport, Send_count); #endif if(ret < 0) break; /* export buffer has filled up */ } #ifdef MYSORT mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } GravDataGet = (struct gravdata_in *) mymalloc(nimport * sizeof(struct gravdata_in)); GravDataIn = (struct gravdata_in *) mymalloc(nexport * sizeof(struct gravdata_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; for(k = 0; k < 3; k++) GravDataIn[j].Pos[k] = P[place].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataIn[j].Type = P[place].Type; #ifdef ADAPTIVE_GRAVSOFT_FORGAS if(P[place].Type == 0) GravDataIn[j].Soft = SphP[place].Hsml; #endif #endif GravDataIn[j].OldAcc = P[place].OldAcc; for(k = 0; k < NODELISTLENGTH; k++) GravDataIn[j].NodeList[k] = DataNodeList[DataIndexTable[j].IndexGet].NodeList[k]; } /* exchange particle data */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, &GravDataGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, MPI_COMM_WORLD, &status); } } } myfree(GravDataIn); PotDataResult = (struct potdata_out *) mymalloc(nimport * sizeof(struct potdata_out)); PotDataOut = (struct potdata_out *) mymalloc(nexport * sizeof(struct potdata_out)); /* now do the particles that were sent to us */ for(j = 0; j < nimport; j++) { #ifndef PMGRID force_treeevaluate_potential(j, 1, &dummy, &dummy); #else force_treeevaluate_potential_shortrange(j, 1, &dummy, &dummy); #endif } if(i >= NumPart) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get the result */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* send the results */ MPI_Sendrecv(&PotDataResult[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct potdata_out), MPI_BYTE, recvTask, TAG_POTENTIAL_B, &PotDataOut[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct potdata_out), MPI_BYTE, recvTask, TAG_POTENTIAL_B, MPI_COMM_WORLD, &status); } } } /* add the results to the local particles */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; P[place].p.dPotential += PotDataOut[j].Potential; } myfree(PotDataOut); myfree(PotDataResult); myfree(GravDataGet); } while(ndone < NTask); myfree(DataNodeList); myfree(DataIndexTable); /* add correction to exclude self-potential */ for(i = 0; i < NumPart; i++) { #ifdef FLTROUNDOFFREDUCTION P[i].p.Potential = FLT(P[i].p.dPotential); #endif /* remove self-potential */ P[i].p.Potential += P[i].Mass / All.SofteningTable[P[i].Type]; if(All.ComovingIntegrationOn) if(All.PeriodicBoundariesOn) P[i].p.Potential -= 2.8372975 * pow(P[i].Mass, 2.0 / 3) * pow(All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G), 1.0 / 3); } /* multiply with the gravitational constant */ for(i = 0; i < NumPart; i++) P[i].p.Potential *= All.G; #ifdef PMGRID #ifdef PERIODIC pmpotential_periodic(); #ifdef PLACEHIGHRESREGION i = pmpotential_nonperiodic(1); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); pm_setup_nonperiodic_kernel(); i = pmpotential_nonperiodic(1); /* try again */ } if(i == 1) endrun(88686); #endif #else i = pmpotential_nonperiodic(0); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); pm_setup_nonperiodic_kernel(); i = pmpotential_nonperiodic(0); /* try again */ } if(i == 1) endrun(88687); #ifdef PLACEHIGHRESREGION i = pmpotential_nonperiodic(1); if(i == 1) /* this is returned if a particle lied outside allowed range */ { pm_init_regionsize(); i = pmpotential_nonperiodic(1); } if(i != 0) endrun(88688); #endif #endif #endif if(All.ComovingIntegrationOn) { #ifndef PERIODIC fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].p.Potential += fac * r2; } #endif } else { fac = -0.5 * All.OmegaLambda * All.Hubble * All.Hubble; if(fac != 0) { for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].p.Potential += fac * r2; } } } if(ThisTask == 0) { printf("potential done.\n"); fflush(stdout); } #else for(i = 0; i < NumPart; i++) P[i].Potential = 0; #endif CPU_Step[CPU_POTENTIAL] += measure_time(); }
int main (int argc, const char * argv[]) { /* for(int single=-1; single<3; single++){ for(int pair=-2; pair<4; pair++){ measure_loop(4, 4, 100, 300, single, pair); } } */ /* for(int k=3; k<500; k++){ printf("%d\n", k); measure_loop(k, k, 10, 1000, 2, 3); measure_loop(k, k, 10, 1000, 2, -2); measure_loop(k, k, 10, 1000, -1, 1); } */ // measure_loop(630, 630, 10, 300, 2, -2); int gpu = 1; /*------------------------------------------------------------------------------------------*/ cl_device_id device_id; int err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context){ printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } char* KernelSource=read_kernel("kernel.cl"); cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands){ printf("Error: Failed to create a command commands!\n"); printf("%i %i\n", CL_INVALID_VALUE, err); return EXIT_FAILURE; } cl_program program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program){ printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS){ size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } cl_kernel kernelInf = clCreateKernel(program, "updateFactor", &err); if (!kernelInf || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel!\n"); return 1; } cl_kernel kernelMarg = clCreateKernel(program, "updateMarginals", &err); if (!kernelMarg || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel marg!\n"); return 1; } /*---------------------------------------------------------------------------------------*/ for(int i=3; i<500; i++){ //int s = (int) 100*pow(10, i/25.); //int s=i; //printf("%d %d\n", i, s); //measure_loop(s, s, 10, 100, 0, 0); //measure_loop(s, s, 10, 100, 2, -2); printf("[%d, ", i); measure_time(kernelInf, kernelMarg, commands, context, device_id, i, i, 2+ceil(900/(i*i))); } }
int main (int argc, char **argv) { int N,MxInt,i,Nthread; int *data, *check_data, *unsorted_data; printf("\n COMP4300 Ass2 Sorting Program\n"); printf(" Input Total Number of Data Items\n"); printf(" Maximum Integer Value\n"); printf(" and Number of Threads to Create\n"); scanf("%d%d%d",&N,&MxInt,&Nthread); assert(MxInt<MAX_INTS); assert(N>0 && N<MAX_INTS); assert(Nthread > 0 && Nthread < MAX_THRDS); printf("\n-------------------------------------\n"); printf(" Total Number of Data Items %-12d\n",N); printf(" Maximum Integer Value %-12d\n",MxInt); printf(" Number of Threads to Use %-12d\n",Nthread); printf("-------------------------------------\n\n"); /* Allocate data array and generate random numbers */ unsorted_data = (int *) malloc (N * sizeof(int)); data = (int*) malloc( N*sizeof(int) ); check_data = (int*) malloc( N*sizeof(int) ); init_data (unsorted_data, N, MxInt); prtvec(unsorted_data,N,"Unsorted Data"); /* Take copy and sort using radix sort, then use to verify */ for (i = 0; i < N; i++) check_data[i] = unsorted_data[i]; /* RADIX SORT */ measure_time(START_TIME, NULL); radixsort(check_data, N, MxInt); measure_time(STOP_TIME, "RadixSort"); for (i = 0; i < N; i++) data[i] = unsorted_data[i]; // RECURSIVE SORT measure_time(START_TIME, NULL); recur_qsort(data, 0, N-1); measure_time(STOP_TIME, "Recursive QuickSort"); check_results(check_data, data, N); for (i = 0; i < N; i++) data[i] = unsorted_data[i]; // ROUTINE 1 - ITERATIVE SORT measure_time(START_TIME, NULL); qsort_1(data, N); measure_time(STOP_TIME, "Routine 1"); check_results(check_data, data, N); for (i = 0; i < N; i++) data[i] = unsorted_data[i]; // ROUTINE 2 - RECURSIVE PTHREAD QUICKSORT measure_time(START_TIME, NULL); int numBusyThreads = 1; struct recur_pthread_qsort_args args = { data, 0, N-1, &numBusyThreads, Nthread}; qsort_2 (&args); measure_time(STOP_TIME, "Routine 2"); check_results(check_data, data, N); for (i = 0; i < N; i++) data[i] = unsorted_data[i]; // ROUTINE 3 - ITERATIVE BUSY WAITING PTHREAD QUICKSORT measure_time (START_TIME, NULL); qsort_3 (data, N, Nthread); measure_time (STOP_TIME, "Routine 3"); check_results(check_data, data, N); for (i = 0; i < N; i++) data[i] = unsorted_data[i]; // ROUTINE 4 - ITERATIVE CV PTHREAD QUICKSORT measure_time (START_TIME, NULL); qsort_4 (data, N, Nthread); measure_time (STOP_TIME, "Routine 4"); // PRINT "SORTED" DATA prtvec(data,N,"Sorted Data"); /* Sequential check that the results are correct */ check_results(check_data, data, N); printf("Execution completed successfully\n"); return 0; }
void cs_find_hot_neighbours(void) { MyFloat *Left, *Right; int nimport; int i, j, n, ndone_flag, dummy; int ndone, ntot, npleft; int iter = 0; int ngrp, sendTask, recvTask; int place, nexport; double dmax1, dmax2; double xhyd, yhel, ne, mu, energy, temp; double a3inv; if(All.ComovingIntegrationOn) a3inv = 1 / (All.Time * All.Time * All.Time); else a3inv = 1; /* allocate buffers to arrange communication */ Left = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat)); Right = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat)); Ngblist = (int *) mymalloc(NumPart * sizeof(int)); All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + sizeof(struct hotngbs_in) + sizeof(struct hotngbs_out) + sizemax(sizeof(struct hotngbs_in), sizeof(struct hotngbs_out)))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); CPU_Step[CPU_MISC] += measure_time(); for(n = FirstActiveParticle; n >= 0; n = NextActiveParticle[n]) { if(P[n].Type == 0) { /* select reservoir and cold phase particles */ if(P[n].EnergySN > 0 && SphP[n].d.Density * a3inv > All.PhysDensThresh * All.DensFrac_Phase) { xhyd = P[n].Zm[6] / P[n].Mass; yhel = (1 - xhyd) / (4. * xhyd); ne = SphP[n].Ne; mu = (1 + 4 * yhel) / (1 + yhel + ne); energy = SphP[n].Entropy * P[n].Mass / GAMMA_MINUS1 * pow(SphP[n].d.Density * a3inv, GAMMA_MINUS1); /* Total Energys */ temp = GAMMA_MINUS1 / BOLTZMANN * energy / P[n].Mass * PROTONMASS * mu; temp *= All.UnitEnergy_in_cgs / All.UnitMass_in_g; /* Temperature in Kelvin */ if(temp < All.Tcrit_Phase) { Left[n] = Right[n] = 0; if(!(SphP[n].HotHsml > 0.)) SphP[n].HotHsml = All.InitialHotHsmlFactor * PPP[n].Hsml; /* Estimation of HotHsml : ONLY first step */ P[n].Type = 10; /* temporarily mark particles of interest with this number */ } } } } /* we will repeat the whole thing for those particles where we didn't find enough neighbours */ do { i = FirstActiveParticle; /* beginn with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i >= 0; i = NextActiveParticle[i]) if(P[i].Type == 10 && P[i].TimeBin >= 0) { if(cs_hotngbs_evaluate(i, 0, &nexport, Send_count) < 0) break; } #ifdef MYSORT mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } HotNgbsGet = (struct hotngbs_in *) mymalloc(nimport * sizeof(struct hotngbs_in)); HotNgbsIn = (struct hotngbs_in *) mymalloc(nexport * sizeof(struct hotngbs_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; HotNgbsIn[j].Pos[0] = P[place].Pos[0]; HotNgbsIn[j].Pos[1] = P[place].Pos[1]; HotNgbsIn[j].Pos[2] = P[place].Pos[2]; HotNgbsIn[j].HotHsml = SphP[place].HotHsml; HotNgbsIn[j].Entropy = SphP[place].Entropy; memcpy(HotNgbsIn[j].NodeList, DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int)); } for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&HotNgbsIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE, recvTask, TAG_DENS_A, &HotNgbsGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } myfree(HotNgbsIn); HotNgbsResult = (struct hotngbs_out *) mymalloc(nimport * sizeof(struct hotngbs_out)); HotNgbsOut = (struct hotngbs_out *) mymalloc(nexport * sizeof(struct hotngbs_out)); /* now do the particles that need to be exported */ for(j = 0; j < nimport; j++) cs_hotngbs_evaluate(j, 1, &dummy, &dummy); if(i < 0) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get the result */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* send the results */ MPI_Sendrecv(&HotNgbsResult[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct hotngbs_out), MPI_BYTE, recvTask, TAG_DENS_B, &HotNgbsOut[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct hotngbs_out), MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } /* add the result to the local particles */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; SphP[place].da.dDensityAvg += HotNgbsOut[j].DensitySum; SphP[place].ea.dEntropyAvg += HotNgbsOut[j].EntropySum; SphP[place].HotNgbNum += HotNgbsOut[j].HotNgbNum; } myfree(HotNgbsOut); myfree(HotNgbsResult); myfree(HotNgbsGet); } while(ndone < NTask); /* do final operations on results */ for(i = FirstActiveParticle, npleft = 0; i >= 0; i = NextActiveParticle[i]) { if(P[i].Type == 10 && P[i].TimeBin >= 0) { #ifdef FLTROUNDOFFREDUCTION SphP[i].da.DensityAvg = FLT(SphP[i].da.dDensityAvg); SphP[i].ea.EntropyAvg = FLT(SphP[i].ea.dEntropyAvg); #endif if(SphP[i].HotNgbNum > 0) { SphP[i].da.DensityAvg /= SphP[i].HotNgbNum; SphP[i].ea.EntropyAvg /= SphP[i].HotNgbNum; } else { SphP[i].da.DensityAvg = 0; SphP[i].ea.EntropyAvg = 0; } /* now check whether we had enough neighbours */ if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation) || (SphP[i].HotNgbNum > (All.DesNumNgb + All.MaxNumHotNgbDeviation))) { /* need to redo this particle */ npleft++; if(Left[i] > 0 && Right[i] > 0) if((Right[i] - Left[i]) < 1.0e-3 * Left[i]) { /* this one should be ok */ npleft--; P[i].TimeBin = -P[i].TimeBin - 1; /* Mark as inactive */ continue; } if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation)) Left[i] = DMAX(SphP[i].HotHsml, Left[i]); else { if(Right[i] != 0) { if(SphP[i].HotHsml < Right[i]) Right[i] = SphP[i].HotHsml; } else Right[i] = SphP[i].HotHsml; } if(Left[i] > All.MaxHotHsmlParam * PPP[i].Hsml) /* prevent us from searching too far */ { npleft--; P[i].TimeBin = -P[i].TimeBin - 1; /* Mark as inactive */ /* Ad-hoc definition of SAvg and RhoAvg when there are no hot neighbours */ /* Note that a minimum nunmber of hot neighbours are required for promotion, see c_enrichment.c */ if(SphP[i].HotNgbNum == 0) { SphP[i].da.DensityAvg = SphP[i].d.Density / 100; SphP[i].ea.EntropyAvg = SphP[i].Entropy * 1000; printf("WARNING: Used ad-hoc values for SAvg and RhoAvg, No hot neighbours\n"); } continue; } if(iter >= MAXITER_HOT - 10) { printf ("i=%d task=%d ID=%d Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n pos=(%g|%g|%g)\n", i, ThisTask, P[i].ID, SphP[i].HotHsml, Left[i], Right[i], (float) SphP[i].HotNgbNum, Right[i] - Left[i], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]); fflush(stdout); } if(Right[i] > 0 && Left[i] > 0) SphP[i].HotHsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3); else { if(Right[i] == 0 && Left[i] == 0) endrun(8188); /* can't occur */ if(Right[i] == 0 && Left[i] > 0) SphP[i].HotHsml *= 1.26; if(Right[i] > 0 && Left[i] == 0) SphP[i].HotHsml /= 1.26; } } else P[i].TimeBin = -P[i].TimeBin - 1; /* Mark as inactive */ } } MPI_Allreduce(&npleft, &ntot, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if(ntot > 0) { iter++; if(iter > 0 && ThisTask == 0) { printf("hotngb iteration %d: need to repeat for %d particles.\n", iter, ntot); fflush(stdout); } if(iter > MAXITER_HOT) { printf("failed to converge in hot-neighbour iteration\n"); fflush(stdout); endrun(1155); } } } while(ntot > 0); myfree(DataNodeList); myfree(DataIndexTable); myfree(Ngblist); myfree(Right); myfree(Left); for(i = FirstActiveParticle; i >= 0; i = NextActiveParticle[i]) if(P[i].Type == 10) { P[i].Type = 0; /* mark as active again */ if(P[i].TimeBin < 0) P[i].TimeBin = -P[i].TimeBin - 1; } CPU_Step[CPU_HOTNGBS] += measure_time(); }