Пример #1
0
int main(){
  //calculate median of rdtsc call overhead - we will subtract it from all the tests we do
  int64_t overhead = calc_rdtsc_overhead();
  printf("Clock Measurement Overhead: %ld Cycles\n",overhead);
  uint64_t testDigit = 1334782398988024;
  uint32_t TRIALS=1000000; //measure 1M times - set it to lower value if your function is slow - the faster your function, more measurements you need - in my experiments, for fastest functions, values seem to converge within 1M iterations
  measure_time(Slow Digits  ,TRIALS,digits10_slow(testDigit),overhead); //default slow digits10
  measure_time(Fast Digits  ,TRIALS,digits10_fast(testDigit),overhead); //fast  digits10
  measure_time(Faster Digits,TRIALS,digits10_faster(testDigit),overhead); //faster digits10 
  measure_time(Small Function,TRIALS,smallFunction(i),overhead); //an example of measuring a very low-overhead function - it should be about 3 cycles on x86_64 (1 push, 1 mov, 1 ret) - we pass it a non-static argument (using loop variable i in measure_time macro body) so that it is not optimized away by the compiler from n calls to 1 call
}
Пример #2
0
int main(){
  //calculate median of rdtsc call overhead - we will subtract it from all the tests we do
  int64_t overhead = calc_rdtsc_overhead();
  printf("Median Clock Measurement Overhead: %ld Cycles\n",overhead);
  uint64_t SIZE = 1024*1024;
  int64_t* array = malloc(SIZE*sizeof(int64_t));
  for(int i=0; i < SIZE; ++i) array[i] = i;
  uint32_t TRIALS=1000000; //measure 1M times - set it to lower value if your function is slow - the faster your function, more measurements you need - in my experiments, for fastest functions, values seem to converge within 1M iterations
  measure_time(CPP Binary Search,TRIALS,bin_long_cpp(array,SIZE,23456),overhead); //CPP binary search
  measure_time(C Binary Search,TRIALS,bin_long_c(array,SIZE,23456),overhead); //C binary search
}
Пример #3
0
void StoreState::update(simulator::Simulation& simulation, units::Time dt)
{
    auto _ = measure_time("diffusion.store-state", simulator::TimeMeasurementIterationOutput(simulation));

    // Get data table
    auto& table = simulation.getDataTable("diffusion");

    // Foreach coordinates
    for (auto&& c : range(m_diffusionModule->getGridSize()))
    {
        // Create new row
        auto row = table.addRow(
            makePair("iteration", simulation.getIteration()),
            makePair("totalTime", simulation.getTotalTime().value()),
            makePair("x", c.getX()),
            makePair("y", c.getY())
        );

        // Foreach signals
        for (auto signalId : m_diffusionModule->getSignalIds())
        {
            table.setData(row,
                makePair(
                    m_diffusionModule->getSignalName(signalId),
                    m_diffusionModule->getSignal(signalId, c).value()
                )
            );
        }
    }
}
Пример #4
0
void Simulation::deleteObjects()
{
    auto _ = measure_time("sim.delete", TimeMeasurementIterationOutput(this));

    // Remove deleted objects
    m_objects.removeDeleted();
}
Пример #5
0
void Module::update(simulator::Simulation& simulation, units::Time dt)
{
    // Store time step
    m_step = dt;

    auto _ = measure_time("agglutination", simulator::TimeMeasurementIterationOutput(simulation));

    // Get physics world
    auto& world = simulation.getWorld();

    // Foreach pending bodies
    for (const auto& p : m_toJoin)
    {
        b2WeldJointDef joint;
        joint.Initialize(p.bodyA, p.bodyB, p.bodyA->GetWorldCenter());
        JointUserData* jUserData = new JointUserData();
        jUserData->module = this;
        jUserData->Kd = p.dConst;
        joint.userData = jUserData;
        world.CreateJoint(&joint);
    }

    m_toJoin.clear();

    // Joints to remove
    DynamicArray<b2Joint*> toRemove;

    // Foreach active joints
    for (auto joint = world.GetJointList(); joint != nullptr; joint = joint->GetNext())
    {
        const JointUserData* jUserData = reinterpret_cast<const JointUserData*>(joint->GetUserData());
        // Not our joint
        if (jUserData == nullptr)
            continue;
        if (jUserData->guard != '@')
            continue;

        std::bernoulli_distribution dist(
            getDisassociationPropensity(
                m_step,
                jUserData->Kd
            )
        );

        if (dist(g_gen))
        {
            Log::debug("Released: ", joint->GetBodyA(), ", ", joint->GetBodyB());
            toRemove.push_back(joint);
            delete jUserData;
        }
    }

    // Destroy joints
    for (auto joint : toRemove)
        world.DestroyJoint(joint);
}
Пример #6
0
TEST(sfrlock, uncontended_write_cost) {
  double t;
  double r;
  pthread_mutex_t mutex;
  pthread_rwlock_t rwlock;
  sfrlock_t sfrlock;

  sfrlock_init(&sfrlock);
  pthread_rwlock_init(&rwlock, nullptr);
  pthread_mutex_init(&mutex, nullptr);

  r = measure_time([&] () {
        for (unsigned cnt = repeat; cnt; cnt--) {
          sfrlock_wrlock(&sfrlock);
          sfrlock_wrunlock(&sfrlock);
        }
      });
  printf("sfrlock_t time: %lf ms\n", r / 1e6);

  t = measure_time([&] () {
        for (unsigned cnt = repeat; cnt; cnt--) {
          pthread_rwlock_wrlock(&rwlock);
          pthread_rwlock_unlock(&rwlock);
        }
      });
  printf("pthread_rwlock_t time: %lf ms (%+.2lf%%)\n", t / 1e6,
         -(1 - (t / r)) * 100);

  t = measure_time([&] () {
        for (unsigned cnt = repeat; cnt; cnt--) {
          pthread_mutex_lock(&mutex);
          pthread_mutex_unlock(&mutex);
        }
      });
  printf("pthread_mutex_t time: %lf ms (%+.2lf%%)\n", t / 1e6,
         -(1 - (t / r)) * 100);

  pthread_rwlock_destroy(&rwlock);
  pthread_mutex_destroy(&mutex);
}
Пример #7
0
static void test_swap_double() {
    union {
        double d;
        uint64_t u;
    } ud = {
        .u = 0x7856341283C0F33F
    };
    double x = 1.2344999991522893623141499119810760021209716796875;
    ud.u = BSWAP_64(ud.u);
    double r = ud.d;
    if (r == x) {
        printf("%.64f\n", ud.d);
        printf("swap okay\n");
    } else {
        printf("swap failed\n");
        printf("%.64f\n", ud.d);
    }

    double td = 1.2344999991522893623141499119810760021209716796875;
    if (memcmp(&td, "\x78\x56\x34\x12\x83\xC0\xF3\x3F", 8) == 0) {
        printf("little endian double\n");
    } else if (memcmp(&td, "\x3F\xF3\xC0\x83\x12\x34\x56\x78", 8) == 0) {
        printf("big endian double\n");
    } else {
        printf("not support number format to dump!");
    }

}

int main(int argc, char const* argv[]) {
    test_swap_double();
    test_b32();
    union_test();
    uint64_t i = 0x123456789abcdeff;
    measure_time(test_mc, i, "memory copy");
    measure_time(test_mc2, i, "memory copy 2");
    measure_time(test_bo, i, "bitwise operation");
    return 0;
}
Пример #8
0
Файл: xxp.hpp Проект: mahrz/xxp
    void execute(std::function<void()> f)
    {
      if(mpi_mode && !master_instance)
      {
	// Setup data pipes
	bool firstJob = true;

	while(true)
	{
	  std::string local_config = send_command(RQJ);

	  if(local_config.empty())
      	    break;

	  if(firstJob)
	  {
	    for(action & a : actions)
	      if(a.t == pipe)
		setup_pipe(a);
	    firstJob = false;
	  }

	  parse_raw_config_str(local_config.c_str());
	  f();

	  send_command(DNE);
	}
      }
      else
      {
	if(actions.size() == 0)
	  f();
	else
	{
	  int action_size = 1;
	  for(auto& a : actions)
	    action_size *= a.size();

	  int cur = 0;

	  execute_action(0, action_size, cur, f);
	}

	if(!mpi_mode)
	{
	  measure_time();
	  store_timing();
	}
      }
    }
Пример #9
0
void Simulation::updateObjects(units::Time dt)
{
    auto _ = measure_time("sim.objects", TimeMeasurementIterationOutput(this));

    // Update simulations objects
    // Can't use range-for because update can add a new object.
    for (object::Container::SizeType i = 0u; i < m_objects.getCount(); ++i)
    {
        auto obj = m_objects[i];

        Assert(obj);
        obj->update(dt);
    }
}
int main( int argc, char * argv[])
{
    try
    {
        bool preserve = false, unwind = true, bind = false;
        boost::program_options::options_description desc("allowed options");
        desc.add_options()
            ("help", "help message")
            ("bind,b", boost::program_options::value< bool >( & bind), "bind thread to CPU")
            ("fpu,f", boost::program_options::value< bool >( & preserve), "preserve FPU registers")
            ("unwind,u", boost::program_options::value< bool >( & unwind), "unwind coroutine-stack")
            ("jobs,j", boost::program_options::value< boost::uint64_t >( & jobs), "jobs to run");

        boost::program_options::variables_map vm;
        boost::program_options::store(
                boost::program_options::parse_command_line(
                    argc,
                    argv,
                    desc),
                vm);
        boost::program_options::notify( vm);

        if ( vm.count("help") ) {
            std::cout << desc << std::endl;
            return EXIT_SUCCESS;
        }

        if ( preserve) preserve_fpu = boost::coroutines::fpu_preserved;
        if ( ! unwind) unwind_stack = boost::coroutines::no_stack_unwind;
        if ( bind) bind_to_processor( 0);

        duration_type overhead_c = overhead_clock();
        std::cout << "overhead " << overhead_c.count() << " nano seconds" << std::endl;
        boost::uint64_t res = measure_time( overhead_c).count();
        std::cout << "average of " << res << " nano seconds" << std::endl;
#ifdef BOOST_CONTEXT_CYCLE
        cycle_type overhead_y = overhead_cycle();
        std::cout << "overhead " << overhead_y << " cpu cycles" << std::endl;
        res = measure_cycles( overhead_y);
        std::cout << "average of " << res << " cpu cycles" << std::endl;
#endif

        return EXIT_SUCCESS;
    }
    catch ( std::exception const& e)
    { std::cerr << "exception: " << e.what() << std::endl; }
    catch (...)
    { std::cerr << "unhandled exception" << std::endl; }
    return EXIT_FAILURE;
}
Пример #11
0
TEST(cwlock, uncontended_acquire) {
  double t;
  cwlock_t cwlock;

  cwlock_init(&cwlock);

  t = measure_time([&] () {
      for (unsigned cnt = repeat; cnt; cnt--) {
        if (cwlock_lock(&cwlock)) {
          cwlock_unlock(&cwlock);
        }
      }
    });

  printf("cwlock_t time: %lf ms\n", t / 1e6);
}
Пример #12
0
int main( int argc, char * argv[])
{
    try
    {
        bind_to_processor( 0);

        boost::program_options::options_description desc("allowed options");
        desc.add_options()
            ("help", "help message")
            ("fpu,f", boost::program_options::value< bool >( & preserve_fpu), "preserve FPU registers")
            ("jobs,j", boost::program_options::value< boost::uint64_t >( & jobs), "jobs to run");

        boost::program_options::variables_map vm;
        boost::program_options::store(
                boost::program_options::parse_command_line(
                    argc,
                    argv,
                    desc),
                vm);
        boost::program_options::notify( vm);

        if ( vm.count("help") ) {
            std::cout << desc << std::endl;
            return EXIT_SUCCESS;
        }
 
        stack_allocator stack_alloc;
        fc = boost::context::make_fcontext(
                stack_alloc.allocate( stack_allocator::default_stacksize() ),
                stack_allocator::default_stacksize(),
                fn);

        boost::uint64_t res = measure_time().count();
        std::cout << "average of " << res << " nano seconds" << std::endl;
#ifdef BOOST_CONTEXT_CYCLE
        res = measure_cycles();
        std::cout << "average of " << res << " cpu cycles" << std::endl;
#endif

        return EXIT_SUCCESS;
    }
    catch ( std::exception const& e)
    { std::cerr << "exception: " << e.what() << std::endl; }
    catch (...)
    { std::cerr << "unhandled exception" << std::endl; }
    return EXIT_FAILURE;
}
Пример #13
0
/**
 * Objective function evaluation, used by the optimization algorithm.
 *
 * params       a structure holding configuration parameters which are 
 *              common to all transmitters;
 * tx_params    a structure holding transmitter-specific configuration 
 *              parameters;
 * radio_zone   radio zone for which the objective function is calculated;
 * sol_vector   solution vector over which the objective function is calculated;
 * comm         the object used to communicate with the workers;
 *
 */
static double
obj_func (Parameters    *params,
          Tx_parameters *tx_params,
          const char     radio_zone,
          double        *sol_vector,
          MPI_Comm      *comm)
{
    double score [params->ntx][2];
    double ret_value [2];
    MPI_Status status;

#ifdef _PERFORMANCE_METRICS_
    measure_time ("Send solution to all workers");
#endif
    //
    // broadcast the new solution to all workers
    //
    MPI_Bcast (sol_vector,
               params->clutter_category_count,
               MPI_DOUBLE,
               _COVERAGE_MASTER_RANK_,
               *comm);
#ifdef _PERFORMANCE_METRICS_
    measure_time (NULL);
    measure_time ("Gather partial objective-function values");
#endif
    //
    // receive the partial objective-function values from all workers,
    // aggregating the partial values before calculating the total 
    //
    ret_value[0] = 0;
    ret_value[1] = 0;
    int workers_evaluating = 0;
    while (workers_evaluating < params->ntx)
    {
        //
        // receive the partial objective-function value from this worker
        //
        MPI_Recv (&(score[workers_evaluating][0]),
                  2,
                  MPI_DOUBLE,
                  MPI_ANY_SOURCE,
                  MPI_ANY_TAG,
                  *comm,
                  &status);
        if (status.MPI_ERROR)
        {
            int worker_rank = status.MPI_SOURCE;
            fprintf (stderr, 
                     "*** ERROR: Objective-function value incorrectly received from %d. worker\n",
                     worker_rank);
            fflush (stderr);
            exit (1);
        }
        //
        // aggregate the received squared error
        //
        ret_value[0] += score[workers_evaluating][0];
        
        //
        // aggregate the received field-measurement count
        //
        ret_value[1] += score[workers_evaluating][1];

        workers_evaluating ++;
    }
#ifdef _PERFORMANCE_METRICS_
    measure_time (NULL);
    measure_time ("Build complete objective-function value");
#endif
    //
    // the total mean-squared error
    //
    ret_value[0] /= ret_value[1];

#ifdef _PERFORMANCE_METRICS_
    measure_time (NULL);
#endif
    return ret_value[0];
}
Пример #14
0
bool Simulation::update(units::Duration dt)
{
    // Initialize simulation
    if (!isInitialized())
        initialize();

    // Increase step number
    m_iteration++;
    m_totalTime += dt;

    // Clear all stored forces
    for (auto& obj : m_objects)
        obj->setForce(Zero);

    // Update modules
    updateModules(dt);

    // Update objects
    updateObjects(dt);

    // Detect object that leaved the scene
    detectDeserters();

    // Delete unused objects
    deleteObjects();

    // Store data
    if (m_dataOutObjects)
    {
        for (const auto& object : m_objects)
        {
            const auto pos = object->getPosition();
            const auto vel = object->getVelocity();

            *m_dataOutObjects <<
                // iteration
                getIteration() << ";" <<
                // totalTime
                getTotalTime() << ";" <<
                // id
                object->getId() << ";" <<
                // typeName
                object->getTypeName() << ";" <<
                // posX
                pos.getX() << ";" <<
                // posY
                pos.getY() << ";" <<
                // velX
                vel.getX() << ";" <<
                // velY
                vel.getY() << "\n"
            ;
        }
    }

#ifdef CECE_ENABLE_BOX2D_PHYSICS
    {
        auto _ = measure_time("sim.physics", TimeMeasurementIterationOutput(this));

        m_world.Step(getPhysicsEngineTimeStep().value(), 10, 10);
    }
#endif

    return (hasUnlimitedIterations() || getIteration() <= getIterations());
}
Пример #15
0
void Simulation::updateModules(units::Time dt)
{
    auto _ = measure_time("sim.modules", TimeMeasurementIterationOutput(this));

    m_modules.update(*this, dt);
}
Пример #16
0
static int
add_initial_data(const struct kmr_kv_box kv,
		 const KMR_KVS *kvi, KMR_KVS *kvo, void *p, long i_)
{
    common_t *common = (common_t *)p;
    char filename[FILENAME_LEN];
    create_file(common->rank, common->iteration, common->file_size,
		filename, FILENAME_LEN);
    common->val_count = IO_COUNT * common->file_size;
    struct kmr_kv_box nkv = { .klen = sizeof(char) * (strlen(common->key) + 1),
			      .k.p = common->key,
			      .vlen = sizeof(char) * (strlen(filename) + 1),
			      .v.p = (void *)filename };
    kmr_add_kv(kvo, nkv);
    return MPI_SUCCESS;
}

static int
increment_in_file_value(const struct kmr_kv_box kv,
			const KMR_KVS *kvi, KMR_KVS *kvo, void *p, long i_)
{
    common_t *common = (common_t *)p;
    char *infile = (char *)kv.v.p;
    char outfile[FILENAME_LEN];
    snprintf(outfile, FILENAME_LEN, "./%06d-%02d.dat", common->rank,
	     common->iteration + 1);

    FILE *ifp = fopen(infile, "r");
    FILE *ofp = fopen(outfile, "w+");
    assert(ifp != 0 && ofp != 0);
    /* read/write 1MB at once */
    long *buf = (long *)malloc(sizeof(long) * IO_COUNT);
    for (int i = 0; i < common->file_size; i++) {
	size_t cc = fread(buf, sizeof(long), IO_COUNT, ifp);
	assert(cc == IO_COUNT);
	for (int j = 0; j < IO_COUNT; j++) {
	    buf[j] += 1;
	}
	cc = fwrite(buf, sizeof(long), IO_COUNT, ofp);
	assert(cc == IO_COUNT);
    }
    free(buf);
    fclose(ofp);

    struct kmr_kv_box nkv = { .klen = sizeof(char) * (strlen(common->key) + 1),
			      .k.p = common->key,
			      .vlen = sizeof(char) * (strlen(outfile) + 1),
			      .v.p = (void *)outfile };
    kmr_add_kv(kvo, nkv);
#ifdef DEBUG
    fseek(ifp, 0, SEEK_SET);
    long val;
    fread(&val, sizeof(long), 1, ifp);
    fprintf(stderr, "Rank[%d]: process key[%s]-val[%ld]\n",
	    common->rank, (char *)kv.k.p, val);
#endif
    fclose(ifp);
    delete_file(common->rank, common->iteration);

    return MPI_SUCCESS;
}


int
main(int argc, char **argv)
{
    int thlv;
    MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thlv);
    int nprocs, rank, task_nprocs;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    check_nprocs(nprocs, rank, &task_nprocs);
    kmr_init();
    KMR *mr = kmr_create_context(MPI_COMM_WORLD, MPI_INFO_NULL, 0);
    mr->verbosity = 5;
    mr->trace_map_mp = 1;

    char even_key[KEY_LEN];
    char odd_key[KEY_LEN];
    snprintf(even_key, KEY_LEN, "even%06d", (rank / task_nprocs + 1));
    snprintf(odd_key,  KEY_LEN, "odd%06d",  (rank % task_nprocs + 1));

    common_t common0;
    common0.key = even_key;
    parse_param_file(argc, argv, &(common0.file_size));
    common0.rank = rank;
    common0.iteration = 0;
    KMR_KVS *kvs0 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE);
    kmr_map_once(kvs0, &common0, kmr_noopt, 0, add_initial_data);

    double itr_times[ITERATIONS];
    for (int i = 0; i < ITERATIONS; i++) {
	common0.key = (i % 2 == 0)? odd_key : even_key;
	common0.iteration = i;
	KMR_KVS *kvs1 = kmr_create_kvs(mr, KMR_KV_OPAQUE, KMR_KV_OPAQUE);

	struct timeval ts;
	measure_time(&ts);
	kmr_map_multiprocess_by_key(kvs0, kvs1, &common0, kmr_noopt, rank,
				    increment_in_file_value);
	struct timeval te;
	measure_time(&te);
	itr_times[i] = calc_time_diff(&ts, &te);

	kvs0 = kvs1;
    }
    kmr_free_kvs(kvs0);
    delete_file(common0.rank, common0.iteration + 1);

    print_time(itr_times, ITERATIONS, rank);

    kmr_free_context(mr);
    kmr_fin();
    MPI_Finalize();
    return 0;
}
Пример #17
0
/**
 * Calculates the coverage prediction for one transmitter, using the E/// model.
 *
 * params           a structure holding configuration parameters which are 
 *                  common to all transmitters;
 * tx_params        a structure holding transmitter-specific configuration
 *                  parameters.-
 *
 */
void 
coverage (Parameters    *params,
          Tx_parameters *tx_params,
          const int     rank)
{
    //
    // execute the path-loss calculation on CPU or GPU?
    //
    if (params->use_gpu)
    {
        //
        // initialize the OpenCL environment
        //
        init_gpu (params,
                  tx_params,
                  rank % 2);
        //
        // SIMULATE the LOS calculation on GPU
        //
        DoProfile_gpu (tx_params->m_obst_height,
                       tx_params->m_obst_dist,
                       tx_params->m_obst_offset,
                       1.0,
                       tx_params->m_dem,
                       tx_params->tx_north_coord_idx,
                       tx_params->tx_east_coord_idx,
                       tx_params->total_tx_height,
                       tx_params->nrows,
                       tx_params->ncols,
                       params->map_ew_res,
                       params->radius);
#ifdef _PERFORMANCE_METRICS_
        measure_time ("E/// on GPU");
#endif
        eric_pathloss_on_gpu (params,
                              tx_params);
    }
    else
    {
        //
        // calculate the terrain profile from the top of the transmitter,
        // i.e. line-of-sight, only once per transmitter
        // 
        DoProfile (tx_params->m_obst_height,
                   tx_params->m_obst_dist,
                   tx_params->m_obst_offset,
                   1.0,
                   tx_params->m_dem,
                   tx_params->tx_north_coord_idx,
                   tx_params->tx_east_coord_idx,
                   tx_params->total_tx_height,
                   tx_params->nrows,
                   tx_params->ncols,
                   params->map_ew_res,
                   params->radius);
#ifdef _PERFORMANCE_METRICS_
        measure_time ("E/// on CPU");
#endif
        eric_pathloss_on_cpu (params,
                              tx_params);
    }
#ifdef _PERFORMANCE_METRICS_
    measure_time (NULL);
#endif

    //
    // calculate the antenna influence, 
    // overwriting the isotrophic path-loss
    //
#ifdef _PERFORMANCE_METRICS_
    measure_time ("Antenna influence");
#endif
    calculate_antenna_influence (params,
                                 tx_params);
#ifdef _PERFORMANCE_METRICS_
    measure_time (NULL);
#endif
    //
    // if the coverage calculation happened on the GPU,
    // we need to refresh the memory buffers on the host
    //
    if (params->use_gpu)
    {
        size_t buff_size = tx_params->nrows * 
                           tx_params->ncols * 
                           sizeof (tx_params->m_loss[0][0]);
        read_buffer_blocking (tx_params->ocl_obj,
                              0,
                              tx_params->m_loss_dev,
                              buff_size,
                              tx_params->m_loss[0]);
    }
}
Пример #18
0
/**
 * Simulates the line-of-sight calculation on GPU.-
 *
 */
static int 
DoProfile_gpu (double **Obst_high,
               double **Obst_dist,
               double **Offset,
               double ResDist, 
               double **Raster, 
               double xBS, 
               double yBS, 
               double ZoTransBS, 
               int xN, 
               int yN, 
               double scale, 
               double radius)
{
#ifdef _PERFORMANCE_METRICS_
    measure_time ("Simulating Line-of-sight on GPU");
#endif
	double AZI;
	int ix, iy;
	double dx, dy;
	
    //
    // LOS and obstacle height calculation is executed only once, 
    // because its results are constant throughout the optimization
    //

    /* Offset ini
	for (ix = 0; ix < xN; ix++)
    {
		for (iy = 0; iy < yN; iy++)
        {
			Offset[ix][iy]=999;
		}
	}*/
	
	// Kvadrant I
	for (ix = 0; ix < xN; ix++)
	{
		//Patrik AZI = atan((ix - xBS) / yBS);
		AZI = atan((ix - floor(xBS)) / floor(yBS));
		
		if (cos(AZI) > sin(AZI))
        {
			//Patrik dx = sin(AZI) / cos(AZI);
			//Patrik dy = -cos(AZI) / cos(AZI);
			dx = (ix - floor(xBS)) / floor(yBS);			// tan(AZI)
			dy = -1;
		}
		else
        {
			//Patrik dx = sin(AZI) / sin(AZI);
			//Patrik dy = -cos(AZI) / sin(AZI);
			dx = 1;
			dy = -floor(yBS)/(ix - floor(xBS));				// ctan(AZI)
		}
		calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius);
#ifdef _DEBUG_INFO_
        printf ("DoProfile -> 1st quadrant: %d\n", ix);
#endif
	}

	/* Kvadrant III
	for (ix = 0; ix < xN; ix++)
	{
		//Patrik AZI = atan((ix - xBS) / (yN - yBS));
		AZI = atan((ix - floor(xBS)) / (yN - floor(yBS)));
		
		if (cos(AZI) > sin(AZI))
        {
			//Patrik dx = sin(AZI) / cos(AZI);
			//Patrik dy = cos(AZI) / cos(AZI);
			dx = (ix - floor(xBS)) / (yN - floor(yBS));			// tan(AZI)
			dy = 1;
		}
		else
        {
			//Patrik dx = sin(AZI) / sin(AZI);
			//Patrik dy = cos(AZI) / sin(AZI);
			dx = 1;
			dy = (yN - floor(yBS)) / (ix - floor(xBS));				// ctan(AZI)
		}
				
		calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius);
#ifdef _DEBUG_INFO_
        printf ("DoProfile -> 3rd quadrant: %d\n", ix);
#endif
	} 
	
	// Kvadrant II
	for (iy = 0; iy < yN; iy++)
	{
		//Patrik AZI = atan((iy - yBS) / (xN - xBS));
		AZI = atan((iy - floor(yBS)) / (xN - floor(xBS)));
			
		if (cos(AZI) > sin(AZI))
        {
			//Patrik dx = cos(AZI) / cos(AZI);
			//Patrik dy = sin(AZI) / cos(AZI);
			dx = 1;			
			dy = (iy - floor(yBS)) / (xN - floor(xBS));		// tan(AZI)
		}
		else
        {
			//Patrik dx = cos(AZI) / sin(AZI);
			//Patrik dy = sin(AZI) / sin(AZI);
			dx = (xN - floor(xBS)) / (iy - floor(yBS));				// ctan(AZI)
			dy = 1;
		}
		
		calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius);	
#ifdef _DEBUG_INFO_
        printf ("DoProfile -> 2nd quadrant: %d\n", ix);
#endif
	}

	// Kvadrant IV
	for (iy = 0; iy < yN; iy++)
	{
		//Patrik AZI = atan((iy - yBS) / xBS);
		AZI = atan((iy - floor(yBS)) / floor(xBS));
		
		if (cos(AZI) > sin(AZI))
        {
			//Patrik dx = -cos(AZI) / cos(AZI);
			//Patrik dy = sin(AZI) / cos(AZI);
			dx = -1;			
			dy = (iy - floor(yBS)) / floor(xBS);		// tan(AZI)
		}
		else
        {
			//Patrik dx = -cos(AZI) / sin(AZI);
			//Patrik dy = sin(AZI) / sin(AZI);
			dx = -floor(xBS) / (iy - floor(yBS));				// ctan(AZI)
			dy = 1;
		}
		
		calc_profile (Obst_high, Obst_dist, Raster, Offset, dx, dy, xBS, yBS, ZoTransBS, xN, yN, scale, radius);			
#ifdef _DEBUG_INFO_
        printf ("DoProfile -> 4th quadrant: %d\n", ix);
#endif
	}*/

#ifdef _PERFORMANCE_METRICS_
    measure_time (NULL);
#endif
	return 0;
}
Пример #19
0
void read_ic(char *fname)
{
  int i, num_files, rest_files, ngroups, gr, filenr, masterTask, lastTask, groupMaster;
  double u_init, molecular_weight, dmax1, dmax2;
  char buf[500];

  CPU_Step[CPU_MISC] += measure_time();

#ifdef RESCALEVINI
  if(ThisTask == 0 && RestartFlag == 0)
    {
      fprintf(stdout, "\nRescaling v_ini !\n\n");
      fflush(stdout);
    }
#endif

  NumPart = 0;
  N_gas = 0;
  All.TotNumPart = 0;

  num_files = find_files(fname);

#if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE)
  NumPartPerFile = (long long *) mymalloc(num_files * sizeof(long long));

  if(ThisTask == 0)
    get_particle_numbers(fname, num_files);

  MPI_Bcast(NumPartPerFile, num_files * sizeof(long long), MPI_BYTE, 0, MPI_COMM_WORLD);
#endif

  rest_files = num_files;

  while(rest_files > NTask)
    {
      sprintf(buf, "%s.%d", fname, ThisTask + (rest_files - NTask));
      if(All.ICFormat == 3)
	sprintf(buf, "%s.%d.hdf5", fname, ThisTask + (rest_files - NTask));
#if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE)
      FileNr = ThisTask + (rest_files - NTask);
#endif

      ngroups = NTask / All.NumFilesWrittenInParallel;
      if((NTask % All.NumFilesWrittenInParallel))
	ngroups++;
      groupMaster = (ThisTask / ngroups) * ngroups;

      for(gr = 0; gr < ngroups; gr++)
	{
	  if(ThisTask == (groupMaster + gr))	/* ok, it's this processor's turn */
	    read_file(buf, ThisTask, ThisTask);
	  MPI_Barrier(MPI_COMM_WORLD);
	}

      rest_files -= NTask;
    }


  if(rest_files > 0)
    {
      distribute_file(rest_files, 0, 0, NTask - 1, &filenr, &masterTask, &lastTask);

      if(num_files > 1)
	{
	  sprintf(buf, "%s.%d", fname, filenr);
	  if(All.ICFormat == 3)
	    sprintf(buf, "%s.%d.hdf5", fname, filenr);
#if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE)
	  FileNr = filenr;
#endif
	}
      else
	{
	  sprintf(buf, "%s", fname);
	  if(All.ICFormat == 3)
	    sprintf(buf, "%s.hdf5", fname);
#if defined(SAVE_HSML_IN_IC_ORDER) || defined(SUBFIND_RESHUFFLE_CATALOGUE)
	  FileNr = 0;
#endif
	}

      ngroups = rest_files / All.NumFilesWrittenInParallel;
      if((rest_files % All.NumFilesWrittenInParallel))
	ngroups++;

      for(gr = 0; gr < ngroups; gr++)
	{
	  if((filenr / All.NumFilesWrittenInParallel) == gr)	/* ok, it's this processor's turn */
	    read_file(buf, masterTask, lastTask);
	  MPI_Barrier(MPI_COMM_WORLD);
	}
    }

#if defined(SUBFIND_RESHUFFLE_CATALOGUE)
  subfind_reshuffle_free();
#endif

  myfree_msg(CommBuffer, "CommBuffer");


  if(header.flag_ic_info != FLAG_SECOND_ORDER_ICS)
    {
      /* this makes sure that masses are initialized in the case that the mass-block
         is empty for this particle type */
      for(i = 0; i < NumPart; i++)
	{
	  if(All.MassTable[P[i].Type] != 0)
	    P[i].Mass = All.MassTable[P[i].Type];
	}
    }


#ifdef GENERATE_GAS_IN_ICS
  int count, j;
  double fac, d, a, b, rho;

  if(RestartFlag == 0)
    {
      header.flag_entropy_instead_u = 0;

      for(i = 0, count = 0; i < NumPart; i++)
	if(P[i].Type == 1)
	  count++;

      memmove(P + count, P, sizeof(struct particle_data) * NumPart);

      NumPart += count;
      N_gas += count;

      if(N_gas > All.MaxPartSph)
        {
          printf("Task=%d ends up getting more SPH particles (%d) than allowed (%d)\n",
                 ThisTask, N_gas, All.MaxPartSph);
          endrun(111);
       }

      fac = All.OmegaBaryon / All.Omega0;
      rho = All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G);

      for(i = count, j = 0; i < NumPart; i++)
	if(P[i].Type == 1)
	  {
	    P[j] = P[i];

	    d = pow(P[i].Mass / rho, 1.0 / 3);
	    a = 0.5 * All.OmegaBaryon / All.Omega0 * d;
	    b = 0.5 * (All.Omega0 - All.OmegaBaryon) / All.Omega0 * d;

	    P[j].Mass *= fac;
	    P[i].Mass *= (1 - fac);
	    P[j].Type = 0;
	    P[j].ID += 1000000000;

	    P[i].Pos[0] += a;
	    P[i].Pos[1] += a;
	    P[i].Pos[2] += a;
	    P[j].Pos[0] -= b;
	    P[j].Pos[1] -= b;
	    P[j].Pos[2] -= b;

	    j++;
	  }

      All.MassTable[0] = fac * All.MassTable[1];
      All.MassTable[1] *= (1 - fac);
    }
#endif



#if defined(BLACK_HOLES) && defined(SWALLOWGAS)
  if(RestartFlag == 0)
    {
      All.MassTable[5] = 0;
    }
#endif

#ifdef SFR
  if(RestartFlag == 0)
    {
      if(All.MassTable[4] == 0 && All.MassTable[0] > 0)
	{
	  All.MassTable[0] = 0;
	  All.MassTable[4] = 0;
	}
    }
#endif


  u_init = (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * All.InitGasTemp;
  u_init *= All.UnitMass_in_g / All.UnitEnergy_in_cgs;	/* unit conversion */

  if(All.InitGasTemp > 1.0e4)	/* assuming FULL ionization */
    molecular_weight = 4 / (8 - 5 * (1 - HYDROGEN_MASSFRAC));
  else				/* assuming NEUTRAL GAS */
    molecular_weight = 4 / (1 + 3 * HYDROGEN_MASSFRAC);

  u_init /= molecular_weight;

  All.InitGasU = u_init;



  if(RestartFlag == 0)
    {
      if(All.InitGasTemp > 0)
	{
	  for(i = 0; i < N_gas; i++)
	    {
	      if(ThisTask == 0 && i == 0 && SphP[i].Entropy == 0)
		printf("Initializing u from InitGasTemp !\n");

	      if(SphP[i].Entropy == 0)
		SphP[i].Entropy = All.InitGasU;

	      /* Note: the coversion to entropy will be done in the function init(),
	         after the densities have been computed */
	    }
	}
    }

  for(i = 0; i < N_gas; i++)
    SphP[i].Entropy = DMAX(All.MinEgySpec, SphP[i].Entropy);

#ifdef EOS_DEGENERATE
  for(i = 0; i < N_gas; i++)
    SphP[i].u = 0;
#endif

  MPI_Barrier(MPI_COMM_WORLD);

  if(ThisTask == 0)
    {
      printf("reading done.\n");
      fflush(stdout);
    }

  if(ThisTask == 0)
    {
      printf("Total number of particles :  %d%09d\n\n",
	     (int) (All.TotNumPart / 1000000000), (int) (All.TotNumPart % 1000000000));
      fflush(stdout);
    }

  CPU_Step[CPU_SNAPSHOT] += measure_time();
}
Пример #20
0
/*! This function computes the gravitational potential for ALL the particles.
 *  First, the (short-range) tree potential is computed, and then, if needed,
 *  the long range PM potential is added.
 */
void compute_potential(void)
{
  int i;

#ifndef NOGRAVITY
  int j, k, ret, sendTask, recvTask;
  int ndone, ndone_flag, dummy;
  int ngrp, place, nexport, nimport;
  double fac;
  MPI_Status status;
  double r2;

  if(All.ComovingIntegrationOn)
    set_softenings();

  if(ThisTask == 0)
    {
      printf("Start computation of potential for all particles...\n");
      fflush(stdout);
    }

  CPU_Step[CPU_MISC] += measure_time();


  if(TreeReconstructFlag)
    {
      if(ThisTask == 0)
	printf("Tree construction.\n");

      CPU_Step[CPU_MISC] += measure_time();

#if defined(SFR) || defined(BLACK_HOLES)
      rearrange_particle_sequence();
#endif

      force_treebuild(NumPart, NULL);

      CPU_Step[CPU_TREEBUILD] += measure_time();

      TreeReconstructFlag = 0;

      if(ThisTask == 0)
	printf("Tree construction done.\n");
    }


  /* allocate buffers to arrange communication */
  All.BunchSize =
    (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) +
					     sizeof(struct gravdata_in) + sizeof(struct potdata_out) +
					     sizemax(sizeof(struct gravdata_in),
						     sizeof(struct potdata_out))));
  DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index));
  DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist));

  for(i = 0; i < NumPart; i++)
    if(P[i].Ti_current != All.Ti_Current)
      drift_particle(i, All.Ti_Current);

  i = 0;			/* beginn with this index */

  do
    {
      for(j = 0; j < NTask; j++)
	{
	  Send_count[j] = 0;
	  Exportflag[j] = -1;
	}

      /* do local particles and prepare export list */
      for(nexport = 0; i < NumPart; i++)
	{
#ifndef PMGRID
	  ret = force_treeevaluate_potential(i, 0, &nexport, Send_count);
#else
	  ret = force_treeevaluate_potential_shortrange(i, 0, &nexport, Send_count);
#endif
	  if(ret < 0)
	    break;		/* export buffer has filled up */
	}

#ifdef MYSORT
      mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#else
      qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#endif

      MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD);

      for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++)
	{
	  Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask];
	  nimport += Recv_count[j];

	  if(j > 0)
	    {
	      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
	      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
	    }
	}

      GravDataGet = (struct gravdata_in *) mymalloc(nimport * sizeof(struct gravdata_in));
      GravDataIn = (struct gravdata_in *) mymalloc(nexport * sizeof(struct gravdata_in));

      /* prepare particle data for export */
      for(j = 0; j < nexport; j++)
	{
	  place = DataIndexTable[j].Index;

	  for(k = 0; k < 3; k++)
	    GravDataIn[j].Pos[k] = P[place].Pos[k];

#ifdef UNEQUALSOFTENINGS
	  GravDataIn[j].Type = P[place].Type;
#ifdef ADAPTIVE_GRAVSOFT_FORGAS
	  if(P[place].Type == 0)
	    GravDataIn[j].Soft = SphP[place].Hsml;
#endif
#endif
	  GravDataIn[j].OldAcc = P[place].OldAcc;

	  for(k = 0; k < NODELISTLENGTH; k++)
	    GravDataIn[j].NodeList[k] = DataNodeList[DataIndexTable[j].IndexGet].NodeList[k];
	}


      /* exchange particle data */

      for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	{
	  sendTask = ThisTask;
	  recvTask = ThisTask ^ ngrp;

	  if(recvTask < NTask)
	    {
	      if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		{
		  /* get the particles */
		  MPI_Sendrecv(&GravDataIn[Send_offset[recvTask]],
			       Send_count[recvTask] * sizeof(struct gravdata_in), MPI_BYTE,
			       recvTask, TAG_POTENTIAL_A,
			       &GravDataGet[Recv_offset[recvTask]],
			       Recv_count[recvTask] * sizeof(struct gravdata_in), MPI_BYTE,
			       recvTask, TAG_POTENTIAL_A, MPI_COMM_WORLD, &status);
		}
	    }
	}

      myfree(GravDataIn);
      PotDataResult = (struct potdata_out *) mymalloc(nimport * sizeof(struct potdata_out));
      PotDataOut = (struct potdata_out *) mymalloc(nexport * sizeof(struct potdata_out));


      /* now do the particles that were sent to us */
      for(j = 0; j < nimport; j++)
	{
#ifndef PMGRID
	  force_treeevaluate_potential(j, 1, &dummy, &dummy);
#else
	  force_treeevaluate_potential_shortrange(j, 1, &dummy, &dummy);
#endif
	}

      if(i >= NumPart)
	ndone_flag = 1;
      else
	ndone_flag = 0;

      MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

      /* get the result */
      for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	{
	  sendTask = ThisTask;
	  recvTask = ThisTask ^ ngrp;
	  if(recvTask < NTask)
	    {
	      if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		{
		  /* send the results */
		  MPI_Sendrecv(&PotDataResult[Recv_offset[recvTask]],
			       Recv_count[recvTask] * sizeof(struct potdata_out),
			       MPI_BYTE, recvTask, TAG_POTENTIAL_B,
			       &PotDataOut[Send_offset[recvTask]],
			       Send_count[recvTask] * sizeof(struct potdata_out),
			       MPI_BYTE, recvTask, TAG_POTENTIAL_B, MPI_COMM_WORLD, &status);
		}
	    }

	}

      /* add the results to the local particles */
      for(j = 0; j < nexport; j++)
	{
	  place = DataIndexTable[j].Index;

	  P[place].p.dPotential += PotDataOut[j].Potential;
	}

      myfree(PotDataOut);
      myfree(PotDataResult);
      myfree(GravDataGet);
    }
  while(ndone < NTask);

  myfree(DataNodeList);
  myfree(DataIndexTable);

  /* add correction to exclude self-potential */

  for(i = 0; i < NumPart; i++)
    {
#ifdef FLTROUNDOFFREDUCTION
      P[i].p.Potential = FLT(P[i].p.dPotential);
#endif
      /* remove self-potential */
      P[i].p.Potential += P[i].Mass / All.SofteningTable[P[i].Type];

      if(All.ComovingIntegrationOn)
	if(All.PeriodicBoundariesOn)
	  P[i].p.Potential -= 2.8372975 * pow(P[i].Mass, 2.0 / 3) *
	    pow(All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G), 1.0 / 3);
    }


  /* multiply with the gravitational constant */

  for(i = 0; i < NumPart; i++)
    P[i].p.Potential *= All.G;


#ifdef PMGRID

#ifdef PERIODIC
  pmpotential_periodic();
#ifdef PLACEHIGHRESREGION
  i = pmpotential_nonperiodic(1);
  if(i == 1)			/* this is returned if a particle lied outside allowed range */
    {
      pm_init_regionsize();
      pm_setup_nonperiodic_kernel();
      i = pmpotential_nonperiodic(1);	/* try again */
    }
  if(i == 1)
    endrun(88686);
#endif
#else
  i = pmpotential_nonperiodic(0);
  if(i == 1)			/* this is returned if a particle lied outside allowed range */
    {
      pm_init_regionsize();
      pm_setup_nonperiodic_kernel();
      i = pmpotential_nonperiodic(0);	/* try again */
    }
  if(i == 1)
    endrun(88687);
#ifdef PLACEHIGHRESREGION
  i = pmpotential_nonperiodic(1);
  if(i == 1)			/* this is returned if a particle lied outside allowed range */
    {
      pm_init_regionsize();

      i = pmpotential_nonperiodic(1);
    }
  if(i != 0)
    endrun(88688);
#endif
#endif

#endif



  if(All.ComovingIntegrationOn)
    {
#ifndef PERIODIC
      fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble;

      for(i = 0; i < NumPart; i++)
	{
	  for(k = 0, r2 = 0; k < 3; k++)
	    r2 += P[i].Pos[k] * P[i].Pos[k];

	  P[i].p.Potential += fac * r2;
	}
#endif
    }
  else
    {
      fac = -0.5 * All.OmegaLambda * All.Hubble * All.Hubble;
      if(fac != 0)
	{
	  for(i = 0; i < NumPart; i++)
	    {
	      for(k = 0, r2 = 0; k < 3; k++)
		r2 += P[i].Pos[k] * P[i].Pos[k];

	      P[i].p.Potential += fac * r2;
	    }
	}
    }


  if(ThisTask == 0)
    {
      printf("potential done.\n");
      fflush(stdout);
    }


#else
  for(i = 0; i < NumPart; i++)
    P[i].Potential = 0;
#endif

  CPU_Step[CPU_POTENTIAL] += measure_time();
}
Пример #21
0
Файл: main.c Проект: dlwh/cs252
int main (int argc, const char * argv[]) {
/*	for(int single=-1; single<3; single++){
		for(int pair=-2; pair<4; pair++){
			measure_loop(4, 4, 100, 300, single, pair);
		}
	}
*/	
/*	for(int k=3; k<500; k++){
		printf("%d\n", k);
		measure_loop(k, k, 10, 1000, 2, 3);
		measure_loop(k, k, 10, 1000, 2, -2);
		measure_loop(k, k, 10, 1000, -1, 1);
	}
*/	
//	measure_loop(630, 630, 10, 300, 2, -2);
	
	int gpu = 1;
	/*------------------------------------------------------------------------------------------*/
	cl_device_id device_id;
	
	int err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS){
		printf("Error: Failed to create a device group!\n");
		return EXIT_FAILURE;
	}
	
	cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context){
		printf("Error: Failed to create a compute context!\n");
		return EXIT_FAILURE;
	}
	
	char* KernelSource=read_kernel("kernel.cl");
	
	
	cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands){
		printf("Error: Failed to create a command commands!\n");
		printf("%i %i\n", CL_INVALID_VALUE, err);
		return EXIT_FAILURE;
	}
	
	
	cl_program program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
	if (!program){
		printf("Error: Failed to create compute program!\n");
		return EXIT_FAILURE;
	}
	
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS){
		size_t len;
		char buffer[2048];
		
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		return EXIT_FAILURE;
	}
	
	cl_kernel kernelInf = clCreateKernel(program, "updateFactor", &err);
	if (!kernelInf || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel!\n");
		return 1;
	}
	cl_kernel kernelMarg = clCreateKernel(program, "updateMarginals", &err);
	if (!kernelMarg || err != CL_SUCCESS){
		printf("Error: Failed to create compute kernel marg!\n");
		return 1;
	}
	/*---------------------------------------------------------------------------------------*/
	
	
	for(int i=3; i<500; i++){
		//int s = (int) 100*pow(10, i/25.);
		//int s=i;
		
		//printf("%d %d\n", i, s);
		//measure_loop(s, s, 10, 100, 0, 0);
		//measure_loop(s, s, 10, 100, 2, -2);
		
		printf("[%d, ", i);
		measure_time(kernelInf, kernelMarg, commands, context, device_id, i, i, 2+ceil(900/(i*i)));
	}
}
Пример #22
0
int main (int argc, char **argv) {
  int N,MxInt,i,Nthread;
  int *data, *check_data, *unsorted_data;

  printf("\n COMP4300 Ass2 Sorting Program\n");

  printf(" Input Total Number of Data Items\n");
  printf(" Maximum Integer Value\n");
  printf(" and Number of Threads to Create\n");
  scanf("%d%d%d",&N,&MxInt,&Nthread);
  assert(MxInt<MAX_INTS);
  assert(N>0 && N<MAX_INTS);
  assert(Nthread > 0 && Nthread < MAX_THRDS);
  printf("\n-------------------------------------\n");
  printf(" Total Number of Data Items %-12d\n",N);
  printf(" Maximum Integer Value      %-12d\n",MxInt);
  printf(" Number of Threads to Use   %-12d\n",Nthread);
  printf("-------------------------------------\n\n");

  /* Allocate data array and generate random numbers */

  unsorted_data = (int *) malloc (N * sizeof(int));
  data         = (int*) malloc( N*sizeof(int) );
  check_data   = (int*) malloc( N*sizeof(int) );
  init_data (unsorted_data, N, MxInt);
  prtvec(unsorted_data,N,"Unsorted Data");

  /* Take copy and sort using radix sort, then use to verify */
  for (i = 0; i < N; i++) check_data[i] = unsorted_data[i];

  /* RADIX SORT */
  measure_time(START_TIME, NULL);
  radixsort(check_data, N, MxInt);
  measure_time(STOP_TIME, "RadixSort");
  
  for (i = 0; i < N; i++) data[i] = unsorted_data[i];

  // RECURSIVE SORT
  measure_time(START_TIME, NULL);
  recur_qsort(data, 0, N-1);
  measure_time(STOP_TIME, "Recursive QuickSort");

  check_results(check_data, data, N);
  for (i = 0; i < N; i++) data[i] = unsorted_data[i];

  // ROUTINE 1 - ITERATIVE SORT
  measure_time(START_TIME, NULL);
  qsort_1(data, N);
  measure_time(STOP_TIME, "Routine 1");
  
  check_results(check_data, data, N);
  for (i = 0; i < N; i++) data[i] = unsorted_data[i];

  // ROUTINE 2 - RECURSIVE PTHREAD QUICKSORT
  measure_time(START_TIME, NULL);
  int numBusyThreads = 1;
  struct recur_pthread_qsort_args args = {
    data, 0, N-1, &numBusyThreads, Nthread};
  qsort_2 (&args);
  measure_time(STOP_TIME, "Routine 2");
    
  check_results(check_data, data, N);
  for (i = 0; i < N; i++) data[i] = unsorted_data[i];

  // ROUTINE 3 - ITERATIVE BUSY WAITING PTHREAD QUICKSORT
  measure_time (START_TIME, NULL);
  qsort_3 (data, N, Nthread);
  measure_time (STOP_TIME, "Routine 3");

  check_results(check_data, data, N);
  for (i = 0; i < N; i++) data[i] = unsorted_data[i];

  // ROUTINE 4 - ITERATIVE CV PTHREAD QUICKSORT
  measure_time (START_TIME, NULL);
  qsort_4 (data, N, Nthread);
  measure_time (STOP_TIME, "Routine 4");

  // PRINT "SORTED" DATA
  prtvec(data,N,"Sorted Data");

  /* Sequential check that the results are correct */
  check_results(check_data, data, N);
  printf("Execution completed successfully\n");
  return 0;
}
Пример #23
0
void cs_find_hot_neighbours(void)
{
  MyFloat *Left, *Right;
  int nimport;
  int i, j, n, ndone_flag, dummy;
  int ndone, ntot, npleft;
  int iter = 0;
  int ngrp, sendTask, recvTask;
  int place, nexport;
  double dmax1, dmax2;
  double xhyd, yhel, ne, mu, energy, temp;
  double a3inv;


  if(All.ComovingIntegrationOn)
    a3inv = 1 / (All.Time * All.Time * All.Time);
  else
    a3inv = 1;

  /* allocate buffers to arrange communication */

  Left = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat));
  Right = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat));

  Ngblist = (int *) mymalloc(NumPart * sizeof(int));

  All.BunchSize =
    (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) +
					     sizeof(struct hotngbs_in) + sizeof(struct hotngbs_out) +
					     sizemax(sizeof(struct hotngbs_in), sizeof(struct hotngbs_out))));
  DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index));
  DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist));


  CPU_Step[CPU_MISC] += measure_time();




  for(n = FirstActiveParticle; n >= 0; n = NextActiveParticle[n])
    {
      if(P[n].Type == 0)
	{
	  /* select reservoir and cold phase particles */
	  if(P[n].EnergySN > 0 && SphP[n].d.Density * a3inv > All.PhysDensThresh * All.DensFrac_Phase)
	    {
	      xhyd = P[n].Zm[6] / P[n].Mass;
	      yhel = (1 - xhyd) / (4. * xhyd);

	      ne = SphP[n].Ne;
	      mu = (1 + 4 * yhel) / (1 + yhel + ne);
	      energy = SphP[n].Entropy * P[n].Mass / GAMMA_MINUS1 * pow(SphP[n].d.Density * a3inv, GAMMA_MINUS1);	/* Total Energys */
	      temp = GAMMA_MINUS1 / BOLTZMANN * energy / P[n].Mass * PROTONMASS * mu;
	      temp *= All.UnitEnergy_in_cgs / All.UnitMass_in_g;	/* Temperature in Kelvin */

	      if(temp < All.Tcrit_Phase)
		{
		  Left[n] = Right[n] = 0;

		  if(!(SphP[n].HotHsml > 0.))
		    SphP[n].HotHsml = All.InitialHotHsmlFactor * PPP[n].Hsml;	/* Estimation of HotHsml : ONLY first step */

		  P[n].Type = 10;	/* temporarily mark particles of interest with this number */
		}
	    }
	}
    }



  /* we will repeat the whole thing for those particles where we didn't find enough neighbours */
  do
    {
      i = FirstActiveParticle;	/* beginn with this index */

      do
	{
	  for(j = 0; j < NTask; j++)
	    {
	      Send_count[j] = 0;
	      Exportflag[j] = -1;
	    }

	  /* do local particles and prepare export list */

	  for(nexport = 0; i >= 0; i = NextActiveParticle[i])
	    if(P[i].Type == 10 && P[i].TimeBin >= 0)
	      {
		if(cs_hotngbs_evaluate(i, 0, &nexport, Send_count) < 0)
		  break;
	      }

#ifdef MYSORT
	  mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#else
	  qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#endif
	  MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD);

	  for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++)
	    {
	      Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask];
	      nimport += Recv_count[j];

	      if(j > 0)
		{
		  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
		  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
		}
	    }

	  HotNgbsGet = (struct hotngbs_in *) mymalloc(nimport * sizeof(struct hotngbs_in));
	  HotNgbsIn = (struct hotngbs_in *) mymalloc(nexport * sizeof(struct hotngbs_in));

	  /* prepare particle data for export */
	  for(j = 0; j < nexport; j++)
	    {
	      place = DataIndexTable[j].Index;

	      HotNgbsIn[j].Pos[0] = P[place].Pos[0];
	      HotNgbsIn[j].Pos[1] = P[place].Pos[1];
	      HotNgbsIn[j].Pos[2] = P[place].Pos[2];
	      HotNgbsIn[j].HotHsml = SphP[place].HotHsml;
	      HotNgbsIn[j].Entropy = SphP[place].Entropy;
	      memcpy(HotNgbsIn[j].NodeList,
		     DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int));
	    }


	  for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	    {
	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&HotNgbsIn[Send_offset[recvTask]],
				   Send_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE,
				   recvTask, TAG_DENS_A,
				   &HotNgbsGet[Recv_offset[recvTask]],
				   Recv_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE,
				   recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		    }
		}
	    }

	  myfree(HotNgbsIn);
	  HotNgbsResult = (struct hotngbs_out *) mymalloc(nimport * sizeof(struct hotngbs_out));
	  HotNgbsOut = (struct hotngbs_out *) mymalloc(nexport * sizeof(struct hotngbs_out));

	  /* now do the particles that need to be exported */
	  for(j = 0; j < nimport; j++)
	    cs_hotngbs_evaluate(j, 1, &dummy, &dummy);


	  if(i < 0)
	    ndone_flag = 1;
	  else
	    ndone_flag = 0;

	  MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);


	  /* get the result */
	  for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	    {
	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;
	      if(recvTask < NTask)
		{
		  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&HotNgbsResult[Recv_offset[recvTask]],
				   Recv_count[recvTask] * sizeof(struct hotngbs_out),
				   MPI_BYTE, recvTask, TAG_DENS_B,
				   &HotNgbsOut[Send_offset[recvTask]],
				   Send_count[recvTask] * sizeof(struct hotngbs_out),
				   MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		    }
		}

	    }


	  /* add the result to the local particles */

	  for(j = 0; j < nexport; j++)
	    {
	      place = DataIndexTable[j].Index;

	      SphP[place].da.dDensityAvg += HotNgbsOut[j].DensitySum;
	      SphP[place].ea.dEntropyAvg += HotNgbsOut[j].EntropySum;
	      SphP[place].HotNgbNum += HotNgbsOut[j].HotNgbNum;
	    }

	  myfree(HotNgbsOut);
	  myfree(HotNgbsResult);
	  myfree(HotNgbsGet);
	}
      while(ndone < NTask);

      /* do final operations on results */
      for(i = FirstActiveParticle, npleft = 0; i >= 0; i = NextActiveParticle[i])
	{
	  if(P[i].Type == 10 && P[i].TimeBin >= 0)
	    {
#ifdef FLTROUNDOFFREDUCTION
	      SphP[i].da.DensityAvg = FLT(SphP[i].da.dDensityAvg);
	      SphP[i].ea.EntropyAvg = FLT(SphP[i].ea.dEntropyAvg);
#endif
	      if(SphP[i].HotNgbNum > 0)
		{
		  SphP[i].da.DensityAvg /= SphP[i].HotNgbNum;
		  SphP[i].ea.EntropyAvg /= SphP[i].HotNgbNum;
		}
	      else
		{
		  SphP[i].da.DensityAvg = 0;
		  SphP[i].ea.EntropyAvg = 0;
		}

	      /* now check whether we had enough neighbours */

	      if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation) ||
		 (SphP[i].HotNgbNum > (All.DesNumNgb + All.MaxNumHotNgbDeviation)))
		{
		  /* need to redo this particle */
		  npleft++;

		  if(Left[i] > 0 && Right[i] > 0)
		    if((Right[i] - Left[i]) < 1.0e-3 * Left[i])
		      {
			/* this one should be ok */
			npleft--;
			P[i].TimeBin = -P[i].TimeBin - 1;	/* Mark as inactive */
			continue;
		      }

		  if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation))
		    Left[i] = DMAX(SphP[i].HotHsml, Left[i]);
		  else
		    {
		      if(Right[i] != 0)
			{
			  if(SphP[i].HotHsml < Right[i])
			    Right[i] = SphP[i].HotHsml;
			}
		      else
			Right[i] = SphP[i].HotHsml;
		    }

		  if(Left[i] > All.MaxHotHsmlParam * PPP[i].Hsml)	/* prevent us from searching too far */
		    {
		      npleft--;
		      P[i].TimeBin = -P[i].TimeBin - 1;	/* Mark as inactive */


		      /* Ad-hoc definition of SAvg and RhoAvg when there are no hot neighbours  */
		      /* Note that a minimum nunmber of hot neighbours are required for promotion, see c_enrichment.c  */
		      if(SphP[i].HotNgbNum == 0)
			{
			  SphP[i].da.DensityAvg = SphP[i].d.Density / 100;
			  SphP[i].ea.EntropyAvg = SphP[i].Entropy * 1000;

			  printf("WARNING: Used ad-hoc values for SAvg and RhoAvg, No hot neighbours\n");
			}

		      continue;
		    }

		  if(iter >= MAXITER_HOT - 10)
		    {
		      printf
			("i=%d task=%d ID=%d Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n   pos=(%g|%g|%g)\n",
			 i, ThisTask, P[i].ID, SphP[i].HotHsml, Left[i], Right[i],
			 (float) SphP[i].HotNgbNum, Right[i] - Left[i], P[i].Pos[0], P[i].Pos[1],
			 P[i].Pos[2]);
		      fflush(stdout);
		    }

		  if(Right[i] > 0 && Left[i] > 0)
		    SphP[i].HotHsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3);
		  else
		    {
		      if(Right[i] == 0 && Left[i] == 0)
			endrun(8188);	/* can't occur */

		      if(Right[i] == 0 && Left[i] > 0)
			SphP[i].HotHsml *= 1.26;

		      if(Right[i] > 0 && Left[i] == 0)
			SphP[i].HotHsml /= 1.26;
		    }
		}
	      else
		P[i].TimeBin = -P[i].TimeBin - 1;	/* Mark as inactive */
	    }
	}


      MPI_Allreduce(&npleft, &ntot, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

      if(ntot > 0)
	{
	  iter++;

	  if(iter > 0 && ThisTask == 0)
	    {
	      printf("hotngb iteration %d: need to repeat for %d particles.\n", iter, ntot);
	      fflush(stdout);
	    }

	  if(iter > MAXITER_HOT)
	    {
	      printf("failed to converge in hot-neighbour iteration\n");
	      fflush(stdout);
	      endrun(1155);
	    }
	}
    }
  while(ntot > 0);


  myfree(DataNodeList);
  myfree(DataIndexTable);
  myfree(Ngblist);
  myfree(Right);
  myfree(Left);


  for(i = FirstActiveParticle; i >= 0; i = NextActiveParticle[i])
    if(P[i].Type == 10)
      {
	P[i].Type = 0;
	/* mark as active again */
	if(P[i].TimeBin < 0)
	  P[i].TimeBin = -P[i].TimeBin - 1;
      }


  CPU_Step[CPU_HOTNGBS] += measure_time();

}