Exemplo n.º 1
/*! \brief Estimate cost of PME FFT communication
 * This only takes the communication into account and not imbalance
 * in the calculation. But the imbalance in communication and calculation
 * are similar and therefore these formulas also prefer load balance
 * in the FFT and pme_solve calculation.
static float comm_pme_cost_vol(int npme, int a, int b, int c)
    /* We use a float here, since an integer might overflow */
    float comm_vol;

    comm_vol  = npme - 1;
    comm_vol *= npme;
    comm_vol *= div_up(a, npme);
    comm_vol *= div_up(b, npme);
    comm_vol *= c;

    return comm_vol;
Exemplo n.º 2
static void * queue_init_perqueue(unsigned int numa_node)
	size_t len = __perqueue_end - __perqueue_start;
	char *addr;

	addr = mem_alloc_pages_onnode(div_up(len, PGSIZE_2MB),
				      PGSIZE_2MB, numa_node, MPOL_BIND);
	if (!addr)
		return NULL;

	memset(addr, 0, len);

	return addr;
Exemplo n.º 3
static void *cpu_init_percpu(unsigned int cpu, unsigned int numa_node)
	size_t len = __percpu_end - __percpu_start;
	char *addr, *addr_percpu;

	addr = mem_alloc_pages_onnode(div_up(len + PERCPU_DUNE_LEN, PGSIZE_2MB),
				      PGSIZE_2MB, numa_node, MPOL_BIND);
	if (!addr)
		return NULL;

	addr_percpu = addr + PERCPU_DUNE_LEN;

	memset(addr_percpu, 0, len);

	*((char **) addr) = addr_percpu;
	percpu_offsets[cpu] = addr_percpu;

	return addr;
Exemplo n.º 4
void walk_test( int **& M, int n_steps, int n_runs )
    if ( M == 0 ) M = ml_alloc<int > ( n_steps, n_runs );
    int n_pool = div_up(n_steps,32);
    int32 *walk = ml_alloc<int32 > (n_steps);
    int32 *pool = ml_alloc<int32 > (n_pool);
    ml_random rng;
    for (int j=0; j<n_runs; j++)
        for (int k=0; k<n_pool; k++)
            pool[k] = rng.gen_int();
        binom_to_random_walk ( pool, n_steps, walk );
        for ( int k=0; k<n_steps; k++ )
            M[k][j] = walk[k];
//    ml_free (walk);
//    ml_free (pool);
Exemplo n.º 5
static float comm_cost_est(gmx_domdec_t *dd,real limit,real cutoff,
                           matrix box,gmx_ddbox_t *ddbox,
                           int natoms,t_inputrec *ir,
                           float pbcdxr,
                           int npme_tot,ivec nc)
    ivec npme= {1,1,1};
    int  i,j,k,nk,overlap;
    rvec bt;
    float comm_vol,comm_vol_xf,comm_pme,cost_pbcdx;
    /* This is the cost of a pbc_dx call relative to the cost
     * of communicating the coordinate and force of an atom.
     * This will be machine dependent.
     * These factors are for x86 with SMP or Infiniband.
    float pbcdx_rect_fac = 0.1;
    float pbcdx_tric_fac = 0.2;

    /* Check the DD algorithm restrictions */
    if ((ir->ePBC == epbcXY && ir->nwall < 2 && nc[ZZ] > 1) ||
            (ir->ePBC == epbcSCREW && (nc[XX] == 1 || nc[YY] > 1 || nc[ZZ] > 1)))
        return -1;

    if (inhomogeneous_z(ir) && nc[ZZ] > 1)
        return -1;

    /* Check if the triclinic requirements are met */
    for(i=0; i<DIM; i++)
        for(j=i+1; j<ddbox->npbcdim; j++)
            if (box[j][i] != 0 || ir->deform[j][i] != 0 ||
                    (ir->epc != epcNO && ir->compress[j][i] != 0))
                if (nc[j] > 1 && nc[i] == 1)
                    return -1;

    for(i=0; i<DIM; i++)
        bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i];

        /* Without PBC there are no cell size limits with 2 cells */
        if (!(i >= ddbox->npbcdim && nc[i] <= 2) && bt[i] < nc[i]*limit)
            return -1;

    if (npme_tot > 1)
        /* The following choices should match those
         * in init_domain_decomposition in domdec.c.
        if (nc[XX] == 1 && nc[YY] > 1)
            npme[XX] = 1;
            npme[YY] = npme_tot;
        else if (nc[YY] == 1)
            npme[XX] = npme_tot;
            npme[YY] = 1;
            /* Will we use 1D or 2D PME decomposition? */
            npme[XX] = (npme_tot % nc[XX] == 0) ? nc[XX] : npme_tot;
            npme[YY] = npme_tot/npme[XX];

    /* When two dimensions are (nearly) equal, use more cells
     * for the smallest index, so the decomposition does not
     * depend sensitively on the rounding of the box elements.
    for(i=0; i<DIM; i++)
        for(j=i+1; j<DIM; j++)
            /* Check if the box size is nearly identical,
             * in that case we prefer nx > ny  and ny > nz.
            if (fabs(bt[j] - bt[i]) < 0.01*bt[i] && nc[j] > nc[i])
                /* The XX/YY check is a bit compact. If nc[YY]==npme[YY]
                * this means the swapped nc has nc[XX]==npme[XX],
                * and we can also swap X and Y for PME.
                /* Check if dimension i and j are equivalent for PME.
                 * For x/y: if nc[YY]!=npme[YY], we can not swap x/y
                 * For y/z: we can not have PME decomposition in z
                if (npme_tot <= 1 ||
                        !((i == XX && j == YY && nc[YY] != npme[YY]) ||
                          (i == YY && j == ZZ && npme[YY] > 1)))
                    return -1;

    /* This function determines only half of the communication cost.
     * All PP, PME and PP-PME communication is symmetric
     * and the "back"-communication cost is identical to the forward cost.

    comm_vol = comm_box_frac(nc,cutoff,ddbox);

    comm_pme = 0;
    for(i=0; i<2; i++)
        /* Determine the largest volume for PME x/f redistribution */
        if (nc[i] % npme[i] != 0)
            if (nc[i] > npme[i])
                comm_vol_xf = (npme[i]==2 ? 1.0/3.0 : 0.5);
                comm_vol_xf = 1.0 - lcd(nc[i],npme[i])/(double)npme[i];
            comm_pme += 3*natoms*comm_vol_xf;

        /* Grid overlap communication */
        if (npme[i] > 1)
            nk = (i==0 ? ir->nkx : ir->nky);
            overlap = (nk % npme[i] == 0 ? ir->pme_order-1 : ir->pme_order);
            comm_pme += npme[i]*overlap*ir->nkx*ir->nky*ir->nkz/nk;

    /* PME FFT communication volume.
     * This only takes the communication into account and not imbalance
     * in the calculation. But the imbalance in communication and calculation
     * are similar and therefore these formulas also prefer load balance
     * in the FFT and pme_solve calculation.
    comm_pme += (npme[YY] - 1)*npme[YY]*div_up(ir->nky,npme[YY])*div_up(ir->nkz,npme[YY])*ir->nkx;
    comm_pme += (npme[XX] - 1)*npme[XX]*div_up(ir->nkx,npme[XX])*div_up(ir->nky,npme[XX])*ir->nkz;

    /* Add cost of pbc_dx for bondeds */
    cost_pbcdx = 0;
    if ((nc[XX] == 1 || nc[YY] == 1) || (nc[ZZ] == 1 && ir->ePBC != epbcXY))
        if ((ddbox->tric_dir[XX] && nc[XX] == 1) ||
                (ddbox->tric_dir[YY] && nc[YY] == 1))
            cost_pbcdx = pbcdxr*pbcdx_tric_fac;
            cost_pbcdx = pbcdxr*pbcdx_rect_fac;

    if (debug)
                "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
                3*natoms*(comm_vol + cost_pbcdx) + comm_pme);

    return 3*natoms*(comm_vol + cost_pbcdx) + comm_pme;
Exemplo n.º 6
void apply_hdaf_reg(
	int m,
	double sigma,
	int diff_order,
	double h,
	double eps_min,
	double eps_max,
    double * in,
	double * & out,
    int length,
    int in_stride,
    int out_stride)
	//if (out == 0) out = ml_alloc<double> (length*out_stride);
	static vector<int > m_list;
	static vector<double > sigma_list;
	static vector<int > diff_order_list;
	static vector<int > length_list;
	static vector<double > h_list;
	static vector<complex<double> * > kernel_fft_list;
	// not dealing with eps_range for now, fix later !!
	static int I = 0;
	bool found = 0;
	// look for existing parameter combination
	if (I)
	if (m_list[I] == m && sigma_list[I] == sigma && diff_order_list[I] == diff_order && length_list[I] == length && h_list[I] == h )
	    found = true;
	if (!found)
	for ( I = 0; I<m_list.size(); I++)
		if ( m_list[I] == m && sigma_list[I] == sigma && diff_order_list[I] == diff_order && length_list[I] == length && h_list[I] == h  )
			found = true;
	if (!found)
		// new parameter combination
		I = m_list.size();
		m_list.push_back( m );
		sigma_list.push_back( sigma );
		diff_order_list.push_back( diff_order );
		length_list.push_back( length );
		h_list.push_back( h );
		complex<double > *kernel_fft = 0;		
		double * kernel = ml_alloc<double > (length);
		for (int k=0; k<length; k++)
			kernel[k] = 0.0;
		double x_max = hdaf_truncate_point (eps_min, eps_max, m, sigma, diff_order );
		int k_max = (int)ceil(x_max/h);
		if ( k_max >= length )
			std::cout << "error: bad combination of hdaf parameters; truncate point exceeds data length in apply_hdaf_reg:\n";
            std::cout << "(eps1,eps2) = (" << eps_min << ", " << eps_max << "),\tm= " << m << ",\tsigma:" << sigma << ",\tdiff_order: " << diff_order << ",\tdata_length: " << length << ",\tx_max: " << x_max << ",\tk_max: " << k_max << std::endl;
		ml_poly<mp_real > P;
		make_hdaf_ml_poly( P, m );
		differentiate_hdaf_poly( P, diff_order );
        static mp_real sqrt2 = pow((mp_real)2.0,0.5);
        mp_real p = (pow(sqrt2*sigma,-diff_order)/(sqrt2*sigma))*h;
		for (int k=0; k<=k_max; k++)
            mp_real r = (((mp_real)k)*h)/(sqrt2*sigma);
            kernel[k] = dble(exp(-r*r)*P(r)*p);
		for (int k=1; k<=k_max; k++)
            mp_real r = -(((mp_real)k)*h)/(sqrt2*sigma);        
            kernel[length-k] = dble(exp(-r*r)*P(r)*p);
		FFT(kernel,kernel_fft, length, 1,1 );	
		ml_free( kernel );
	// run
	complex<double> * q=0;
	complex<double> * ker_fft = kernel_fft_list[I];
	FFT(in, q, length, in_stride,1);
	for (int k=0; k<div_up(length,2); k++)
		q[k] *= ker_fft[k]/((double)length);
Exemplo n.º 7
 * eth_process_recv - processes pending received packets
 * Returns true if there are no remaining packets.
int eth_process_recv(void)
	int i, count = 0;
	bool empty;
	unsigned long min_timestamp = -1;
	unsigned long timestamp;
	int value;
	double val;
	struct metrics_accumulator *this_metrics_acc = &percpu_get(metrics_acc);

	 * We round robin through each queue one packet at
	 * a time for fairness, and stop when all queues are
	 * empty or the batch limit is hit. We're okay with
	 * going a little over the batch limit if it means
	 * we're not favoring one queue over another.
	do {
		empty = true;
		for (i = 0; i < percpu_get(eth_num_queues); i++) {
			struct eth_rx_queue *rxq = percpu_get(eth_rxqs[i]);
			struct mbuf *pos = rxq->head;
			if (pos)
				min_timestamp = min(min_timestamp, pos->timestamp);
			if (!eth_process_recv_queue(rxq)) {
				empty = false;
	} while (!empty && count < eth_rx_max_batch);

	timestamp = rdtsc();
	value = count ? (timestamp - min_timestamp) / cycles_per_us : 0;
	this_metrics_acc->queuing_delay += value;
	this_metrics_acc->batch_size += count;
	if (timestamp - this_metrics_acc->timestamp > (long) cycles_per_us * METRICS_PERIOD_US) {
		if (this_metrics_acc->batch_size)
			val = (double) this_metrics_acc->queuing_delay / this_metrics_acc->batch_size;
			val = 0;
		EMA_UPDATE(cp_shmem->cpu_metrics[percpu_get(cpu_nr)].queuing_delay, val, EMA_SMOOTH_FACTOR);
		if (this_metrics_acc->count)
			val = (double) this_metrics_acc->batch_size / this_metrics_acc->count;
			val = 0;
		EMA_UPDATE(cp_shmem->cpu_metrics[percpu_get(cpu_nr)].batch_size, val, EMA_SMOOTH_FACTOR);
		this_metrics_acc->timestamp = timestamp;
		this_metrics_acc->count = 0;
		this_metrics_acc->queuing_delay = 0;
		this_metrics_acc->batch_size = 0;

	int backlog = 0;
	for (i = 0; i < percpu_get(eth_num_queues); i++) {
		struct eth_rx_queue *rxq = percpu_get(eth_rxqs[i]);
		backlog += rxq->len;
	backlog = div_up(backlog, eth_rx_max_batch);

	return empty;