/*! \brief Estimate cost of PME FFT communication * * This only takes the communication into account and not imbalance * in the calculation. But the imbalance in communication and calculation * are similar and therefore these formulas also prefer load balance * in the FFT and pme_solve calculation. */ static float comm_pme_cost_vol(int npme, int a, int b, int c) { /* We use a float here, since an integer might overflow */ float comm_vol; comm_vol = npme - 1; comm_vol *= npme; comm_vol *= div_up(a, npme); comm_vol *= div_up(b, npme); comm_vol *= c; return comm_vol; }
static void * queue_init_perqueue(unsigned int numa_node) { size_t len = __perqueue_end - __perqueue_start; char *addr; addr = mem_alloc_pages_onnode(div_up(len, PGSIZE_2MB), PGSIZE_2MB, numa_node, MPOL_BIND); if (!addr) return NULL; memset(addr, 0, len); return addr; }
static void *cpu_init_percpu(unsigned int cpu, unsigned int numa_node) { size_t len = __percpu_end - __percpu_start; char *addr, *addr_percpu; addr = mem_alloc_pages_onnode(div_up(len + PERCPU_DUNE_LEN, PGSIZE_2MB), PGSIZE_2MB, numa_node, MPOL_BIND); if (!addr) return NULL; addr_percpu = addr + PERCPU_DUNE_LEN; memset(addr_percpu, 0, len); *((char **) addr) = addr_percpu; percpu_offsets[cpu] = addr_percpu; return addr; }
void walk_test( int **& M, int n_steps, int n_runs ) { if ( M == 0 ) M = ml_alloc<int > ( n_steps, n_runs ); int n_pool = div_up(n_steps,32); int32 *walk = ml_alloc<int32 > (n_steps); int32 *pool = ml_alloc<int32 > (n_pool); ml_random rng; for (int j=0; j<n_runs; j++) { for (int k=0; k<n_pool; k++) pool[k] = rng.gen_int(); binom_to_random_walk ( pool, n_steps, walk ); for ( int k=0; k<n_steps; k++ ) M[k][j] = walk[k]; } // ml_free (walk); // ml_free (pool); }
static float comm_cost_est(gmx_domdec_t *dd,real limit,real cutoff, matrix box,gmx_ddbox_t *ddbox, int natoms,t_inputrec *ir, float pbcdxr, int npme_tot,ivec nc) { ivec npme= {1,1,1}; int i,j,k,nk,overlap; rvec bt; float comm_vol,comm_vol_xf,comm_pme,cost_pbcdx; /* This is the cost of a pbc_dx call relative to the cost * of communicating the coordinate and force of an atom. * This will be machine dependent. * These factors are for x86 with SMP or Infiniband. */ float pbcdx_rect_fac = 0.1; float pbcdx_tric_fac = 0.2; /* Check the DD algorithm restrictions */ if ((ir->ePBC == epbcXY && ir->nwall < 2 && nc[ZZ] > 1) || (ir->ePBC == epbcSCREW && (nc[XX] == 1 || nc[YY] > 1 || nc[ZZ] > 1))) { return -1; } if (inhomogeneous_z(ir) && nc[ZZ] > 1) { return -1; } /* Check if the triclinic requirements are met */ for(i=0; i<DIM; i++) { for(j=i+1; j<ddbox->npbcdim; j++) { if (box[j][i] != 0 || ir->deform[j][i] != 0 || (ir->epc != epcNO && ir->compress[j][i] != 0)) { if (nc[j] > 1 && nc[i] == 1) { return -1; } } } } for(i=0; i<DIM; i++) { bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i]; /* Without PBC there are no cell size limits with 2 cells */ if (!(i >= ddbox->npbcdim && nc[i] <= 2) && bt[i] < nc[i]*limit) { return -1; } } if (npme_tot > 1) { /* The following choices should match those * in init_domain_decomposition in domdec.c. */ if (nc[XX] == 1 && nc[YY] > 1) { npme[XX] = 1; npme[YY] = npme_tot; } else if (nc[YY] == 1) { npme[XX] = npme_tot; npme[YY] = 1; } else { /* Will we use 1D or 2D PME decomposition? */ npme[XX] = (npme_tot % nc[XX] == 0) ? nc[XX] : npme_tot; npme[YY] = npme_tot/npme[XX]; } } /* When two dimensions are (nearly) equal, use more cells * for the smallest index, so the decomposition does not * depend sensitively on the rounding of the box elements. */ for(i=0; i<DIM; i++) { for(j=i+1; j<DIM; j++) { /* Check if the box size is nearly identical, * in that case we prefer nx > ny and ny > nz. */ if (fabs(bt[j] - bt[i]) < 0.01*bt[i] && nc[j] > nc[i]) { /* The XX/YY check is a bit compact. If nc[YY]==npme[YY] * this means the swapped nc has nc[XX]==npme[XX], * and we can also swap X and Y for PME. */ /* Check if dimension i and j are equivalent for PME. * For x/y: if nc[YY]!=npme[YY], we can not swap x/y * For y/z: we can not have PME decomposition in z */ if (npme_tot <= 1 || !((i == XX && j == YY && nc[YY] != npme[YY]) || (i == YY && j == ZZ && npme[YY] > 1))) { return -1; } } } } /* This function determines only half of the communication cost. * All PP, PME and PP-PME communication is symmetric * and the "back"-communication cost is identical to the forward cost. */ comm_vol = comm_box_frac(nc,cutoff,ddbox); comm_pme = 0; for(i=0; i<2; i++) { /* Determine the largest volume for PME x/f redistribution */ if (nc[i] % npme[i] != 0) { if (nc[i] > npme[i]) { comm_vol_xf = (npme[i]==2 ? 1.0/3.0 : 0.5); } else { comm_vol_xf = 1.0 - lcd(nc[i],npme[i])/(double)npme[i]; } comm_pme += 3*natoms*comm_vol_xf; } /* Grid overlap communication */ if (npme[i] > 1) { nk = (i==0 ? ir->nkx : ir->nky); overlap = (nk % npme[i] == 0 ? ir->pme_order-1 : ir->pme_order); comm_pme += npme[i]*overlap*ir->nkx*ir->nky*ir->nkz/nk; } } /* PME FFT communication volume. * This only takes the communication into account and not imbalance * in the calculation. But the imbalance in communication and calculation * are similar and therefore these formulas also prefer load balance * in the FFT and pme_solve calculation. */ comm_pme += (npme[YY] - 1)*npme[YY]*div_up(ir->nky,npme[YY])*div_up(ir->nkz,npme[YY])*ir->nkx; comm_pme += (npme[XX] - 1)*npme[XX]*div_up(ir->nkx,npme[XX])*div_up(ir->nky,npme[XX])*ir->nkz; /* Add cost of pbc_dx for bondeds */ cost_pbcdx = 0; if ((nc[XX] == 1 || nc[YY] == 1) || (nc[ZZ] == 1 && ir->ePBC != epbcXY)) { if ((ddbox->tric_dir[XX] && nc[XX] == 1) || (ddbox->tric_dir[YY] && nc[YY] == 1)) { cost_pbcdx = pbcdxr*pbcdx_tric_fac; } else { cost_pbcdx = pbcdxr*pbcdx_rect_fac; } } if (debug) { fprintf(debug, "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n", nc[XX],nc[YY],nc[ZZ],npme[XX],npme[YY], comm_vol,cost_pbcdx,comm_pme, 3*natoms*(comm_vol + cost_pbcdx) + comm_pme); } return 3*natoms*(comm_vol + cost_pbcdx) + comm_pme; }
void apply_hdaf_reg( int m, double sigma, int diff_order, double h, double eps_min, double eps_max, double * in, double * & out, int length, int in_stride, int out_stride) { //if (out == 0) out = ml_alloc<double> (length*out_stride); static vector<int > m_list; static vector<double > sigma_list; static vector<int > diff_order_list; static vector<int > length_list; static vector<double > h_list; static vector<complex<double> * > kernel_fft_list; // not dealing with eps_range for now, fix later !! static int I = 0; bool found = 0; // look for existing parameter combination if (I) if (m_list[I] == m && sigma_list[I] == sigma && diff_order_list[I] == diff_order && length_list[I] == length && h_list[I] == h ) found = true; if (!found) for ( I = 0; I<m_list.size(); I++) if ( m_list[I] == m && sigma_list[I] == sigma && diff_order_list[I] == diff_order && length_list[I] == length && h_list[I] == h ) { found = true; break; } if (!found) { // new parameter combination I = m_list.size(); m_list.push_back( m ); sigma_list.push_back( sigma ); diff_order_list.push_back( diff_order ); length_list.push_back( length ); h_list.push_back( h ); complex<double > *kernel_fft = 0; double * kernel = ml_alloc<double > (length); for (int k=0; k<length; k++) kernel[k] = 0.0; double x_max = hdaf_truncate_point (eps_min, eps_max, m, sigma, diff_order ); int k_max = (int)ceil(x_max/h); if ( k_max >= length ) { std::cout << "error: bad combination of hdaf parameters; truncate point exceeds data length in apply_hdaf_reg:\n"; std::cout << "(eps1,eps2) = (" << eps_min << ", " << eps_max << "),\tm= " << m << ",\tsigma:" << sigma << ",\tdiff_order: " << diff_order << ",\tdata_length: " << length << ",\tx_max: " << x_max << ",\tk_max: " << k_max << std::endl; std_exit(); } mp::mp_init(30); ml_poly<mp_real > P; make_hdaf_ml_poly( P, m ); differentiate_hdaf_poly( P, diff_order ); static mp_real sqrt2 = pow((mp_real)2.0,0.5); mp_real p = (pow(sqrt2*sigma,-diff_order)/(sqrt2*sigma))*h; for (int k=0; k<=k_max; k++) { mp_real r = (((mp_real)k)*h)/(sqrt2*sigma); kernel[k] = dble(exp(-r*r)*P(r)*p); } for (int k=1; k<=k_max; k++) { mp_real r = -(((mp_real)k)*h)/(sqrt2*sigma); kernel[length-k] = dble(exp(-r*r)*P(r)*p); } FFT(kernel,kernel_fft, length, 1,1 ); kernel_fft_list.push_back(kernel_fft); ml_free( kernel ); } // run complex<double> * q=0; complex<double> * ker_fft = kernel_fft_list[I]; FFT(in, q, length, in_stride,1); for (int k=0; k<div_up(length,2); k++) q[k] *= ker_fft[k]/((double)length); IFFT(q,out,length,1,out_stride); }
/** * eth_process_recv - processes pending received packets * * Returns true if there are no remaining packets. */ int eth_process_recv(void) { int i, count = 0; bool empty; unsigned long min_timestamp = -1; unsigned long timestamp; int value; double val; struct metrics_accumulator *this_metrics_acc = &percpu_get(metrics_acc); /* * We round robin through each queue one packet at * a time for fairness, and stop when all queues are * empty or the batch limit is hit. We're okay with * going a little over the batch limit if it means * we're not favoring one queue over another. */ do { empty = true; for (i = 0; i < percpu_get(eth_num_queues); i++) { struct eth_rx_queue *rxq = percpu_get(eth_rxqs[i]); struct mbuf *pos = rxq->head; if (pos) min_timestamp = min(min_timestamp, pos->timestamp); if (!eth_process_recv_queue(rxq)) { count++; empty = false; } } } while (!empty && count < eth_rx_max_batch); timestamp = rdtsc(); this_metrics_acc->count++; value = count ? (timestamp - min_timestamp) / cycles_per_us : 0; this_metrics_acc->queuing_delay += value; this_metrics_acc->batch_size += count; if (timestamp - this_metrics_acc->timestamp > (long) cycles_per_us * METRICS_PERIOD_US) { if (this_metrics_acc->batch_size) val = (double) this_metrics_acc->queuing_delay / this_metrics_acc->batch_size; else val = 0; EMA_UPDATE(cp_shmem->cpu_metrics[percpu_get(cpu_nr)].queuing_delay, val, EMA_SMOOTH_FACTOR); if (this_metrics_acc->count) val = (double) this_metrics_acc->batch_size / this_metrics_acc->count; else val = 0; EMA_UPDATE(cp_shmem->cpu_metrics[percpu_get(cpu_nr)].batch_size, val, EMA_SMOOTH_FACTOR); this_metrics_acc->timestamp = timestamp; this_metrics_acc->count = 0; this_metrics_acc->queuing_delay = 0; this_metrics_acc->batch_size = 0; } KSTATS_PACKETS_INC(count); KSTATS_BATCH_INC(count); #ifdef ENABLE_KSTATS int backlog = 0; for (i = 0; i < percpu_get(eth_num_queues); i++) { struct eth_rx_queue *rxq = percpu_get(eth_rxqs[i]); backlog += rxq->len; } backlog = div_up(backlog, eth_rx_max_batch); KSTATS_BACKLOG_INC(backlog); #endif return empty; }