void iso_3dfd2(float *ptr_next, float *ptr_prev, float *ptr_vel, float *coeff, const int n1, const int n2, const int n3, int nreps) { int it; transfer(ptr_vel, n1, n2, n3); int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); for(it=0; it<nreps; it+=2){ double wstart = walltime(); transfer(ptr_prev, n1, n2, n3); double wend = walltime(); float delta = wend - wstart; if (rank == 0) printf("%8.2f\n", delta); iso_3dfd_stencil2( ptr_next, ptr_prev, ptr_vel, coeff, n1, n2, n3); wend = walltime(); delta = wend - wstart; if (rank == 0) printf("%8.2f\n", delta); // here's where boundary conditions+halo exchanges happen MPI_Barrier(MPI_COMM_WORLD); transfer(ptr_next, n1, n2, n3); // Swap previous & next between iterations iso_3dfd_stencil2 ( ptr_prev, ptr_next, ptr_vel, coeff, n1, n2, n3); MPI_Barrier(MPI_COMM_WORLD); } // time loop }
int run_mzed_add(void *_p, unsigned long long *data, int *data_len) { struct smallops_params *p = (struct smallops_params *)_p; *data_len = 2; gf2e *ff = gf2e_init(irreducible_polynomials[p->k][1]); mzed_t *A = mzed_init(ff,p->m,p->n); mzed_randomize(A); mzed_t *B = mzed_init(ff,p->m,p->n); mzed_randomize(B); mzed_t *C = mzed_init(ff,p->m,p->n); data[0] = walltime(0); data[1] = cpucycles(); mzed_add(C, A, B); data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); mzed_free(A); mzed_free(B); mzed_free(C); gf2e_free(ff); return 0; }
double run_mp(double sigma_, double c_, int tau, dgs_disc_gauss_alg_t alg, size_t ntrials, unsigned long long *t) { mpfr_set_default_prec(80); mpfr_t sigma; mpfr_init_set_d(sigma, sigma_, MPFR_RNDN); gmp_randstate_t state; gmp_randinit_default(state); mpfr_t c; mpfr_init_set_d(c, c_, MPFR_RNDN); dgs_disc_gauss_mp_t *gen = dgs_disc_gauss_mp_init(sigma, c, tau, alg); double variance = 0.0; mpz_t r; mpz_init(r); *t = walltime(0); for(size_t i=0; i<ntrials; i++) { gen->call(r, gen, state); variance += mpz_get_d(r)*mpz_get_d(r); } *t = walltime(*t); dgs_disc_gauss_mp_clear(gen); mpfr_clear(sigma); mpz_clear(r); mpfr_clear(c); gmp_randclear(state); variance /= ntrials; return sqrt(variance); }
bool shouldstop() const { if (this->cost < 0) return false; double t = wtime*(walltime() - this->res.wallstart); double c = wcost*this->cost; return t >= c/lambda; }
bool shouldstop() const { if (this->cost < 0) return false; double t = walltime() - this->res.wallstart; double c = this->cost; return mon.stop(c, t); }
double run_dp(double sigma, double c, int tau, dgs_disc_gauss_alg_t alg, size_t ntrials, unsigned long long *t) { double variance = 0.0; gmp_randstate_t state; gmp_randinit_default(state); dgs_disc_gauss_dp_t *gen = dgs_disc_gauss_dp_init(sigma, c, tau, alg); *t = walltime(0); for(size_t i=0; i<ntrials; i++) { long r = gen->call(gen); variance += ((double)r)*((double)r); } *t = walltime(*t); dgs_disc_gauss_dp_clear(gen); gmp_randclear(state); variance /= ntrials; return sqrt(variance); }
int run(void *_p, unsigned long long *data, int *data_len) { struct trsm_params *p = (struct trsm_params *)_p; *data_len = 2; mzd_t *B = mzd_init(p->m, p->n); mzd_t *L = mzd_init(p->n, p->n); mzd_randomize(B); mzd_randomize(L); for (rci_t i = 0; i < p->n; ++i){ for (rci_t j = i + 1; j < p->n; ++j) mzd_write_bit(L,i,j, 0); mzd_write_bit(L,i,i, 1); } data[0] = walltime(0); data[1] = cpucycles(); mzd_trsm_lower_right(L, B, 2048); data[0] = walltime(data[0]); data[1] = cpucycles() - data[1]; mzd_free(B); mzd_free(L); return 0; }
// updateopen updates the utilities of all nodes on open and // reinitializes the heap every 2^i expansions. void updateopen() { if (this->res.expd < nextresort) return; double nexpd = this->res.expd - lastexpd; lastexpd = this->res.expd; double t = walltime(); timeper = (t - lasttime) / nexpd; lasttime = t; avgdelay = delaysum/nexpd; delaysum = 0; nextresort *= 2; nresort++; reinitheap(); }
void search(D &d, typename D::State &s0) { this->start(); lasttime = walltime(); closed.init(d); Node *n0 = init(d, s0); closed.add(n0); open.push(n0); while (!open.empty() && !SearchAlgorithm<D>::limit()) { Node* n = *open.pop(); State buf, &state = d.unpack(buf, n->state); if (d.isgoal(state)) { solpath<D, Node>(d, n, this->res); break; } expand(d, n, state); updateopen(); } this->finish(); }
/** * \brief Function a given walltime into seconds * \param walltime The walltime to convert * \return the walltime converted to seconds */ long vishnu::convertStringToWallTime(const std::string& walltime_) { std::string walltime(walltime_); if(!walltime.empty()){ if(*(walltime.begin())=='\"'){ walltime.replace(walltime.begin(), walltime.begin()+1, ""); } if(*(walltime.end()-1)=='\"'){ walltime.replace(walltime.end()-1, walltime.end(), ""); } } if(walltime.size()!=0) { int seconds = 0; int minute = 0; int heure = 0; int jour = 0; std::string value; size_t size = walltime.size(); size_t pos = walltime.rfind(":"); if(pos!=std::string::npos) { if((size-pos > 1)) { value = walltime.substr(pos+1, size-1-pos); if(isNumericalValue(value)) { seconds = convertToInt(value); } } } else { if(walltime.size() > 0) { value = walltime; if(isNumericalValue(value)) { seconds = convertToInt(value); } } } if((pos!=std::string::npos) && (pos > 0)) { size = pos; pos = walltime.rfind(":", size-1); if(pos!=std::string::npos) { if((size-pos > 1)) { value = walltime.substr(pos+1, size-pos-1); if(isNumericalValue(value)) { minute = convertToInt(value); } } } else { value = walltime.substr(0, size); if(isNumericalValue(value)) { minute = convertToInt(value); } } } if((pos!=std::string::npos) && (pos > 0)) { size = pos; pos = walltime.rfind(":", size-1); if(pos!=std::string::npos) { if((size-pos > 1)) { value = walltime.substr(pos+1, size-pos-1); if(isNumericalValue(value)) { heure = convertToInt(value); } } } else { value = walltime.substr(0, size); if(isNumericalValue(value)) { heure = convertToInt(value); } } } if((pos!=std::string::npos) && (pos > 0)) { size = pos; pos = walltime.rfind(":", size-1); if(pos!=std::string::npos) { if((size-pos > 1)) { throw std::runtime_error("Invalid wallltime value: "+walltime); } } else { value = walltime.substr(0, size); if(isNumericalValue(value)) { jour = convertToInt(value); } } } long walltimeInSeconds = (jour*86400+heure*3600+minute*60+seconds); return walltimeInSeconds; } else { throw UserException(ERRCODE_INVALID_PARAM, ("Invalid walltime value: The given value is empty")); } }
void query (const char* fname,int num_threads) { int result = 0; double start, end; start = walltime(); //scan edges vector<tuple> edges = vector<tuple>(); ifstream f0(fname); while (!f0.eof()) { int j; f0 >> j; int k; f0 >> k; tuple t; t.to = j; t.from = k; edges.push_back(t); //tmp_vector0.push_back(j); //count0++; //if (count0 == 2) { // count0 = 0; // edges.push_back(tmp_vector0); // tmp_vector0 = vector<int>(); //} } f0.close(); end = walltime(); scan_runtime = end - start; cout << "done reading file.\n"; start = walltime(); //hash edges map<int, vector<tuple > > edges0_hash; for (int i = 0; i < edges.size(); i++) { if (edges0_hash.find(edges[i].to) == edges0_hash.end()) { edges0_hash[edges[i].to] = vector<tuple> (); } edges0_hash[edges[i].to].push_back(edges[i]); } end = walltime(); hash_runtime = end - start; cout << "done creating hash.\n"; omp_set_num_threads(num_threads); start = walltime(); //loop over edges #pragma omp parallel for reduction(+:result) schedule(static) for (int index0 = 0; index0 < edges.size(); ++index0) { if (edges[index0].to > edges[index0].from) { continue; } //if there is no match, continue if (edges0_hash.find(edges[index0].from) == edges0_hash.end()) { continue; } vector<tuple> table1 = edges0_hash[edges[index0].from]; //loop over table1 #pragma omp parallel for reduction(+:result) schedule(static) for (int index1 = 0; index1 < table1.size(); ++index1) { if (table1[index1].to > table1[index1].from) { continue;} //if there is no match, continue if (edges0_hash.find(table1[index1].from) == edges0_hash.end()) { continue; } vector<tuple> table2 = edges0_hash[table1[index1].from]; //loop over final join results #pragma omp parallel for reduction(+:result) schedule(static) for (int index2 = 0; index2 < table2.size(); ++index2) { if (table2[index2].from==edges[index0].to) { ++result; } } } } end = walltime(); triangles_runtime = end - start; cout << "Found " << result << " tuples.\n"; char scheduling[1024] = "static"; int64_t chunk = -1; DictOut out; DICT_ADD(out, hash_runtime); DICT_ADD(out, triangles_runtime); DICT_ADD(out, scan_runtime); DICT_ADD(out, (int64_t)num_threads); DICT_ADD(out, fname); DICT_ADD(out, scheduling); DICT_ADD(out, chunk); std::cout << out.toString() << std::endl; }
// row outputs an incumbent solution row. void row(unsigned long n, double epsprime) { dfrow(stdout, "incumbent", "uuugggg", n, this->res.expd, this->res.gend, wt, epsprime, cost, walltime() - this->res.wallstart); }
void bfs(GlobalAddress<G> _g, int nbfs, TupleGraph tg) { bool verified = false; double t; auto _frontier = GlobalBag<VertexID>::create(_g->nv); auto _next = GlobalBag<VertexID>::create(_g->nv); call_on_all_cores([=]{ frontier = _frontier; next = _next; g = _g; }); // do BFS from multiple different roots and average their times for (int root_idx = 0; root_idx < nbfs; root_idx++) { // intialize parent to -1 forall(g, [](G::Vertex& v){ v->init(); v->level = -1; }); VertexID root; if (FLAGS_max_degree_source) { forall(g, [](VertexID i, G::Vertex& v){ max_degree << MaxDegree(i, v.nadj); }); root = static_cast<MaxDegree>(max_degree).idx(); } else { root = choose_root(g); } // setup 'root' as the parent of itself delegate::call(g->vs+root, [=](G::Vertex& v){ v->parent = root; v->level = 0; }); // reset frontier queues next->clear(); frontier->clear(); // start with root as only thing in frontier delegate::call((g->vs+root).core(), [=]{ frontier->add(root); }); t = walltime(); bool top_down = true; int64_t prev_nf = -1; int64_t frontier_edges = 0; int64_t remaining_edges = g->nadj; while (!frontier->empty()) { auto nf = frontier->size(); VLOG(1) << "remaining_edges = " << remaining_edges << ", nf = " << nf << ", prev_nf = " << prev_nf << ", frontier_edges: " ; if (top_down && frontier_edges > remaining_edges/FLAGS_beamer_alpha && nf > prev_nf) { VLOG(1) << "switching to bottom-up"; top_down = false; } else if (!top_down && frontier_edges < g->nv/FLAGS_beamer_beta && nf < prev_nf) { VLOG(1) << "switching to top-down"; top_down = true; } edge_count = 0; if (top_down) { // iterate over vertices in this level of the frontier forall(frontier, [](VertexID& i){ // visit all the adjacencies of the vertex // note: this has to be 'async' to prevent deadlock from // running out of available workers forall<async>(adj(g,i), [i](G::Edge& e) { auto j = e.id; // at the core where the vertex is... delegate::call<async>(e.ga, [i,j](G::Vertex& vj){ // note: no synchronization needed because 'call' is // guaranteed to be executed atomically because it // does no blocking operations if (vj->parent == -1) { // claim parenthood vj->parent = i; vj->level = current_depth; next->add(j); edge_count += vj.nadj; } }); }); }); } else { // bottom-up forall<&phaser>(g, [](G::Vertex& v){ if (v->level != -1) return; auto va = make_linear(&v); forall<async,&phaser>(adj(g,v), [=,&v](G::Edge& e){ if (v->level != -1) return; phaser.enroll(); auto eva = e.ga; send_heap_message(eva.core(), [=]{ auto& ev = *eva.pointer(); if (ev->level != -1 && ev->level < current_depth) { auto eid = g->id(ev); send_heap_message(va.core(), [=]{ auto& v = *va.pointer(); if (v->level == -1) { next->add(g->id(v)); v->level = current_depth; v->parent = eid; edge_count += v.nadj; } phaser.complete(); }); } else { phaser.send_completion(va.core()); } }); }); }); } call_on_all_cores([=]{ current_depth++; // switch to next frontier level std::swap(frontier, next); }); next->clear(); frontier_edges = edge_count; remaining_edges -= frontier_edges; prev_nf = nf; } // while (frontier not empty) double this_bfs_time = walltime() - t; LOG(INFO) << "(root=" << root << ", time=" << this_bfs_time << ")"; if (!verified) { // only verify the first one to save time t = walltime(); bfs_nedge = verify(tg, g, root); verify_time = (walltime()-t); LOG(INFO) << verify_time; verified = true; Metrics::reset_all_cores(); // don't count the first one } else { total_time += this_bfs_time; } bfs_mteps += bfs_nedge / this_bfs_time / 1.0e6; } }
double duration(){ double now= walltime(); double r= now-start; start= now; return r; }
void* thread_search(void * arg) { int id = thread_id.fetch_add(1); // closed list is waaaay too big for my computer. // original 512927357 // TODO: Must optimize these numbers // 9999943 // 14414443 //129402307 // HashTable<typename D::PackedState, Node> closed(512927357 / tnum); HashTable<typename D::PackedState, Node> closed(closedlistsize); // printf("closedlistsize = %u\n", closedlistsize); // Heap<Node> open(100, overrun); heap open(openlistsize, overrun); Pool<Node> nodes(2048); // If the buffer is locked when the thread pushes a node, // stores it locally and pushes it afterward. // TODO: Array of dynamic sized objects. // This array would be allocated in heap rather than stack. // Therefore, not the best optimized way to do. // Also we need to fix it to compile in clang++. std::vector<std::vector<Node*>> outgo_buffer; outgo_buffer.reserve(tnum); std::vector<Node*> tmp; tmp.reserve(10); // TODO: ad hoc random number uint expd_here = 0; uint gend_here = 0; int max_outgo_buffer_size = 0; int max_income_buffer_size = 0; unsigned int discarded_here = 0; int duplicate_here = 0; int current_f = 0; double lapse; int useless = 0; int fval = -1; // How many of the nodes sent to itself. // If this high, then lower communication overhead. unsigned int self_push = 0; // while (path.size() == 0) { printf("id = %d\n", id); printf("incumbent = %d\n", incumbent.load()); unsigned int over_incumbent_count = 0; unsigned int no_work_iteration = 0; double init_time = walltime(); while (true) { Node *n; if (this->isTimed) { double t = walltime() - init_time; // printf("t = %f\n", t); if (t > this->timer) { // closed.destruct_all(nodes); terminate[id] = true; break; } } #ifdef ANALYZE_LAP startlapse(lapse); // income buffer #endif if (!income_buffer[id].isempty()) { terminate[id] = false; if (income_buffer[id].size() >= income_threshold) { ++force_income; income_buffer[id].lock(); tmp = income_buffer[id].pull_all_with_lock(); income_buffer[id].release_lock(); uint size = tmp.size(); #ifdef ANALYZE_INCOME if (max_income_buffer_size < size) { max_income_buffer_size = size; } dbgprintf("size = %d\n", size); #endif // ANALYZE_INCOME for (int i = 0; i < size; ++i) { dbgprintf("pushing %d, ", i); open.push(tmp[i]); // Not sure optimal or not. Vector to Heap. } tmp.clear(); } else if (income_buffer[id].try_lock()) { tmp = income_buffer[id].pull_all_with_lock(); // printf("%d", __LINE__); income_buffer[id].release_lock(); uint size = tmp.size(); #ifdef ANALYZE_INCOME if (max_income_buffer_size < size) { max_income_buffer_size = size; } dbgprintf("size = %d\n", size); #endif // ANALYZE_INCOME for (int i = 0; i < size; ++i) { dbgprintf("pushing %d, ", i); open.push(tmp[i]); // Not sure optimal or not. } tmp.clear(); } } #ifdef ANALYZE_LAPSE endlapse(lapse, "incomebuffer"); startlapse(&lapse); // open list #endif #ifdef OUTSOURCING open_sizes[id] = open.getsize(); #endif if (open.isemptyunder(incumbent.load())) { dbgprintf("open is empty.\n"); terminate[id] = true; if (hasterminated() && incumbent != initmaxcost) { printf("terminated\n"); break; } ++no_work_iteration; for (int i = 0; i < tnum; ++i) { if (i != id && outgo_buffer[i].size() > 0) { if (income_buffer[i].try_lock()) { // acquired lock income_buffer[i].push_all_with_lock( outgo_buffer[i]); income_buffer[i].release_lock(); outgo_buffer[i].clear(); } } } continue; // ad hoc } n = static_cast<Node*>(open.pop()); // if (n->f >= incumbent.load()) { // printf("open list error: n->f >= incumbent: %u > %d\n", n->f, incumbent.load()); // } // printf("f,g = %d, %d\n", n->f, n->g); #ifdef ANALYZE_LAPSE endlapse(lapse, "openlist"); #endif #ifdef ANALYZE_FTRACE int newf = open.minf(); Logfvalue* lg = new Logfvalue(walltime() - wall0, n->f, n->f - n->g); logfvalue[id].push_back(*lg); if (fvalues[id] != newf) { // printf("ftrace %d %d %f\n", id, fvalues[id], // walltime() - wall0); fvalues[id] = newf; } #endif // ANALYZE_FTRACE // TODO: Might not be the best way. // Would there be more novel way? #ifdef ANALYZE_GLOBALF if (n->f != fvalues[id]) { fvalues[id] = n->f; int min = *std::min_element(fvalues, fvalues+tnum); if (min != globalf) { globalf = min; printf("globalf %d %d %f\n", id, min, walltime() - wall0); } } #endif // If the new node n is duplicated and // the f value is higher than or equal to the duplicate, discard it. #ifdef ANALYZE_LAPSE startlapse(&lapse); // closed list #endif // if (n->thrown == 0) { Node *duplicate = closed.find(n->packed); if (duplicate) { if (duplicate->f <= n->f) { dbgprintf("Discarded\n"); ++discarded_here; nodes.destruct(n); continue; #ifdef ANALYZE_DUPLICATE } else { duplicate_here++; #endif // ANALYZE_DUPLICATE } // Node access here is unnecessary duplicates. // printf("Duplicated\n"); } // } #ifdef ANALYZE_LAPSE endlapse(lapse, "closedlist"); #endif #ifdef OUTSOURCING if ((n->thrown < 5) && (expd_here > 600) && outsourcing(n, id)) { if (n->thrown == 0) { closed.add(n); } #ifdef ANALYZE_OUTSOURCING outsource_pushed++; #endif dbgprintf("Out sourced a node\n"); continue; } #endif // OUTSOURCING typename D::State state; this->dom.unpack(state, n->packed); #ifdef ANALYZE_ORDER if (fval != n->f) { fval = n->f; LogNodeOrder* ln = new LogNodeOrder(globalOrder.fetch_add(1), state.sequence, fval, open.getsize()); lognodeorder[id].push_back(*ln); } else { LogNodeOrder* ln = new LogNodeOrder(globalOrder.fetch_add(1), state.sequence, -1, open.getsize()); lognodeorder[id].push_back(*ln); } #endif // ANALYZE_ORDER if (this->dom.isgoal(state)) { // TODO: For some reason, sometimes pops broken node. // if (state.tiles[1] == 0) { // printf("isgoal ERROR\n"); // continue; // } // print_state(state); std::vector<typename D::State> newpath; for (Node *p = n; p; p = p->parent) { typename D::State s; this->dom.unpack(s, p->packed); newpath.push_back(s); // This triggers the main loop to terminate. } int length = newpath.size(); printf("Goal! length = %d\n", length); printf("cost = %u\n", n->g); if (incumbent > n->g) { // TODO: this should be changed to match non-unit cost domains. incumbent = n->g; LogIncumbent* li = new LogIncumbent(walltime() - wall0, incumbent); logincumbent[id].push_back(*li); path = newpath; } continue; } #ifdef OUTSOURCING if (n->thrown == 0) { closed.add(n); } #else closed.add(n); #endif expd_here++; // if (expd_here % 100000 == 0) { // printf("expd: %u\n", expd_here); // } // printf("expd: %d\n", id); #ifdef ANALYZE_LAPSE startlapse(&lapse); #endif // buffer<Node>* buffers; useless += uselessCalc(useless); for (int i = 0; i < this->dom.nops(state); i++) { // op is the next blank position. int op = this->dom.nthop(state, i); // if (op == n->pop) { // // printf("erase %d \n", i); // continue; // } // printf("gend: %d\n", id); // int moving_tile = 0; // int blank = 0; // Make this available for Grid pathfinding. // int moving_tile = state.tiles[op]; // int blank = state.blank; // Make this available for Grid pathfinding. Edge<D> e = this->dom.apply(state, op); Node* next = wrap(state, n, e.cost, e.pop, nodes); /////////////////////////// /// TESTS /// Compare nodes, n & next /// 1. f value does increase (or same) /// 2. g increases by cost /// /// Compare states /// 1. operation working fine /////////////////////////// if (n->f > next->f) { // // heuristic was calculating too big. printf("!!!ERROR: f decreases: %u %u\n", n->f, next->f); unsigned int nh = n->f - n->g; unsigned int nxh = next->f - next->g; printf("h = %u %u\n", nh, nxh); printf("cost = %d\n", e.cost); } // if (static_cast<unsigned int>(n->g + e.cost) != static_cast<unsigned int>(next->g)) { // printf("!!!ERROR: g is wrong: %u + %d != %u\n", n->g, e.cost, next->g); // } if (next->f >= incumbent.load()) { // printf("needless\n");- ++over_incumbent_count; nodes.destruct(next); this->dom.undo(state, e); // printf("%u >= %d\n", next->f, incumbent.load()); continue; } // Node *duplicate = closed.find(next->packed); // if (duplicate) { // if (duplicate->f <= next->f) { // dbgprintf("Discarded\n"); // ++discarded_here; // nodes.destruct(next); // this->dom.undo(state, e); // continue; // } else { // ++duplicate_here; // } // } // printf("inc: %d, inc.load: %d\n", incumbent.load()); // if (next->f >= initmaxcost || next->g >= initmaxcost // || next->f >= incumbent || next->g >= incumbent // || next->f >= incumbent.load() // || next->g >= incumbent.load()) { // printf("f >= initmaxcost: %d > %d\n", next->f, initmaxcost); // } gend_here++; // if (gend_here % 1000000 == 0) { // printf("gend: %u\n", gend_here); //// printf("%u < %d\n", next->f, incumbent.load()); // } //printf("mv blank op = %d %d %d \n", moving_tile, blank, op); // print_state(state); unsigned int zbr = z.inc_hash(n->zbr, 0, 0, op, 0, state); next->zbr = zbr; zbr = zbr % tnum; // next->zbr = z.inc_hash(n->zbr, moving_tile, blank, op, // 0, state); // next->zbr = z.inc_hash(state); // unsigned int zbr = next->zbr % tnum; // printf("zbr, zbr_tnum = (%u, %u)\n", next->zbr, zbr); // If the node belongs to itself, just push to this open list. if (zbr == id) { // double w = walltime(); ++self_push; open.push(next); // printf("self: %f\n", walltime() - w); // } #ifdef SEMISYNC // Synchronous communication to avoid search overhead else if (outgo_buffer[zbr].size() > outgo_threshold) { income_buffer[zbr].lock(); income_buffer[zbr].push_with_lock(next); income_buffer[zbr].push_all_with_lock(outgo_buffer[zbr]); // printf("%d", __LINE__); income_buffer[zbr].release_lock(); outgo_buffer[zbr].clear(); #ifdef ANALYZE_SEMISYNC ++force_outgo; // printf("semisync = %d to %d\n", id, zbr); #endif // ANALYZE_SEMISYNC } #endif // SEMISYNC // else if (income_buffer[zbr].try_lock()) { // // if able to acquire the lock, then push all nodes in local buffer. // income_buffer[zbr].push_with_lock(next); // if (outgo_buffer[zbr].size() != 0) { // income_buffer[zbr].push_all_with_lock( // outgo_buffer[zbr]); // } //// printf("%d", __LINE__); // income_buffer[zbr].release_lock(); // outgo_buffer[zbr].clear(); } else {
void bfs(GlobalAddress<G> g, int nbfs, TupleGraph tg) { bool verified = false; double t; auto frontier = GlobalVector<int64_t>::create(g->nv); auto next = GlobalVector<int64_t>::create(g->nv); // do BFS from multiple different roots and average their times for (int root_idx = 0; root_idx < nbfs; root_idx++) { // intialize parent to -1 forall(g, [](G::Vertex& v){ v->init(); }); int64_t root = choose_root(g); VLOG(1) << "root => " << root; // setup 'root' as the parent of itself delegate::call(g->vs+root, [=](G::Vertex& v){ v->parent = root; }); // reset frontier queues next->clear(); frontier->clear(); // start with root as only thing in frontier frontier->push(root); t = walltime(); while (!frontier->empty()) { // iterate over vertices in this level of the frontier forall(frontier, [g,next](int64_t& i){ // visit all the adjacencies of the vertex // note: this has to be 'async' to prevent deadlock from // running out of available workers forall<async>(adj(g,g->vs+i), [i,next](G::Edge& e) { auto j = e.id; // at the core where the vertex is... bool claimed = delegate::call(e.ga, [i](G::Vertex& v){ // note: no synchronization needed because 'call' is // guaranteed to be executed atomically because it // does no blocking operations if (v->parent == -1) { // claim parenthood v->parent = i; return true; } return false; }); if (claimed) { // add this vertex to the frontier for the next level // note: we (currently) can't do this 'push' inside the delegate because it may block next->push(j); } }); }); // switch to next frontier level std::swap(frontier, next); next->clear(); } double this_total_time = walltime() - t; LOG(INFO) << "(root=" << root << ", time=" << this_total_time << ")"; total_time += this_total_time; if (!verified) { // only verify the first one to save time t = walltime(); bfs_nedge = verify(tg, g, root); verify_time = (walltime()-t); LOG(INFO) << verify_time; verified = true; } bfs_mteps += bfs_nedge / this_total_time / 1.0e6; } }
int run(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i && j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - i); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if (papi_res) m4ri_die(""); #endif if(strcmp(p->algorithm, "m4ri") == 0) p->r = mzd_echelonize_m4ri(A, 0, 0); else if(strcmp(p->algorithm, "ple") == 0) p->r = mzd_ple(A, P, Q, 0); else if(strcmp(p->algorithm, "mmpf") == 0) p->r = _mzd_ple_russian(A, P, Q, 0); else m4ri_die("unknown algorithm %s",p->algorithm); #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else mzp_free(P); mzp_free(Q); PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { data[nv] -= loop_calibration[nv]; } #endif mzd_free(A); return 0; }
main (int argc, char **argv) { /* declaration of variables */ FILE *fp; /* file pointer */ char *auxChar; /* auxiliar character */ char *modelFile = " "; /* elastic model file */ /* THICK - RHO - VP - QP - VS - QS */ int i, k, iProc, iR; /* counters */ int initF, lastF; /* initial and final frequencies */ int apl_pid; /* PVM process id control */ int nSamplesOrig; /* time series length */ int die; /* flag used to kill processes */ int pid; /* process id */ int nProc; /* number of processes */ int processControl; /* monitoring PVM start */ int *processes; /* array with process ids */ int FReceived; /* number of frequencies processed */ int nFreqProc; /* number of frequencies per process */ int nFreqPart; /* number of frequency partitions */ int **statusFreq; /* monitors processed frequencies */ int FInfo[2]; /* frequency delimiters */ int **procInfo; /* frequency limits for each processor */ float wallcpu; /* wall clock time */ float dt; /* time sampling interval */ float f; /* current frequency */ float fR; /* reference frequency */ float tMax; /* maximum recording time */ float *thick, *alpha, *beta, *rho, *qP, *qS; /* elastic constants and thickness */ complex **freqPart; /* frequency arrays sent by the slaves */ complex **uRF, **uZF; /* final frequency components */ INFO info[1]; /* basic information for slaves */ /* Logging information */ /* CleanLog(); */ /* getting input */ initargs(argc, argv); requestdoc(0); if (!getparstring("model", &modelFile)) modelFile = "model"; if (!getparstring("recfile", &auxChar)) auxChar = " "; sprintf(info->recFile, "%s", auxChar); if (!getparint("directwave", &info->directWave)) info->directWave = 1; if (!getparfloat("r1", &info->r1)) info->r1 = 0; if (!getparint("nr", &info->nR)) info->nR = 148; if (!getparfloat("dr", &info->dR)) info->dR = .025; if (!getparfloat("zs", &info->zs)) info->zs = 0.001; if (info->zs <= 0) info->zs = 0.001; if (!getparfloat("u1", &info->u1)) info->u1 = 0.0002; if (!getparfloat("u2", &info->u2)) info->u2 = 1.; if (!getparint("nu", &info->nU)) info->nU = 1000; if (!getparfloat("f1", &info->f1)) info->f1 = 2; if (!getparfloat("f2", &info->f2)) info->f2 = 50; if (!getparfloat("dt", &dt)) dt = 0.004; if (!getparfloat("tmax", &tMax)) tMax = 8; if (!getparfloat("F1", &info->F1)) info->F1 = 0; if (!getparfloat("F2", &info->F2)) info->F2 = 0; if (!getparfloat("F3", &info->F3)) info->F3 = 1; if (!getparint("hanning", &info->hanningFlag)) info->hanningFlag = 0; if (!getparfloat("wu", &info->percU)) info->percU = 5; info->percU /= 100; if (!getparfloat("ww", &info->percW)) info->percW = 5; info->percW /= 100; if (!getparfloat("fr", &fR)) fR = 1; info->wR = 2 * PI * fR; if (!getparfloat("tau", &info->tau)) info->tau = 50; if (!getparint("nproc", &nProc)) nProc = 1; if (!getparint("nfreqproc", &nFreqProc) || nProc == 1) nFreqProc = 0; if (!getparint("verbose", &info->verbose)) info->verbose = 0; /* how many layers */ fp = fopen(modelFile,"r"); if (fp == NULL) err("No model file!\n"); info->nL = 0; while (fscanf(fp, "%f %f %f %f %f %f\n", &f, &f, &f, &f, &f, &f) != EOF) info->nL++; info->nL--; fclose(fp); if (info->verbose) fprintf(stderr,"Number of layers in model %s : %d\n", modelFile, info->nL + 1); /* if specific geometry, count number of receivers */ fp = fopen(info->recFile, "r"); if (fp != NULL) { info->nR = 0; while (fscanf(fp, "%f\n", &f) != EOF) info->nR++; } fclose(fp); /* memory allocation */ alpha = alloc1float(info->nL + 1); beta = alloc1float(info->nL + 1); rho = alloc1float(info->nL + 1); qP = alloc1float(info->nL + 1); qS = alloc1float(info->nL + 1); thick = alloc1float(info->nL + 1); processes = alloc1int(nProc); procInfo = alloc2int(2, nProc); /* reading the file */ fp = fopen(modelFile,"r"); if (info->verbose) fprintf(stderr,"Thickness rho vP qP vS qS\n"); for (i = 0; i < info->nL + 1; i++) { fscanf(fp, "%f %f %f %f %f %f\n", &thick[i], &rho[i], &alpha[i], &qP[i], &beta[i], &qS[i]); if (info->verbose) fprintf(stderr," %7.4f %4.3f %3.2f %5.1f %3.2f %5.1f\n", thick[i], rho[i], alpha[i], qP[i], beta[i], qS[i]); } fclose(fp); /* computing frequency interval */ info->nSamples = NINT(tMax / dt) + 1; nSamplesOrig = info->nSamples; info->nSamples = npfar(info->nSamples); /* slowness increment */ info->dU = (info->u2 - info->u1) / (float) info->nU; /* computing more frequency related quatities */ tMax = dt * (info->nSamples - 1); info->dF = 1. / (tMax); f = info->dF; while (f < info->f1) f += info->dF; info->f1 = f; while (f < info->f2) f += info->dF; info->f2 = f; initF = NINT(info->f1 / info->dF); lastF = NINT(info->f2 / info->dF); info->nF = lastF - initF + 1; if (info->nF%2 == 0) { info->nF++; lastF++; } /* attenuation of wrap-around */ info->tau = log(info->tau) / tMax; if (info->tau > TAUMAX) info->tau = TAUMAX; if (info->verbose) fprintf(stderr, "Discrete frequency range to model: [%d, %d]\n", initF, lastF); if (nFreqProc == 0) nFreqProc = NINT((float) info->nF / (float) nProc + .5); else while (nFreqProc > info->nF) nFreqProc /= 2; nFreqPart = NINT((float) info->nF / (float) nFreqProc + .5); /* memory allocation for frequency arrays */ uRF = alloc2complex(info->nSamples / 2 + 1, info->nR); uZF = alloc2complex(info->nSamples / 2 + 1, info->nR); freqPart = alloc2complex(nFreqProc, info->nR); statusFreq = alloc2int(3, nFreqPart); /* defining frequency partitions */ for (k = initF, i = 0; i < nFreqPart; i++, k += nFreqProc) { statusFreq[i][0] = k; statusFreq[i][1] = MIN(k + nFreqProc - 1, lastF); statusFreq[i][2] = 0; } if (info->verbose) fprintf(stderr, "Starting communication with PVM\n"); /* starting communication with PVM */ if ((apl_pid = pvm_mytid()) < 0) { err("Error enrolling master process"); /* exit(-1); */ } fprintf(stderr, "Starting %d slaves ... ", nProc); processControl = CreateSlaves(processes, PROCESS, nProc); if (processControl != nProc) { err("Problem starting Slaves (%s)\n", PROCESS); /* exit(-1); */ } fprintf(stderr, " Ready \n"); info->nFreqProc = nFreqProc; /* Broadcasting all processes common information */ BroadINFO(info, 1, processes, nProc, GENERAL_INFORMATION); if (info->verbose) { fprintf(stderr, "Broadcasting model information to all slaves\n"); fflush(stderr); } /* sending all profiles */ BroadFloat(thick, info->nL + 1, processes, nProc, THICKNESS); BroadFloat(rho, info->nL + 1, processes, nProc, DENSITY); BroadFloat(alpha, info->nL + 1, processes, nProc, ALPHA); BroadFloat(qP, info->nL + 1, processes, nProc, QALPHA); BroadFloat(beta, info->nL + 1, processes, nProc, BETA); BroadFloat(qS, info->nL + 1, processes, nProc, QBETA); /* freeing memory */ free1float(thick); free1float(rho); free1float(alpha); free1float(qP); free1float(beta); free1float(qS); /* sending frequency partitions for each process */ for (iProc = 0; iProc < nProc; iProc++) { FInfo[0] = statusFreq[iProc][0]; FInfo[1] = statusFreq[iProc][1]; if (info->verbose) { fprintf(stderr, "Master sending frequencies [%d, %d] out of %d to slave %d [id:%d]\n" ,FInfo[0], FInfo[1], info->nF, iProc, processes[iProc]); fflush(stderr); } procInfo[iProc][0] = FInfo[0]; procInfo[iProc][1] = FInfo[1]; SendInt(FInfo, 2, processes[iProc], FREQUENCY_LIMITS); statusFreq[iProc][2] = 1; } /* waiting modelled frequencies */ /* master process will send more frequencies if there's more work to do */ /* measuring elapsed time */ wallcpu = walltime(); /* reseting frequency counter */ FReceived = 0; while (FOREVER) { pid = RecvCplx(freqPart[0], info->nR * nFreqProc, -1, FREQUENCY_PARTITION_VERTICAL); /* finding the frequency limits of this process */ iProc = 0; while (pid != processes[iProc]) iProc++; /* copying into proper place of the total frequency array */ for (iR = 0; iR < info->nR; iR++) { for (k = 0, i = procInfo[iProc][0]; i <= procInfo[iProc][1]; i++, k++) { uZF[iR][i] = freqPart[iR][k]; } } pid = RecvCplx(freqPart[0], info->nR * nFreqProc, -1, FREQUENCY_PARTITION_RADIAL); /* finding the frequency limits of this process */ iProc = 0; while (pid != processes[iProc]) iProc++; /* copying into proper place of the total frequency array */ for (iR = 0; iR < info->nR; iR++) { for (k = 0, i = procInfo[iProc][0]; i <= procInfo[iProc][1]; i++, k++) { uRF[iR][i] = freqPart[iR][k]; } } /* summing frequencies that are done */ FReceived += procInfo[iProc][1] - procInfo[iProc][0] + 1; if (info->verbose) fprintf(stderr, "Master received %d frequencies, remaining %d\n", FReceived, info->nF - FReceived); /* if (FReceived >= info->nF) break; */ /* defining new frequency limits */ i = 0; while (i < nFreqPart && statusFreq[i][2]) i++; if (i < nFreqPart) { /* there is still more work to be done */ /* tell this process to not die */ die = 0; SendInt(&die, 1, processes[iProc], DIE); FInfo[0] = statusFreq[i][0]; FInfo[1] = statusFreq[i][1]; if (info->verbose) fprintf(stderr, "Master sending frequencies [%d, %d] to slave %d\n", FInfo[0], FInfo[1], processes[iProc]); procInfo[iProc][0] = FInfo[0]; procInfo[iProc][1] = FInfo[1]; SendInt(FInfo, 2, processes[iProc], FREQUENCY_LIMITS); statusFreq[i][2] = 1; } else { /* tell this process to die since there is no more work to do */ if (info->verbose) fprintf(stderr, "Master ''killing'' slave %d\n", processes[iProc]); die = 1; SendInt(&die, 1, processes[iProc], DIE); } /* a check to get out the loop */ if (FReceived >= info->nF) break; } if (info->verbose) fprintf(stderr, "Master ''killing'' remaining slaves\n"); /* getting elapsed time */ wallcpu = walltime() - wallcpu; fprintf(stderr, "Wall clock time = %f seconds\n", wallcpu); /* going to time domain */ memset( (void *) &trZ, (int) '\0', sizeof(trZ)); memset( (void *) &trR, (int) '\0', sizeof(trR)); trZ.dt = dt * 1000000; trZ.ns = nSamplesOrig; trR.dt = dt * 1000000; trR.ns = nSamplesOrig; /* z component */ for (iR = 0; iR < info->nR; iR++) { trZ.tracl = iR + 1; /* inverse FFT */ pfacr(1, info->nSamples, uZF[iR], trZ.data); for (i = 0; i < info->nSamples; i++) { /* compensating for the complex frequency */ trZ.data[i] *= exp(info->tau * i * dt); } puttr(&trZ); } /* r component */ for (iR = 0; iR < info->nR; iR++) { trR.tracl = info->nR + iR + 1; /* inverse FFT */ pfacr(1, info->nSamples, uRF[iR], trR.data); for (i = 0; i < info->nSamples; i++) { /* compensating for the complex frequency */ trR.data[i] *= exp(info->tau * i * dt); } puttr(&trR); } return(EXIT_SUCCESS); }
void gradient(float *grad) { /* declaration of variables */ int i, iF, iR, iProc, iDer, iL, iU, offset; /* counters */ int FReceived; /* number of frequencies processed */ int die; /* die processor flag */ int apl_pid; /* PVM process id control */ int pid; /* process id */ int masterId; /* master id */ int processControl; /* monitoring PVM start */ int FInfo[2]; /* frequency delimiters */ float wallcpu; /* wall clock time */ float *gradPart; /* partition of gradients */ complex **resCDPart; /* partition of resCD */ /* Clean up log files */ CleanLog(); /* Reseting synchronization flags */ for (i = 0; i < nFreqPart; i++) { statusFreq[i][2] = 0; } /* allocating some memory */ gradPart = alloc1float(numberPar * limRange); for (i = 0; i < numberPar * limRange; i++) { grad[i] = 0; } fprintf(stderr, "Starting communication with PVM for derivatives\n"); /* starting communication with PVM */ if ((apl_pid = pvm_mytid()) < 0) { pvm_perror("Error enrolling master process"); exit(-1); } processControl = CreateSlaves(processes, PROCESS_FRECHET, nProc); if (processControl != nProc) { fprintf(stderr,"Problem starting PVM daemons\n"); exit(-1); } /* converting to velocities */ if (IMPEDANCE) { for (i = 0; i < info->nL + 1; i++) { alpha[i] /= rho[i]; beta[i] /= rho[i]; } } /* Broadcasting all processes common information */ BroadINFO(info, 1, processes, nProc, GENERAL_INFORMATION); /* sending all profiles */ BroadFloat(thick, info->nL + 1, processes, nProc, THICKNESS); BroadFloat(rho, info->nL + 1, processes, nProc, DENSITY); BroadFloat(alpha, info->nL + 1, processes, nProc, ALPHAS); BroadFloat(qP, info->nL + 1, processes, nProc, QALPHA); BroadFloat(beta, info->nL + 1, processes, nProc, BETAS); BroadFloat(qS, info->nL + 1, processes, nProc, QBETA); /* sending frequency partitions for each process */ for (iProc = 0; iProc < nProc; iProc++) { FInfo[0] = statusFreq[iProc][0]; FInfo[1] = statusFreq[iProc][1]; if (info->verbose) fprintf(stderr, "Master sending frequencies [%d, %d] out of %d to slave Frechet %d [id:%d]\n", FInfo[0], FInfo[1], info->nF, iProc, processes[iProc]); procInfo[iProc][0] = FInfo[0]; procInfo[iProc][1] = FInfo[1]; SendInt(FInfo, 2, processes[iProc], FREQUENCY_LIMITS); statusFreq[iProc][2] = 1; /* and sending the appropriate correlation chunk */ /* allocating some memory */ resCDPart = alloc2complex(FInfo[1] - FInfo[0] + 1, info->nR); for (iR = 0; iR < info->nR; iR++) { for (i = 0, iF = FInfo[0]; iF <= FInfo[1]; iF++, i++) { resCDPart[iR][i] = resCD[iR][iF - initF]; /* fprintf(stderr, "iR %d iF %d [%f %f]\n", iR, iF, resCDPart[iR][i].r, resCDPart[iR][i].i);*/ } } /* sending frequency partition to the slave process */ SendCplx(resCDPart[0], (FInfo[1] - FInfo[0] + 1) * info->nR, processes[iProc], COVARIANCE_PARTITION); free2complex(resCDPart); } /* waiting modelled frequencies */ /* master process will send more frequencies if there's more work to do */ /* measuring elapsed time */ wallcpu = walltime(); /* reseting frequency counter */ FReceived = 0; while (FOREVER) { pid = RecvFloat(gradPart, info->numberPar * info->limRange, -1, PARTIAL_GRADIENT); /* finding the frequency limits of this process */ /* DD fprintf(stderr, "Master finding the frequency limits of this process\n"); */ iProc = 0; while (pid != processes[iProc]) iProc++; /* stacking gradient */ for (i = 0; i < info->numberPar * info->limRange; i++) { grad[i] += gradPart[i]; /* DD fprintf(stderr, "i %d grad %f gradPart %f\n", i, grad[i], gradPart[i]);*/ } /* summing frequencies that are done */ FReceived += procInfo[iProc][1] - procInfo[iProc][0] + 1; if (info->verbose) fprintf(stderr, "Master received %d frequencies, remaining %d\n", FReceived, info->nF - FReceived); /* defining new frequency limits */ i = 0; while (i < nFreqPart && statusFreq[i][2]) i++; /* DD fprintf(stderr, "i %d nFreqPart %d\n", i, nFreqPart);*/ if (i < nFreqPart) { /* there is still more work to be done */ /* tell this process to not die */ die = 0; SendInt(&die, 1, processes[iProc], DIE); FInfo[0] = statusFreq[i][0]; FInfo[1] = statusFreq[i][1]; if (info->verbose) fprintf(stderr, "Master sending frequencies [%d, %d] to slave %d\n", FInfo[0], FInfo[1], processes[iProc]); procInfo[iProc][0] = FInfo[0]; procInfo[iProc][1] = FInfo[1]; SendInt(FInfo, 2, processes[iProc], FREQUENCY_LIMITS); statusFreq[i][2] = 1; /* sending covariance partition */ /* allocating some memory */ resCDPart = alloc2complex(FInfo[1] - FInfo[0] + 1, info->nR); for (iR = 0; iR < info->nR; iR++) { for (i = 0, iF = FInfo[0]; iF <= FInfo[1]; iF++, i++) { resCDPart[iR][i] = resCD[iR][iF - initF]; } } /* sending frequency partition to the slave process */ SendCplx(resCDPart[0], (FInfo[1] - FInfo[0] + 1) * info->nR, processes[iProc], COVARIANCE_PARTITION); free2complex(resCDPart); } else { /* tell this process to die since there is no more work to do */ if (info->verbose) fprintf(stderr, "Master ''killing'' slave %d\n", processes[iProc]); die = 1; SendInt(&die, 1, processes[iProc], DIE); } /* a check to get out the loop */ if (FReceived >= info->nF) break; } /* getting elapsed time */ wallcpu = walltime() - wallcpu; fprintf(stderr, "Frechet derivative wall clock time = %f seconds\n\n", wallcpu); /* back to impedances*/ if (IMPEDANCE) { for (i = 0; i < info->nL + 1; i++) { alpha[i] *= rho[i]; beta[i] *= rho[i]; } } /* finally the gradient, the 2 is due Parseval */ for (iDer = 0; iDer < numberPar * limRange; iDer++) { grad[iDer] *= 2 / (float) (nTotalSamples * oFNorm); } /* getting gradient in impedance domain */ if (IMPEDANCE) { offset = 0; for (i = lim[0], iL = 0; iL < limRange; iL++, i++) { if (vpFrechet) { grad[iL] /= rho[i]; offset = limRange; } if (vsFrechet) { grad[iL + offset] /= rho[i]; offset += limRange; } if (rhoFrechet) { grad[iL + offset] = - alpha[i] * grad[iL] - beta[i] * grad[iL + limRange] + grad[iL + 2 * limRange]; } } } if (PRIOR) { auxm1 = 1. / (float) (numberPar * limRange); /* normalization */ /* considering the regularization or model covariance term */ for (i = 0; i < limRange; i++) { for (offset = i, iL = 0; iL < limRange; iL++) { iU = 0; if (vpFrechet) { grad[iL] += (alpha[i + lim[0]] - alphaMean[i + lim[0]]) * CMvP[offset] * auxm1; iU = limRange; /* used as offset in gradient vector */ } if (vsFrechet) { grad[iL + iU] += (beta[i + lim[0]] - betaMean[i + lim[0]]) * CMvS[offset] * auxm1; iU += limRange; } if (rhoFrechet) { grad[iL + iU] += (rho[i + lim[0]] - rhoMean[i + lim[0]]) * CMrho[offset] * auxm1; } offset += MAX(SGN0(i - iL) * (limRange - 1 - iL), 1); } } } /* normalizing gradient normalize(grad, numberPar * limRange);*/ /* freeing memory */ free1float(gradPart); }
int run_nothing(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i && j <p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - j); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if(papi_res) m4ri_die(""); #endif #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { if (data[nv] < loop_calibration[nv]) loop_calibration[nv] = data[nv]; } #endif mzd_free(A); return (0); }
int main(int argc, char* argv[]) { bool mig, sub; int it, nt, ix, nx, iz, nz, nx2, nz2, nzx, nzx2, ih, nh, nh2; int im, i, j, m2, it1, it2, its, ikz, ikx, ikh, n2, nk, snap; float dt, dx, dz, c, old, dh; float *curr, *prev, **img, **dat, **lft, **rht, **wave; sf_complex *cwave, *cwavem; sf_file data, image, left, right, snaps; /*MPI related*/ int cpuid,numprocs; int provided; int n_local, o_local, nz_local; int ozx2; float *sendbuf, *recvbuf, *wave_all; int *rcounts, *displs; /*wall time*/ double startTime, elapsedTime; double clockZero = 0.0; MPI_Init_thread(&argc,&argv,MPI_THREAD_FUNNELED,&provided); threads_ok = provided >= MPI_THREAD_FUNNELED; sf_init(argc,argv); MPI_Comm_rank(MPI_COMM_WORLD, &cpuid); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); if (!sf_getbool("mig",&mig)) mig=false; /* if n, modeling; if y, migration */ if (!sf_getint("snap",&snap)) snap=0; /* interval for snapshots */ snaps = (snap > 0)? sf_output("snaps"): NULL; /* (optional) snapshot file */ if (mig) { /* migration */ data = sf_input("input"); image = sf_output("output"); if (!sf_histint(data,"n1",&nh)) sf_error("No n1="); if (!sf_histfloat(data,"d1",&dh)) sf_error("No d1="); if (!sf_histint(data,"n2",&nx)) sf_error("No n2="); if (!sf_histfloat(data,"d2",&dx)) sf_error("No d2="); if (!sf_histint(data,"n3",&nt)) sf_error("No n3="); if (!sf_histfloat(data,"d3",&dt)) sf_error("No d3="); if (!sf_getint("nz",&nz)) sf_error("Need nz="); /* time samples (if migration) */ if (!sf_getfloat("dz",&dz)) sf_error("Need dz="); /* time sampling (if migration) */ if (cpuid==0) { sf_putint(image,"o1",0.); sf_putint(image,"n1",nz); sf_putfloat(image,"d1",dz); sf_putstring(image,"label1","Depth"); sf_putint(image,"o2",0.); sf_putint(image,"n2",nx); sf_putfloat(image,"d2",dx); sf_putstring(image,"label2","Midpoint"); sf_putint(image,"n3",1); /* stack for now */ } } else { /* modeling */ image = sf_input("input"); data = sf_output("output"); if (!sf_histint(image,"n1",&nz)) sf_error("No n1="); if (!sf_histfloat(image,"d1",&dz)) sf_error("No d1="); if (!sf_histint(image,"n2",&nx)) sf_error("No n2="); if (!sf_histfloat(image,"d2",&dx)) sf_error("No d2="); if (!sf_getint("nt",&nt)) sf_error("Need nt="); /* time samples (if modeling) */ if (!sf_getfloat("dt",&dt)) sf_error("Need dt="); /* time sampling (if modeling) */ if (!sf_getint("nh",&nh)) sf_error("Need nh="); /* offset samples (if modeling) */ if (!sf_getfloat("dh",&dh)) sf_error("Need dh="); /* offset sampling (if modeling) */ if (cpuid==0) { sf_putint(data,"n1",nh); sf_putfloat(data,"d1",dh); sf_putstring(data,"label1","Half-Offset"); sf_putint(data,"o2",0.); sf_putint(data,"n2",nx); sf_putfloat(data,"d2",dx); sf_putstring(data,"label2","Midpoint"); sf_putint(data,"n3",nt); sf_putfloat(data,"d3",dt); sf_putstring(data,"label3","Time"); sf_putstring(data,"unit3","s"); } } if (cpuid==0) { if (NULL != snaps) { sf_putint(snaps,"n1",nh); sf_putfloat(snaps,"d1",dh); sf_putstring(snaps,"label1","Half-Offset"); sf_putint(snaps,"n2",nx); sf_putfloat(snaps,"d2",dx); sf_putstring(snaps,"label2","Midpoint"); sf_putint(snaps,"n3",nz); sf_putfloat(snaps,"d3",dz); sf_putstring(snaps,"label3","Depth"); sf_putint(snaps,"n4",nt/snap); sf_putfloat(snaps,"d4",dt*snap); if (mig) { sf_putfloat(snaps,"o4",(nt-1)*dt); } else { sf_putfloat(snaps,"o4",0.); } sf_putstring(snaps,"label4","Time"); } } /* Mark the starting time. */ startTime = walltime( &clockZero ); nk = mcfft3_init(1,nh,nx,nz,&nh2,&nx2,&nz2,&n_local,&o_local); nz_local = (n_local < nz-o_local)? n_local:nz-o_local; sf_warning("Cpuid=%d,n2=%d,n1=%d,n0=%d,local_n0=%d,local_0_start=%d,nz_local=%d",cpuid,nh2,nx2,nz2,n_local,o_local,nz_local); if (cpuid==0) if (o_local!=0) sf_error("Cpuid and o_local inconsistant!"); nzx = nz*nx*nh; //nzx2 = nz2*nx2*nh2; nzx2 = n_local*nx2*nh2; ozx2 = o_local*nx2*nh2; img = sf_floatalloc2(nz,nx); dat = sf_floatalloc2(nh,nx); /* propagator matrices */ left = sf_input("left"); right = sf_input("right"); if (!sf_histbool(left,"sub",&sub) && !sf_getbool("sub",&sub)) sub=true; /* if -1 is included in the matrix */ if (!sf_histint(left,"n1",&n2) || n2 != nzx) sf_error("Need n1=%d in left",nzx); if (!sf_histint(left,"n2",&m2)) sf_error("No n2= in left"); if (!sf_histint(right,"n1",&n2) || n2 != m2) sf_error("Need n1=%d in right",m2); if (!sf_histint(right,"n2",&n2) || n2 != nk) sf_error("Need n2=%d in right",nk); lft = sf_floatalloc2(nzx,m2); rht = sf_floatalloc2(m2,nk); sf_floatread(lft[0],nzx*m2,left); sf_floatread(rht[0],m2*nk,right); curr = sf_floatalloc(nzx2); prev = sf_floatalloc(nzx2); cwave = sf_complexalloc(nk); cwavem = sf_complexalloc(nk); wave = sf_floatalloc2(nzx2,m2); #ifdef _OPENMP #pragma omp parallel for default(shared) private(iz) #endif for (iz=0; iz < nzx2; iz++) { curr[iz]=0.; prev[iz]=0.; } sendbuf = prev; if (cpuid==0) { wave_all = sf_floatalloc(nh2*nx2*nz2); recvbuf = wave_all; rcounts = sf_intalloc(numprocs); displs = sf_intalloc(numprocs); } else { wave_all = NULL; recvbuf = NULL; rcounts = NULL; displs = NULL; } MPI_Gather(&nzx2, 1, MPI_INT, rcounts, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Gather(&ozx2, 1, MPI_INT, displs, 1, MPI_INT, 0, MPI_COMM_WORLD); if (mig) { /* migration */ /* step backward in time */ it1 = nt-1; it2 = -1; its = -1; } else { /* modeling */ sf_floatread(img[0],nz*nx,image); /* transpose and initialize at zero offset */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(iz,ix) #endif for (iz=0; iz < nz_local; iz++) { for (ix=0; ix < nx; ix++) { curr[nh2*(ix+iz*nx2)]=img[ix][iz+o_local]; } } /* step forward in time */ it1 = 0; it2 = nt; its = +1; } /* time stepping */ for (it=it1; it != it2; it += its) { sf_warning("it=%d;",it); if (mig) { /* migration <- read data */ sf_floatread(dat[0],nx*nh,data); } else { #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix,ih) #endif for (ix=0; ix < nx; ix++) { for (ih=0; ih < nh; ih++) { dat[ix][ih] = 0.; } } } if (NULL != snaps && 0 == it%snap) { MPI_Gatherv(sendbuf, nzx2, MPI_FLOAT, recvbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD); if (cpuid==0) { for (iz = 0; iz < nz; iz++) for (ix = 0; ix < nx; ix++) sf_floatwrite(wave_all+nh2*(ix+nx2*iz),nh,snaps); } } /* at z=0 */ if (cpuid==0) { #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix,ih) #endif for (ix=0; ix < nx; ix++) { for (ih=0; ih < nh; ih++) { if (mig) { curr[ix*nh2+ih] += dat[ix][ih]; } else { dat[ix][ih] = curr[ix*nh2+ih]; } } } } /* matrix multiplication */ mcfft3(curr,cwave); for (im = 0; im < m2; im++) { //for (ik = 0; ik < nk; ik++) { #ifdef _OPENMP #pragma omp parallel for default(shared) private(ikz,ikx,ikh,i,j) #endif for (ikz = 0; ikz < n_local; ikz++) { for (ikx = 0; ikx < nx2; ikx++) { for (ikh = 0; ikh < nh2; ikh++) { i = ikh + ikx*nh2 + (o_local+ikz)*nx2*nh2; j = ikh + ikx*nh2 + ikz*nx2*nh2; #ifdef SF_HAS_COMPLEX_H cwavem[j] = cwave[j]*rht[i][im]; #else cwavem[j] = sf_crmul(cwave[j],rht[i][im]); #endif } } } imcfft3(wave[im],cwavem); } #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix,iz,ih,i,j,im,old,c) #endif for (iz=0; iz < nz_local; iz++) { for (ix = 0; ix < nx; ix++) { for (ih=0; ih < nh; ih++) { i = ih + ix*nh + (o_local+iz)*nx*nh; /* original grid */ j = ih + ix*nh2+ iz*nx2*nh2; /* padded grid */ old = curr[j]; c = sub? 2*old: 0.0f; c -= prev[j]; prev[j] = old; for (im = 0; im < m2; im++) { c += lft[im][i]*wave[im][j]; } curr[j] = c; } } } if (!mig) { /* modeling -> write out data */ if (cpuid==0) sf_floatwrite(dat[0],nx*nh,data); } } sf_warning("."); if (mig) { sendbuf = curr; MPI_Gatherv(sendbuf, nzx2, MPI_FLOAT, recvbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD); if (cpuid==0) { /* transpose */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(iz,ix) #endif for (iz=0; iz < nz; iz++) { for (ix=0; ix < nx; ix++) { img[ix][iz] = wave_all[nh2*(ix+iz*nx2)]; } } sf_floatwrite(img[0],nz*nx,image); } } mcfft3_finalize(); /* Work's done. Get the elapsed wall time. */ elapsedTime = walltime( &startTime ); /* Print the wall time and terminate. */ if (cpuid==0) printf("\nwall time = %.5fs\n", elapsedTime); MPI_Finalize(); exit(0); }
ri_t reduce_gbla_matrix_keep_A(mat_t *mat, int verbose, int nthreads) { /* timing structs */ struct timeval t_load_start; struct timeval t_complete; if (verbose > 2) gettimeofday(&t_complete, NULL); /* A^-1 * B */ if (verbose > 2) { printf("---------------------------------------------------------------------------\n"); printf("GBLA Matrix Reduction\n"); printf("---------------------------------------------------------------------------\n"); gettimeofday(&t_load_start, NULL); printf("%-38s","Storing A in C ..."); fflush(stdout); } if (mat->AR->row != NULL) { if (elim_fl_C_sparse_dense_keep_A(mat->CR, &(mat->AR), mat->mod, nthreads)) { printf("Error while reducing A.\n"); return 1; } } if (verbose > 2) { printf("%9.3f sec\n", walltime(t_load_start) / (1000000)); } if (verbose > 3) { print_mem_usage(); } /* reducing submatrix C to zero using methods of Faugère & Lachartre */ if (verbose > 2) { gettimeofday(&t_load_start, NULL); printf("%-38s","Copying C to sparse block representation ..."); fflush(stdout); } if (mat->CR->row != NULL) { mat->C = copy_sparse_to_block_matrix(mat->CR, nthreads); free_sparse_matrix(&(mat->CR), nthreads); } if (verbose > 2) { printf("%9.3f sec\n", walltime(t_load_start) / (1000000)); } if (verbose > 3) { print_mem_usage(); } if (verbose > 2) { printf("%-38s","Reducing C to zero ..."); fflush(stdout); } if (mat->C != NULL) { if (elim_fl_C_sparse_dense_block(mat->B, &(mat->C), mat->D, mat->mod, nthreads)) { printf("Error while reducing A.\n"); return 1; } } if (verbose > 2) { printf("%9.3f sec\n", walltime(t_load_start) / (1000000)); } if (verbose > 3) { print_mem_usage(); } /* copy block D to dense wide (re_l_t) representation */ mat->DR = copy_block_to_dense_matrix(&(mat->D), nthreads, 1); mat->DR->mod = mat->mod; /* eliminate mat->DR using a structured Gaussian Elimination process on the rows */ nelts_t rank_D = 0; /* echelonizing D to zero using methods of Faugère & Lachartre */ if (verbose > 2) { gettimeofday(&t_load_start, NULL); printf("%-38s","Reducing D ..."); fflush(stdout); } if (mat->DR->nrows > 0) /* rank_D = elim_fl_dense_D(mat->DR, nthreads); */ rank_D = elim_fl_dense_D_completely(mat->DR, nthreads); if (verbose > 2) { printf("%9.3f sec %5d %5d %5d\n", walltime(t_load_start) / (1000000), rank_D, mat->DR->nrows - rank_D, mat->DR->nrows); } if (verbose > 3) { print_mem_usage(); } if (verbose > 2) { printf("---------------------------------------------------------------------------\n"); printf("%-38s","Reduction completed ..."); fflush(stdout); printf("%9.3f sec\n", walltime(t_complete) / (1000000)); if (verbose > 3) print_mem_usage(); } return rank_D; }
float modeling() { /* declaration of variables */ FILE *fp; /* to report results */ int iF, iF1, iR, offset, iT1, iT2, iS, iProc, i, k; /* counters */ int wL; /* window length */ int die; /* die processor flag */ int FReceived; /* number of frequencies processed */ int apl_pid; /* PVM process id control */ int pid; /* process id */ int processControl; /* monitoring PVM start */ int FInfo[2]; /* frequency delimiters */ float wallcpu; /* wall clock time */ float oF; /* value of the objective function */ float residue; /* data residue */ float wdw; /* windowing purposes */ float *buffer, *bufferRCD; /* auxiliary buffers */ /* upgoing waves */ complex **dataS; /* synthethics in the frequency domain */ complex *bufferC; /* auxiliary buffer */ complex **freqPart; /* frequency arrays sent by the slaves */ /* Clean up log files */ CleanLog(); /* Reseting synchronization flags */ for (i = 0; i < nFreqPart; i++) { statusFreq[i][2] = 0; } /* allocating some memory */ dataS = alloc2complex(info->nF, info->nR); buffer = alloc1float(info->nSamples); bufferRCD = alloc1float(info->nSamples); bufferC = alloc1complex(info->nSamples / 2 + 1); freqPart = alloc2complex(info->nFreqProc, info->nR); /* reseting */ for (iF = 0; iF < info->nSamples / 2 + 1; iF++) bufferC[iF] = zeroC; for (iS = 0; iS < info->nSamples; iS++) { buffer[iS] = 0; bufferRCD[iS] = 0; } /* DD fprintf(stderr, "nF -> %d\n", info->nF);*/ fprintf(stderr, "Starting communication with PVM for modeling\n"); /* starting communication with PVM */ if ((apl_pid = pvm_mytid()) < 0) { pvm_perror("Error enrolling master process"); exit(-1); } processControl = CreateSlaves(processes, PROCESS_MODELING, nProc); if (processControl != nProc) { fprintf(stderr,"Problem starting PVM daemons\n"); exit(-1); } /* converting to velocities */ if (IMPEDANCE) { for (i = 0; i < info->nL + 1; i++) { alpha[i] /= rho[i]; beta[i] /= rho[i]; } } /* Broadcasting all processes common information */ BroadINFO(info, 1, processes, nProc, GENERAL_INFORMATION); /* sending all profiles */ BroadFloat(thick, info->nL + 1, processes, nProc, THICKNESS); BroadFloat(rho, info->nL + 1, processes, nProc, DENSITY); BroadFloat(alpha, info->nL + 1, processes, nProc, ALPHAS); BroadFloat(qP, info->nL + 1, processes, nProc, QALPHA); BroadFloat(beta, info->nL + 1, processes, nProc, BETAS); BroadFloat(qS, info->nL + 1, processes, nProc, QBETA); /* sending frequency partitions for each process */ for (iProc = 0; iProc < nProc; iProc++) { FInfo[0] = statusFreq[iProc][0]; FInfo[1] = statusFreq[iProc][1]; if (info->verbose) fprintf(stderr, "Master sending frequencies [%d, %d] out of %d to slave Modeling %d [id:%d]\n", FInfo[0], FInfo[1], info->nF, iProc, processes[iProc]); procInfo[iProc][0] = FInfo[0]; procInfo[iProc][1] = FInfo[1]; SendInt(FInfo, 2, processes[iProc], FREQUENCY_LIMITS); statusFreq[iProc][2] = 1; } /* waiting modelled frequencies */ /* master process will send more frequencies if there's more work to do */ /* measuring elapsed time */ wallcpu = walltime(); /* reseting frequency counter */ FReceived = 0; while (FOREVER) { pid = RecvCplx(freqPart[0], info->nR * info->nFreqProc, -1, FREQUENCY_PARTITION); /* finding the frequency limits of this process */ /* DD fprintf(stderr, "Master finding the frequency limits of this process\n"); */ iProc = 0; while (pid != processes[iProc]) iProc++; /* DD fprintf(stderr, "iProc %d pid %d\n", iProc, pid);*/ /* copying into proper place of the total frequency array */ for (iR = 0; iR < info->nR; iR++) { for (k = 0, i = procInfo[iProc][0]; i <= procInfo[iProc][1]; i++, k++) { dataS[iR][i - initF] = freqPart[iR][k]; } } /* summing frequencies that are done */ FReceived += procInfo[iProc][1] - procInfo[iProc][0] + 1; if (info->verbose) fprintf(stderr, "Master received %d frequencies, remaining %d\n", FReceived, info->nF - FReceived); /* defining new frequency limits */ i = 0; while (i < nFreqPart && statusFreq[i][2]) i++; /* DD fprintf(stderr, "i %d nFreqPart %d\n", i, nFreqPart);*/ if (i < nFreqPart) { /* there is still more work to be done */ /* tell this process to not die */ die = 0; SendInt(&die, 1, processes[iProc], DIE); FInfo[0] = statusFreq[i][0]; FInfo[1] = statusFreq[i][1]; if (info->verbose) fprintf(stderr, "Master sending frequencies [%d, %d] to slave %d\n", FInfo[0], FInfo[1], processes[iProc]); procInfo[iProc][0] = FInfo[0]; procInfo[iProc][1] = FInfo[1]; SendInt(FInfo, 2, processes[iProc], FREQUENCY_LIMITS); statusFreq[i][2] = 1; } else { /* tell this process to die since there is no more work to do */ if (info->verbose) fprintf(stderr, "Master ''killing'' slave %d\n", processes[iProc]); die = 1; SendInt(&die, 1, processes[iProc], DIE); } /* a check to get out the loop */ if (FReceived >= info->nF) break; } /* quitting PVM */ EndOfMaster(); /* getting elapsed time */ wallcpu = walltime() - wallcpu; fprintf(stderr, "Modeling wall clock time = %f seconds\n", wallcpu); /* back to impedances*/ if (IMPEDANCE) { for (i = 0; i < info->nL + 1; i++) { alpha[i] *= rho[i]; beta[i] *= rho[i]; } } /* computing the objective function for the time window */ for (oF = 0, residue = 0, iR = 0; iR < info->nR; iR++) { /* windowing as it was done to the input data */ iT1 = NINT(info->f1 / info->dF); iT2 = NINT(info->f2 / info->dF); wL = info->nF * PERC_WINDOW / 2; wL = 2 * wL + 1; for (iS = 0, iF = 0; iF < info->nSamples / 2 + 1; iF++) { if (iF < iT1 || iF >= iT2) { bufferC[iF] = cmplx(0, 0); } else if (iF - iT1 < (wL - 1) / 2) { wdw = .42 - .5 * cos(2 * PI * (float) iS / ((float) (wL - 1))) + .08 * cos(4 * PI * (float) iS / ((float) (wL - 1))); bufferC[iF].r = dataS[iR][iF - iT1].r * wdw; bufferC[iF].i = dataS[iR][iF - iT1].i * wdw; iS++; } else if (iF - iT1 >= info->nF - (wL - 1) / 2) { iS++; wdw = .42 - .5 * cos(2 * PI * (float) iS / ((float) (wL - 1))) + .08 * cos(4 * PI * (float) iS / ((float) (wL - 1))); bufferC[iF].r = dataS[iR][iF - iT1].r * wdw; bufferC[iF].i = dataS[iR][iF - iT1].i * wdw; } else { bufferC[iF] = dataS[iR][iF - iT1]; } } /* going to time domain */ /* DD fprintf(stderr, "going to time domain \n");*/ pfacr(1, info->nSamples, bufferC, buffer); /* muting ? */ if (MUTE) { for (iS = 0; iS <= NINT(t1Mute[iR] / dt); iS++) { buffer[iS] = 0; } } /* and computing data misfit and likelihood function */ iS = NINT(t1 / dt); for (iT1 = 0; iT1 < nDM; iT1++) { bufferRCD[iT1 + iS] = 0; for (offset = iT1, iT2 = 0; iT2 < nDM; iT2++) { bufferRCD[iT1 + iS] += (buffer[iT2 + iS] - dataObs[iR][iT2]) * CD[offset]; offset += MAX(SGN0(iT1 - iT2) * (nDM - 1 - iT2), 1); } oF += (buffer[iT1 + iS] - dataObs[iR][iT1]) * bufferRCD[iT1 + iS]; residue += (buffer[iT1 + iS] - dataObs[iR][iT1]) * (buffer[iT1 + iS] - dataObs[iR][iT1]); /* DD fprintf(stdout, "%d %f %f %f %f %f %d %f %f\n", nTotalSamples, oF, dt, auxm1, info->tau, residue, iT1, buffer[iT1], dataObs[iR][iT1 - NINT(t1 / dt)]); */ } /* windowing bufferRCD */ iT1 = NINT(t1 / dt); iT2 = NINT(t2 / dt); wL = nDM * PERC_WINDOW / 2; wL = 2 * wL + 1; for (iS = 0, iF = 0; iF < info->nSamples; iF++) { if (iF < iT1 || iF >= iT2) { bufferRCD[iF] = 0; } else if (iF - iT1 < (wL - 1) / 2) { wdw = .42 - .5 * cos(2 * PI * (float) iS / ((float) (wL - 1))) + .08 * cos(4 * PI * (float) iS / ((float) (wL - 1))); bufferRCD[iF] *= wdw; iS++; } else if (iF - iT1 >= nDM - (wL - 1) / 2) { iS++; wdw = .42 - .5 * cos(2 * PI * (float) iS / ((float) (wL - 1))) + .08 * cos(4 * PI * (float) iS / ((float) (wL - 1))); bufferRCD[iF] *= wdw; } } /* going back to Fourier domain */ pfarc(-1, info->nSamples, bufferRCD, bufferC); for (iF1 = 0, iF = NINT(info->f1 / info->dF); iF <= NINT(info->f2 / info->dF); iF++, iF1++) { resCD[iR][iF1] = bufferC[iF]; } } /* considering the .5 factor of the exponent of the Gaussian */ /* and normalizing the likelihood by the number of samples */ oF /= (2 * nTotalSamples); /* freeing some memory */ /* allocating some memory */ free2complex(dataS); free1float(buffer); free1float(bufferRCD); free1complex(bufferC); free2complex(freqPart); /* considering the regularizaton or model covariance term */ if (PRIOR) { auxm1 = 1. / (float) (numberPar * limRange); /* normalization */ for (auxm2 = 0, iF = 0; iF < limRange; iF++) { for (offset = iF, iF1 = 0; iF1 < limRange; iF1++) { if (vpFrechet) { auxm2 += (alpha[iF + lim[0]] - alphaMean[iF + lim[0]]) * CMvP[offset] * auxm1 * (alpha[iF1 + lim[0]] - alphaMean[iF1 + lim[0]]); } if (vsFrechet) { auxm2 += (beta[iF + lim[0]] - betaMean[iF + lim[0]]) * CMvS[offset] * auxm1 * (beta[iF1 + lim[0]] - betaMean[iF1 + lim[0]]); } if (rhoFrechet) { auxm2 += (rho[iF + lim[0]] - rhoMean[iF + lim[0]]) * CMrho[offset] * auxm1 * (rho[iF1 + lim[0]] - rhoMean[iF1 + lim[0]]); } offset += MAX(SGN0(iF - iF1) * (limRange - 1 - iF1), 1); } } } /* getting normalization factor */ fp = fopen("report", "a"); fprintf(fp,"-----------------------\n"); if (modCount == 0) { oFNorm = oF; fprintf(fp,">> Normalization constant for objective function: %f <<\n", oFNorm); } /* normalizing residue */ residue /= (nTotalSamples); if (!DATACOV && noiseVar == 0) noiseVar = residue / 10.; if (PRIOR) { fprintf(fp, "residue at iteration [%d] : Data residue variance %f , Noise variance %f , Likelihood %f , Prior %f\n", modCount, residue, noiseVar, oF / oFNorm, auxm2 / oFNorm); } else { fprintf(fp,"residue at iteration [%d] : Data residue variance %f , Noise variance %f , Likelihood %f , No Prior\n", modCount, residue, noiseVar, oF / oFNorm); } /* checking if we reached noise variance with the data residue */ if (residue / noiseVar <= 1) { /* DATA IS FIT, stop the procedure */ fprintf(fp, "[][][][][][][][][][][][][][][][][][][][]\n"); fprintf(fp, "DATA WAS FIT UP TO 1 VARIANCE!\n"); fprintf(fp, "[][][][][][][][][][][][][][][][][][][][]\n"); exit(0); } /* adding Likelihood and Prior */ if (PRIOR) oF += auxm2 / 2; fprintf(fp,"TOTAL residue at iteration [%d] : %f\n", modCount, oF / oFNorm); fprintf(fp,"-----------------------\n"); fclose(fp); /* returning objective function value */ return(oF / oFNorm); }
int main(int argc, char *argv[]) { char tbuf[100]; initialize_mycallable(); initialize_mycaller(); void *obj_intern = get_f_callable_intern(); void *obj_key = get_f_callable_key(); printf("Direct result: %f\n", func(2.0)); printf("Intern result: %f\n", docall_intern(obj_intern, 2.0)); printf("Key result: %f\n", docall_key(obj_key, 2.0)); printf("Intern+getfunc result: %f\n", docall_getfunc_intern(obj_intern, 2.0)); printf("Key+getfunc result: %f\n", docall_getfunc_key(obj_key, 2.0)); double s = 0; { double times[K]; for (int k = 0; k != K; ++k) { double t0 = walltime(); for (int i = 0; i != J; i++) { s += func(2.0); } times[k] = walltime() - t0; } snftime(tbuf, 100, arrmin(times, K) / (double)J); printf("Direct took %s\n", tbuf); } { double times[K]; for (int k = 0; k != K; ++k) { double t0 = walltime(); for (int i = 0; i != J; i++) { s += docall_dispatch(&func, 2.0); } times[k] = walltime() - t0; } snftime(tbuf, 100, arrmin(times, K) / (double)J); printf("Dispatch took %s\n", tbuf); } { double times[K]; for (int k = 0; k != K; ++k) { double t0 = walltime(); for (int i = 0; i != J; i++) { s += docall_intern(obj_intern, 2.0); } times[k] = walltime() - t0; } snftime(tbuf, 100, arrmin(times, K) / (double)J); printf("Intern method took %s\n", tbuf); } { double times[K]; for (int k = 0; k != K; ++k) { double t0 = walltime(); for (int i = 0; i != J; i++) { s += docall_key(obj_key, 2.0); } times[k] = walltime() - t0; } snftime(tbuf, 100, arrmin(times, K) / (double)J); printf("Key method took %s\n", tbuf); } { double times[K]; for (int k = 0; k != K; ++k) { double t0 = walltime(); for (int i = 0; i != J; i++) { s += docall_getfunc_intern(obj_intern, 2.0); } times[k] = walltime() - t0; } snftime(tbuf, 100, arrmin(times, K) / (double)J); printf("Intern+getfunc method took %s\n", tbuf); } { double times[K]; for (int k = 0; k != K; ++k) { double t0 = walltime(); for (int i = 0; i != J; i++) { s += docall_getfunc_key(obj_key, 2.0); } times[k] = walltime() - t0; } snftime(tbuf, 100, arrmin(times, K) / (double)J); printf("Key+getfunc method took %s\n", tbuf); } printf("s: %f\n", s); return 0; }
int main(int argc, char* argv[]) { init(&argc, &argv); run([]{ double t; TupleGraph tg; GRAPPA_TIME_REGION(tuple_time) { if (FLAGS_path.empty()) { int64_t NE = (1L << FLAGS_scale) * FLAGS_edgefactor; tg = TupleGraph::Kronecker(FLAGS_scale, NE, 111, 222); } else { LOG(INFO) << "loading " << FLAGS_path; tg = TupleGraph::Load(FLAGS_path, FLAGS_format); } } LOG(INFO) << tuple_time; LOG(INFO) << "constructing graph"; t = walltime(); auto g = G::create(tg); construction_time = walltime()-t; LOG(INFO) << construction_time; count = 0; forall(masters(g), [](G::Vertex& v){ count += v.n_out; }); CHECK_EQ(count, g->ne); Metrics::start_tracing(); for (int i = 0; i < FLAGS_trials; i++) { if (FLAGS_trials > 1) LOG(INFO) << "trial " << i; forall(g, [](G::Vertex& v){ v->rank = 1.0; }); GRAPPA_TIME_REGION(total_time) { activate_all(g); GraphlabEngine<G,PagerankVertexProgram>::run_sync(g); } if (i == 0) { total_time.reset(); // don't count the first one total_rank = 0; forall(g, [](G::Vertex& v){ total_rank += v->rank; }); std::cerr << "total_rank: " << total_rank << "\n"; } } Metrics::stop_tracing(); LOG(INFO) << total_time; total_rank = 0; forall(masters(g), [](G::Vertex& v){ total_rank += v->rank; }); LOG(INFO) << "total_rank: " << total_rank << "\n"; });
ri_t reduce_gbla_matrix(mat_t * mat, int verbose, int nthreads) { /* timing structs */ struct timeval t_load_start; struct timeval t_complete; if (verbose > 2) gettimeofday(&t_complete, NULL); /* A^-1 * B */ if (verbose > 2) { printf("---------------------------------------------------------------------------\n"); printf("GBLA Matrix Reduction\n"); printf("---------------------------------------------------------------------------\n"); gettimeofday(&t_load_start, NULL); printf("%-38s","Reducing A ..."); fflush(stdout); } if (mat->A->blocks != NULL) { if (elim_fl_A_sparse_dense_block(&(mat->A), mat->B, mat->mod, nthreads)) { printf("Error while reducing A.\n"); return 1; } } if (verbose > 2) { printf("%9.3f sec\n", walltime(t_load_start) / (1000000)); } if (verbose > 3) { print_mem_usage(); } /* reducing submatrix C to zero using methods of Faugère & Lachartre */ if (verbose > 2) { gettimeofday(&t_load_start, NULL); printf("%-38s","Reducing C ..."); fflush(stdout); } if (mat->C->blocks != NULL) { if (elim_fl_C_sparse_dense_block(mat->B, &(mat->C), mat->D, mat->mod, nthreads)) { printf("Error while reducing C.\n"); return 1; } } if (verbose > 2) { printf("%9.3f sec\n", walltime(t_load_start) / (1000000)); } if (verbose > 3) { print_mem_usage(); } /* copy block D to dense wide (re_l_t) representation */ mat->DR = copy_block_to_dense_matrix(&(mat->D), nthreads, 1); mat->DR->mod = mat->mod; #if 0 printf("number of rows of DR %u\n", mat->DR->nrows); for (int ii=0; ii<mat->DR->nrows; ++ii) { printf("ROW %d\n",ii); if (mat->DR->row[ii]->init_val == NULL) printf("NULL!"); else { printf("%u || ", mat->DR->row[ii]->lead); for (int jj=0; jj<mat->DR->ncols; ++jj) #if defined(GBLA_USE_UINT16) || defined(GBLA_USE_UINT32) printf("%u (%u) ", mat->DR->row[ii]->init_val[jj], jj+mat->ncl); #else printf("%.0f ", mat->DR->row[ii]->init_val[jj]); #endif } printf("\n"); } #endif /* eliminate mat->DR using a structured Gaussian Elimination process on the rows */ nelts_t rank_D = 0; /* echelonizing D to zero using methods of Faugère & Lachartre */ if (verbose > 2) { gettimeofday(&t_load_start, NULL); printf("%-38s","Reducing D ..."); fflush(stdout); } if (mat->DR->nrows > 0) { if (nthreads == 1) { rank_D = elim_fl_dense_D_completely(mat->DR, nthreads); } else { rank_D = elim_fl_dense_D(mat->DR, nthreads); nelts_t l; for (l=1; l<mat->DR->rank; ++l) { /* for (l=(int)(mat->DR->rank-1); l>0; --l) { */ copy_piv_to_val(mat->DR, mat->DR->rank-l-1); completely_reduce_D(mat->DR, mat->DR->rank-l-1); } } } if (verbose > 2) { printf("%9.3f sec %5d %5d %5d\n", walltime(t_load_start) / (1000000), rank_D, mat->DR->nrows - rank_D, mat->DR->nrows); } /* if we simplify, then copy B to dense row representation */ if (mat->sl > 0 && mat->B->blocks != NULL) { /* first copy B to BR (dense row format) */ mat->BR = copy_block_to_dense_matrix(&(mat->B), nthreads, 0); mat->BR->mod = mat->mod; } if (verbose > 3) { print_mem_usage(); } if (verbose > 2) { printf("---------------------------------------------------------------------------\n"); printf("%-38s","Reduction completed ..."); fflush(stdout); printf("%9.3f sec\n", walltime(t_complete) / (1000000)); if (verbose > 3) print_mem_usage(); } return rank_D; }
static void run_sync(GlobalAddress<Graph<V,E>> _g) { call_on_all_cores([=]{ g = _g; }); ct = 0; // initialize GraphlabVertexProgram forall(g, [=](Vertex& v){ v->prog = new VertexProg(v); if (prog(v).gather_edges(v)) ct++; }); if (ct > 0) { forall(g, [=](Vertex& v){ forall<async>(adj(g,v), [=,&v](Edge& e){ // gather auto delta = prog(v).gather(v, e); call<async>(e.ga, [=](Vertex& ve){ prog(ve).post_delta(delta); }); }); }); } int iteration = 0; size_t active = V::total_active; while ( active > 0 && iteration < FLAGS_max_iterations ) GRAPPA_TIME_REGION(iteration_time) { VLOG(1) << "iteration " << std::setw(3) << iteration; VLOG(1) << " active: " << active; double t = walltime(); forall(g, [=](Vertex& v){ if (!v->active) return; v->deactivate(); auto& p = prog(v); // apply p.apply(v, p.cache); v->active_minor_step = p.scatter_edges(v); }); forall(g, [=](Vertex& v){ if (v->active_minor_step) { v->active_minor_step = false; auto prog_copy = prog(v); // scatter forall<async>(adj(g,v), [=](Edge& e){ _do_scatter(prog_copy, e, &VertexProg::scatter); }); } }); { symmetric_static std::ofstream myFile; //std::ofstream myFile; int pid = getpid(); LOG(INFO) << "start writing file"; std::string path = NaiveGraphlabEngine<G,VertexProg>::OutputPath; //on_all_cores( [pid, iteration, path] { std::ostringstream oss; oss << OutputPath << "-" << pid << "-" << mycore() << "-" << iteration; new (&myFile) std::ofstream(oss.str()); if (!myFile.is_open()) exit(1); //}); forall(g, [](VertexID i, Vertex& v){ // LOG(INFO) << "id: " << i << " label: " << v->label; myFile << i << " "; for (int j = 0; j < NaiveGraphlabEngine<G,VertexProg>::Number_of_groups; j++) { myFile << prog(v).cache.label_count[j] << " "; } myFile << v->label << "\n"; }); //on_all_cores( [] { myFile.close(); //}); LOG(INFO) << "end writig file"; } iteration++; VLOG(1) << " time: " << walltime()-t; active = V::total_active; } forall(g, [](Vertex& v){ delete static_cast<VertexProg*>(v->prog); }); }