void parTreeSearch(StealStack *ss) { Node *parent; Node *child; void *parent_buf, *child_buf; #ifdef USING_GTC parent_buf = (void*) gtc_task_create_ofclass(sizeof(Node), uts_tclass); parent = gtc_task_body((task_t*)parent_buf); child_buf = (void*) gtc_task_create_ofclass(sizeof(Node), uts_tclass); child = gtc_task_body((task_t*)child_buf); #else child = malloc(sizeof(Node)); parent = malloc(sizeof(Node)); parent_buf = parent; child_buf = child; #endif while (ss_get_work(ss, parent_buf) == STATUS_HAVEWORK) { genChildren(parent, child_buf, child, ss); #if DEBUG_PROGRESS > 0 // Debugging: Witness progress... if (ss->nNodes % DEBUG_PROGRESS == 0) printf("Thread %3d: Progress is %d nodes\n", ss_get_thread_num(), ss->nNodes); #endif } #ifdef USING_GTC gtc_task_destroy(parent_buf); gtc_task_destroy(child_buf); #else free(parent); free(child); #endif }
/* * parallel search of UTS trees using work stealing * * Note: tree size is measured by the number of * push operations */ void parTreeSearch(StealStack *ss) { Node* parent; Node* child; parent = malloc(sizeof(Node)); child = malloc(sizeof(Node)); while (ss_get_work(ss,parent) == STATUS_HAVEWORK) { genChildren(parent,child,ss); // Debugging: Uncomment to witness progress... //if (ss->nNodes % 10000 == 0) // printf("Thread %d: Progress is %d nodes\n", ss_get_thread_num(), ss->nNodes); } free(child); free(parent); }
/* Main() function for: Sequential, OpenMP, UPC, and Shmem * * Notes on execution model: * - under openMP, global vars are all shared * - under UPC, global vars are private unless explicitly shared * - UPC is SPMD starting with main, OpenMP goes SPMD after * parsing parameters */ int main(int argc, char *argv[]) { #ifdef THREAD_METADATA memset(t_metadata, 0x00, MAX_OMP_THREADS * sizeof(thread_metadata)); #endif memset(thread_info, 0x00, MAX_OMP_THREADS * sizeof(per_thread_info)); memset(steal_buffer_locks, 0x00, MAX_SHMEM_THREADS * sizeof(long)); hclib::launch([argc, argv] { pe = hclib::pe_for_locale(hclib::shmem_my_pe()); npes = hclib::shmem_n_pes(); /* determine benchmark parameters (all PEs) */ uts_parseParams(argc, argv); #ifdef UTS_STAT if (stats) { initHist(); } #endif double t1, t2, et; /* show parameter settings */ if (pe == 0) { uts_printParams(); } Node root; initRootNode(&root, type); hclib::shmem_barrier_all(); /* time parallel search */ t1 = uts_wctime(); int n_omp_threads; /********** SPMD Parallel Region **********/ int first = 1; n_omp_threads = hclib::num_workers(); assert(n_omp_threads <= MAX_OMP_THREADS); Node child; retry: initNode(&child); hclib::finish([&first, &root, &child] { if (first) { if (pe == 0) { genChildren(&root, &child); } } else { genChildren(&root, &child); } }); first = 0; if (n_buffered_steals > 0) { hclib::shmem_set_lock(&steal_buffer_locks[pe]); if (n_buffered_steals > 0) { n_buffered_steals--; memcpy(&root, &steal_buffer[n_buffered_steals], sizeof(root)); hclib::shmem_clear_lock(&steal_buffer_locks[pe]); goto retry; } else { hclib::shmem_clear_lock(&steal_buffer_locks[pe]); } } const int got_more_work = remote_steal(&root); if (got_more_work == 1) { goto retry; } hclib::shmem_barrier_all(); t2 = uts_wctime(); et = t2 - t1; int i; for (i = 0; i < MAX_OMP_THREADS; i++) { n_nodes += thread_info[i].n_nodes; n_leaves += thread_info[i].n_leaves; } hclib::shmem_barrier_all(); if (pe != 0) { hclib::shmem_int_add(&n_nodes, n_nodes, 0); hclib::shmem_int_add(&n_leaves, n_leaves, 0); } hclib::shmem_barrier_all(); if (pe == 0) { showStats(et); } /********** End Parallel Region **********/ #ifdef THREAD_METADATA int p; for (p = 0; p < npes; p++) { if (p == pe) { printf("\n"); int i; for (i = 0; i < n_omp_threads; i++) { printf("PE %d, thread %d: %lu tasks\n", p, i, t_metadata[i].ntasks); } } hclib::shmem_barrier_all(); } #endif }); return 0; }
/* * Generate all children of the parent * * details depend on tree type, node type and shape function * */ void genChildren(Node * parent, Node * child) { int parentHeight = parent->height; int numChildren, childType; #ifdef THREAD_METADATA t_metadata[omp_get_thread_num()].ntasks += 1; #endif thread_info[hclib::get_current_worker()].n_nodes++; numChildren = uts_numChildren(parent); childType = uts_childType(parent); // record number of children in parent parent->numChildren = numChildren; // construct children and push onto stack if (numChildren > 0) { int i, j; child->type = childType; child->height = parentHeight + 1; #ifdef UTS_STAT if (stats) { child->pp = parent; // pointer to parent } #endif const unsigned char * parent_state = parent->state.state; unsigned char * child_state = child->state.state; for (i = 0; i < numChildren; i++) { for (j = 0; j < computeGranularity; j++) { // TBD: add parent height to spawn // computeGranularity controls number of rng_spawn calls per node rng_spawn(parent_state, child_state, i); } Node parent = *child; int made_available_for_stealing = 0; if (hclib::get_current_worker() == 0 && n_buffered_steals < N_BUFFERED_STEALS) { hclib::shmem_set_lock(&steal_buffer_locks[pe]); if (n_buffered_steals < N_BUFFERED_STEALS) { steal_buffer[n_buffered_steals++] = parent; made_available_for_stealing = 1; } hclib::shmem_clear_lock(&steal_buffer_locks[pe]); } if (!made_available_for_stealing) { if (parent.height < 9) { hclib::async([parent] { Node child; initNode(&child); Node tmp = parent; genChildren(&tmp, &child); }); } else { Node child; initNode(&child); genChildren(&parent, &child); } } } } else { thread_info[hclib::get_current_worker()].n_leaves++; } }