/* * Generate all children of the parent * * details depend on tree type, node type and shape function * */ void genChildren(Node * parent, void * child_buf, Node * child, StealStack * ss) { int parentHeight = parent->height; int numChildren, childType; ss->maxTreeDepth = max(ss->maxTreeDepth, parent->height); numChildren = uts_numChildren(parent); childType = uts_childType(parent); // record number of children in parent parent->numChildren = numChildren; // construct children and push onto stack if (numChildren > 0) { int i, j; child->type = childType; child->height = parentHeight + 1; for (i = 0; i < numChildren; i++) { for (j = 0; j < computeGranularity; j++) { // TBD: add parent height to spawn // computeGranularity controls number of rng_spawn calls per node rng_spawn(parent->state.state, child->state.state, i); } ss_put_work(ss, child_buf); } } else { ss->nLeaves++; } }
unsigned long long parTreeSearch(int depth, Node *parent, int numChildren) { Node *n = (Node *)malloc(numChildren * sizeof(Node)); Node *nodePtr; int i, j; unsigned long long subtreesize = 1; unsigned long long *partialCount = (unsigned long long *)malloc(numChildren * sizeof(unsigned long long)); // Recurse on the children for (i = 0; i < numChildren; i++) { nodePtr = &n[i]; nodePtr->height = parent->height + 1; // The following line is the work (one or more SHA-1 ops) for (j = 0; j < computeGranularity; j++) { rng_spawn(parent->state.state, nodePtr->state.state, i); } nodePtr->numChildren = uts_numChildren(nodePtr); hclib_pragma_marker("omp", "task untied firstprivate(i, nodePtr) shared(partialCount)", "pragma221_omp_task"); partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren); } hclib_pragma_marker("omp", "taskwait", "pragma225_omp_taskwait"); for (i = 0; i < numChildren; i++) { subtreesize += partialCount[i]; } free(n); free(partialCount); return subtreesize; }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static long visit(node_t parent) { node_t child; uint64_t *child_descendants = calloc(sizeof(long), parent.num_children); CILK_C_REDUCER_OPADD(num_descendants, ulong, 0); uint64_t tmp; // Spawn children, if any for (int i = 0; i < parent.num_children; i++) { child.height = parent.height + 1; for (int j = 0; j < num_samples; j++) { rng_spawn(parent.state.state, child.state.state, i); } child.num_children = calc_num_children(&child); child_descendants[i] = _Cilk_spawn visit(child); } _Cilk_sync; CILK_C_REGISTER_REDUCER(num_descendants); _Cilk_for(int i = 0; i < parent.num_children; i++) { REDUCER_VIEW(num_descendants) += child_descendants[i]; } tmp = 1 + REDUCER_VIEW(num_descendants); CILK_C_UNREGISTER_REDUCER(num_descendants); return tmp; }
unsigned long long parTreeSearch(int depth, Node *parent, int numChildren) { Node n[numChildren], *nodePtr; int i, j; unsigned long long subtreesize = 1, partialCount[numChildren]; // Recurse on the children for (i = 0; i < numChildren; i++) { nodePtr = &n[i]; nodePtr->height = parent->height + 1; // The following line is the work (one or more SHA-1 ops) for (j = 0; j < computeGranularity; j++) { rng_spawn(parent->state.state, nodePtr->state.state, i); } nodePtr->numChildren = uts_numChildren(nodePtr); #pragma omp task untied firstprivate(i, nodePtr) shared(partialCount) partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren); } #pragma omp taskwait for (i = 0; i < numChildren; i++) { subtreesize += partialCount[i]; } return subtreesize; }
static counter_t _uts_action(void *args, size_t size) { int i, j; struct thread_data *my_data; struct thread_data temp, input; my_data = (struct thread_data *)args; Node n[my_data->numChildren], *nodePtr; counter_t subtreesize = 1, partialCount[my_data->numChildren]; temp.depth = my_data->depth; memcpy(&temp.parent, &my_data->parent, sizeof(Node)); temp.numChildren = my_data->numChildren; //hpx_lco_sema_p (mutex); //printf("D: %d; child: %d; spawns:%.0f\n", temp.depth, temp.numChildren, spawns_counter++); //hpx_lco_sema_v_sync (mutex); /* printf("\n[Node] height = %d; numChildren = %d\n" , temp.parent.height , temp.parent.numChildren); */ hpx_addr_t theThread = HPX_HERE; hpx_addr_t done = hpx_lco_future_new(sizeof(uint64_t)); // Recurse on the children for (i = 0; i < temp.numChildren; i++) { nodePtr = &n[i]; nodePtr->height = temp.parent.height + 1; // The following line is the work (one or more SHA-1 ops) for (j = 0; j < computeGranularity; j++) { rng_spawn(temp.parent.state.state, nodePtr->state.state, i); } nodePtr->numChildren = uts_numChildren(nodePtr); input.depth = temp.depth+1; memcpy(&input.parent, nodePtr, sizeof(Node)); input.numChildren = nodePtr->numChildren; //partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren); hpx_call_sync(theThread, _uts, &partialCount[i], sizeof(partialCount[i]), &input, sizeof(input)); } for (i = 0; i < temp.numChildren; i++) { subtreesize += partialCount[i]; } HPX_THREAD_CONTINUE(subtreesize); return HPX_SUCCESS; }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static long visit(node_t *parent, int num_children) { uint64_t num_descendants = 1; #ifdef BIG_STACKS uint64_t child_descendants[num_children]; node_t child_nodes[num_children]; #else uint64_t *child_descendants; node_t *child_nodes; if (num_children > 0) { child_descendants = calloc(sizeof(uint64_t), num_children); child_nodes = malloc(sizeof(node_t) * num_children); } #endif // Spawn children, if any for (int i = 0; i < num_children; i++) { node_t *child = &child_nodes[i]; child->height = parent->height + 1; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child->state.state, i); } child->num_children = calc_num_children(child); #pragma omp task untied firstprivate(i, child) shared(child_descendants) child_descendants[i] = visit(child, child->num_children); } #pragma omp taskwait // #pragma omp parallel for reduction(+:num_descendants) for (int i = 0; i < num_children; i++) { num_descendants += child_descendants[i]; } #ifndef BIG_STACKS if (num_children > 0) { free(child_descendants); free(child_nodes); } #endif return num_descendants; }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static aligned_t visit(void *args_) { node_t *parent = (node_t *)args_; int parent_height = parent->height; int num_children = parent->num_children; aligned_t expect = parent->expect; aligned_t num_descendants[num_children]; aligned_t sum_descendants = 1; if (num_children != 0) { node_t child __attribute__((aligned(8))); aligned_t donec = 0; // Spawn children, if any child.height = parent_height + 1; child.dc = &donec; child.expect = num_children; qthread_empty(&donec); for (int i = 0; i < num_children; i++) { child.acc = &num_descendants[i]; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child.state.state, i); } child.num_children = calc_num_children(&child); qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL); } // Wait for children to finish up, accumulate descendants counts if (donec != expect) qthread_readFF(NULL, &donec); for (int i = 0; i < num_children; i++) { sum_descendants += num_descendants[i]; } } *parent->acc = sum_descendants; if (qthread_incr(parent->dc, 1) + 1 == expect) { qthread_fill(parent->dc); } return 0; }
unsigned long long serTreeSearch(int depth, Node *parent, int numChildren) { unsigned long long subtreesize = 1, partialCount[numChildren]; Node n[numChildren]; int i, j; // Recurse on the children for (i = 0; i < numChildren; i++) { n[i].height = parent->height + 1; // The following line is the work (one or more SHA-1 ops) for (j = 0; j < computeGranularity; j++) { rng_spawn(parent->state.state, n[i].state.state, i); } partialCount[i] = serTreeSearch(depth+1, &n[i], uts_numChildren(&n[i])); } // computing total size for (i = 0; i < numChildren; i++) subtreesize += partialCount[i]; return subtreesize; }
counter_t parTreeSearch(int depth, Node *parent, int numChildren) { //JK //Node n[numChildren], *nodePtr; Node *n, *nodePtr; int i, j; counter_t subtreesize = 1; counter_t *partialCount; //counter_t partialCount[numChildren]; n = (Node*)malloc(numChildren * sizeof(Node)); partialCount = (counter_t*)malloc(numChildren * sizeof(counter_t)); // Recurse on the children for (i = 0; i < numChildren; i++) { nodePtr = &n[i]; nodePtr->height = parent->height + 1; // The following line is the work (one or more SHA-1 ops) for (j = 0; j < computeGranularity; j++) { rng_spawn(parent->state.state, nodePtr->state.state, i); } nodePtr->numChildren = uts_numChildren(nodePtr); #pragma omp task firstprivate(i, nodePtr) shared(partialCount) untied partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren); } #pragma omp taskwait for (i = 0; i < numChildren; i++) { subtreesize += partialCount[i]; } free(n); free(partialCount); return subtreesize; }
TASK_2(Result, parTreeSearch, int, depth, Node *, parent) { int numChildren, childType; counter_t parentHeight = parent->height; Result r = { depth, 1, 0 }; numChildren = uts_numChildren(parent); childType = uts_childType(parent); // record number of children in parent parent->numChildren = numChildren; // Recurse on the children if (numChildren > 0) { int i, j; for (i = 0; i < numChildren; i++) { Node *child = (Node*)alloca(sizeof(Node)); child->type = childType; child->height = parentHeight + 1; child->numChildren = -1; // not yet determined for (j = 0; j < computeGranularity; j++) { rng_spawn(parent->state.state, child->state.state, i); } SPAWN(parTreeSearch, depth+1, child); } /* Wait a bit */ struct timespec tim = (struct timespec){0, 100L*numChildren}; nanosleep(&tim, NULL); for (i = 0; i < numChildren; i++) { Result c = SYNC(parTreeSearch); if (c.maxdepth>r.maxdepth) r.maxdepth = c.maxdepth; r.size += c.size; r.leaves += c.leaves; } } else {
counter_t parTreeSearch(int depth, Node *parent, int numChildren) { Node n[numChildren], *nodePtr; int i, j; counter_t subtreesize = 1, partialCount[numChildren]; //printf("[p] *** depth = %d ***\n", depth); //printf("[p] *** height = %d ***\n", parent->height); //printf("[p] *** numChildren = %d ***\n", parent->numChildren); // Recurse on the children for (i = 0; i < numChildren; i++) { nodePtr = &n[i]; nodePtr->height = parent->height + 1; // The following line is the work (one or more SHA-1 ops) for (j = 0; j < computeGranularity; j++) { rng_spawn(parent->state.state, nodePtr->state.state, i); } nodePtr->numChildren = uts_numChildren(nodePtr); //#pragma omp task firstprivate(i, nodePtr) shared(partialCount) untied partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren); } //#pragma omp taskwait for (i = 0; i < numChildren; i++) { subtreesize += partialCount[i]; } return subtreesize; }
/* * Generate all children of the parent * * details depend on tree type, node type and shape function * */ void genChildren(Node * parent, Node * child) { int parentHeight = parent->height; int numChildren, childType; #ifdef THREAD_METADATA t_metadata[omp_get_thread_num()].ntasks += 1; #endif thread_info[hclib::get_current_worker()].n_nodes++; numChildren = uts_numChildren(parent); childType = uts_childType(parent); // record number of children in parent parent->numChildren = numChildren; // construct children and push onto stack if (numChildren > 0) { int i, j; child->type = childType; child->height = parentHeight + 1; #ifdef UTS_STAT if (stats) { child->pp = parent; // pointer to parent } #endif const unsigned char * parent_state = parent->state.state; unsigned char * child_state = child->state.state; for (i = 0; i < numChildren; i++) { for (j = 0; j < computeGranularity; j++) { // TBD: add parent height to spawn // computeGranularity controls number of rng_spawn calls per node rng_spawn(parent_state, child_state, i); } Node parent = *child; int made_available_for_stealing = 0; if (hclib::get_current_worker() == 0 && n_buffered_steals < N_BUFFERED_STEALS) { hclib::shmem_set_lock(&steal_buffer_locks[pe]); if (n_buffered_steals < N_BUFFERED_STEALS) { steal_buffer[n_buffered_steals++] = parent; made_available_for_stealing = 1; } hclib::shmem_clear_lock(&steal_buffer_locks[pe]); } if (!made_available_for_stealing) { if (parent.height < 9) { hclib::async([parent] { Node child; initNode(&child); Node tmp = parent; genChildren(&tmp, &child); }); } else { Node child; initNode(&child); genChildren(&parent, &child); } } } } else { thread_info[hclib::get_current_worker()].n_leaves++; } }