void FMEMultipoleKernel::multipoleApproxSingleThreaded(ArrayPartition& nodePointPartition) { FMELocalContext* localContext = m_pLocalContext; FMEGlobalContext* globalContext = m_pGlobalContext; LinearQuadtree& tree = *globalContext->pQuadtree; if (isMainThread()) { tree.bottom_up_traversal( // do a bottom up traversal M2M pass if_then_else(tree.is_leaf_condition(), // if the current node is a leaf p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf m2m_function(localContext) // else shift the coefficents of all children to center of the inner node ) )(tree.root()); tree.forall_well_separated_pairs( // do a wspd traversal M2L direct eval pair_vice_versa(m2l_function(localContext)),// M2L for a well-separated pair p2p_function(localContext), // direct evaluation p2p_function(localContext) // direct evaluation )(tree.root()); tree.top_down_traversal( // top down traversal if_then_else( tree.is_leaf_condition(), // if the node is a leaf do_nothing(), // then do nothing, we will deal with this case later l2l_function(localContext) // else shift the nodes local coeffs to the children ) )(tree.root());// start at the root // evaluate all leaves and store the forces in the threads array for_loop(nodePointPartition, // loop over points func_comp( // composition of two statements l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf collect_force_function // collect the forces of all threads with the following options: < COLLECT_REPULSIVE_FACTOR | // multiply by the repulsive factor stored in the global options COLLECT_TREE_2_GRAPH_ORDER | // threads data is stored in quadtree leaf order, transform it into graph order COLLECT_ZERO_THREAD_ARRAY // reset threads array >(localContext) ) ); }; };
/*! put impl * * <pre> * init: * * 1(head) * ------------------------- * | | * 4 2 * -------------- ------------- * | | | | * 6(parent) 9 7 8 * --------- * | | * 10(last) (hole) <= 5(val) * after: * * 1(head) * ------------------------- * | | * 4 2 * -------------- ------------- * | | | | * 5(hole) 9 7 8 * --------- * | | * 10(last) 6(last) * </pre> */ tb_void_t tb_heap_put(tb_heap_ref_t heap, tb_cpointer_t data) { // check tb_heap_impl_t* impl = (tb_heap_impl_t*)heap; tb_assert_and_check_return(impl && impl->data); // full? grow it if (impl->size == impl->maxn) { // the maxn tb_size_t maxn = tb_align4(impl->maxn + impl->grow); tb_assert_and_check_return(maxn < TB_HEAD_MAXN); // realloc data impl->data = (tb_byte_t*)tb_ralloc(impl->data, maxn * impl->func.size); tb_assert_and_check_return(impl->data); // must be align by 4-bytes tb_assert_and_check_return(!(((tb_size_t)(impl->data)) & 3)); // clear the grow data tb_memset(impl->data + impl->size * impl->func.size, 0, (maxn - impl->maxn) * impl->func.size); // save maxn impl->maxn = maxn; } // check tb_assert_and_check_return(impl->size < impl->maxn); // init func tb_item_func_comp_t func_comp = impl->func.comp; tb_item_func_data_t func_data = impl->func.data; tb_assert_and_check_return(func_comp && func_data); // walk, (hole - 1) / 2: the parent node of the hole tb_size_t parent = 0; tb_byte_t* head = impl->data; tb_size_t hole = impl->size; tb_size_t step = impl->func.size; switch (step) { #ifndef __tb_small__ case sizeof(tb_uint64_t): { for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1) { // move item: parent => hole *((tb_uint64_t*)(head + hole * step)) = *((tb_uint64_t*)(head + parent * step)); // move node: hole => parent hole = parent; } } break; case sizeof(tb_uint32_t): { for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1) { // move item: parent => hole *((tb_uint32_t*)(head + hole * step)) = *((tb_uint32_t*)(head + parent * step)); // move node: hole => parent hole = parent; } } break; case sizeof(tb_uint16_t): { for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1) { // move item: parent => hole *((tb_uint16_t*)(head + hole * step)) = *((tb_uint16_t*)(head + parent * step)); // move node: hole => parent hole = parent; } } break; case sizeof(tb_uint8_t): { for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1) { // move item: parent => hole *((tb_uint8_t*)(head + hole * step)) = *((tb_uint8_t*)(head + parent * step)); // move node: hole => parent hole = parent; } } break; #endif default: for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1) { // move item: parent => hole tb_memcpy(head + hole * step, head + parent * step, step); // move node: hole => parent hole = parent; } break; } // save data impl->func.dupl(&impl->func, head + hole * step, data); // size++ impl->size++; // check // tb_heap_check(impl); }
/*! remove the impl item * * <pre> * init: * 1(head) * ------------------------- * | | * (hole) 2 * -------------- ------------- * | | | | * 6(smaler) 9 7 8 * --------- ---- (hole) <- * | | | | * 10 16 8 (last)---------------------------------------------> 8 (val) * * * after: * 1(head) * ------------------------- * | | * 6 2 * -------------- ------------- * | | | | * (hole) 9 7 8 * --------- <- * | | | * 10(smaller)16 8 (val) * * * after: * 1(head) * ------------------------- * | | * 6 2 * -------------- ------------- * | | | | * 8 9 7 8 * --------- * | | * 10 16 * * </pre> */ static tb_void_t tb_heap_itor_remove(tb_iterator_ref_t iterator, tb_size_t itor) { // check tb_heap_impl_t* impl = (tb_heap_impl_t*)iterator; tb_assert_and_check_return(impl && impl->data && impl->size && itor < impl->size); // init func tb_item_func_comp_t func_comp = impl->func.comp; tb_item_func_data_t func_data = impl->func.data; tb_assert_and_check_return(func_comp && func_data); // walk, 2 * hole + 1: the left child node of hole tb_size_t step = impl->func.size; tb_byte_t* head = impl->data; tb_byte_t* hole = head + itor * step; tb_byte_t* tail = head + impl->size * step; tb_byte_t* last = head + (impl->size - 1) * step; tb_byte_t* child = head + ((itor << 1) + 1) * step; tb_pointer_t data_child = tb_null; tb_pointer_t data_rchild = tb_null; tb_pointer_t data_last = func_data(&impl->func, last); switch (step) { #ifndef __tb_small__ case sizeof(tb_uint64_t): { for (; child < tail; child = head + (((child - head) << 1) + step)) { // the smaller child node data_child = func_data(&impl->func, child); if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) { child += step; data_child = data_rchild; } // end? if (func_comp(&impl->func, data_child, data_last) > 0) break; // the smaller child node => hole *((tb_uint64_t*)hole) = *((tb_uint64_t*)child); // move the hole down to it's larger child node hole = child; } } break; case sizeof(tb_uint32_t): { for (; child < tail; child = head + (((child - head) << 1) + step)) { // the smaller child node data_child = func_data(&impl->func, child); if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) { child += step; data_child = data_rchild; } // end? if (func_comp(&impl->func, data_child, data_last) > 0) break; // the smaller child node => hole *((tb_uint32_t*)hole) = *((tb_uint32_t*)child); // move the hole down to it's larger child node hole = child; } } break; case sizeof(tb_uint16_t): { for (; child < tail; child = head + (((child - head) << 1) + step)) { // the smaller child node data_child = func_data(&impl->func, child); if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) { child += step; data_child = data_rchild; } // end? if (func_comp(&impl->func, data_child, data_last) > 0) break; // the smaller child node => hole *((tb_uint16_t*)hole) = *((tb_uint16_t*)child); // move the hole down to it's larger child node hole = child; } } break; case sizeof(tb_uint8_t): { for (; child < tail; child = head + (((child - head) << 1) + step)) { // the smaller child node data_child = func_data(&impl->func, child); if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) { child += step; data_child = data_rchild; } // end? if (func_comp(&impl->func, data_child, data_last) > 0) break; // the smaller child node => hole *((tb_uint8_t*)hole) = *((tb_uint8_t*)child); // move the hole down to it's larger child node hole = child; } } break; #endif default: { for (; child < tail; child = head + (((child - head) << 1) + step)) { // the smaller child node data_child = func_data(&impl->func, child); if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) { child += step; data_child = data_rchild; } // end? if (func_comp(&impl->func, data_child, data_last) > 0) break; // the smaller child node => hole tb_memcpy(hole, child, step); // move the hole down to it's larger child node hole = child; } } break; } // the last node => hole if (hole != last) tb_memcpy(hole, last, step); // size-- impl->size--; // check // tb_heap_check(impl); }
void FMEMultipoleKernel::operator()(FMEGlobalContext* globalContext) { __uint32 maxNumIterations = globalContext->pOptions->maxNumIterations; __uint32 minNumIterations = globalContext->pOptions->minNumIterations; __uint32 numPoints = globalContext->pQuadtree->numberOfPoints(); ArrayGraph& graph = *globalContext->pGraph; LinearQuadtree& tree = *globalContext->pQuadtree; LinearQuadtreeExpansion& treeExp = *globalContext->pExpansion; WSPD& wspd = *globalContext->pWSPD; FMELocalContext* localContext = globalContext->pLocalContext[threadNr()]; FMEGlobalOptions* options = globalContext->pOptions; float* threadsForceArrayX = localContext->forceX; float* threadsForceArrayY = localContext->forceY; float* globalForceArrayX = globalContext->globalForceX; float* globalForceArrayY = globalContext->globalForceY; ArrayPartition edgePartition = arrayPartition(graph.numEdges()); ArrayPartition nodePointPartition = arrayPartition(graph.numNodes()); m_pLocalContext = localContext; m_pGlobalContext = globalContext; /****************************/ /* INIT */ /****************************/ //! reset the global force array for_loop_array_set(threadNr(), numThreads(), globalForceArrayX, tree.numberOfPoints(), 0.0f); for_loop_array_set(threadNr(), numThreads(), globalForceArrayY, tree.numberOfPoints(), 0.0f); // reset the threads force array for (__uint32 i = 0; i < tree.numberOfPoints(); i++) { threadsForceArrayX[i] = 0.0f; threadsForceArrayY[i] = 0.0f; }; __uint32 maxNumIt = options->preProcMaxNumIterations; for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIt) ); currNumIteration++) { // iterate over all edges and store the resulting forces in the threads array for_loop(edgePartition, edge_force_function< EDGE_FORCE_DIV_DEGREE > (localContext) // divide the forces by degree of the node to avoid oscilation ); // wait until all edges are done sync(); // now collect the forces in parallel and put the sum into the global array and move the nodes accordingly for_loop(nodePointPartition, func_comp( collect_force_function<COLLECT_EDGE_FACTOR_PREP | COLLECT_ZERO_THREAD_ARRAY >(localContext), node_move_function<TIME_STEP_PREP | ZERO_GLOBAL_ARRAY>(localContext) ) ); }; if (isMainThread()) { globalContext->coolDown = 1.0f; }; sync(); for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIterations) && !globalContext->earlyExit); currNumIteration++) { // reset the coefficients for_loop_array_set(threadNr(), numThreads(), treeExp.m_multiExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0); for_loop_array_set(threadNr(), numThreads(), treeExp.m_localExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0); localContext->maxForceSq = 0.0; localContext->avgForce = 0.0; // construct the quadtree quadtreeConstruction(nodePointPartition); // wait for all threads to finish sync(); if (isSingleThreaded()) // if is single threaded run the simple approximation multipoleApproxSingleThreaded(nodePointPartition); else // otherwise use the partitioning multipoleApproxFinal(nodePointPartition); // now wait until all forces are summed up in the global array and mapped to graph node order sync(); // run the edge forces for_loop(edgePartition, // iterate over all edges and sum up the forces in the threads array edge_force_function< EDGE_FORCE_DIV_DEGREE >(localContext) // divide the forces by degree of the node to avoid oscilation ); // wait until edges are finished sync(); // collect the edge forces and move nodes without waiting for_loop(nodePointPartition, func_comp( collect_force_function<COLLECT_EDGE_FACTOR | COLLECT_ZERO_THREAD_ARRAY>(localContext), node_move_function<TIME_STEP_NORMAL | ZERO_GLOBAL_ARRAY>(localContext) ) ); // wait so we can decide if we need another iteration sync(); // check the max force square for all threads if (isMainThread()) { double maxForceSq = 0.0; for (__uint32 j=0; j < numThreads(); j++) maxForceSq = max(globalContext->pLocalContext[j]->maxForceSq, maxForceSq); // if we are allowed to quit and the max force sq falls under the threshold tell all threads we are done if ((currNumIteration >= minNumIterations) && (maxForceSq < globalContext->pOptions->stopCritForce )) { globalContext->earlyExit = true; }; }; // this is required to wait for the earlyExit result sync(); }; };
//! the final approximation algorithm which runs the wspd parallel without storing it in the threads subtrees void FMEMultipoleKernel::multipoleApproxFinal(ArrayPartition& nodePointPartition) { FMELocalContext* localContext = m_pLocalContext; FMEGlobalContext* globalContext = m_pGlobalContext; LinearQuadtree& tree = *globalContext->pQuadtree; // big multihreaded bottom up traversal. for_tree_partition( // for all roots in the threads tree partition tree.bottom_up_traversal( // do a bottom up traversal if_then_else(tree.is_leaf_condition(), // if the current node is a leaf p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf m2m_function(localContext) // else shift the coefficents of all children to center of the inner node ) ) ); sync(); // top of the tree has to be done by the main thread if (isMainThread()) { tree.bottom_up_traversal( // start a bottom up traversal if_then_else(tree.is_leaf_condition(), // if the current node is a leaf p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf m2m_function(localContext) // else shift the coefficents of all children to center of the inner node ), not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached tree.forall_well_separated_pairs( // do a wspd traversal tree.StoreWSPairFunction(), // store the ws pairs in the WSPD tree.StoreDirectPairFunction(), // store the direct pairs tree.StoreDirectNodeFunction(), // store the direct nodes not_condition(tree.is_fence_condition()))(tree.root()); }; // wait for the main thread to finish sync(); // M2L pass with the WSPD for the result of the single threaded pass above tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)(); tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)(); // D2D pass and store in the thread force array for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext)); for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext)); // wait until all local coeffs and all direct forces are computed sync(); // the rest of the WSPD can be done on the fly by the thread for_tree_partition( tree.forall_well_separated_pairs( // do a wspd traversal pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair p2p_function(localContext), // direct evaluation p2p_function(localContext) // direct evaluation ) ); // wait until all local coeffs and all direct forces are computed sync(); // big multihreaded top down traversal. top of the tree has to be done by the main thread if (isMainThread()) { tree.top_down_traversal( // top down traversal L2L pass if_then_else( tree.is_leaf_condition(), // if the node is a leaf do_nothing(), // then do nothing, we will deal with this case later l2l_function(localContext) // else shift the nodes local coeffs to the children ), not_condition(tree.is_fence_condition()) // stop when the fence to the threads is reached )(tree.root()); // start at the root, }; // wait for the top of the tree sync(); for_tree_partition( // for all roots in the threads tree partition L2L pass tree.top_down_traversal( // do a top down traversal if_then_else( tree.is_leaf_condition(), // if the node is a leaf do_nothing(), // then do nothing, we will deal with this case later l2l_function(localContext) // else shift the nodes local coeffs to the children ) ) ); // wait until the traversal is finished and all leaves have their accumulated local coeffs sync(); // evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access) // we can start immediately to collect the forces because we evaluated before point by point for_loop(nodePointPartition, // loop over threads points func_comp( // composition of two statements l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf collect_force_function // collect the forces of all threads with the following options: < COLLECT_REPULSIVE_FACTOR | // multiply by the repulsive factor stored in the global options COLLECT_TREE_2_GRAPH_ORDER | // threads data is stored in quadtree leaf order, transform it into graph order COLLECT_ZERO_THREAD_ARRAY // reset threads array >(localContext) ) ); };