Esempio n. 1
0
void FMEMultipoleKernel::multipoleApproxSingleThreaded(ArrayPartition& nodePointPartition)
{
	FMELocalContext*  localContext	= m_pLocalContext;
	FMEGlobalContext* globalContext = m_pGlobalContext;
	LinearQuadtree&	tree			= *globalContext->pQuadtree;
	if (isMainThread())
	{									
		tree.bottom_up_traversal(					// do a bottom up traversal M2M pass
			if_then_else(tree.is_leaf_condition(),	// if the current node is a leaf
				p2m_function(localContext),			// then calculate the multipole coeff. due to the points in the leaf
				m2m_function(localContext)			// else shift the coefficents of all children to center of the inner node
			)
		)(tree.root());
	
		tree.forall_well_separated_pairs(				// do a wspd traversal M2L direct eval
			pair_vice_versa(m2l_function(localContext)),// M2L for a well-separated pair
			p2p_function(localContext),					// direct evaluation
			p2p_function(localContext)					// direct evaluation
		)(tree.root());
	
		tree.top_down_traversal(						// top down traversal 
			if_then_else( tree.is_leaf_condition(),		// if the node is a leaf
				do_nothing(),							// then do nothing, we will deal with this case later
				l2l_function(localContext)				// else shift the nodes local coeffs to the children
			)
		)(tree.root());// start at the root 

		// evaluate all leaves and store the forces in the threads array
		for_loop(nodePointPartition,				// loop over points
			func_comp(								// composition of two statements
				l2p_function(localContext),			// evaluate the forces due to the local expansion in the corresponding leaf
				collect_force_function				// collect the forces of all threads with the following options:
				<
					COLLECT_REPULSIVE_FACTOR | 		// multiply by the repulsive factor stored in the global options
					COLLECT_TREE_2_GRAPH_ORDER |	// threads data is stored in quadtree leaf order, transform it into graph order
					COLLECT_ZERO_THREAD_ARRAY		// reset threads array
				>(localContext)
			)
		);
	};
};
Esempio n. 2
0
/*! put impl
 *
 * <pre>
 * init:
 * 
 *                                          1(head)
 *                               -------------------------
 *                              |                         |
 *                              4                         2
 *                        --------------             -------------
 *                       |              |           |             |
 *                       6(parent)      9           7             8
 *                   ---------       
 *                  |         |     
 *                  10(last) (hole) <= 5(val)
 * after:
 *
 *                                          1(head)
 *                               -------------------------
 *                              |                         |
 *                              4                         2
 *                        --------------             -------------
 *                       |              |           |             |
 *                       5(hole)      9           7             8
 *                   ---------       
 *                  |         |     
 *                  10(last)  6(last)
 * </pre>
 */
tb_void_t tb_heap_put(tb_heap_ref_t heap, tb_cpointer_t data)
{
    // check
    tb_heap_impl_t* impl = (tb_heap_impl_t*)heap;
    tb_assert_and_check_return(impl && impl->data);

    // full? grow it
    if (impl->size == impl->maxn)
    {
        // the maxn
        tb_size_t maxn = tb_align4(impl->maxn + impl->grow);
        tb_assert_and_check_return(maxn < TB_HEAD_MAXN);

        // realloc data
        impl->data = (tb_byte_t*)tb_ralloc(impl->data, maxn * impl->func.size);
        tb_assert_and_check_return(impl->data);

        // must be align by 4-bytes
        tb_assert_and_check_return(!(((tb_size_t)(impl->data)) & 3));

        // clear the grow data
        tb_memset(impl->data + impl->size * impl->func.size, 0, (maxn - impl->maxn) * impl->func.size);

        // save maxn
        impl->maxn = maxn;
    }

    // check
    tb_assert_and_check_return(impl->size < impl->maxn);
    
    // init func
    tb_item_func_comp_t func_comp = impl->func.comp;
    tb_item_func_data_t func_data = impl->func.data;
    tb_assert_and_check_return(func_comp && func_data);

    // walk, (hole - 1) / 2: the parent node of the hole
    tb_size_t           parent = 0;
    tb_byte_t*          head = impl->data;
    tb_size_t           hole = impl->size;
    tb_size_t           step = impl->func.size;
    switch (step)
    {
#ifndef __tb_small__
    case sizeof(tb_uint64_t):
        {
            for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1)
            {
                // move item: parent => hole
                *((tb_uint64_t*)(head + hole * step)) = *((tb_uint64_t*)(head + parent * step));

                // move node: hole => parent
                hole = parent;
            }
        }
        break;
    case sizeof(tb_uint32_t):
        {
            for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1)
            {
                // move item: parent => hole
                *((tb_uint32_t*)(head + hole * step)) = *((tb_uint32_t*)(head + parent * step));

                // move node: hole => parent
                hole = parent;
            }
        }
        break;
    case sizeof(tb_uint16_t):
        {
            for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1)
            {
                // move item: parent => hole
                *((tb_uint16_t*)(head + hole * step)) = *((tb_uint16_t*)(head + parent * step));

                // move node: hole => parent
                hole = parent;
            }
        }
        break;
    case sizeof(tb_uint8_t):
        {
            for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1)
            {
                // move item: parent => hole
                *((tb_uint8_t*)(head + hole * step)) = *((tb_uint8_t*)(head + parent * step));

                // move node: hole => parent
                hole = parent;
            }
        }
        break;
#endif
    default:
        for (parent = (hole - 1) >> 1; hole && (func_comp(&impl->func, func_data(&impl->func, head + parent * step), data) > 0); parent = (hole - 1) >> 1)
        {
            // move item: parent => hole
            tb_memcpy(head + hole * step, head + parent * step, step);

            // move node: hole => parent
            hole = parent;
        }
        break;
    }

    // save data
    impl->func.dupl(&impl->func, head + hole * step, data);

    // size++
    impl->size++;

    // check
//  tb_heap_check(impl);
}
Esempio n. 3
0
/*! remove the impl item
 *
 * <pre>
 * init:
 *                                          1(head)
 *                               -------------------------
 *                              |                         |
 *                           (hole)                       2
 *                        --------------             -------------
 *                       |              |           |             |
 *                       6(smaler)      9           7             8
 *                   ---------       ----                                            (hole) <-
 *                  |         |     |                                                         |
 *                 10        16    8 (last)---------------------------------------------> 8 (val)
 *
 *
 * after:
 *                                          1(head)
 *                               -------------------------
 *                              |                         |
 *                              6                         2
 *                        --------------             -------------
 *                       |              |           |             |
 *                     (hole)           9           7             8
 *                   ---------                                                              <-
 *                  |         |                                                               |
 *                 10(smaller)16                                                          8 (val)
 *
 *
 * after:
 *                                          1(head)
 *                               -------------------------
 *                              |                         |
 *                              6                         2
 *                        --------------             -------------
 *                       |              |           |             |
 *                       8              9           7             8
 *                   ---------                                                              
 *                  |         |                                                               
 *                 10        16 
 * 
 * </pre>
 */
static tb_void_t tb_heap_itor_remove(tb_iterator_ref_t iterator, tb_size_t itor)
{
    // check
    tb_heap_impl_t* impl = (tb_heap_impl_t*)iterator;
    tb_assert_and_check_return(impl && impl->data && impl->size && itor < impl->size);

    // init func
    tb_item_func_comp_t func_comp = impl->func.comp;
    tb_item_func_data_t func_data = impl->func.data;
    tb_assert_and_check_return(func_comp && func_data);

    // walk, 2 * hole + 1: the left child node of hole
    tb_size_t           step = impl->func.size;
    tb_byte_t*          head = impl->data;
    tb_byte_t*          hole = head + itor * step;
    tb_byte_t*          tail = head + impl->size * step;
    tb_byte_t*          last = head + (impl->size - 1) * step;
    tb_byte_t*          child = head + ((itor << 1) + 1) * step;
    tb_pointer_t        data_child = tb_null;
    tb_pointer_t        data_rchild = tb_null;
    tb_pointer_t        data_last = func_data(&impl->func, last);
    switch (step)
    {
#ifndef __tb_small__
    case sizeof(tb_uint64_t):
        {
            for (; child < tail; child = head + (((child - head) << 1) + step))
            {   
                // the smaller child node
                data_child = func_data(&impl->func, child);
                if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) 
                {
                    child += step;
                    data_child = data_rchild;
                }

                // end?
                if (func_comp(&impl->func, data_child, data_last) > 0) break;

                // the smaller child node => hole
                *((tb_uint64_t*)hole) = *((tb_uint64_t*)child);

                // move the hole down to it's larger child node 
                hole = child;
            }
        }
        break;
    case sizeof(tb_uint32_t):
        {
            for (; child < tail; child = head + (((child - head) << 1) + step))
            {   
                // the smaller child node
                data_child = func_data(&impl->func, child);
                if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) 
                {
                    child += step;
                    data_child = data_rchild;
                }

                // end?
                if (func_comp(&impl->func, data_child, data_last) > 0) break;

                // the smaller child node => hole
                *((tb_uint32_t*)hole) = *((tb_uint32_t*)child);

                // move the hole down to it's larger child node 
                hole = child;
            }
        }
        break;
    case sizeof(tb_uint16_t):
        {
            for (; child < tail; child = head + (((child - head) << 1) + step))
            {   
                // the smaller child node
                data_child = func_data(&impl->func, child);
                if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) 
                {
                    child += step;
                    data_child = data_rchild;
                }

                // end?
                if (func_comp(&impl->func, data_child, data_last) > 0) break;

                // the smaller child node => hole
                *((tb_uint16_t*)hole) = *((tb_uint16_t*)child);

                // move the hole down to it's larger child node 
                hole = child;
            }
        }
        break;
    case sizeof(tb_uint8_t):
        {
            for (; child < tail; child = head + (((child - head) << 1) + step))
            {   
                // the smaller child node
                data_child = func_data(&impl->func, child);
                if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) 
                {
                    child += step;
                    data_child = data_rchild;
                }

                // end?
                if (func_comp(&impl->func, data_child, data_last) > 0) break;

                // the smaller child node => hole
                *((tb_uint8_t*)hole) = *((tb_uint8_t*)child);

                // move the hole down to it's larger child node 
                hole = child;
            }

        }
        break;
#endif
    default:
        {
            for (; child < tail; child = head + (((child - head) << 1) + step))
            {   
                // the smaller child node
                data_child = func_data(&impl->func, child);
                if (child + step < tail && func_comp(&impl->func, data_child, (data_rchild = func_data(&impl->func, child + step))) > 0) 
                {
                    child += step;
                    data_child = data_rchild;
                }

                // end?
                if (func_comp(&impl->func, data_child, data_last) > 0) break;

                // the smaller child node => hole
                tb_memcpy(hole, child, step);

                // move the hole down to it's larger child node 
                hole = child;
            }
        }
        break;
    }

    // the last node => hole
    if (hole != last) tb_memcpy(hole, last, step);

    // size--
    impl->size--;

    // check
//  tb_heap_check(impl);
}
Esempio n. 4
0
void FMEMultipoleKernel::operator()(FMEGlobalContext* globalContext)
{
	__uint32					maxNumIterations    =  globalContext->pOptions->maxNumIterations;
	__uint32					minNumIterations    =  globalContext->pOptions->minNumIterations;
	__uint32					numPoints			=  globalContext->pQuadtree->numberOfPoints();
	ArrayGraph&					graph				= *globalContext->pGraph;
	LinearQuadtree&				tree				= *globalContext->pQuadtree;
	LinearQuadtreeExpansion&	treeExp				= *globalContext->pExpansion;
	WSPD&						wspd				= *globalContext->pWSPD;
	FMELocalContext*			localContext		= globalContext->pLocalContext[threadNr()];
	FMEGlobalOptions*			options				= globalContext->pOptions;
	float*						threadsForceArrayX	= localContext->forceX;
	float*						threadsForceArrayY	= localContext->forceY;
    float*						globalForceArrayX	= globalContext->globalForceX;
	float*						globalForceArrayY	= globalContext->globalForceY;

	ArrayPartition edgePartition = arrayPartition(graph.numEdges());
	ArrayPartition nodePointPartition = arrayPartition(graph.numNodes());

	m_pLocalContext = localContext;
	m_pGlobalContext = globalContext;
	/****************************/
	/* INIT						*/
	/****************************/
	//! reset the global force array 
	for_loop_array_set(threadNr(), numThreads(), globalForceArrayX, tree.numberOfPoints(), 0.0f);
	for_loop_array_set(threadNr(), numThreads(), globalForceArrayY, tree.numberOfPoints(), 0.0f);
	
	// reset the threads force array
	for (__uint32 i = 0; i < tree.numberOfPoints(); i++)
	{
		threadsForceArrayX[i] = 0.0f;
		threadsForceArrayY[i] = 0.0f;
	};

	__uint32 maxNumIt = options->preProcMaxNumIterations;
	for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIt) ); currNumIteration++)
	{
		// iterate over all edges and store the resulting forces in the threads array
		for_loop(edgePartition, 
			edge_force_function< EDGE_FORCE_DIV_DEGREE > (localContext)	// divide the forces by degree of the node to avoid oscilation						
		);
		// wait until all edges are done
		sync();
		// now collect the forces in parallel and put the sum into the global array and move the nodes accordingly
		for_loop(nodePointPartition, 
			func_comp(
				 collect_force_function<COLLECT_EDGE_FACTOR_PREP | COLLECT_ZERO_THREAD_ARRAY >(localContext),
				 node_move_function<TIME_STEP_PREP | ZERO_GLOBAL_ARRAY>(localContext)
			)
		);
	};
	if (isMainThread())
	{
		globalContext->coolDown = 1.0f;
	};
	sync();

	for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIterations) && !globalContext->earlyExit); currNumIteration++)
	{
		// reset the coefficients 
		for_loop_array_set(threadNr(), numThreads(), treeExp.m_multiExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
		for_loop_array_set(threadNr(), numThreads(), treeExp.m_localExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);

		localContext->maxForceSq = 0.0;
		localContext->avgForce = 0.0;
		
		// construct the quadtree
		quadtreeConstruction(nodePointPartition);
		// wait for all threads to finish
		sync();

		if (isSingleThreaded()) // if is single threaded run the simple approximation
			multipoleApproxSingleThreaded(nodePointPartition);
		else // otherwise use the partitioning
			multipoleApproxFinal(nodePointPartition); 
		// now wait until all forces are summed up in the global array and mapped to graph node order
		sync();
		
		// run the edge forces
		for_loop(edgePartition,							// iterate over all edges and sum up the forces in the threads array 
			edge_force_function< EDGE_FORCE_DIV_DEGREE >(localContext)	// divide the forces by degree of the node to avoid oscilation
		);	
		// wait until edges are finished
		sync();

		// collect the edge forces and move nodes without waiting
		for_loop(nodePointPartition, 
			func_comp(
				 collect_force_function<COLLECT_EDGE_FACTOR | COLLECT_ZERO_THREAD_ARRAY>(localContext),
				 node_move_function<TIME_STEP_NORMAL | ZERO_GLOBAL_ARRAY>(localContext)
			)
		);
		// wait so we can decide if we need another iteration
		sync();
		// check the max force square for all threads
		if (isMainThread())
		{
			double maxForceSq = 0.0;
			for (__uint32 j=0; j < numThreads(); j++)
				maxForceSq = max(globalContext->pLocalContext[j]->maxForceSq, maxForceSq);

			// if we are allowed to quit and the max force sq falls under the threshold tell all threads we are done
			if ((currNumIteration >= minNumIterations) && (maxForceSq < globalContext->pOptions->stopCritForce ))
			{
				globalContext->earlyExit = true;
			};
		};
		// this is required to wait for the earlyExit result
		sync();
	};
};
Esempio n. 5
0
//! the final approximation algorithm which runs the wspd parallel without storing it in the threads subtrees
void FMEMultipoleKernel::multipoleApproxFinal(ArrayPartition& nodePointPartition)
{
	FMELocalContext*  localContext	= m_pLocalContext;
	FMEGlobalContext* globalContext = m_pGlobalContext;
	LinearQuadtree&	tree			= *globalContext->pQuadtree;
	// big multihreaded bottom up traversal.
	for_tree_partition(								// for all roots in the threads tree partition
		tree.bottom_up_traversal(					// do a bottom up traversal 
			if_then_else(tree.is_leaf_condition(),	// if the current node is a leaf
				p2m_function(localContext),			// then calculate the multipole coeff. due to the points in the leaf
				m2m_function(localContext)			// else shift the coefficents of all children to center of the inner node
			)
		)
	);
	sync();
	// top of the tree has to be done by the main thread
	if (isMainThread())
	{
		tree.bottom_up_traversal(					// start a bottom up traversal 
			if_then_else(tree.is_leaf_condition(),	// if the current node is a leaf
				p2m_function(localContext),			// then calculate the multipole coeff. due to the points in the leaf
				m2m_function(localContext)			// else shift the coefficents of all children to center of the inner node
			),
			not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached

		tree.forall_well_separated_pairs(	// do a wspd traversal
			tree.StoreWSPairFunction(),		// store the ws pairs in the WSPD
			tree.StoreDirectPairFunction(), // store the direct pairs
			tree.StoreDirectNodeFunction(),	// store the direct nodes
			not_condition(tree.is_fence_condition()))(tree.root());
	};
	// wait for the main thread to finish
	sync();

	// M2L pass with the WSPD for the result of the single threaded pass above
	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
	
	// D2D pass and store in the thread force array
	for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
	for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));

	// wait until all local coeffs and all direct forces are computed
	sync();

	// the rest of the WSPD can be done on the fly by the thread
	for_tree_partition(	
		tree.forall_well_separated_pairs(					// do a wspd traversal
			pair_vice_versa(m2l_function(localContext)),	// M2L for a well-separated pair
			p2p_function(localContext),						// direct evaluation
			p2p_function(localContext)						// direct evaluation
		)
	);	
	// wait until all local coeffs and all direct forces are computed
	sync();

	// big multihreaded top down traversal. top of the tree has to be done by the main thread
	if (isMainThread())
	{
		tree.top_down_traversal(						// top down traversal L2L pass
			if_then_else( tree.is_leaf_condition(),		// if the node is a leaf
				do_nothing(),							// then do nothing, we will deal with this case later
				l2l_function(localContext)				// else shift the nodes local coeffs to the children
			),
			not_condition(tree.is_fence_condition())	// stop when the fence to the threads is reached 
		)(tree.root());									// start at the root, 
	};
	// wait for the top of the tree
	sync();
	
	for_tree_partition(								// for all roots in the threads tree partition L2L pass
		tree.top_down_traversal(					// do a top down traversal 
			if_then_else( tree.is_leaf_condition(),	// if the node is a leaf
				do_nothing(),						// then do nothing, we will deal with this case later
				l2l_function(localContext)			// else shift the nodes local coeffs to the children
			)
		)
	);
	// wait until the traversal is finished and all leaves have their accumulated local coeffs
	sync(); 
	// evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
	// we can start immediately to collect the forces because we evaluated before point by point 
	for_loop(nodePointPartition,				// loop over threads points
		func_comp(								// composition of two statements
			l2p_function(localContext),			// evaluate the forces due to the local expansion in the corresponding leaf
			collect_force_function				// collect the forces of all threads with the following options:
			<
				COLLECT_REPULSIVE_FACTOR | 		// multiply by the repulsive factor stored in the global options
				COLLECT_TREE_2_GRAPH_ORDER |	// threads data is stored in quadtree leaf order, transform it into graph order
				COLLECT_ZERO_THREAD_ARRAY		// reset threads array
			>(localContext)
		)
	);
};