Ejemplo n.º 1
0
void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i)
{
	int numContacts = b3Contact4Data_getNumPoints(&newContactCache);
	if (i!=(numContacts-1))
	{
		b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]);
		b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]);
		b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]);
	}
	b3Contact4Data_setNumPoints(&newContactCache,numContacts-1);

}
Ejemplo n.º 2
0
b3BroadphasePair* b3HashedOverlappingPairCache::findPair(int proxy0, int proxy1)
{
	b3g_findPairs++;
	if(proxy0 >proxy1) 
		b3Swap(proxy0,proxy1);
	int proxyId1 = proxy0;
	int proxyId2 = proxy1;

	/*if (proxyId1 > proxyId2) 
		b3Swap(proxyId1, proxyId2);*/

	int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));

	if (hash >= m_hashTable.size())
	{
		return NULL;
	}

	int index = m_hashTable[hash];
	while (index != B3_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyId1, proxyId2) == false)
	{
		index = m_next[index];
	}

	if (index == B3_NULL_PAIR)
	{
		return NULL;
	}

	b3Assert(index < m_overlappingPairArray.size());

	return &m_overlappingPairArray[index];
}
Ejemplo n.º 3
0
b3BroadphasePair* b3HashedOverlappingPairCache::internalAddPair(int proxy0, int proxy1)
{
	if(proxy0>proxy1) 
		b3Swap(proxy0,proxy1);
	int proxyId1 = proxy0;
	int proxyId2 = proxy1;

	/*if (proxyId1 > proxyId2) 
		b3Swap(proxyId1, proxyId2);*/

	int	hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));	// New hash value with new mask


	b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash);
	if (pair != NULL)
	{
		return pair;
	}
	/*for(int i=0;i<m_overlappingPairArray.size();++i)
		{
		if(	(m_overlappingPairArray[i].m_pProxy0==proxy0)&&
			(m_overlappingPairArray[i].m_pProxy1==proxy1))
			{
			printf("Adding duplicated %u<>%u\r\n",proxyId1,proxyId2);
			internalFindPair(proxy0, proxy1, hash);
			}
		}*/
	int count = m_overlappingPairArray.size();
	int oldCapacity = m_overlappingPairArray.capacity();
	void* mem = &m_overlappingPairArray.expandNonInitializing();

	//this is where we add an actual pair, so also call the 'ghost'
//	if (m_ghostPairCallback)
//		m_ghostPairCallback->addOverlappingPair(proxy0,proxy1);

	int newCapacity = m_overlappingPairArray.capacity();

	if (oldCapacity < newCapacity)
	{
		growTables();
		//hash with new capacity
		hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));
	}
	
	pair = new (mem) b3BroadphasePair(proxy0,proxy1);
	
//	pair->m_pProxy0 = proxy0;
//	pair->m_pProxy1 = proxy1;
	//pair->m_algorithm = 0;
	//pair->m_internalTmpValue = 0;
	

	m_next[count] = m_hashTable[hash];
	m_hashTable[hash] = count;

	return pair;
}
Ejemplo n.º 4
0
	void	Process(const b3DbvtNode* na,const b3DbvtNode* nb)
	{
		if(na!=nb)
		{
			b3DbvtProxy*	pa=(b3DbvtProxy*)na->data;
			b3DbvtProxy*	pb=(b3DbvtProxy*)nb->data;
#if B3_DBVT_BP_SORTPAIRS
			if(pa->m_uniqueId>pb->m_uniqueId) 
				b3Swap(pa,pb);
#endif
			pbp->m_paircache->addOverlappingPair(pa->getUid(),pb->getUid());
			++pbp->m_newpairs;
		}
	}
inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3(b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
{
	//int sz = sizeof(b3BatchConstraint);

	B3_PROFILE("sortConstraintByBatch3");

	static int maxSwaps = 0;
	int numSwaps = 0;

	curUsed.resize(2 * simdWidth);

	static int maxNumConstraints = 0;
	if (maxNumConstraints < numConstraints)
	{
		maxNumConstraints = numConstraints;
		//printf("maxNumConstraints  = %d\n",maxNumConstraints );
	}

	int numUsedArray = numBodies / 32 + 1;
	bodyUsed.resize(numUsedArray);

	for (int q = 0; q < numUsedArray; q++)
		bodyUsed[q] = 0;

	int curBodyUsed = 0;

	int numIter = 0;

#if defined(_DEBUG)
	for (int i = 0; i < numConstraints; i++)
		cs[i].m_batchId = -1;
#endif

	int numValidConstraints = 0;
	//	int unprocessedConstraintIndex = 0;

	int batchIdx = 0;

	{
		B3_PROFILE("cpu batch innerloop");

		while (numValidConstraints < numConstraints)
		{
			numIter++;
			int nCurrentBatch = 0;
			//	clear flag
			for (int i = 0; i < curBodyUsed; i++)
				bodyUsed[curUsed[i] / 32] = 0;

			curBodyUsed = 0;

			for (int i = numValidConstraints; i < numConstraints; i++)
			{
				int idx = i;
				b3Assert(idx < numConstraints);
				//	check if it can go
				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
				int bodyA = abs(bodyAS);
				int bodyB = abs(bodyBS);
				bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
				bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
				int aUnavailable = 0;
				int bUnavailable = 0;
				if (!aIsStatic)
				{
					aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
				}
				if (!aUnavailable)
					if (!bIsStatic)
					{
						bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
					}

				if (aUnavailable == 0 && bUnavailable == 0)  // ok
				{
					if (!aIsStatic)
					{
						bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
						curUsed[curBodyUsed++] = bodyA;
					}
					if (!bIsStatic)
					{
						bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
						curUsed[curBodyUsed++] = bodyB;
					}

					cs[idx].m_batchId = batchIdx;

					if (i != numValidConstraints)
					{
						b3Swap(cs[i], cs[numValidConstraints]);
						numSwaps++;
					}

					numValidConstraints++;
					{
						nCurrentBatch++;
						if (nCurrentBatch == simdWidth)
						{
							nCurrentBatch = 0;
							for (int i = 0; i < curBodyUsed; i++)
								bodyUsed[curUsed[i] / 32] = 0;
							curBodyUsed = 0;
						}
					}
				}
			}
			m_gpuData->m_batchSizes.push_back(nCurrentBatch);
			batchIdx++;
		}
	}

#if defined(_DEBUG)
	//		debugPrintf( "nBatches: %d\n", batchIdx );
	for (int i = 0; i < numConstraints; i++)
	{
		b3Assert(cs[i].m_batchId != -1);
	}
#endif

	if (maxSwaps < numSwaps)
	{
		maxSwaps = numSwaps;
		//printf("maxSwaps = %d\n", maxSwaps);
	}

	return batchIdx;
}
Ejemplo n.º 6
0
void							b3DynamicBvhBroadphase::collide(b3Dispatcher* dispatcher)
{
	/*printf("---------------------------------------------------------\n");
	printf("m_sets[0].m_leaves=%d\n",m_sets[0].m_leaves);
	printf("m_sets[1].m_leaves=%d\n",m_sets[1].m_leaves);
	printf("numPairs = %d\n",getOverlappingPairCache()->getNumOverlappingPairs());
	{
		int i;
		for (i=0;i<getOverlappingPairCache()->getNumOverlappingPairs();i++)
		{
			printf("pair[%d]=(%d,%d),",i,getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy0->getUid(),
				getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy1->getUid());
		}
		printf("\n");
	}
*/



	b3SPC(m_profiling.m_total);
	/* optimize				*/ 
	m_sets[0].optimizeIncremental(1+(m_sets[0].m_leaves*m_dupdates)/100);
	if(m_fixedleft)
	{
		const int count=1+(m_sets[1].m_leaves*m_fupdates)/100;
		m_sets[1].optimizeIncremental(1+(m_sets[1].m_leaves*m_fupdates)/100);
		m_fixedleft=b3Max<int>(0,m_fixedleft-count);
	}
	/* dynamic -> fixed set	*/ 
	m_stageCurrent=(m_stageCurrent+1)%STAGECOUNT;
	b3DbvtProxy*	current=m_stageRoots[m_stageCurrent];
	if(current)
	{
		b3DbvtTreeCollider	collider(this);
		do	{
			b3DbvtProxy*	next=current->links[1];
			b3ListRemove(current,m_stageRoots[current->stage]);
			b3ListAppend(current,m_stageRoots[STAGECOUNT]);
#if B3_DBVT_BP_ACCURATESLEEPING
			m_paircache->removeOverlappingPairsContainingProxy(current,dispatcher);
			collider.proxy=current;
			b3DynamicBvh::collideTV(m_sets[0].m_root,current->aabb,collider);
			b3DynamicBvh::collideTV(m_sets[1].m_root,current->aabb,collider);
#endif
			m_sets[0].remove(current->leaf);
			B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)	curAabb=b3DbvtVolume::FromMM(current->m_aabbMin,current->m_aabbMax);
			current->leaf	=	m_sets[1].insert(curAabb,current);
			current->stage	=	STAGECOUNT;	
			current			=	next;
		} while(current);
		m_fixedleft=m_sets[1].m_leaves;
		m_needcleanup=true;
	}
	/* collide dynamics		*/ 
	{
		b3DbvtTreeCollider	collider(this);
		if(m_deferedcollide)
		{
			b3SPC(m_profiling.m_fdcollide);
			m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[1].m_root,collider);
		}
		if(m_deferedcollide)
		{
			b3SPC(m_profiling.m_ddcollide);
			m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[0].m_root,collider);
		}
	}
	/* clean up				*/ 
	if(m_needcleanup)
	{
		b3SPC(m_profiling.m_cleanup);
		b3BroadphasePairArray&	pairs=m_paircache->getOverlappingPairArray();
		if(pairs.size()>0)
		{

			int			ni=b3Min(pairs.size(),b3Max<int>(m_newpairs,(pairs.size()*m_cupdates)/100));
			for(int i=0;i<ni;++i)
			{
				b3BroadphasePair&	p=pairs[(m_cid+i)%pairs.size()];
				b3DbvtProxy*		pa=&m_proxies[p.x];
				b3DbvtProxy*		pb=&m_proxies[p.y];
				if(!b3Intersect(pa->leaf->volume,pb->leaf->volume))
				{
#if B3_DBVT_BP_SORTPAIRS
					if(pa->m_uniqueId>pb->m_uniqueId) 
						b3Swap(pa,pb);
#endif
					m_paircache->removeOverlappingPair(pa->getUid(),pb->getUid(),dispatcher);
					--ni;--i;
				}
			}
			if(pairs.size()>0) m_cid=(m_cid+ni)%pairs.size(); else m_cid=0;
		}
	}
	++m_pid;
	m_newpairs=1;
	m_needcleanup=false;
	if(m_updates_call>0)
	{ m_updates_ratio=m_updates_done/(b3Scalar)m_updates_call; }
	else
	{ m_updates_ratio=0; }
	m_updates_done/=2;
	m_updates_call/=2;
}
Ejemplo n.º 7
0
inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies)
{
	
	B3_PROFILE("sortConstraintByBatch2");
	

	
	bodyUsed2.resize(2*simdWidth);

	for (int q=0;q<2*simdWidth;q++)
		bodyUsed2[q]=0;

	int curBodyUsed = 0;

	int numIter = 0;
    
	m_data->m_sortData.resize(numConstraints);
	m_data->m_idxBuffer.resize(numConstraints);
	m_data->m_old.resize(numConstraints);
	
	unsigned int* idxSrc = &m_data->m_idxBuffer[0];
		
#if defined(_DEBUG)
	for(int i=0; i<numConstraints; i++)
		cs[i].getBatchIdx() = -1;
#endif
	for(int i=0; i<numConstraints; i++) 
		idxSrc[i] = i;
    
	int numValidConstraints = 0;
	int unprocessedConstraintIndex = 0;

	int batchIdx = 0;
    

	{
		B3_PROFILE("cpu batch innerloop");
		
		while( numValidConstraints < numConstraints)
		{
			numIter++;
			int nCurrentBatch = 0;
			//	clear flag
			for(int i=0; i<curBodyUsed; i++) 
				bodyUsed2[i] = 0;
            curBodyUsed = 0;

			for(int i=numValidConstraints; i<numConstraints; i++)
			{
				int idx = idxSrc[i];
				b3Assert( idx < numConstraints );
				//	check if it can go
				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
				int bodyA = abs(bodyAS);
				int bodyB = abs(bodyBS);
				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
				int aUnavailable = 0;
				int bUnavailable = 0;
				if (!aIsStatic)
				{
					for (int j=0;j<curBodyUsed;j++)
					{
						if (bodyA == bodyUsed2[j])
						{
							aUnavailable=1;
							break;
						}
					}
				}
				if (!aUnavailable)
				if (!bIsStatic)
				{
					for (int j=0;j<curBodyUsed;j++)
					{
						if (bodyB == bodyUsed2[j])
						{
							bUnavailable=1;
							break;
						}
					}
				}
                
				if( aUnavailable==0 && bUnavailable==0 ) // ok
				{
					if (!aIsStatic)
					{
						bodyUsed2[curBodyUsed++] = bodyA;
					}
					if (!bIsStatic)
					{
						bodyUsed2[curBodyUsed++] = bodyB;
					}

					cs[idx].getBatchIdx() = batchIdx;
					m_data->m_sortData[idx].m_key = batchIdx;
					m_data->m_sortData[idx].m_value = idx;

					if (i!=numValidConstraints)
					{
						b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
					}

					numValidConstraints++;
					{
						nCurrentBatch++;
						if( nCurrentBatch == simdWidth )
						{
							nCurrentBatch = 0;
							for(int i=0; i<curBodyUsed; i++) 
								bodyUsed2[i] = 0;

							
							curBodyUsed = 0;
						}
					}
				}
			}
			
			batchIdx ++;
		}
	}
	{
		B3_PROFILE("quickSort");
		//m_data->m_sortData.quickSort(sortfnc);
	}

	{
        B3_PROFILE("reorder");
		//	reorder
		
		memcpy( &m_data->m_old[0], cs, sizeof(b3Contact4)*numConstraints);

		for(int i=0; i<numConstraints; i++)
		{
			b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
			int idx = m_data->m_sortData[idxSrc[i]].m_value;
			cs[i] = m_data->m_old[idx];
		}
	}
	
#if defined(_DEBUG)
    //		debugPrintf( "nBatches: %d\n", batchIdx );
	for(int i=0; i<numConstraints; i++)
    {
        b3Assert( cs[i].getBatchIdx() != -1 );
    }
#endif

	
	return batchIdx;
}
Ejemplo n.º 8
0
inline int b3GpuPgsContactSolver::sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies)
{
	
	B3_PROFILE("sortConstraintByBatch");
	int numIter = 0;
    
	sortData.resize(n);
	idxBuffer.resize(n);
	old.resize(n);
	
	unsigned int* idxSrc = &idxBuffer[0];
	unsigned int* idxDst = &idxBuffer[0];
	int nIdxSrc, nIdxDst;
    
	const int N_FLG = 256;
	const int FLG_MASK = N_FLG-1;
	unsigned int flg[N_FLG/32];
#if defined(_DEBUG)
	for(int i=0; i<n; i++)
		cs[i].getBatchIdx() = -1;
#endif
	for(int i=0; i<n; i++) 
		idxSrc[i] = i;
	nIdxSrc = n;
    
	int batchIdx = 0;
    
	{
		B3_PROFILE("cpu batch innerloop");
		while( nIdxSrc )
		{
			numIter++;
			nIdxDst = 0;
			int nCurrentBatch = 0;
            
			//	clear flag
			for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
            
			for(int i=0; i<nIdxSrc; i++)
			{
				int idx = idxSrc[i];
				

				b3Assert( idx < n );
				//	check if it can go
				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
                
				
                
				int bodyA = abs(bodyAS);
				int bodyB = abs(bodyBS);
                
				int aIdx = bodyA & FLG_MASK;
				int bIdx = bodyB & FLG_MASK;
                
				unsigned int aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31));
				unsigned int bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31));
                
				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;

                //use inv_mass!
				aUnavailable = !aIsStatic? aUnavailable:0;//
				bUnavailable = !bIsStatic? bUnavailable:0;
                
				if( aUnavailable==0 && bUnavailable==0 ) // ok
				{
					if (!aIsStatic)
						flg[ aIdx/32 ] |= (1<<(aIdx&31));
					if (!bIsStatic)
						flg[ bIdx/32 ] |= (1<<(bIdx&31));

					cs[idx].getBatchIdx() = batchIdx;
					sortData[idx].m_key = batchIdx;
					sortData[idx].m_value = idx;
                    
					{
						nCurrentBatch++;
						if( nCurrentBatch == simdWidth )
						{
							nCurrentBatch = 0;
							for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
						}
					}
				}
				else
				{
					idxDst[nIdxDst++] = idx;
				}
			}
			b3Swap( idxSrc, idxDst );
			b3Swap( nIdxSrc, nIdxDst );
			batchIdx ++;
		}
	}
	{
		B3_PROFILE("quickSort");
		sortData.quickSort(sortfnc);
	}
	
	
	{
        B3_PROFILE("reorder");
		//	reorder
		
		memcpy( &old[0], cs, sizeof(b3Contact4)*n);
		for(int i=0; i<n; i++)
		{
			int idx = sortData[i].m_value;
			cs[i] = old[idx];
		}
	}
    
	
#if defined(_DEBUG)
    //		debugPrintf( "nBatches: %d\n", batchIdx );
	for(int i=0; i<n; i++)
    {
        b3Assert( cs[i].getBatchIdx() != -1 );
    }
#endif
	return batchIdx;
}
Ejemplo n.º 9
0
void* b3HashedOverlappingPairCache::removeOverlappingPair(int proxy0, int proxy1,b3Dispatcher* dispatcher)
{
	b3g_removePairs++;
	if(proxy0>proxy1) 
		b3Swap(proxy0,proxy1);
	int proxyId1 = proxy0;
	int proxyId2 = proxy1;

	/*if (proxyId1 > proxyId2) 
		b3Swap(proxyId1, proxyId2);*/

	int	hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1));

	b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash);
	if (pair == NULL)
	{
		return 0;
	}

	cleanOverlappingPair(*pair,dispatcher);

	

	int pairIndex = int(pair - &m_overlappingPairArray[0]);
	b3Assert(pairIndex < m_overlappingPairArray.size());

	// Remove the pair from the hash table.
	int index = m_hashTable[hash];
	b3Assert(index != B3_NULL_PAIR);

	int previous = B3_NULL_PAIR;
	while (index != pairIndex)
	{
		previous = index;
		index = m_next[index];
	}

	if (previous != B3_NULL_PAIR)
	{
		b3Assert(m_next[previous] == pairIndex);
		m_next[previous] = m_next[pairIndex];
	}
	else
	{
		m_hashTable[hash] = m_next[pairIndex];
	}

	// We now move the last pair into spot of the
	// pair being removed. We need to fix the hash
	// table indices to support the move.

	int lastPairIndex = m_overlappingPairArray.size() - 1;

	//if (m_ghostPairCallback)
	//	m_ghostPairCallback->removeOverlappingPair(proxy0, proxy1,dispatcher);

	// If the removed pair is the last pair, we are done.
	if (lastPairIndex == pairIndex)
	{
		m_overlappingPairArray.pop_back();
		return 0;
	}

	// Remove the last pair from the hash table.
	const b3BroadphasePair* last = &m_overlappingPairArray[lastPairIndex];
		/* missing swap here too, Nat. */ 
	int lastHash = static_cast<int>(getHash(static_cast<unsigned int>(last->x), static_cast<unsigned int>(last->y)) & (m_overlappingPairArray.capacity()-1));

	index = m_hashTable[lastHash];
	b3Assert(index != B3_NULL_PAIR);

	previous = B3_NULL_PAIR;
	while (index != lastPairIndex)
	{
		previous = index;
		index = m_next[index];
	}

	if (previous != B3_NULL_PAIR)
	{
		b3Assert(m_next[previous] == lastPairIndex);
		m_next[previous] = m_next[lastPairIndex];
	}
	else
	{
		m_hashTable[lastHash] = m_next[lastPairIndex];
	}

	// Copy the last pair into the remove pair's spot.
	m_overlappingPairArray[pairIndex] = m_overlappingPairArray[lastPairIndex];

	// Insert the last pair into the hash table
	m_next[pairIndex] = m_hashTable[lastHash];
	m_hashTable[lastHash] = pairIndex;

	m_overlappingPairArray.pop_back();

	return 0;
}
Ejemplo n.º 10
0
void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
{
	int n = inout.size();
	const int BITS_PER_PASS = 8;
	const int NUM_TABLES = (1<<BITS_PER_PASS);


	int tables[NUM_TABLES];
	int counter[NUM_TABLES];

	b3SortData* src = &inout[0];
	b3AlignedObjectArray<b3SortData> workbuffer;
	workbuffer.resize(inout.size());
	b3SortData* dst = &workbuffer[0];

	int count=0;
	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
	{
		for(int i=0; i<NUM_TABLES; i++)
		{
			tables[i] = 0;
		}

		for(int i=0; i<n; i++)
		{
			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
			tables[tableIdx]++;
		}
//#define TEST
#ifdef TEST
		printf("histogram size=%d\n",NUM_TABLES);
		for (int i=0;i<NUM_TABLES;i++)
		{
			if (tables[i]!=0)
			{
				printf("tables[%d]=%d]\n",i,tables[i]);
			}

		}
#endif //TEST
		//	prefix scan
		int sum = 0;
		for(int i=0; i<NUM_TABLES; i++)
		{
			int iData = tables[i];
			tables[i] = sum;
			sum += iData;
			counter[i] = 0;
		}

		//	distribute
		for(int i=0; i<n; i++)
		{
			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
			
			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
			counter[tableIdx] ++;
		}

		b3Swap( src, dst );
		count++;
	}

	if (count&1)
	{
		b3Assert(0);//need to copy 

	}
}
Ejemplo n.º 11
0
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
	
	int originalSize = keyValuesInOut.size();
	int workingSize = originalSize;
	
			
	int dataAlignment = DATA_ALIGNMENT;

#ifdef DEBUG_RADIXSORT2
    b3AlignedObjectArray<b3SortData>   test2;
    keyValuesInOut.copyToHost(test2);
    printf("numElem = %d\n",test2.size());
    for (int i=0;i<test2.size();i++)
    {
        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
    }
#endif //DEBUG_RADIXSORT2
    
	b3OpenCLArray<b3SortData>* src = 0;

	if (workingSize%dataAlignment)
	{
		workingSize += dataAlignment-(workingSize%dataAlignment);
		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
		m_workBuffer4->resize(workingSize);
		b3SortData fillValue;
		fillValue.m_key = 0xffffffff;
		fillValue.m_value = 0xffffffff;

#define USE_BTFILL
#ifdef USE_BTFILL
		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
#else
		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
		
		for (int i=originalSize; i<workingSize;i++)
		{
			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
		}
#endif//USE_BTFILL

		src = m_workBuffer4;
	} else
	{
		src = &keyValuesInOut;
		m_workBuffer4->resize(0);
	}
		
	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
	int minCap = NUM_BUCKET*NUM_WGS;


	int n = workingSize;

	m_workBuffer1->resize(minCap);
	m_workBuffer3->resize(workingSize);
	

//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
	b3Assert( BITS_PER_PASS == 4 );
	b3Assert( WG_SIZE == 64 );
	b3Assert( (sortBits&0x3) == 0 );

	
	
	b3OpenCLArray<b3SortData>* dst = m_workBuffer3;

	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;


	int nWGs = NUM_WGS;
	b3ConstData cdata;

	{
        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
     	int nBlocks = (n+blockSize-1)/(blockSize);
		cdata.m_n = n;
		cdata.m_nWGs = NUM_WGS;
		cdata.m_startBit = 0;
		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
		if( nBlocks < NUM_WGS )
		{
			cdata.m_nBlocksPerWG = 1;
			nWGs = nBlocks;
		}
	}

	int count=0;
	for(int ib=0; ib<sortBits; ib+=4)
	{
#ifdef DEBUG_RADIXSORT2
        keyValuesInOut.copyToHost(test2);
        printf("numElem = %d\n",test2.size());
        for (int i=0;i<test2.size();i++)
        {
            if (test2[i].m_key != test2[i].m_value)
            {
                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
            }
        }
#endif //DEBUG_RADIXSORT2
        
		cdata.m_startBit = ib;
		
		if (src->size())
		{
			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);

			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
			launcher.setConst(  cdata );
			
			int num = NUM_WGS*WG_SIZE;
			launcher.launch1D( num, WG_SIZE );
		}

        
        
#ifdef DEBUG_RADIXSORT
		b3AlignedObjectArray<unsigned int> testHist;
		srcHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
#endif //DEBUG_RADIXSORT
	
	

//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
	bool fastScan=!m_deviceCPU;//only use fast scan on GPU
#else
	bool fastScan=false;
#endif

		if (fastScan)
		{//	prefix scan group histogram
			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( 128, 128 );
			destHisto = srcHisto;
		}else
		{
			//unsigned int sum; //for debugging
            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
		}


#ifdef DEBUG_RADIXSORT
		destHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
        
        for (int i=0;i<testHist.size();i+=NUM_WGS)
		{
				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
		}

#endif //DEBUG_RADIXSORT

#define USE_GPU
#ifdef USE_GPU
        
		if (src->size())
		{//	local sort and distribute
			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
            
		}
#else
        {
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            int tables[NUM_TABLES];
            int startBit = ib;
            
            destHisto->copyToHost(testHist);
            b3AlignedObjectArray<b3SortData> srcHost;
            b3AlignedObjectArray<b3SortData> dstHost;
            dstHost.resize(src->size());
            
            src->copyToHost(srcHost);
            
            for (int i=0;i<NUM_TABLES;i++)
            {
                tables[i] = testHist[i*NUM_WGS];
            }
            
            //	distribute
            for(int i=0; i<n; i++)
            {
                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                
                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
                counter2[tableIdx] ++;
            }
            
            
#else
          
            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            
            int tables[NUM_TABLES];
             b3AlignedObjectArray<b3SortData> dstHostOK;
            dstHostOK.resize(src->size());

            destHisto->copyToHost(testHist);
            b3AlignedObjectArray<b3SortData> srcHost;
            src->copyToHost(srcHost);
        
            int blockSize = 256;
            int nBlocksPerWG = cdata.m_nBlocksPerWG;
            int startBit = ib;

            {
                for (int i=0;i<NUM_TABLES;i++)
                {
                    tables[i] = testHist[i*NUM_WGS];
                }
                
                //	distribute
                for(int i=0; i<n; i++)
                {
                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                    
                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
                    counter2[tableIdx] ++;
                }

            
            }
            
            
            b3AlignedObjectArray<b3SortData> dstHost;
            dstHost.resize(src->size());
            
            
            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            
            
            
            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
            {
              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
                
              for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
              {
                for (int lIdx = 0;lIdx < 64;lIdx++)
                {
                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
                    
                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
                    //	AMD: AtomInc performs better while NV prefers ++
                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
                    {
                        if( addr+j < n )
                        {
                          //  printf ("addr+j=%d\n", addr+j);
                            
                            int i = addr+j;
                            
                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                            
                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
                            
                            b3SortData ok = dstHostOK[destIndex];
                                                    
                            if (ok.m_key != srcHost[i].m_key)
                            {
                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
                            }
                            if (ok.m_value != srcHost[i].m_value)
                            {
                                
                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );

                            }
                   
                            dstHost[destIndex] = srcHost[i];
                            counter[tableIdx] ++;
                            
                        }
                    }
                }
              }
            }
            
         
#endif //SEQUENTIAL
            
            dst->copyFromHost(dstHost);
        }
#endif//USE_GPU
        
        
        
#ifdef DEBUG_RADIXSORT
		destHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
#endif //DEBUG_RADIXSORT
		b3Swap(src, dst );
		b3Swap(srcHisto,destHisto);

#ifdef DEBUG_RADIXSORT2
        keyValuesInOut.copyToHost(test2);
        printf("numElem = %d\n",test2.size());
        for (int i=0;i<test2.size();i++)
        {
            if (test2[i].m_key != test2[i].m_value)
            {
                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
            }
        }
#endif //DEBUG_RADIXSORT2
        
        count++;
                
        
	}