Exemplo n.º 1
0
void Solver::convertToConstraints( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf,
                                   const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf,
                                   btOpenCLArray<Contact4>* contactsIn, btOpenCLArray<Constraint4>* contactCOut, void* additionalData,
                                   int nContacts, const ConstraintCfg& cfg )
{
    btOpenCLArray<Constraint4>* constraintNative =0;

    struct CB
    {
        int m_nContacts;
        float m_dt;
        float m_positionDrift;
        float m_positionConstraintCoeff;
    };

    {
        BT_PROFILE("m_contactToConstraintKernel");
        CB cdata;
        cdata.m_nContacts = nContacts;
        cdata.m_dt = cfg.m_dt;
        cdata.m_positionDrift = cfg.m_positionDrift;
        cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;


        btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( shapeBuf->getBufferCL()),
                                   btBufferInfoCL( contactCOut->getBufferCL() )
                                 };
        btLauncherCL launcher( m_queue, m_contactToConstraintKernel );
        launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
        //launcher.setConst(  cdata );

        launcher.setConst(cdata.m_nContacts);
        launcher.setConst(cdata.m_dt);
        launcher.setConst(cdata.m_positionDrift);
        launcher.setConst(cdata.m_positionConstraintCoeff);

        launcher.launch1D( nContacts, 64 );
        clFinish(m_queue);

    }

    contactCOut->resize(nContacts);
}
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, unsigned int nSrc, btOpenCLArray<unsigned int>& dst, unsigned int nDst, Option option )
{
	btInt4 constBuffer;
	constBuffer.x = nSrc;
	constBuffer.y = nDst;

	if( option == BOUND_LOWER )
	{
		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };

		btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
		launcher.setConst( nSrc );
        launcher.setConst( nDst );
        
		launcher.launch1D( nSrc, 64 );
	}
	else if( option == BOUND_UPPER )
	{
		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };

		btLauncherCL launcher(m_queue, m_upperSortDataKernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
        launcher.setConst( nSrc );
        launcher.setConst( nDst );

		launcher.launch1D( nSrc+1, 64 );
	}
	else if( option == COUNT )
	{
		btAssert( m_lower );
		btAssert( m_upper );
		btAssert( m_lower->capacity() <= (int)nDst );
		btAssert( m_upper->capacity() <= (int)nDst );

		int zero = 0;
		m_filler->execute( *m_lower, zero, nDst );
		m_filler->execute( *m_upper, zero, nDst );

		execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
		execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );

		{
			btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };

			btLauncherCL  launcher( m_queue, m_subtractKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
            launcher.setConst( nSrc );
            launcher.setConst( nDst );

			launcher.launch1D( nDst, 64 );
		}
	}
	else
	{
		btAssert( 0 );
	}

}
Exemplo n.º 3
0
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int& value, int n, int offset)
{
	btAssert( n>0 );

	{
		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };

		btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
		launcher.setConst( n );
        launcher.setConst(value);
		launcher.launch1D( n );
	}
}
Exemplo n.º 4
0
void btFillCL::execute(btOpenCLArray<int>& src, const int& value, int n, int offset)
{
	btAssert( n>0 );
	btConstData constBuffer;
	{
		constBuffer.m_offset = offset;
		constBuffer.m_n = n;
		constBuffer.m_UnsignedData = btMakeUnsignedInt4( value,value,value,value );
	}

	{
		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };

		btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
		launcher.setConst( constBuffer );
		launcher.launch1D( n );
	}
}
Exemplo n.º 5
0
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
{
	btAssert( n>0 );
	btConstData constBuffer;
	{
		constBuffer.m_offset = offset;
		constBuffer.m_n = n;
		constBuffer.m_data = btMakeInt4( value.x, value.y, 0, 0 );
	}

	{
		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };

		btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
		launcher.setConst(n);
		launcher.setConst(value);
		launcher.setConst(offset);

		//( constBuffer );
		launcher.launch1D( n );
	}
}
Exemplo n.º 6
0
void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
{
	int axis = 0;//todo on GPU for now hardcode

	btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
	

	if (forceHost)
	{

	btAlignedObjectArray<btSapAabb> allHostAabbs;
	m_allAabbsGPU.copyToHost(allHostAabbs);
	
	{
		int numSmallAabbs = m_smallAabbsCPU.size();
		for (int j=0;j<numSmallAabbs;j++)
		{
			//sync aabb
			int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
			m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
			m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
		}
	}

	{
		int numLargeAabbs = m_largeAabbsCPU.size();
		for (int j=0;j<numLargeAabbs;j++)
		{
			//sync aabb
			int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
			m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
			m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;

		}
	}

	btAlignedObjectArray<btInt2> hostPairs;

	{
		int numSmallAabbs = m_smallAabbsCPU.size();
		for (int i=0;i<numSmallAabbs;i++)
		{
			float reference = m_smallAabbsCPU[i].m_max[axis];

			for (int j=i+1;j<numSmallAabbs;j++)
			{
				if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
					(btVector3&)m_smallAabbsCPU[j].m_min,(btVector3&)m_smallAabbsCPU[j].m_max))
				{
					btInt2 pair;
					pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
					pair.y = m_smallAabbsCPU[j].m_minIndices[3];
					hostPairs.push_back(pair);
				}
			}
		}
	}

	
	{
		int numSmallAabbs = m_smallAabbsCPU.size();
		for (int i=0;i<numSmallAabbs;i++)
		{
			float reference = m_smallAabbsCPU[i].m_max[axis];
			int numLargeAabbs = m_largeAabbsCPU.size();

			for (int j=0;j<numLargeAabbs;j++)
			{
				if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
					(btVector3&)m_largeAabbsCPU[j].m_min,(btVector3&)m_largeAabbsCPU[j].m_max))
				{
					btInt2 pair;
					pair.x = m_largeAabbsCPU[j].m_minIndices[3];
					pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
					hostPairs.push_back(pair);
				}
			}
		}
	}


	if (hostPairs.size())
	{
		m_overlappingPairs.copyFromHost(hostPairs);
	} else
	{
		m_overlappingPairs.resize(0);
	}

	return;
	}

	{

	bool syncOnHost = false;

	if (syncOnHost)
	{
		BT_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
		btAlignedObjectArray<btSapAabb> allHostAabbs;
		m_allAabbsGPU.copyToHost(allHostAabbs);

		m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
		{
			int numSmallAabbs = m_smallAabbsCPU.size();
			for (int j=0;j<numSmallAabbs;j++)
			{
				//sync aabb
				int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
				m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
				m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
			}
		}
		m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
	
	} else
	{
		{
			int numSmallAabbs = m_smallAabbsGPU.size();
			BT_PROFILE("copyAabbsKernelSmall");
			btBufferInfoCL bInfo[] = { 
				btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
				btBufferInfoCL( m_smallAabbsGPU.getBufferCL()),
			};

			btLauncherCL launcher(m_queue, m_copyAabbsKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst( numSmallAabbs  );
			int num = numSmallAabbs;
			launcher.launch1D( num);
			clFinish(m_queue);
		}
	}

	if (syncOnHost)
	{
		BT_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
		btAlignedObjectArray<btSapAabb> allHostAabbs;
		m_allAabbsGPU.copyToHost(allHostAabbs);

		m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
		{
			int numLargeAabbs = m_largeAabbsCPU.size();
			for (int j=0;j<numLargeAabbs;j++)
			{
				//sync aabb
				int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
				m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
				m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
			}
		}
		m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
	
	} else
	{
		int numLargeAabbs = m_largeAabbsGPU.size();
		
		if (numLargeAabbs)
		{
			BT_PROFILE("copyAabbsKernelLarge");
			btBufferInfoCL bInfo[] = { 
				btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
				btBufferInfoCL( m_largeAabbsGPU.getBufferCL()),
			};

			btLauncherCL launcher(m_queue, m_copyAabbsKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst( numLargeAabbs  );
			int num = numLargeAabbs;
			launcher.launch1D( num);
			clFinish(m_queue);
		}
	}




		BT_PROFILE("GPU SAP");
		
		int numSmallAabbs = m_smallAabbsGPU.size();
		m_gpuSmallSortData.resize(numSmallAabbs);
		int numLargeAabbs = m_smallAabbsGPU.size();

#if 1
		if (m_smallAabbsGPU.size())
		{
			BT_PROFILE("flipFloatKernel");
			btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL())};
			btLauncherCL launcher(m_queue, m_flipFloatKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst( numSmallAabbs  );
			launcher.setConst( axis  );
			
			int num = numSmallAabbs;
			launcher.launch1D( num);
			clFinish(m_queue);
		}

		{
			BT_PROFILE("gpu radix sort\n");
			m_sorter->execute(m_gpuSmallSortData);
			clFinish(m_queue);
		}

		m_gpuSmallSortedAabbs.resize(numSmallAabbs);
		if (numSmallAabbs)
		{
			BT_PROFILE("scatterKernel");
			btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
			btLauncherCL launcher(m_queue, m_scatterKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst( numSmallAabbs);
			int num = numSmallAabbs;
			launcher.launch1D( num);
			clFinish(m_queue);
			
		}
        

			int maxPairsPerBody = 64;
			int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
			m_overlappingPairs.resize(maxPairs);

			btOpenCLArray<int> pairCount(m_context, m_queue);
			pairCount.push_back(0);
            int numPairs=0;

			{
				int numLargeAabbs = m_largeAabbsGPU.size();
				if (numLargeAabbs && numSmallAabbs)
				{
					BT_PROFILE("sap2Kernel");
					btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
					btLauncherCL launcher(m_queue, m_sap2Kernel);
					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
					launcher.setConst(   numLargeAabbs  );
					launcher.setConst( numSmallAabbs);
					launcher.setConst( axis  );
					launcher.setConst( maxPairs  );
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
					launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
                
					numPairs = pairCount.at(0);
					if (numPairs >maxPairs)
						numPairs =maxPairs;
					
				}
			}
			if (m_gpuSmallSortedAabbs.size())
			{
				BT_PROFILE("sapKernel");
				btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
				btLauncherCL launcher(m_queue, m_sapKernel);
				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
				launcher.setConst( numSmallAabbs  );
				launcher.setConst( axis  );
				launcher.setConst( maxPairs  );

			
				int num = numSmallAabbs;
#if 0                
                int buffSize = launcher.getSerializationBufferSize();
                unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
                for (int i=0;i<buffSize+1;i++)
                {
                    unsigned char* ptr = (unsigned char*)&buf[i];
                    *ptr = 0xff;
                }
                int actualWrite = launcher.serializeArguments(buf,buffSize);
                
                unsigned char* cptr = (unsigned char*)&buf[buffSize];
    //            printf("buf[buffSize] = %d\n",*cptr);
                
                assert(buf[buffSize]==0xff);//check for buffer overrun
                int* ptr = (int*)&buf[buffSize];
                
                *ptr = num;
                
                FILE* f = fopen("m_sapKernelArgs.bin","wb");
                fwrite(buf,buffSize+sizeof(int),1,f);
                fclose(f);
#endif//

                launcher.launch1D( num);
				clFinish(m_queue);
                
                numPairs = pairCount.at(0);
                if (numPairs>maxPairs)
					numPairs = maxPairs;
			}
			
#else
        int numPairs = 0;
        
        
        btLauncherCL launcher(m_queue, m_sapKernel);

        const char* fileName = "m_sapKernelArgs.bin";
        FILE* f = fopen(fileName,"rb");
        if (f)
        {
            int sizeInBytes=0;
            if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) 
            {
                printf("error, cannot get file size\n");
                exit(0);
            }
            
            unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
            fread(buf,sizeInBytes,1,f);
            int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
            int num = *(int*)&buf[serializedBytes];
            launcher.launch1D( num);
            
            btOpenCLArray<int> pairCount(m_context, m_queue);
            int numElements = launcher.m_arrays[2]->size()/sizeof(int);
            pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
            numPairs = pairCount.at(0);
            //printf("overlapping pairs = %d\n",numPairs);
            btAlignedObjectArray<btInt2>		hostOoverlappingPairs;
            btOpenCLArray<btInt2> tmpGpuPairs(m_context,m_queue);
            tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
   
            tmpGpuPairs.copyToHost(hostOoverlappingPairs);
            m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
            //printf("hello %d\n", m_overlappingPairs.size());
            free(buf);
            fclose(f);
            
        } else {
            printf("error: cannot find file %s\n",fileName);
        }
        
        clFinish(m_queue);

        
#endif

			
        m_overlappingPairs.resize(numPairs);
		
	}//BT_PROFILE("GPU_RADIX SORT");

}
void btGpuNarrowphaseAndSolver::computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs) 
{


	BT_PROFILE("computeContactsAndSolver");
	bool bGPU = (m_internalData != 0);
	int maxBodyIndex = m_internalData->m_numAcceleratedRigidBodies;

	if (!maxBodyIndex)
		return;
	int numOfConvexRBodies = maxBodyIndex;

	ChNarrowphaseBase::Config cfgNP;
	cfgNP.m_collisionMargin = 0.01f;
	int nContactOut = 0;
	//printf("convexPairsOut.m_size = %d\n",m_internalData->m_convexPairsOutGPU->m_size);


	btOpenCLArray<int2> broadphasePairsGPU(m_context,m_queue);
	broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs);

	bool useCulling = true;
	if (useCulling)
	{
		BT_PROFILE("ChNarrowphase::culling");
		clFinish(m_queue);

		numPairsOut = m_internalData->m_narrowPhase->culling(
			&broadphasePairsGPU, 
			numBroadphasePairs,
			m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer,
			m_internalData->m_convexPairsOutGPU,
			cfgNP);
	}
	{
			if (m_planeBodyIndex>=0)
			{
				BT_PROFILE("ChNarrowphase:: plane versus convex");
				//todo: get rid of this dynamic allocation
				int2* hostPairs = new int2[m_internalData->m_numAcceleratedRigidBodies-1];
				int index=0;
				for (int i=0;i<m_internalData->m_numAcceleratedRigidBodies;i++)
				{
					if (i!=m_planeBodyIndex)
					{
						hostPairs[index].x = m_planeBodyIndex;
						hostPairs[index].y = i;
						index++;
					}
				}
				assert(m_internalData->m_numAcceleratedRigidBodies-1 == index);
				m_internalData->m_planePairs->copyFromHostPointer(hostPairs,index);
				clFinish(m_queue);

				delete[]hostPairs;
				//convex versus plane
				m_internalData->m_narrowPhase->execute(m_internalData->m_planePairs, index, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, 
					0,0,m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
			}
	}
	{
		BT_PROFILE("ChNarrowphase::execute");
		if (useCulling)
		{
			//convex versus convex
			//m_internalData->m_narrowPhase->execute(m_internalData->m_convexPairsOutGPU,numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
#define USE_CONVEX_CONVEX_HOST 1
#ifdef USE_CONVEX_CONVEX_HOST
			m_internalData->m_convexPairsOutGPU->resize(numPairsOut);
			m_internalData->m_pBufContactOutGPU->resize(nContactOut);
			
			m_internalData->m_gpuSatCollision->computeConvexConvexContactsHost(
				m_internalData->m_convexPairsOutGPU,
				numPairsOut, 
				m_internalData->m_bodyBufferGPU, 
				m_internalData->m_ShapeBuffer, 
				m_internalData->m_pBufContactOutGPU, 
				nContactOut, cfgNP, m_internalData->m_convexPolyhedra,m_internalData->m_convexVertices,m_internalData->m_uniqueEdges,
				m_internalData->m_convexFaces,m_internalData->m_convexIndices);
#else

			m_internalData->m_narrowPhase->execute(
				m_internalData->m_convexPairsOutGPU,
				numPairsOut, 
				m_internalData->m_bodyBufferGPU, 
				m_internalData->m_ShapeBuffer, 
				m_internalData->m_pBufContactOutGPU, 
				nContactOut, cfgNP);
#endif


		} else
		{
			m_internalData->m_narrowPhase->execute(&broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
		}

		clFinish(m_queue);
	}
	
	if (!nContactOut)
		return;
	

	bool useSolver = true;//true;//false;

	if (useSolver)
	{
		float dt=1./60.;
		SolverBase::ConstraintCfg csCfg( dt );
		csCfg.m_enableParallelSolve = true;
		csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent;
		csCfg.m_staticIdx = m_planeBodyIndex;

		btOpenCLArray<Contact4>* contactsIn = m_internalData->m_pBufContactOutGPU;
		const btOpenCLArray<RigidBodyBase::Body>* bodyBuf = m_internalData->m_bodyBufferGPU;
		void* additionalData = m_internalData->m_frictionCGPU;
		const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf = m_internalData->m_inertiaBufferGPU;
		SolverData contactCOut = m_internalData->m_contactCGPU;
		int nContacts = nContactOut;

		bool useCPU=false;

		{
			BT_PROFILE("GPU batch");

			{
				//@todo: just reserve it, without copy of original contact (unless we use warmstarting)
				if( m_internalData->m_solverGPU->m_contactBuffer)
				{
					m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts);
				}

				if( m_internalData->m_solverGPU->m_contactBuffer == 0 )
				{
					m_internalData->m_solverGPU->m_contactBuffer = new btOpenCLArray<Contact4>(m_context,m_queue, nContacts );
					m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts);
				}

				btOpenCLArray<Contact4>* contactNative  = contactsIn;
				const btOpenCLArray<RigidBodyBase::Body>* bodyNative = bodyBuf;
				

				{
					
					//btOpenCLArray<RigidBodyBase::Body>* bodyNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
					//btOpenCLArray<Contact4>* contactNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );

					const int sortAlignment = 512; // todo. get this out of sort
					if( csCfg.m_enableParallelSolve )
					{
						

						int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );

						btOpenCLArray<u32>* countsNative = m_internalData->m_solverGPU->m_numConstraints;
						btOpenCLArray<u32>* offsetsNative = m_internalData->m_solverGPU->m_offsets;

						{	//	2. set cell idx
							BT_PROFILE("GPU set cell idx");
							struct CB
							{
								int m_nContacts;
								int m_staticIdx;
								float m_scale;
								int m_nSplit;
							};

							ADLASSERT( sortSize%64 == 0 );
							CB cdata;
							cdata.m_nContacts = nContacts;
							cdata.m_staticIdx = csCfg.m_staticIdx;
							cdata.m_scale = 1.f/(BT_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent);
							cdata.m_nSplit = BT_SOLVER_N_SPLIT;

							m_internalData->m_solverGPU->m_sortDataBuffer->resize(nContacts);

							
							btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
							btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_setSortDataKernel );
							launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
							launcher.setConst( cdata );
							launcher.launch1D( sortSize, 64 );
						}
						bool gpuRadixSort=true;
						if (gpuRadixSort)
						{	//	3. sort by cell idx
							BT_PROFILE("gpuRadixSort");
							int n = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT;
							int sortBit = 32;
							//if( n <= 0xffff ) sortBit = 16;
							//if( n <= 0xff ) sortBit = 8;
							//adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
							//adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
							btOpenCLArray<btSortData>& keyValuesInOut = *(m_internalData->m_solverGPU->m_sortDataBuffer);
							this->m_internalData->m_solverGPU->m_sort32->execute(keyValuesInOut);

							/*btAlignedObjectArray<btSortData> hostValues;
							keyValuesInOut.copyToHost(hostValues);
							printf("hostValues.size=%d\n",hostValues.size());
							*/

						}



						
						{	
							//	4. find entries
							BT_PROFILE("gpuBoundSearch");
							
							m_internalData->m_solverGPU->m_search->execute(*m_internalData->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,
								BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT,btBoundSearchCL::COUNT);

							
							//adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, 
							//	BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );

							//unsigned int sum;
							m_internalData->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT);//,&sum );
							//printf("sum = %d\n",sum);
						} 


						{	//	5. sort constraints by cellIdx
							{
								BT_PROFILE("gpu m_reorderContactKernel");
							
								btInt4 cdata; 
								cdata.x = nContacts;

								btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL())
									, btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
								btLauncherCL launcher(m_queue,m_internalData->m_solverGPU->m_reorderContactKernel);
								launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
								launcher.setConst( cdata );
								launcher.launch1D( nContacts, 64 );
							}
						}

					}

				}

				clFinish(m_queue);

				{
					BT_PROFILE("gpu m_copyConstraintKernel");
					
					btInt4 cdata; cdata.x = nContacts;
					btBufferInfoCL bInfo[] = { btBufferInfoCL(  m_internalData->m_solverGPU->m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactNative->getBufferCL() ) };
					btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_copyConstraintKernel );
					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
					launcher.setConst(  cdata );
					launcher.launch1D( nContacts, 64 );
					clFinish(m_queue);
				}
					
				bool compareGPU = false;
				if (gpuBatchContacts)
				{
					BT_PROFILE("gpu batchContacts");
					m_internalData->m_solverGPU->batchContacts( contactNative, nContacts, m_internalData->m_solverGPU->m_numConstraints, m_internalData->m_solverGPU->m_offsets, csCfg.m_staticIdx );
				}

				if (1)
				{
					BT_PROFILE("gpu convertToConstraints");
					m_internalData->m_solverGPU->convertToConstraints( bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, csCfg );
					clFinish(m_queue);
				}

			}
		}


		if (1)
		{
			BT_PROFILE("GPU solveContactConstraint");
			m_internalData->m_solverGPU->m_nIterations = 4;//10
			m_internalData->m_solverGPU->solveContactConstraint(m_internalData->m_bodyBufferGPU, 
				m_internalData->m_inertiaBufferGPU, 
				m_internalData->m_contactCGPU,
				0, 
				nContactOut );

			clFinish(m_queue);
		}


#if 0
		if (0)
		{
			BT_PROFILE("read body velocities back to CPU");
			//read body updated linear/angular velocities back to CPU
			m_internalData->m_bodyBufferGPU->read(
				m_internalData->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
			adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
		}
#endif

	}

}
Exemplo n.º 8
0
void Solver::batchContacts(  btOpenCLArray<Contact4>* contacts, int nContacts, btOpenCLArray<u32>* nNative, btOpenCLArray<u32>* offsetsNative, int staticIdx )
{

    {
        BT_PROFILE("batch generation");

        btInt4 cdata;
        cdata.x = nContacts;
        cdata.y = 0;
        cdata.z = staticIdx;

        int numWorkItems = 64*N_SPLIT*N_SPLIT;
#ifdef BATCH_DEBUG
        SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
        adl::btOpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
        memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
        gpuDebugInfo.write(debugInfo,numWorkItems);
#endif


        btBufferInfoCL bInfo[] = {
            btBufferInfoCL( contacts->getBufferCL() ),
            btBufferInfoCL( m_contactBuffer->getBufferCL() ),
            btBufferInfoCL( nNative->getBufferCL() ),
            btBufferInfoCL( offsetsNative->getBufferCL() )
#ifdef BATCH_DEBUG
            ,	btBufferInfoCL(&gpuDebugInfo)
#endif
        };


        {
            BT_PROFILE("batchingKernel");
            btLauncherCL launcher( m_queue, m_batchingKernel);
            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
            //launcher.setConst(  cdata );
            launcher.setConst(staticIdx);

            launcher.launch1D( numWorkItems, 64 );
            clFinish(m_queue);
        }

#ifdef BATCH_DEBUG
        aaaa
        Contact4* hostContacts = new Contact4[nContacts];
        m_contactBuffer->read(hostContacts,nContacts);
        clFinish(m_queue);

        gpuDebugInfo.read(debugInfo,numWorkItems);
        clFinish(m_queue);

        for (int i=0; i<numWorkItems; i++)
        {
            if (debugInfo[i].m_valInt1>0)
            {
                printf("catch\n");
            }
            if (debugInfo[i].m_valInt2>0)
            {
                printf("catch22\n");
            }

            if (debugInfo[i].m_valInt3>0)
            {
                printf("catch666\n");
            }

            if (debugInfo[i].m_valInt4>0)
            {
                printf("catch777\n");
            }
        }
        delete[] debugInfo;
#endif //BATCH_DEBUG

    }

//	copy buffer to buffer
    btAssert(m_contactBuffer->size()==nContacts);
    //contacts->copyFromOpenCLArray( *m_contactBuffer);
    //clFinish(m_queue);//needed?


}
Exemplo n.º 9
0
void Solver::sortContacts(  const btOpenCLArray<RigidBodyBase::Body>* bodyBuf,
                            btOpenCLArray<Contact4>* contactsIn, void* additionalData,
                            int nContacts, const Solver::ConstraintCfg& cfg )
{



    const int sortAlignment = 512; // todo. get this out of sort
    if( cfg.m_enableParallelSolve )
    {


        int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );

        btOpenCLArray<u32>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
        btOpenCLArray<u32>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );

        {   //	2. set cell idx
            struct CB
            {
                int m_nContacts;
                int m_staticIdx;
                float m_scale;
                int m_nSplit;
            };

            btAssert( sortSize%64 == 0 );
            CB cdata;
            cdata.m_nContacts = nContacts;
            cdata.m_staticIdx = cfg.m_staticIdx;
            cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
            cdata.m_nSplit = N_SPLIT;


            btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
            btLauncherCL launcher( m_queue, m_setSortDataKernel );
            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
            launcher.setConst(  cdata );
            launcher.launch1D( sortSize, 64 );
        }

        {   //	3. sort by cell idx
            int n = N_SPLIT*N_SPLIT;
            int sortBit = 32;
            //if( n <= 0xffff ) sortBit = 16;
            //if( n <= 0xff ) sortBit = 8;
            m_sort32->execute(*m_sortDataBuffer,sortSize);
        }
        {   //	4. find entries
            m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, btBoundSearchCL::COUNT);

            m_scan->execute( *countsNative, *offsetsNative, N_SPLIT*N_SPLIT );
        }

        {   //	5. sort constraints by cellIdx
            //	todo. preallocate this
//			btAssert( contactsIn->getType() == TYPE_HOST );
//			btOpenCLArray<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer

            {


                btInt4 cdata;
                cdata.x = nContacts;
                btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
                btLauncherCL launcher( m_queue, m_reorderContactKernel );
                launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
                launcher.setConst(  cdata );
                launcher.launch1D( nContacts, 64 );
            }
//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
        }
    }


}
Exemplo n.º 10
0
void Solver::solveContactConstraint(  const btOpenCLArray<RigidBodyBase::Body>* bodyBuf, const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf,
                                      btOpenCLArray<Constraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
{


    btInt4 cdata = btMakeInt4( n, 0, 0, 0 );
    {

        const int nn = N_SPLIT*N_SPLIT;

        cdata.x = 0;
        cdata.y = maxNumBatches;//250;


        int numWorkItems = 64*nn/N_BATCHES;
#ifdef DEBUG_ME
        SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
        adl::btOpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
#endif



        {

            BT_PROFILE("m_batchSolveKernel iterations");
            for(int iter=0; iter<m_nIterations; iter++)
            {
                for(int ib=0; ib<N_BATCHES; ib++)
                {
#ifdef DEBUG_ME
                    memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
                    gpuDebugInfo.write(debugInfo,numWorkItems);
#endif


                    cdata.z = ib;
                    cdata.w = N_SPLIT;

                    btLauncherCL launcher( m_queue, m_solveContactKernel );
#if 1

                    btBufferInfoCL bInfo[] = {

                        btBufferInfoCL( bodyBuf->getBufferCL() ),
                        btBufferInfoCL( shapeBuf->getBufferCL() ),
                        btBufferInfoCL( constraint->getBufferCL() ),
                        btBufferInfoCL( m_numConstraints->getBufferCL() ),
                        btBufferInfoCL( m_offsets->getBufferCL() )
#ifdef DEBUG_ME
                        ,	btBufferInfoCL(&gpuDebugInfo)
#endif
                    };



                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
                    //launcher.setConst(  cdata.x );
                    launcher.setConst(  cdata.y );
                    launcher.setConst(  cdata.z );
                    launcher.setConst(  cdata.w );
                    launcher.launch1D( numWorkItems, 64 );


#else
                    const char* fileName = "m_batchSolveKernel.bin";
                    FILE* f = fopen(fileName,"rb");
                    if (f)
                    {
                        int sizeInBytes=0;
                        if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
                        {
                            printf("error, cannot get file size\n");
                            exit(0);
                        }

                        unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
                        fread(buf,sizeInBytes,1,f);
                        int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
                        int num = *(int*)&buf[serializedBytes];

                        launcher.launch1D( num);

                        //this clFinish is for testing on errors
                        clFinish(m_queue);
                    }

#endif


#ifdef DEBUG_ME
                    clFinish(m_queue);
                    gpuDebugInfo.read(debugInfo,numWorkItems);
                    clFinish(m_queue);
                    for (int i=0; i<numWorkItems; i++)
                    {
                        if (debugInfo[i].m_valInt2>0)
                        {
                            printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
                        }

                        if (debugInfo[i].m_valInt3>0)
                        {
                            printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
                        }
                    }
#endif //DEBUG_ME


                }
            }

            clFinish(m_queue);


        }

        cdata.x = 1;
        bool applyFriction=true;
        if (applyFriction)
        {
            BT_PROFILE("m_batchSolveKernel iterations2");
            for(int iter=0; iter<m_nIterations; iter++)
            {
                for(int ib=0; ib<N_BATCHES; ib++)
                {
                    cdata.z = ib;
                    cdata.w = N_SPLIT;

                    btBufferInfoCL bInfo[] = {
                        btBufferInfoCL( bodyBuf->getBufferCL() ),
                        btBufferInfoCL( shapeBuf->getBufferCL() ),
                        btBufferInfoCL( constraint->getBufferCL() ),
                        btBufferInfoCL( m_numConstraints->getBufferCL() ),
                        btBufferInfoCL( m_offsets->getBufferCL() )
#ifdef DEBUG_ME
                        ,btBufferInfoCL(&gpuDebugInfo)
#endif //DEBUG_ME
                    };
                    btLauncherCL launcher( m_queue, m_solveFrictionKernel );
                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
                    //launcher.setConst(  cdata.x );
                    launcher.setConst(  cdata.y );
                    launcher.setConst(  cdata.z );
                    launcher.setConst(  cdata.w );

                    launcher.launch1D( 64*nn/N_BATCHES, 64 );
                }
            }
            clFinish(m_queue);

        }
#ifdef DEBUG_ME
        delete[] debugInfo;
#endif //DEBUG_ME
    }


}
Exemplo n.º 11
0
void Solver::reorderConvertToConstraints( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf,
        const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf,
        btOpenCLArray<Contact4>* contactsIn, btOpenCLArray<Constraint4>* contactCOut, void* additionalData,
        int nContacts, const Solver::ConstraintCfg& cfg )
{
    if( m_contactBuffer )
    {
        m_contactBuffer->resize(nContacts);
    }
    if( m_contactBuffer == 0 )
    {
        BT_PROFILE("new m_contactBuffer;");
        m_contactBuffer = new btOpenCLArray<Contact4>(m_context,m_queue,nContacts );
        m_contactBuffer->resize(nContacts);
    }




    //DeviceUtils::Config dhCfg;
    //Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
    if( cfg.m_enableParallelSolve )
    {


        clFinish(m_queue);

        //	contactsIn -> m_contactBuffer
        {
            BT_PROFILE("sortContacts");
            sortContacts( bodyBuf, contactsIn, additionalData, nContacts, cfg );
            clFinish(m_queue);
        }


        {
            BT_PROFILE("m_copyConstraintKernel");



            btInt4 cdata;
            cdata.x = nContacts;
            btBufferInfoCL bInfo[] = { btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactsIn->getBufferCL() ) };
//			btLauncherCL launcher( m_queue, data->m_device->getKernel( PATH, "CopyConstraintKernel",  "-I ..\\..\\ -Wf,--c++", 0 ) );
            btLauncherCL launcher( m_queue, m_copyConstraintKernel );
            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
            launcher.setConst(  cdata );
            launcher.launch1D( nContacts, 64 );
            clFinish(m_queue);
        }

        {
            BT_PROFILE("batchContacts");
            Solver::batchContacts( contactsIn, nContacts, m_numConstraints, m_offsets, cfg.m_staticIdx );

        }
    }
    {
        BT_PROFILE("waitForCompletion (batchContacts)");
        clFinish(m_queue);
    }

    //================

    {
        BT_PROFILE("convertToConstraints");
        Solver::convertToConstraints(  bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg );
    }

    {
        BT_PROFILE("convertToConstraints waitForCompletion");
        clFinish(m_queue);
    }

}
Exemplo n.º 12
0
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
{
	
	int originalSize = keyValuesInOut.size();
	int workingSize = originalSize;
	
			
	int dataAlignment = DATA_ALIGNMENT;

#ifdef DEBUG_RADIXSORT2
    btAlignedObjectArray<btSortData>   test2;
    keyValuesInOut.copyToHost(test2);
    printf("numElem = %d\n",test2.size());
    for (int i=0;i<test2.size();i++)
    {
        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
    }
#endif //DEBUG_RADIXSORT2
    
	btOpenCLArray<btSortData>* src = 0;

	if (workingSize%dataAlignment)
	{
		workingSize += dataAlignment-(workingSize%dataAlignment);
		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
		m_workBuffer4->resize(workingSize);
		btSortData fillValue;
		fillValue.m_key = 0xffffffff;
		fillValue.m_value = 0xffffffff;

#define USE_BTFILL
#ifdef USE_BTFILL
		m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
#else
		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
		
		for (int i=originalSize; i<workingSize;i++)
		{
			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
		}
#endif//USE_BTFILL

		src = m_workBuffer4;
	} else
	{
		src = &keyValuesInOut;
		m_workBuffer4->resize(0);
	}
		
	btAssert( workingSize%DATA_ALIGNMENT == 0 );
	int minCap = NUM_BUCKET*NUM_WGS;


	int n = workingSize;

	m_workBuffer1->resize(minCap);
	m_workBuffer3->resize(workingSize);
	

//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
	btAssert( BITS_PER_PASS == 4 );
	btAssert( WG_SIZE == 64 );
	btAssert( (sortBits&0x3) == 0 );

	
	
	btOpenCLArray<btSortData>* dst = m_workBuffer3;

	btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
	btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;


	int nWGs = NUM_WGS;
	btConstData cdata;

	{
        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
     	int nBlocks = (n+blockSize-1)/(blockSize);
		cdata.m_n = n;
		cdata.m_nWGs = NUM_WGS;
		cdata.m_startBit = 0;
		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
		if( nBlocks < NUM_WGS )
		{
			cdata.m_nBlocksPerWG = 1;
			nWGs = nBlocks;
		}
	}

	int count=0;
	for(int ib=0; ib<sortBits; ib+=4)
	{
#ifdef DEBUG_RADIXSORT2
        keyValuesInOut.copyToHost(test2);
        printf("numElem = %d\n",test2.size());
        for (int i=0;i<test2.size();i++)
        {
            if (test2[i].m_key != test2[i].m_value)
            {
                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
            }
        }
#endif //DEBUG_RADIXSORT2
        
		cdata.m_startBit = ib;
		
		{
			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
			btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);

			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst(  cdata );
			
			int num = NUM_WGS*WG_SIZE;
			launcher.launch1D( num, WG_SIZE );
		}

        
        
#ifdef DEBUG_RADIXSORT
		btAlignedObjectArray<unsigned int> testHist;
		srcHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
#endif //DEBUG_RADIXSORT
	
	

//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
	bool fastScan=true;
#else
	bool fastScan=false;
#endif

		if (fastScan)
		{//	prefix scan group histogram
			btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
			btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( 128, 128 );
			destHisto = srcHisto;
		}else
		{
			//unsigned int sum; //for debugging
            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
		}


#ifdef DEBUG_RADIXSORT
		destHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
        
        for (int i=0;i<testHist.size();i+=NUM_WGS)
		{
				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
		}

#endif //DEBUG_RADIXSORT

#define USE_GPU
#ifdef USE_GPU
        
		{//	local sort and distribute
			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
			btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
            
		}
#else
        {
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            int tables[NUM_TABLES];
            int startBit = ib;
            
            destHisto->copyToHost(testHist);
            btAlignedObjectArray<btSortData> srcHost;
            btAlignedObjectArray<btSortData> dstHost;
            dstHost.resize(src->size());
            
            src->copyToHost(srcHost);
            
            for (int i=0;i<NUM_TABLES;i++)
            {
                tables[i] = testHist[i*NUM_WGS];
            }
            
            //	distribute
            for(int i=0; i<n; i++)
            {
                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                
                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
                counter2[tableIdx] ++;
            }
            
            
#else
          
            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            
            int tables[NUM_TABLES];
             btAlignedObjectArray<btSortData> dstHostOK;
            dstHostOK.resize(src->size());

            destHisto->copyToHost(testHist);
            btAlignedObjectArray<btSortData> srcHost;
            src->copyToHost(srcHost);
        
            int blockSize = 256;
            int nBlocksPerWG = cdata.m_nBlocksPerWG;
            int startBit = ib;

            {
                for (int i=0;i<NUM_TABLES;i++)
                {
                    tables[i] = testHist[i*NUM_WGS];
                }
                
                //	distribute
                for(int i=0; i<n; i++)
                {
                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                    
                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
                    counter2[tableIdx] ++;
                }

            
            }
            
            
            btAlignedObjectArray<btSortData> dstHost;
            dstHost.resize(src->size());
            
            
            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            
            
            
            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
            {
              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
                
              for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
              {
                for (int lIdx = 0;lIdx < 64;lIdx++)
                {
                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
                    
                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
                    //	AMD: AtomInc performs better while NV prefers ++
                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
                    {
                        if( addr+j < n )
                        {
                          //  printf ("addr+j=%d\n", addr+j);
                            
                            int i = addr+j;
                            
                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                            
                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
                            
                            btSortData ok = dstHostOK[destIndex];
                                                    
                            if (ok.m_key != srcHost[i].m_key)
                            {
                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
                            }
                            if (ok.m_value != srcHost[i].m_value)
                            {
                                
                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );

                            }
                   
                            dstHost[destIndex] = srcHost[i];
                            counter[tableIdx] ++;
                            
                        }
                    }
                }
              }
            }
            
         
#endif //SEQUENTIAL
            
            dst->copyFromHost(dstHost);
        }
#endif//USE_GPU
        
        
        
#ifdef DEBUG_RADIXSORT
		destHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
#endif //DEBUG_RADIXSORT
		btSwap(src, dst );
		btSwap(srcHisto,destHisto);

#ifdef DEBUG_RADIXSORT2
        keyValuesInOut.copyToHost(test2);
        printf("numElem = %d\n",test2.size());
        for (int i=0;i<test2.size();i++)
        {
            if (test2[i].m_key != test2[i].m_value)
            {
                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
            }
        }
#endif //DEBUG_RADIXSORT2
        
        count++;
                
        
	}