void AdlPrimitivesDemo::render() { int size = 1024*256; // int size = 1024*64; size = NEXTMULTIPLEOF( size, 512 ); int* host1 = new int[size]; int2* host2 = new int2[size]; int4* host4 = new int4[size]; for(int i=0; i<size; i++) { host1[i] = getRandom(0,0xffff); host2[i] = make_int2( host1[i], i ); host4[i] = make_int4( host2[i].x, host2[i].y, host2[i].x, host2[i].y ); } Buffer<int> buf1( m_deviceData, size ); Buffer<int2> buf2( m_deviceData, size ); Buffer<int4> buf4( m_deviceData, size ); buf1.write( host1, size ); buf2.write( host2, size ); buf4.write( host4, size ); Stopwatch sw( m_deviceData ); m_nTxtLines = 0; sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "%d elems", size); // testSort( (Buffer<SortData>&)buf2, size, sw ); testFill1( buf1, size, sw ); testFill2( buf2, size, sw ); testFill4( buf4, size, sw ); test( buf2, size, sw ); delete [] host1; delete [] host2; delete [] host4; }
bool radixSortTest() { TEST_INIT; int maxSize = 1024*256; b3AlignedObjectArray<b3SortData> buf0Host; buf0Host.resize(maxSize); b3AlignedObjectArray<b3SortData> buf1Host; buf1Host.resize(maxSize ); b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize); b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize); int dx = maxSize/NUM_TESTS; for(int iter=0; iter<NUM_TESTS; iter++) { int size = b3Min( 128+dx*iter, maxSize-512 ); size = NEXTMULTIPLEOF( size, 512 );//not necessary buf0Host.resize(size); for(int i=0; i<size; i++) { b3SortData v; v.m_key = getRandom(0,0xff); v.m_value = i; buf0Host[i] = v; } buf2CL.copyFromHost( buf0Host); sort->executeHost( buf0Host); sort->execute(buf2CL); buf2CL.copyToHost(buf1Host); for(int i=0; i<size; i++) { TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key ); } } delete sort; TEST_REPORT( "radixSort" ); return g_testFailed; }
void btGpuNarrowphaseAndSolver::computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs) { BT_PROFILE("computeContactsAndSolver"); bool bGPU = (m_internalData != 0); int maxBodyIndex = m_internalData->m_numAcceleratedRigidBodies; if (!maxBodyIndex) return; int numOfConvexRBodies = maxBodyIndex; ChNarrowphaseBase::Config cfgNP; cfgNP.m_collisionMargin = 0.01f; int nContactOut = 0; //printf("convexPairsOut.m_size = %d\n",m_internalData->m_convexPairsOutGPU->m_size); btOpenCLArray<int2> broadphasePairsGPU(m_context,m_queue); broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs); bool useCulling = true; if (useCulling) { BT_PROFILE("ChNarrowphase::culling"); clFinish(m_queue); numPairsOut = m_internalData->m_narrowPhase->culling( &broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_convexPairsOutGPU, cfgNP); } { if (m_planeBodyIndex>=0) { BT_PROFILE("ChNarrowphase:: plane versus convex"); //todo: get rid of this dynamic allocation int2* hostPairs = new int2[m_internalData->m_numAcceleratedRigidBodies-1]; int index=0; for (int i=0;i<m_internalData->m_numAcceleratedRigidBodies;i++) { if (i!=m_planeBodyIndex) { hostPairs[index].x = m_planeBodyIndex; hostPairs[index].y = i; index++; } } assert(m_internalData->m_numAcceleratedRigidBodies-1 == index); m_internalData->m_planePairs->copyFromHostPointer(hostPairs,index); clFinish(m_queue); delete[]hostPairs; //convex versus plane m_internalData->m_narrowPhase->execute(m_internalData->m_planePairs, index, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, 0,0,m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); } } { BT_PROFILE("ChNarrowphase::execute"); if (useCulling) { //convex versus convex //m_internalData->m_narrowPhase->execute(m_internalData->m_convexPairsOutGPU,numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); #define USE_CONVEX_CONVEX_HOST 1 #ifdef USE_CONVEX_CONVEX_HOST m_internalData->m_convexPairsOutGPU->resize(numPairsOut); m_internalData->m_pBufContactOutGPU->resize(nContactOut); m_internalData->m_gpuSatCollision->computeConvexConvexContactsHost( m_internalData->m_convexPairsOutGPU, numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP, m_internalData->m_convexPolyhedra,m_internalData->m_convexVertices,m_internalData->m_uniqueEdges, m_internalData->m_convexFaces,m_internalData->m_convexIndices); #else m_internalData->m_narrowPhase->execute( m_internalData->m_convexPairsOutGPU, numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); #endif } else { m_internalData->m_narrowPhase->execute(&broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); } clFinish(m_queue); } if (!nContactOut) return; bool useSolver = true;//true;//false; if (useSolver) { float dt=1./60.; SolverBase::ConstraintCfg csCfg( dt ); csCfg.m_enableParallelSolve = true; csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent; csCfg.m_staticIdx = m_planeBodyIndex; btOpenCLArray<Contact4>* contactsIn = m_internalData->m_pBufContactOutGPU; const btOpenCLArray<RigidBodyBase::Body>* bodyBuf = m_internalData->m_bodyBufferGPU; void* additionalData = m_internalData->m_frictionCGPU; const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf = m_internalData->m_inertiaBufferGPU; SolverData contactCOut = m_internalData->m_contactCGPU; int nContacts = nContactOut; bool useCPU=false; { BT_PROFILE("GPU batch"); { //@todo: just reserve it, without copy of original contact (unless we use warmstarting) if( m_internalData->m_solverGPU->m_contactBuffer) { m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts); } if( m_internalData->m_solverGPU->m_contactBuffer == 0 ) { m_internalData->m_solverGPU->m_contactBuffer = new btOpenCLArray<Contact4>(m_context,m_queue, nContacts ); m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts); } btOpenCLArray<Contact4>* contactNative = contactsIn; const btOpenCLArray<RigidBodyBase::Body>* bodyNative = bodyBuf; { //btOpenCLArray<RigidBodyBase::Body>* bodyNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); //btOpenCLArray<Contact4>* contactNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); const int sortAlignment = 512; // todo. get this out of sort if( csCfg.m_enableParallelSolve ) { int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); btOpenCLArray<u32>* countsNative = m_internalData->m_solverGPU->m_numConstraints; btOpenCLArray<u32>* offsetsNative = m_internalData->m_solverGPU->m_offsets; { // 2. set cell idx BT_PROFILE("GPU set cell idx"); struct CB { int m_nContacts; int m_staticIdx; float m_scale; int m_nSplit; }; ADLASSERT( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = csCfg.m_staticIdx; cdata.m_scale = 1.f/(BT_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent); cdata.m_nSplit = BT_SOLVER_N_SPLIT; m_internalData->m_solverGPU->m_sortDataBuffer->resize(nContacts); btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_setSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( sortSize, 64 ); } bool gpuRadixSort=true; if (gpuRadixSort) { // 3. sort by cell idx BT_PROFILE("gpuRadixSort"); int n = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT; int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); btOpenCLArray<btSortData>& keyValuesInOut = *(m_internalData->m_solverGPU->m_sortDataBuffer); this->m_internalData->m_solverGPU->m_sort32->execute(keyValuesInOut); /*btAlignedObjectArray<btSortData> hostValues; keyValuesInOut.copyToHost(hostValues); printf("hostValues.size=%d\n",hostValues.size()); */ } { // 4. find entries BT_PROFILE("gpuBoundSearch"); m_internalData->m_solverGPU->m_search->execute(*m_internalData->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT,btBoundSearchCL::COUNT); //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, // BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); //unsigned int sum; m_internalData->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT);//,&sum ); //printf("sum = %d\n",sum); } { // 5. sort constraints by cellIdx { BT_PROFILE("gpu m_reorderContactKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL()) , btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; btLauncherCL launcher(m_queue,m_internalData->m_solverGPU->m_reorderContactKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); } } } } clFinish(m_queue); { BT_PROFILE("gpu m_copyConstraintKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactNative->getBufferCL() ) }; btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_copyConstraintKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); clFinish(m_queue); } bool compareGPU = false; if (gpuBatchContacts) { BT_PROFILE("gpu batchContacts"); m_internalData->m_solverGPU->batchContacts( contactNative, nContacts, m_internalData->m_solverGPU->m_numConstraints, m_internalData->m_solverGPU->m_offsets, csCfg.m_staticIdx ); } if (1) { BT_PROFILE("gpu convertToConstraints"); m_internalData->m_solverGPU->convertToConstraints( bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, csCfg ); clFinish(m_queue); } } } if (1) { BT_PROFILE("GPU solveContactConstraint"); m_internalData->m_solverGPU->m_nIterations = 4;//10 m_internalData->m_solverGPU->solveContactConstraint(m_internalData->m_bodyBufferGPU, m_internalData->m_inertiaBufferGPU, m_internalData->m_contactCGPU, 0, nContactOut ); clFinish(m_queue); } #if 0 if (0) { BT_PROFILE("read body velocities back to CPU"); //read body updated linear/angular velocities back to CPU m_internalData->m_bodyBufferGPU->read( m_internalData->m_bodyBufferCPU->m_ptr,numOfConvexRBodies); adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL ); } #endif } }
Solver::Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity) :m_nIterations(4), m_context(ctx), m_device(device), m_queue(queue) { m_sort32 = new btRadixSort32CL(ctx,device,queue); m_scan = new btPrefixScanCL(ctx,device,queue,N_SPLIT*N_SPLIT); m_search = new btBoundSearchCL(ctx,device,queue,N_SPLIT*N_SPLIT); const int sortSize = NEXTMULTIPLEOF( pairCapacity, 512 ); m_sortDataBuffer = new btOpenCLArray<btSortData>(ctx,queue,sortSize); m_contactBuffer = new btOpenCLArray<Contact4>(ctx,queue); m_numConstraints = new btOpenCLArray<u32>(ctx,queue,N_SPLIT*N_SPLIT ); m_numConstraints->resize(N_SPLIT*N_SPLIT); m_offsets = new btOpenCLArray<u32>( ctx,queue, N_SPLIT*N_SPLIT ); m_offsets->resize(N_SPLIT*N_SPLIT); const char* additionalMacros = ""; const char* srcFileNameForCaching=""; cl_int pErrNum; const char* batchKernelSource = batchingKernelsCL; const char* solverSetupSource = solverSetupCL; const char* solverSetup2Source = solverSetup2CL; const char* solveContactSource = solveContactCL; const char* solveFrictionSource = solveFrictionCL; { cl_program solveContactProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, SOLVER_CONTACT_KERNEL_PATH); btAssert(solveContactProg); cl_program solveFrictionProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, SOLVER_FRICTION_KERNEL_PATH); btAssert(solveFrictionProg); cl_program solverSetup2Prog= btOpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, SOLVER_SETUP2_KERNEL_PATH); btAssert(solverSetup2Prog); cl_program solverSetupProg= btOpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, SOLVER_SETUP_KERNEL_PATH); btAssert(solverSetupProg); m_solveFrictionKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); btAssert(m_solveFrictionKernel); m_solveContactKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); btAssert(m_solveContactKernel); m_contactToConstraintKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); btAssert(m_contactToConstraintKernel); m_setSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); btAssert(m_setSortDataKernel); m_reorderContactKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); btAssert(m_reorderContactKernel); m_copyConstraintKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); btAssert(m_copyConstraintKernel); } { cl_program batchingProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, BATCHING_PATH); btAssert(batchingProg); m_batchingKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); btAssert(m_batchingKernel); } }
void Solver::sortContacts( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf, btOpenCLArray<Contact4>* contactsIn, void* additionalData, int nContacts, const Solver::ConstraintCfg& cfg ) { const int sortAlignment = 512; // todo. get this out of sort if( cfg.m_enableParallelSolve ) { int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); btOpenCLArray<u32>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost ); btOpenCLArray<u32>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost ); { // 2. set cell idx struct CB { int m_nContacts; int m_staticIdx; float m_scale; int m_nSplit; }; btAssert( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = cfg.m_staticIdx; cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent); cdata.m_nSplit = N_SPLIT; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; btLauncherCL launcher( m_queue, m_setSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( sortSize, 64 ); } { // 3. sort by cell idx int n = N_SPLIT*N_SPLIT; int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; m_sort32->execute(*m_sortDataBuffer,sortSize); } { // 4. find entries m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, btBoundSearchCL::COUNT); m_scan->execute( *countsNative, *offsetsNative, N_SPLIT*N_SPLIT ); } { // 5. sort constraints by cellIdx // todo. preallocate this // btAssert( contactsIn->getType() == TYPE_HOST ); // btOpenCLArray<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer { btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; btLauncherCL launcher( m_queue, m_reorderContactKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); } // BufferUtils::unmap<true>( out, contactsIn, nContacts ); } } }