void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index) { B3_PROFILE("solveContacts"); m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies); m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies); m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts); if (optionalSortContactsDeterminism) { if (!gCpuSortContactsDeterminism) { B3_PROFILE("GPU Sort contact constraints (determinism)"); m_data->m_pBufContactOutGPUCopy->resize(numContacts); m_data->m_contactKeyValues->resize(numContacts); m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(),numContacts,0,0); { b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel,"m_setDeterminismSortDataChildShapeBKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); launcher.launch1D( numContacts, 64 ); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel,"m_setDeterminismSortDataChildShapeAKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); launcher.launch1D( numContacts, 64 ); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel,"m_setDeterminismSortDataBodyBKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); launcher.launch1D( numContacts, 64 ); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel,"m_setDeterminismSortDataBodyAKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); launcher.launch1D( numContacts, 64 ); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { B3_PROFILE("gpu reorderContactKernel (determinism)"); b3Int4 cdata; cdata.x = numContacts; //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst( cdata ); launcher.launch1D( numContacts, 64 ); } } else { B3_PROFILE("CPU Sort contact constraints (determinism)"); b3AlignedObjectArray<b3Contact4> cpuConstraints; m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints); bool sort = true; if (sort) { cpuConstraints.quickSort(b3ContactCmp); for (int i=0;i<cpuConstraints.size();i++) { cpuConstraints[i].m_batchIdx = i; } } m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints); if (m_debugOutput==100) { for (int i=0;i<cpuConstraints.size();i++) { printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n",i,cpuConstraints[i].m_bodyAPtrAndSignBit,cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx); } } m_debugOutput++; } } int nContactOut = m_data->m_pBufContactOutGPU->size(); bool useSolver = true; if (useSolver) { float dt=1./60.; b3ConstraintCfg csCfg( dt ); csCfg.m_enableParallelSolve = true; csCfg.m_batchCellSize = 6; csCfg.m_staticIdx = static0Index; b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU; void* additionalData = 0;//m_data->m_frictionCGPU; const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU; b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU; int nContacts = nContactOut; int maxNumBatches = 0; if (!gUseLargeBatches) { if( m_data->m_solverGPU->m_contactBuffer2) { m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); } if( m_data->m_solverGPU->m_contactBuffer2 == 0 ) { m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context,m_data->m_queue, nContacts ); m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); } //clFinish(m_data->m_queue); { B3_PROFILE("batching"); //@todo: just reserve it, without copy of original contact (unless we use warmstarting) const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf; { //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); const int sortAlignment = 512; // todo. get this out of sort if( csCfg.m_enableParallelSolve ) { int sortSize = B3NEXTMULTIPLEOF( nContacts, sortAlignment ); b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; if (!gCpuSetSortData) { // 2. set cell idx B3_PROFILE("GPU set cell idx"); struct CB { int m_nContacts; int m_staticIdx; float m_scale; b3Int4 m_nSplit; }; b3Assert( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = csCfg.m_staticIdx; cdata.m_scale = 1.f/csCfg.m_batchCellSize; cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X; cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y; cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z; m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL()), b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel,"m_setSortDataKernel" ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata.m_nContacts ); launcher.setConst( cdata.m_scale ); launcher.setConst(cdata.m_nSplit); launcher.setConst(cdata.m_staticIdx); launcher.launch1D( sortSize, 64 ); } else { m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); b3AlignedObjectArray<b3SortData> sortDataCPU; m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU); b3AlignedObjectArray<b3Contact4> contactCPU; m_data->m_pBufContactOutGPU->copyToHost(contactCPU); b3AlignedObjectArray<b3RigidBodyData> bodiesCPU; bodyBuf->copyToHost(bodiesCPU); float scale = 1.f/csCfg.m_batchCellSize; b3Int4 nSplit; nSplit.x = B3_SOLVER_N_SPLIT_X; nSplit.y = B3_SOLVER_N_SPLIT_Y; nSplit.z = B3_SOLVER_N_SPLIT_Z; SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts,scale,nSplit,csCfg.m_staticIdx); m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU); } if (!gCpuRadixSort) { // 3. sort by cell idx B3_PROFILE("gpuRadixSort"); //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; //int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut); } else { b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); b3AlignedObjectArray<b3SortData> hostValues; keyValuesInOut.copyToHost(hostValues); hostValues.quickSort(sortfnc); keyValuesInOut.copyFromHost(hostValues); } if (gUseScanHost) { // 4. find entries B3_PROFILE("cpuBoundSearch"); b3AlignedObjectArray<unsigned int> countsHost; countsNative->copyToHost(countsHost); b3AlignedObjectArray<b3SortData> sortDataHost; m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost); //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); m_data->m_solverGPU->m_search->executeHost(sortDataHost,nContacts,countsHost,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); countsNative->copyFromHost(countsHost); //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); //unsigned int sum; //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); b3AlignedObjectArray<unsigned int> offsetsHost; offsetsHost.resize(offsetsNative->size()); m_data->m_solverGPU->m_scan->executeHost(countsHost,offsetsHost, B3_SOLVER_N_CELLS);//,&sum ); offsetsNative->copyFromHost(offsetsHost); //printf("sum = %d\n",sum); } else { // 4. find entries B3_PROFILE("gpuBoundSearch"); m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); } if (nContacts) { // 5. sort constraints by cellIdx if (gReorderContactsOnCpu) { B3_PROFILE("cpu m_reorderContactKernel"); b3AlignedObjectArray<b3SortData> sortDataHost; m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost); b3AlignedObjectArray<b3Contact4> inContacts; b3AlignedObjectArray<b3Contact4> outContacts; m_data->m_pBufContactOutGPU->copyToHost(inContacts); outContacts.resize(inContacts.size()); for (int i=0;i<nContacts;i++) { int srcIdx = sortDataHost[i].y; outContacts[i] = inContacts[srcIdx]; } m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts); /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" "{\n" " int nContacts = cb.x;\n" " int gIdx = GET_GLOBAL_IDX;\n" " if( gIdx < nContacts )\n" " {\n" " int srcIdx = sortData[gIdx].y;\n" " out[gIdx] = in[srcIdx];\n" " }\n" "}\n" */ } else { B3_PROFILE("gpu m_reorderContactKernel"); b3Int4 cdata; cdata.x = nContacts; b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel"); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); } } } } //clFinish(m_data->m_queue); // { // b3AlignedObjectArray<unsigned int> histogram; // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram); // printf(",,,\n"); // } if (nContacts) { if (gUseCpuCopyConstraints) { for (int i=0;i<nContacts;i++) { m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2); // m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); // m_data->m_pBufContactOutGPU->getBufferCL() } } else { B3_PROFILE("gpu m_copyConstraintKernel"); b3Int4 cdata; cdata.x = nContacts; b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL() ), b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ) }; b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel,"m_copyConstraintKernel" ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); //we use the clFinish for proper benchmark/profile clFinish(m_data->m_queue); } } bool compareGPU = false; if (nContacts) { if (!gCpuBatchContacts) { B3_PROFILE("gpu batchContacts"); maxNumBatches = 150;//250; m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); clFinish(m_data->m_queue); } else { B3_PROFILE("cpu batchContacts"); static b3AlignedObjectArray<b3Contact4> cpuContacts; b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2; { B3_PROFILE("copyToHost"); contactsIn->copyToHost(cpuContacts); } b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; b3AlignedObjectArray<unsigned int> nNativeHost; b3AlignedObjectArray<unsigned int> offsetsNativeHost; { B3_PROFILE("countsNative/offsetsNative copyToHost"); countsNative->copyToHost(nNativeHost); offsetsNative->copyToHost(offsetsNativeHost); } int numNonzeroGrid=0; if (gUseLargeBatches) { m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES); int totalNumConstraints = cpuContacts.size(); int simdWidth =numBodies+1;//-1;//64;//-1;//32; int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]); // on GPU maxNumBatches = b3Max(numBatches,maxNumBatches); static int globalMaxBatch = 0; if (maxNumBatches>globalMaxBatch ) { globalMaxBatch = maxNumBatches; b3Printf("maxNumBatches = %d\n",maxNumBatches); } } else { m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS*B3_MAX_NUM_BATCHES); B3_PROFILE("cpu batch grid"); for(int i=0; i<B3_SOLVER_N_CELLS; i++) { int n = (nNativeHost)[i]; int offset = (offsetsNativeHost)[i]; if( n ) { numNonzeroGrid++; int simdWidth =numBodies+1;//-1;//64;//-1;//32; int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[i*B3_MAX_NUM_BATCHES]); // on GPU maxNumBatches = b3Max(numBatches,maxNumBatches); static int globalMaxBatch = 0; if (maxNumBatches>globalMaxBatch ) { globalMaxBatch = maxNumBatches; b3Printf("maxNumBatches = %d\n",maxNumBatches); } //we use the clFinish for proper benchmark/profile } } //clFinish(m_data->m_queue); } { B3_PROFILE("m_contactBuffer->copyFromHost"); m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts); } } } } } //printf("maxNumBatches = %d\n", maxNumBatches); if (gUseLargeBatches) { if (nContacts) { B3_PROFILE("cpu batchContacts"); static b3AlignedObjectArray<b3Contact4> cpuContacts; // b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2; { B3_PROFILE("copyToHost"); m_data->m_pBufContactOutGPU->copyToHost(cpuContacts); } b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; int numNonzeroGrid=0; { m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES); int totalNumConstraints = cpuContacts.size(); int simdWidth =numBodies+1;//-1;//64;//-1;//32; int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]); // on GPU maxNumBatches = b3Max(numBatches,maxNumBatches); static int globalMaxBatch = 0; if (maxNumBatches>globalMaxBatch ) { globalMaxBatch = maxNumBatches; b3Printf("maxNumBatches = %d\n",maxNumBatches); } } { B3_PROFILE("m_contactBuffer->copyFromHost"); m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts); } } } if (nContacts) { B3_PROFILE("gpu convertToConstraints"); m_data->m_solverGPU->convertToConstraints( bodyBuf, shapeBuf, m_data->m_solverGPU->m_contactBuffer2, contactConstraintOut, additionalData, nContacts, (b3SolverBase::ConstraintCfg&) csCfg ); clFinish(m_data->m_queue); } if (1) { int numIter = 4; m_data->m_solverGPU->m_nIterations = numIter;//10 if (!gCpuSolveConstraint) { B3_PROFILE("GPU solveContactConstraint"); /*m_data->m_solverGPU->solveContactConstraint( m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut , maxNumBatches); */ //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes); if (gUseLargeBatches) { solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut , maxNumBatches,numIter,&m_data->m_batchSizes); } else { solveContactConstraint( m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut , maxNumBatches,numIter,&m_data->m_batchSizes);//m_data->m_batchSizesGpu); } } else { B3_PROFILE("Host solveContactConstraint"); m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches,&m_data->m_batchSizes); } } #if 0 if (0) { B3_PROFILE("read body velocities back to CPU"); //read body updated linear/angular velocities back to CPU m_data->m_bodyBufferGPU->read( m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies); adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL ); } #endif } }
void btGpuNarrowphaseAndSolver::computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs) { BT_PROFILE("computeContactsAndSolver"); bool bGPU = (m_internalData != 0); int maxBodyIndex = m_internalData->m_numAcceleratedRigidBodies; if (!maxBodyIndex) return; int numOfConvexRBodies = maxBodyIndex; ChNarrowphaseBase::Config cfgNP; cfgNP.m_collisionMargin = 0.01f; int nContactOut = 0; //printf("convexPairsOut.m_size = %d\n",m_internalData->m_convexPairsOutGPU->m_size); btOpenCLArray<int2> broadphasePairsGPU(m_context,m_queue); broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs); bool useCulling = true; if (useCulling) { BT_PROFILE("ChNarrowphase::culling"); clFinish(m_queue); numPairsOut = m_internalData->m_narrowPhase->culling( &broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_convexPairsOutGPU, cfgNP); } { if (m_planeBodyIndex>=0) { BT_PROFILE("ChNarrowphase:: plane versus convex"); //todo: get rid of this dynamic allocation int2* hostPairs = new int2[m_internalData->m_numAcceleratedRigidBodies-1]; int index=0; for (int i=0;i<m_internalData->m_numAcceleratedRigidBodies;i++) { if (i!=m_planeBodyIndex) { hostPairs[index].x = m_planeBodyIndex; hostPairs[index].y = i; index++; } } assert(m_internalData->m_numAcceleratedRigidBodies-1 == index); m_internalData->m_planePairs->copyFromHostPointer(hostPairs,index); clFinish(m_queue); delete[]hostPairs; //convex versus plane m_internalData->m_narrowPhase->execute(m_internalData->m_planePairs, index, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, 0,0,m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); } } { BT_PROFILE("ChNarrowphase::execute"); if (useCulling) { //convex versus convex //m_internalData->m_narrowPhase->execute(m_internalData->m_convexPairsOutGPU,numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); #define USE_CONVEX_CONVEX_HOST 1 #ifdef USE_CONVEX_CONVEX_HOST m_internalData->m_convexPairsOutGPU->resize(numPairsOut); m_internalData->m_pBufContactOutGPU->resize(nContactOut); m_internalData->m_gpuSatCollision->computeConvexConvexContactsHost( m_internalData->m_convexPairsOutGPU, numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP, m_internalData->m_convexPolyhedra,m_internalData->m_convexVertices,m_internalData->m_uniqueEdges, m_internalData->m_convexFaces,m_internalData->m_convexIndices); #else m_internalData->m_narrowPhase->execute( m_internalData->m_convexPairsOutGPU, numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); #endif } else { m_internalData->m_narrowPhase->execute(&broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); } clFinish(m_queue); } if (!nContactOut) return; bool useSolver = true;//true;//false; if (useSolver) { float dt=1./60.; SolverBase::ConstraintCfg csCfg( dt ); csCfg.m_enableParallelSolve = true; csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent; csCfg.m_staticIdx = m_planeBodyIndex; btOpenCLArray<Contact4>* contactsIn = m_internalData->m_pBufContactOutGPU; const btOpenCLArray<RigidBodyBase::Body>* bodyBuf = m_internalData->m_bodyBufferGPU; void* additionalData = m_internalData->m_frictionCGPU; const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf = m_internalData->m_inertiaBufferGPU; SolverData contactCOut = m_internalData->m_contactCGPU; int nContacts = nContactOut; bool useCPU=false; { BT_PROFILE("GPU batch"); { //@todo: just reserve it, without copy of original contact (unless we use warmstarting) if( m_internalData->m_solverGPU->m_contactBuffer) { m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts); } if( m_internalData->m_solverGPU->m_contactBuffer == 0 ) { m_internalData->m_solverGPU->m_contactBuffer = new btOpenCLArray<Contact4>(m_context,m_queue, nContacts ); m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts); } btOpenCLArray<Contact4>* contactNative = contactsIn; const btOpenCLArray<RigidBodyBase::Body>* bodyNative = bodyBuf; { //btOpenCLArray<RigidBodyBase::Body>* bodyNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); //btOpenCLArray<Contact4>* contactNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); const int sortAlignment = 512; // todo. get this out of sort if( csCfg.m_enableParallelSolve ) { int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); btOpenCLArray<u32>* countsNative = m_internalData->m_solverGPU->m_numConstraints; btOpenCLArray<u32>* offsetsNative = m_internalData->m_solverGPU->m_offsets; { // 2. set cell idx BT_PROFILE("GPU set cell idx"); struct CB { int m_nContacts; int m_staticIdx; float m_scale; int m_nSplit; }; ADLASSERT( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = csCfg.m_staticIdx; cdata.m_scale = 1.f/(BT_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent); cdata.m_nSplit = BT_SOLVER_N_SPLIT; m_internalData->m_solverGPU->m_sortDataBuffer->resize(nContacts); btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_setSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( sortSize, 64 ); } bool gpuRadixSort=true; if (gpuRadixSort) { // 3. sort by cell idx BT_PROFILE("gpuRadixSort"); int n = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT; int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); btOpenCLArray<btSortData>& keyValuesInOut = *(m_internalData->m_solverGPU->m_sortDataBuffer); this->m_internalData->m_solverGPU->m_sort32->execute(keyValuesInOut); /*btAlignedObjectArray<btSortData> hostValues; keyValuesInOut.copyToHost(hostValues); printf("hostValues.size=%d\n",hostValues.size()); */ } { // 4. find entries BT_PROFILE("gpuBoundSearch"); m_internalData->m_solverGPU->m_search->execute(*m_internalData->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT,btBoundSearchCL::COUNT); //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, // BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); //unsigned int sum; m_internalData->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT);//,&sum ); //printf("sum = %d\n",sum); } { // 5. sort constraints by cellIdx { BT_PROFILE("gpu m_reorderContactKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL()) , btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; btLauncherCL launcher(m_queue,m_internalData->m_solverGPU->m_reorderContactKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); } } } } clFinish(m_queue); { BT_PROFILE("gpu m_copyConstraintKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactNative->getBufferCL() ) }; btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_copyConstraintKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); clFinish(m_queue); } bool compareGPU = false; if (gpuBatchContacts) { BT_PROFILE("gpu batchContacts"); m_internalData->m_solverGPU->batchContacts( contactNative, nContacts, m_internalData->m_solverGPU->m_numConstraints, m_internalData->m_solverGPU->m_offsets, csCfg.m_staticIdx ); } if (1) { BT_PROFILE("gpu convertToConstraints"); m_internalData->m_solverGPU->convertToConstraints( bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, csCfg ); clFinish(m_queue); } } } if (1) { BT_PROFILE("GPU solveContactConstraint"); m_internalData->m_solverGPU->m_nIterations = 4;//10 m_internalData->m_solverGPU->solveContactConstraint(m_internalData->m_bodyBufferGPU, m_internalData->m_inertiaBufferGPU, m_internalData->m_contactCGPU, 0, nContactOut ); clFinish(m_queue); } #if 0 if (0) { BT_PROFILE("read body velocities back to CPU"); //read body updated linear/angular velocities back to CPU m_internalData->m_bodyBufferGPU->read( m_internalData->m_bodyBufferCPU->m_ptr,numOfConvexRBodies); adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL ); } #endif } }