void Solver::convertToConstraints( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf, const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf, btOpenCLArray<Contact4>* contactsIn, btOpenCLArray<Constraint4>* contactCOut, void* additionalData, int nContacts, const ConstraintCfg& cfg ) { btOpenCLArray<Constraint4>* constraintNative =0; struct CB { int m_nContacts; float m_dt; float m_positionDrift; float m_positionConstraintCoeff; }; { BT_PROFILE("m_contactToConstraintKernel"); CB cdata; cdata.m_nContacts = nContacts; cdata.m_dt = cfg.m_dt; cdata.m_positionDrift = cfg.m_positionDrift; cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( shapeBuf->getBufferCL()), btBufferInfoCL( contactCOut->getBufferCL() ) }; btLauncherCL launcher( m_queue, m_contactToConstraintKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); //launcher.setConst( cdata ); launcher.setConst(cdata.m_nContacts); launcher.setConst(cdata.m_dt); launcher.setConst(cdata.m_positionDrift); launcher.setConst(cdata.m_positionConstraintCoeff); launcher.launch1D( nContacts, 64 ); clFinish(m_queue); } contactCOut->resize(nContacts); }
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, unsigned int nSrc, btOpenCLArray<unsigned int>& dst, unsigned int nDst, Option option ) { btInt4 constBuffer; constBuffer.x = nSrc; constBuffer.y = nDst; if( option == BOUND_LOWER ) { btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) }; btLauncherCL launcher( m_queue, m_lowerSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( nSrc ); launcher.setConst( nDst ); launcher.launch1D( nSrc, 64 ); } else if( option == BOUND_UPPER ) { btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) }; btLauncherCL launcher(m_queue, m_upperSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( nSrc ); launcher.setConst( nDst ); launcher.launch1D( nSrc+1, 64 ); } else if( option == COUNT ) { btAssert( m_lower ); btAssert( m_upper ); btAssert( m_lower->capacity() <= (int)nDst ); btAssert( m_upper->capacity() <= (int)nDst ); int zero = 0; m_filler->execute( *m_lower, zero, nDst ); m_filler->execute( *m_upper, zero, nDst ); execute( src, nSrc, *m_lower, nDst, BOUND_LOWER ); execute( src, nSrc, *m_upper, nDst, BOUND_UPPER ); { btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) }; btLauncherCL launcher( m_queue, m_subtractKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( nSrc ); launcher.setConst( nDst ); launcher.launch1D( nDst, 64 ); } } else { btAssert( 0 ); } }
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int& value, int n, int offset) { btAssert( n>0 ); { btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( n ); launcher.setConst(value); launcher.launch1D( n ); } }
void btFillCL::execute(btOpenCLArray<int>& src, const int& value, int n, int offset) { btAssert( n>0 ); btConstData constBuffer; { constBuffer.m_offset = offset; constBuffer.m_n = n; constBuffer.m_UnsignedData = btMakeUnsignedInt4( value,value,value,value ); } { btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( constBuffer ); launcher.launch1D( n ); } }
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset) { btAssert( n>0 ); btConstData constBuffer; { constBuffer.m_offset = offset; constBuffer.m_n = n; constBuffer.m_data = btMakeInt4( value.x, value.y, 0, 0 ); } { btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) }; btLauncherCL launcher(m_commandQueue, m_fillKernelInt2); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst(n); launcher.setConst(value); launcher.setConst(offset); //( constBuffer ); launcher.launch1D( n ); } }
void btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost) { int axis = 0;//todo on GPU for now hardcode btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); if (forceHost) { btAlignedObjectArray<btSapAabb> allHostAabbs; m_allAabbsGPU.copyToHost(allHostAabbs); { int numSmallAabbs = m_smallAabbsCPU.size(); for (int j=0;j<numSmallAabbs;j++) { //sync aabb int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3]; m_smallAabbsCPU[j] = allHostAabbs[aabbIndex]; m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex; } } { int numLargeAabbs = m_largeAabbsCPU.size(); for (int j=0;j<numLargeAabbs;j++) { //sync aabb int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3]; m_largeAabbsCPU[j] = allHostAabbs[aabbIndex]; m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex; } } btAlignedObjectArray<btInt2> hostPairs; { int numSmallAabbs = m_smallAabbsCPU.size(); for (int i=0;i<numSmallAabbs;i++) { float reference = m_smallAabbsCPU[i].m_max[axis]; for (int j=i+1;j<numSmallAabbs;j++) { if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max, (btVector3&)m_smallAabbsCPU[j].m_min,(btVector3&)m_smallAabbsCPU[j].m_max)) { btInt2 pair; pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array pair.y = m_smallAabbsCPU[j].m_minIndices[3]; hostPairs.push_back(pair); } } } } { int numSmallAabbs = m_smallAabbsCPU.size(); for (int i=0;i<numSmallAabbs;i++) { float reference = m_smallAabbsCPU[i].m_max[axis]; int numLargeAabbs = m_largeAabbsCPU.size(); for (int j=0;j<numLargeAabbs;j++) { if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max, (btVector3&)m_largeAabbsCPU[j].m_min,(btVector3&)m_largeAabbsCPU[j].m_max)) { btInt2 pair; pair.x = m_largeAabbsCPU[j].m_minIndices[3]; pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array hostPairs.push_back(pair); } } } } if (hostPairs.size()) { m_overlappingPairs.copyFromHost(hostPairs); } else { m_overlappingPairs.resize(0); } return; } { bool syncOnHost = false; if (syncOnHost) { BT_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)"); btAlignedObjectArray<btSapAabb> allHostAabbs; m_allAabbsGPU.copyToHost(allHostAabbs); m_smallAabbsGPU.copyToHost(m_smallAabbsCPU); { int numSmallAabbs = m_smallAabbsCPU.size(); for (int j=0;j<numSmallAabbs;j++) { //sync aabb int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3]; m_smallAabbsCPU[j] = allHostAabbs[aabbIndex]; m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex; } } m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU); } else { { int numSmallAabbs = m_smallAabbsGPU.size(); BT_PROFILE("copyAabbsKernelSmall"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_smallAabbsGPU.getBufferCL()), }; btLauncherCL launcher(m_queue, m_copyAabbsKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numSmallAabbs ); int num = numSmallAabbs; launcher.launch1D( num); clFinish(m_queue); } } if (syncOnHost) { BT_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)"); btAlignedObjectArray<btSapAabb> allHostAabbs; m_allAabbsGPU.copyToHost(allHostAabbs); m_largeAabbsGPU.copyToHost(m_largeAabbsCPU); { int numLargeAabbs = m_largeAabbsCPU.size(); for (int j=0;j<numLargeAabbs;j++) { //sync aabb int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3]; m_largeAabbsCPU[j] = allHostAabbs[aabbIndex]; m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex; } } m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU); } else { int numLargeAabbs = m_largeAabbsGPU.size(); if (numLargeAabbs) { BT_PROFILE("copyAabbsKernelLarge"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_largeAabbsGPU.getBufferCL()), }; btLauncherCL launcher(m_queue, m_copyAabbsKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numLargeAabbs ); int num = numLargeAabbs; launcher.launch1D( num); clFinish(m_queue); } } BT_PROFILE("GPU SAP"); int numSmallAabbs = m_smallAabbsGPU.size(); m_gpuSmallSortData.resize(numSmallAabbs); int numLargeAabbs = m_smallAabbsGPU.size(); #if 1 if (m_smallAabbsGPU.size()) { BT_PROFILE("flipFloatKernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL())}; btLauncherCL launcher(m_queue, m_flipFloatKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numSmallAabbs ); launcher.setConst( axis ); int num = numSmallAabbs; launcher.launch1D( num); clFinish(m_queue); } { BT_PROFILE("gpu radix sort\n"); m_sorter->execute(m_gpuSmallSortData); clFinish(m_queue); } m_gpuSmallSortedAabbs.resize(numSmallAabbs); if (numSmallAabbs) { BT_PROFILE("scatterKernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; btLauncherCL launcher(m_queue, m_scatterKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numSmallAabbs); int num = numSmallAabbs; launcher.launch1D( num); clFinish(m_queue); } int maxPairsPerBody = 64; int maxPairs = maxPairsPerBody * numSmallAabbs;//todo m_overlappingPairs.resize(maxPairs); btOpenCLArray<int> pairCount(m_context, m_queue); pairCount.push_back(0); int numPairs=0; { int numLargeAabbs = m_largeAabbsGPU.size(); if (numLargeAabbs && numSmallAabbs) { BT_PROFILE("sap2Kernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())}; btLauncherCL launcher(m_queue, m_sap2Kernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numLargeAabbs ); launcher.setConst( numSmallAabbs); launcher.setConst( axis ); launcher.setConst( maxPairs ); //@todo: use actual maximum work item sizes of the device instead of hardcoded values launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); numPairs = pairCount.at(0); if (numPairs >maxPairs) numPairs =maxPairs; } } if (m_gpuSmallSortedAabbs.size()) { BT_PROFILE("sapKernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())}; btLauncherCL launcher(m_queue, m_sapKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numSmallAabbs ); launcher.setConst( axis ); launcher.setConst( maxPairs ); int num = numSmallAabbs; #if 0 int buffSize = launcher.getSerializationBufferSize(); unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; for (int i=0;i<buffSize+1;i++) { unsigned char* ptr = (unsigned char*)&buf[i]; *ptr = 0xff; } int actualWrite = launcher.serializeArguments(buf,buffSize); unsigned char* cptr = (unsigned char*)&buf[buffSize]; // printf("buf[buffSize] = %d\n",*cptr); assert(buf[buffSize]==0xff);//check for buffer overrun int* ptr = (int*)&buf[buffSize]; *ptr = num; FILE* f = fopen("m_sapKernelArgs.bin","wb"); fwrite(buf,buffSize+sizeof(int),1,f); fclose(f); #endif// launcher.launch1D( num); clFinish(m_queue); numPairs = pairCount.at(0); if (numPairs>maxPairs) numPairs = maxPairs; } #else int numPairs = 0; btLauncherCL launcher(m_queue, m_sapKernel); const char* fileName = "m_sapKernelArgs.bin"; FILE* f = fopen(fileName,"rb"); if (f) { int sizeInBytes=0; if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) { printf("error, cannot get file size\n"); exit(0); } unsigned char* buf = (unsigned char*) malloc(sizeInBytes); fread(buf,sizeInBytes,1,f); int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); int num = *(int*)&buf[serializedBytes]; launcher.launch1D( num); btOpenCLArray<int> pairCount(m_context, m_queue); int numElements = launcher.m_arrays[2]->size()/sizeof(int); pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); numPairs = pairCount.at(0); //printf("overlapping pairs = %d\n",numPairs); btAlignedObjectArray<btInt2> hostOoverlappingPairs; btOpenCLArray<btInt2> tmpGpuPairs(m_context,m_queue); tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); tmpGpuPairs.copyToHost(hostOoverlappingPairs); m_overlappingPairs.copyFromHost(hostOoverlappingPairs); //printf("hello %d\n", m_overlappingPairs.size()); free(buf); fclose(f); } else { printf("error: cannot find file %s\n",fileName); } clFinish(m_queue); #endif m_overlappingPairs.resize(numPairs); }//BT_PROFILE("GPU_RADIX SORT"); }
void btGpuNarrowphaseAndSolver::computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs) { BT_PROFILE("computeContactsAndSolver"); bool bGPU = (m_internalData != 0); int maxBodyIndex = m_internalData->m_numAcceleratedRigidBodies; if (!maxBodyIndex) return; int numOfConvexRBodies = maxBodyIndex; ChNarrowphaseBase::Config cfgNP; cfgNP.m_collisionMargin = 0.01f; int nContactOut = 0; //printf("convexPairsOut.m_size = %d\n",m_internalData->m_convexPairsOutGPU->m_size); btOpenCLArray<int2> broadphasePairsGPU(m_context,m_queue); broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs); bool useCulling = true; if (useCulling) { BT_PROFILE("ChNarrowphase::culling"); clFinish(m_queue); numPairsOut = m_internalData->m_narrowPhase->culling( &broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_convexPairsOutGPU, cfgNP); } { if (m_planeBodyIndex>=0) { BT_PROFILE("ChNarrowphase:: plane versus convex"); //todo: get rid of this dynamic allocation int2* hostPairs = new int2[m_internalData->m_numAcceleratedRigidBodies-1]; int index=0; for (int i=0;i<m_internalData->m_numAcceleratedRigidBodies;i++) { if (i!=m_planeBodyIndex) { hostPairs[index].x = m_planeBodyIndex; hostPairs[index].y = i; index++; } } assert(m_internalData->m_numAcceleratedRigidBodies-1 == index); m_internalData->m_planePairs->copyFromHostPointer(hostPairs,index); clFinish(m_queue); delete[]hostPairs; //convex versus plane m_internalData->m_narrowPhase->execute(m_internalData->m_planePairs, index, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, 0,0,m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); } } { BT_PROFILE("ChNarrowphase::execute"); if (useCulling) { //convex versus convex //m_internalData->m_narrowPhase->execute(m_internalData->m_convexPairsOutGPU,numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); #define USE_CONVEX_CONVEX_HOST 1 #ifdef USE_CONVEX_CONVEX_HOST m_internalData->m_convexPairsOutGPU->resize(numPairsOut); m_internalData->m_pBufContactOutGPU->resize(nContactOut); m_internalData->m_gpuSatCollision->computeConvexConvexContactsHost( m_internalData->m_convexPairsOutGPU, numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP, m_internalData->m_convexPolyhedra,m_internalData->m_convexVertices,m_internalData->m_uniqueEdges, m_internalData->m_convexFaces,m_internalData->m_convexIndices); #else m_internalData->m_narrowPhase->execute( m_internalData->m_convexPairsOutGPU, numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); #endif } else { m_internalData->m_narrowPhase->execute(&broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP); } clFinish(m_queue); } if (!nContactOut) return; bool useSolver = true;//true;//false; if (useSolver) { float dt=1./60.; SolverBase::ConstraintCfg csCfg( dt ); csCfg.m_enableParallelSolve = true; csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent; csCfg.m_staticIdx = m_planeBodyIndex; btOpenCLArray<Contact4>* contactsIn = m_internalData->m_pBufContactOutGPU; const btOpenCLArray<RigidBodyBase::Body>* bodyBuf = m_internalData->m_bodyBufferGPU; void* additionalData = m_internalData->m_frictionCGPU; const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf = m_internalData->m_inertiaBufferGPU; SolverData contactCOut = m_internalData->m_contactCGPU; int nContacts = nContactOut; bool useCPU=false; { BT_PROFILE("GPU batch"); { //@todo: just reserve it, without copy of original contact (unless we use warmstarting) if( m_internalData->m_solverGPU->m_contactBuffer) { m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts); } if( m_internalData->m_solverGPU->m_contactBuffer == 0 ) { m_internalData->m_solverGPU->m_contactBuffer = new btOpenCLArray<Contact4>(m_context,m_queue, nContacts ); m_internalData->m_solverGPU->m_contactBuffer->resize(nContacts); } btOpenCLArray<Contact4>* contactNative = contactsIn; const btOpenCLArray<RigidBodyBase::Body>* bodyNative = bodyBuf; { //btOpenCLArray<RigidBodyBase::Body>* bodyNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); //btOpenCLArray<Contact4>* contactNative = btOpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); const int sortAlignment = 512; // todo. get this out of sort if( csCfg.m_enableParallelSolve ) { int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); btOpenCLArray<u32>* countsNative = m_internalData->m_solverGPU->m_numConstraints; btOpenCLArray<u32>* offsetsNative = m_internalData->m_solverGPU->m_offsets; { // 2. set cell idx BT_PROFILE("GPU set cell idx"); struct CB { int m_nContacts; int m_staticIdx; float m_scale; int m_nSplit; }; ADLASSERT( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = csCfg.m_staticIdx; cdata.m_scale = 1.f/(BT_SOLVER_N_OBJ_PER_SPLIT*csCfg.m_averageExtent); cdata.m_nSplit = BT_SOLVER_N_SPLIT; m_internalData->m_solverGPU->m_sortDataBuffer->resize(nContacts); btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_setSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( sortSize, 64 ); } bool gpuRadixSort=true; if (gpuRadixSort) { // 3. sort by cell idx BT_PROFILE("gpuRadixSort"); int n = BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT; int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize ); //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); btOpenCLArray<btSortData>& keyValuesInOut = *(m_internalData->m_solverGPU->m_sortDataBuffer); this->m_internalData->m_solverGPU->m_sort32->execute(keyValuesInOut); /*btAlignedObjectArray<btSortData> hostValues; keyValuesInOut.copyToHost(hostValues); printf("hostValues.size=%d\n",hostValues.size()); */ } { // 4. find entries BT_PROFILE("gpuBoundSearch"); m_internalData->m_solverGPU->m_search->execute(*m_internalData->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT,btBoundSearchCL::COUNT); //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, // BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); //unsigned int sum; m_internalData->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, BT_SOLVER_N_SPLIT*BT_SOLVER_N_SPLIT);//,&sum ); //printf("sum = %d\n",sum); } { // 5. sort constraints by cellIdx { BT_PROFILE("gpu m_reorderContactKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL()) , btBufferInfoCL( m_internalData->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; btLauncherCL launcher(m_queue,m_internalData->m_solverGPU->m_reorderContactKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); } } } } clFinish(m_queue); { BT_PROFILE("gpu m_copyConstraintKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( m_internalData->m_solverGPU->m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactNative->getBufferCL() ) }; btLauncherCL launcher(m_queue, m_internalData->m_solverGPU->m_copyConstraintKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); clFinish(m_queue); } bool compareGPU = false; if (gpuBatchContacts) { BT_PROFILE("gpu batchContacts"); m_internalData->m_solverGPU->batchContacts( contactNative, nContacts, m_internalData->m_solverGPU->m_numConstraints, m_internalData->m_solverGPU->m_offsets, csCfg.m_staticIdx ); } if (1) { BT_PROFILE("gpu convertToConstraints"); m_internalData->m_solverGPU->convertToConstraints( bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, csCfg ); clFinish(m_queue); } } } if (1) { BT_PROFILE("GPU solveContactConstraint"); m_internalData->m_solverGPU->m_nIterations = 4;//10 m_internalData->m_solverGPU->solveContactConstraint(m_internalData->m_bodyBufferGPU, m_internalData->m_inertiaBufferGPU, m_internalData->m_contactCGPU, 0, nContactOut ); clFinish(m_queue); } #if 0 if (0) { BT_PROFILE("read body velocities back to CPU"); //read body updated linear/angular velocities back to CPU m_internalData->m_bodyBufferGPU->read( m_internalData->m_bodyBufferCPU->m_ptr,numOfConvexRBodies); adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL ); } #endif } }
void Solver::batchContacts( btOpenCLArray<Contact4>* contacts, int nContacts, btOpenCLArray<u32>* nNative, btOpenCLArray<u32>* offsetsNative, int staticIdx ) { { BT_PROFILE("batch generation"); btInt4 cdata; cdata.x = nContacts; cdata.y = 0; cdata.z = staticIdx; int numWorkItems = 64*N_SPLIT*N_SPLIT; #ifdef BATCH_DEBUG SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; adl::btOpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); gpuDebugInfo.write(debugInfo,numWorkItems); #endif btBufferInfoCL bInfo[] = { btBufferInfoCL( contacts->getBufferCL() ), btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( nNative->getBufferCL() ), btBufferInfoCL( offsetsNative->getBufferCL() ) #ifdef BATCH_DEBUG , btBufferInfoCL(&gpuDebugInfo) #endif }; { BT_PROFILE("batchingKernel"); btLauncherCL launcher( m_queue, m_batchingKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); //launcher.setConst( cdata ); launcher.setConst(staticIdx); launcher.launch1D( numWorkItems, 64 ); clFinish(m_queue); } #ifdef BATCH_DEBUG aaaa Contact4* hostContacts = new Contact4[nContacts]; m_contactBuffer->read(hostContacts,nContacts); clFinish(m_queue); gpuDebugInfo.read(debugInfo,numWorkItems); clFinish(m_queue); for (int i=0; i<numWorkItems; i++) { if (debugInfo[i].m_valInt1>0) { printf("catch\n"); } if (debugInfo[i].m_valInt2>0) { printf("catch22\n"); } if (debugInfo[i].m_valInt3>0) { printf("catch666\n"); } if (debugInfo[i].m_valInt4>0) { printf("catch777\n"); } } delete[] debugInfo; #endif //BATCH_DEBUG } // copy buffer to buffer btAssert(m_contactBuffer->size()==nContacts); //contacts->copyFromOpenCLArray( *m_contactBuffer); //clFinish(m_queue);//needed? }
void Solver::sortContacts( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf, btOpenCLArray<Contact4>* contactsIn, void* additionalData, int nContacts, const Solver::ConstraintCfg& cfg ) { const int sortAlignment = 512; // todo. get this out of sort if( cfg.m_enableParallelSolve ) { int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment ); btOpenCLArray<u32>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost ); btOpenCLArray<u32>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost ); { // 2. set cell idx struct CB { int m_nContacts; int m_staticIdx; float m_scale; int m_nSplit; }; btAssert( sortSize%64 == 0 ); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = cfg.m_staticIdx; cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent); cdata.m_nSplit = N_SPLIT; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; btLauncherCL launcher( m_queue, m_setSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( sortSize, 64 ); } { // 3. sort by cell idx int n = N_SPLIT*N_SPLIT; int sortBit = 32; //if( n <= 0xffff ) sortBit = 16; //if( n <= 0xff ) sortBit = 8; m_sort32->execute(*m_sortDataBuffer,sortSize); } { // 4. find entries m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, btBoundSearchCL::COUNT); m_scan->execute( *countsNative, *offsetsNative, N_SPLIT*N_SPLIT ); } { // 5. sort constraints by cellIdx // todo. preallocate this // btAssert( contactsIn->getType() == TYPE_HOST ); // btOpenCLArray<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer { btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( contactsIn->getBufferCL() ), btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( m_sortDataBuffer->getBufferCL() ) }; btLauncherCL launcher( m_queue, m_reorderContactKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); } // BufferUtils::unmap<true>( out, contactsIn, nContacts ); } } }
void Solver::solveContactConstraint( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf, const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf, btOpenCLArray<Constraint4>* constraint, void* additionalData, int n ,int maxNumBatches) { btInt4 cdata = btMakeInt4( n, 0, 0, 0 ); { const int nn = N_SPLIT*N_SPLIT; cdata.x = 0; cdata.y = maxNumBatches;//250; int numWorkItems = 64*nn/N_BATCHES; #ifdef DEBUG_ME SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; adl::btOpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); #endif { BT_PROFILE("m_batchSolveKernel iterations"); for(int iter=0; iter<m_nIterations; iter++) { for(int ib=0; ib<N_BATCHES; ib++) { #ifdef DEBUG_ME memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); gpuDebugInfo.write(debugInfo,numWorkItems); #endif cdata.z = ib; cdata.w = N_SPLIT; btLauncherCL launcher( m_queue, m_solveContactKernel ); #if 1 btBufferInfoCL bInfo[] = { btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( shapeBuf->getBufferCL() ), btBufferInfoCL( constraint->getBufferCL() ), btBufferInfoCL( m_numConstraints->getBufferCL() ), btBufferInfoCL( m_offsets->getBufferCL() ) #ifdef DEBUG_ME , btBufferInfoCL(&gpuDebugInfo) #endif }; launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); //launcher.setConst( cdata.x ); launcher.setConst( cdata.y ); launcher.setConst( cdata.z ); launcher.setConst( cdata.w ); launcher.launch1D( numWorkItems, 64 ); #else const char* fileName = "m_batchSolveKernel.bin"; FILE* f = fopen(fileName,"rb"); if (f) { int sizeInBytes=0; if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) { printf("error, cannot get file size\n"); exit(0); } unsigned char* buf = (unsigned char*) malloc(sizeInBytes); fread(buf,sizeInBytes,1,f); int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); int num = *(int*)&buf[serializedBytes]; launcher.launch1D( num); //this clFinish is for testing on errors clFinish(m_queue); } #endif #ifdef DEBUG_ME clFinish(m_queue); gpuDebugInfo.read(debugInfo,numWorkItems); clFinish(m_queue); for (int i=0; i<numWorkItems; i++) { if (debugInfo[i].m_valInt2>0) { printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); } if (debugInfo[i].m_valInt3>0) { printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); } } #endif //DEBUG_ME } } clFinish(m_queue); } cdata.x = 1; bool applyFriction=true; if (applyFriction) { BT_PROFILE("m_batchSolveKernel iterations2"); for(int iter=0; iter<m_nIterations; iter++) { for(int ib=0; ib<N_BATCHES; ib++) { cdata.z = ib; cdata.w = N_SPLIT; btBufferInfoCL bInfo[] = { btBufferInfoCL( bodyBuf->getBufferCL() ), btBufferInfoCL( shapeBuf->getBufferCL() ), btBufferInfoCL( constraint->getBufferCL() ), btBufferInfoCL( m_numConstraints->getBufferCL() ), btBufferInfoCL( m_offsets->getBufferCL() ) #ifdef DEBUG_ME ,btBufferInfoCL(&gpuDebugInfo) #endif //DEBUG_ME }; btLauncherCL launcher( m_queue, m_solveFrictionKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); //launcher.setConst( cdata.x ); launcher.setConst( cdata.y ); launcher.setConst( cdata.z ); launcher.setConst( cdata.w ); launcher.launch1D( 64*nn/N_BATCHES, 64 ); } } clFinish(m_queue); } #ifdef DEBUG_ME delete[] debugInfo; #endif //DEBUG_ME } }
void Solver::reorderConvertToConstraints( const btOpenCLArray<RigidBodyBase::Body>* bodyBuf, const btOpenCLArray<RigidBodyBase::Inertia>* shapeBuf, btOpenCLArray<Contact4>* contactsIn, btOpenCLArray<Constraint4>* contactCOut, void* additionalData, int nContacts, const Solver::ConstraintCfg& cfg ) { if( m_contactBuffer ) { m_contactBuffer->resize(nContacts); } if( m_contactBuffer == 0 ) { BT_PROFILE("new m_contactBuffer;"); m_contactBuffer = new btOpenCLArray<Contact4>(m_context,m_queue,nContacts ); m_contactBuffer->resize(nContacts); } //DeviceUtils::Config dhCfg; //Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg ); if( cfg.m_enableParallelSolve ) { clFinish(m_queue); // contactsIn -> m_contactBuffer { BT_PROFILE("sortContacts"); sortContacts( bodyBuf, contactsIn, additionalData, nContacts, cfg ); clFinish(m_queue); } { BT_PROFILE("m_copyConstraintKernel"); btInt4 cdata; cdata.x = nContacts; btBufferInfoCL bInfo[] = { btBufferInfoCL( m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactsIn->getBufferCL() ) }; // btLauncherCL launcher( m_queue, data->m_device->getKernel( PATH, "CopyConstraintKernel", "-I ..\\..\\ -Wf,--c++", 0 ) ); btLauncherCL launcher( m_queue, m_copyConstraintKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nContacts, 64 ); clFinish(m_queue); } { BT_PROFILE("batchContacts"); Solver::batchContacts( contactsIn, nContacts, m_numConstraints, m_offsets, cfg.m_staticIdx ); } } { BT_PROFILE("waitForCompletion (batchContacts)"); clFinish(m_queue); } //================ { BT_PROFILE("convertToConstraints"); Solver::convertToConstraints( bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg ); } { BT_PROFILE("convertToConstraints waitForCompletion"); clFinish(m_queue); } }
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */) { int originalSize = keyValuesInOut.size(); int workingSize = originalSize; int dataAlignment = DATA_ALIGNMENT; #ifdef DEBUG_RADIXSORT2 btAlignedObjectArray<btSortData> test2; keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } #endif //DEBUG_RADIXSORT2 btOpenCLArray<btSortData>* src = 0; if (workingSize%dataAlignment) { workingSize += dataAlignment-(workingSize%dataAlignment); m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); m_workBuffer4->resize(workingSize); btSortData fillValue; fillValue.m_key = 0xffffffff; fillValue.m_value = 0xffffffff; #define USE_BTFILL #ifdef USE_BTFILL m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize); #else //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) for (int i=originalSize; i<workingSize;i++) { m_workBuffer4->copyFromHostPointer(&fillValue,1,i); } #endif//USE_BTFILL src = m_workBuffer4; } else { src = &keyValuesInOut; m_workBuffer4->resize(0); } btAssert( workingSize%DATA_ALIGNMENT == 0 ); int minCap = NUM_BUCKET*NUM_WGS; int n = workingSize; m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); btAssert( BITS_PER_PASS == 4 ); btAssert( WG_SIZE == 64 ); btAssert( (sortBits&0x3) == 0 ); btOpenCLArray<btSortData>* dst = m_workBuffer3; btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1; btOpenCLArray<unsigned int>* destHisto = m_workBuffer2; int nWGs = NUM_WGS; btConstData cdata; { int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 int nBlocks = (n+blockSize-1)/(blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; if( nBlocks < NUM_WGS ) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } int count=0; for(int ib=0; ib<sortBits; ib+=4) { #ifdef DEBUG_RADIXSORT2 keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { if (test2[i].m_key != test2[i].m_value) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } } #endif //DEBUG_RADIXSORT2 cdata.m_startBit = ib; { btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) }; btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); int num = NUM_WGS*WG_SIZE; launcher.launch1D( num, WG_SIZE ); } #ifdef DEBUG_RADIXSORT btAlignedObjectArray<unsigned int> testHist; srcHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } #endif //DEBUG_RADIXSORT //fast prefix scan is not working properly on Mac OSX yet #ifdef _WIN32 bool fastScan=true; #else bool fastScan=false; #endif if (fastScan) {// prefix scan group histogram btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_prefixScanKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( 128, 128 ); destHisto = srcHisto; }else { //unsigned int sum; //for debugging m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); } #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } for (int i=0;i<testHist.size();i+=NUM_WGS) { printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); } #endif //DEBUG_RADIXSORT #define USE_GPU #ifdef USE_GPU {// local sort and distribute btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )}; btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); } #else { #define NUM_TABLES 16 //#define SEQUENTIAL #ifdef SEQUENTIAL int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; int startBit = ib; destHisto->copyToHost(testHist); btAlignedObjectArray<btSortData> srcHost; btAlignedObjectArray<btSortData> dstHost; dstHost.resize(src->size()); src->copyToHost(srcHost); for (int i=0;i<NUM_TABLES;i++) { tables[i] = testHist[i*NUM_WGS]; } // distribute for(int i=0; i<n; i++) { int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } #else int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; btAlignedObjectArray<btSortData> dstHostOK; dstHostOK.resize(src->size()); destHisto->copyToHost(testHist); btAlignedObjectArray<btSortData> srcHost; src->copyToHost(srcHost); int blockSize = 256; int nBlocksPerWG = cdata.m_nBlocksPerWG; int startBit = ib; { for (int i=0;i<NUM_TABLES;i++) { tables[i] = testHist[i*NUM_WGS]; } // distribute for(int i=0; i<n; i++) { int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } } btAlignedObjectArray<btSortData> dstHost; dstHost.resize(src->size()); int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) { int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++) { for (int lIdx = 0;lIdx < 64;lIdx++) { int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops // AMD: AtomInc performs better while NV prefers ++ for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) { if( addr+j < n ) { // printf ("addr+j=%d\n", addr+j); int i = addr+j; int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; btSortData ok = dstHostOK[destIndex]; if (ok.m_key != srcHost[i].m_key) { printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); } if (ok.m_value != srcHost[i].m_value) { printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); } dstHost[destIndex] = srcHost[i]; counter[tableIdx] ++; } } } } } #endif //SEQUENTIAL dst->copyFromHost(dstHost); } #endif//USE_GPU #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } #endif //DEBUG_RADIXSORT btSwap(src, dst ); btSwap(srcHisto,destHisto); #ifdef DEBUG_RADIXSORT2 keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { if (test2[i].m_key != test2[i].m_value) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } } #endif //DEBUG_RADIXSORT2 count++; }