void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i) { int numContacts = b3Contact4Data_getNumPoints(&newContactCache); if (i!=(numContacts-1)) { b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]); b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]); b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]); } b3Contact4Data_setNumPoints(&newContactCache,numContacts-1); }
b3BroadphasePair* b3HashedOverlappingPairCache::findPair(int proxy0, int proxy1) { b3g_findPairs++; if(proxy0 >proxy1) b3Swap(proxy0,proxy1); int proxyId1 = proxy0; int proxyId2 = proxy1; /*if (proxyId1 > proxyId2) b3Swap(proxyId1, proxyId2);*/ int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1)); if (hash >= m_hashTable.size()) { return NULL; } int index = m_hashTable[hash]; while (index != B3_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyId1, proxyId2) == false) { index = m_next[index]; } if (index == B3_NULL_PAIR) { return NULL; } b3Assert(index < m_overlappingPairArray.size()); return &m_overlappingPairArray[index]; }
b3BroadphasePair* b3HashedOverlappingPairCache::internalAddPair(int proxy0, int proxy1) { if(proxy0>proxy1) b3Swap(proxy0,proxy1); int proxyId1 = proxy0; int proxyId2 = proxy1; /*if (proxyId1 > proxyId2) b3Swap(proxyId1, proxyId2);*/ int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1)); // New hash value with new mask b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash); if (pair != NULL) { return pair; } /*for(int i=0;i<m_overlappingPairArray.size();++i) { if( (m_overlappingPairArray[i].m_pProxy0==proxy0)&& (m_overlappingPairArray[i].m_pProxy1==proxy1)) { printf("Adding duplicated %u<>%u\r\n",proxyId1,proxyId2); internalFindPair(proxy0, proxy1, hash); } }*/ int count = m_overlappingPairArray.size(); int oldCapacity = m_overlappingPairArray.capacity(); void* mem = &m_overlappingPairArray.expandNonInitializing(); //this is where we add an actual pair, so also call the 'ghost' // if (m_ghostPairCallback) // m_ghostPairCallback->addOverlappingPair(proxy0,proxy1); int newCapacity = m_overlappingPairArray.capacity(); if (oldCapacity < newCapacity) { growTables(); //hash with new capacity hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1)); } pair = new (mem) b3BroadphasePair(proxy0,proxy1); // pair->m_pProxy0 = proxy0; // pair->m_pProxy1 = proxy1; //pair->m_algorithm = 0; //pair->m_internalTmpValue = 0; m_next[count] = m_hashTable[hash]; m_hashTable[hash] = count; return pair; }
void Process(const b3DbvtNode* na,const b3DbvtNode* nb) { if(na!=nb) { b3DbvtProxy* pa=(b3DbvtProxy*)na->data; b3DbvtProxy* pb=(b3DbvtProxy*)nb->data; #if B3_DBVT_BP_SORTPAIRS if(pa->m_uniqueId>pb->m_uniqueId) b3Swap(pa,pb); #endif pbp->m_paircache->addOverlappingPair(pa->getUid(),pb->getUid()); ++pbp->m_newpairs; } }
inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3(b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies) { //int sz = sizeof(b3BatchConstraint); B3_PROFILE("sortConstraintByBatch3"); static int maxSwaps = 0; int numSwaps = 0; curUsed.resize(2 * simdWidth); static int maxNumConstraints = 0; if (maxNumConstraints < numConstraints) { maxNumConstraints = numConstraints; //printf("maxNumConstraints = %d\n",maxNumConstraints ); } int numUsedArray = numBodies / 32 + 1; bodyUsed.resize(numUsedArray); for (int q = 0; q < numUsedArray; q++) bodyUsed[q] = 0; int curBodyUsed = 0; int numIter = 0; #if defined(_DEBUG) for (int i = 0; i < numConstraints; i++) cs[i].m_batchId = -1; #endif int numValidConstraints = 0; // int unprocessedConstraintIndex = 0; int batchIdx = 0; { B3_PROFILE("cpu batch innerloop"); while (numValidConstraints < numConstraints) { numIter++; int nCurrentBatch = 0; // clear flag for (int i = 0; i < curBodyUsed; i++) bodyUsed[curUsed[i] / 32] = 0; curBodyUsed = 0; for (int i = numValidConstraints; i < numConstraints; i++) { int idx = i; b3Assert(idx < numConstraints); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx; bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx; int aUnavailable = 0; int bUnavailable = 0; if (!aIsStatic) { aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31)); } if (!aUnavailable) if (!bIsStatic) { bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31)); } if (aUnavailable == 0 && bUnavailable == 0) // ok { if (!aIsStatic) { bodyUsed[bodyA / 32] |= (1 << (bodyA & 31)); curUsed[curBodyUsed++] = bodyA; } if (!bIsStatic) { bodyUsed[bodyB / 32] |= (1 << (bodyB & 31)); curUsed[curBodyUsed++] = bodyB; } cs[idx].m_batchId = batchIdx; if (i != numValidConstraints) { b3Swap(cs[i], cs[numValidConstraints]); numSwaps++; } numValidConstraints++; { nCurrentBatch++; if (nCurrentBatch == simdWidth) { nCurrentBatch = 0; for (int i = 0; i < curBodyUsed; i++) bodyUsed[curUsed[i] / 32] = 0; curBodyUsed = 0; } } } } m_gpuData->m_batchSizes.push_back(nCurrentBatch); batchIdx++; } } #if defined(_DEBUG) // debugPrintf( "nBatches: %d\n", batchIdx ); for (int i = 0; i < numConstraints; i++) { b3Assert(cs[i].m_batchId != -1); } #endif if (maxSwaps < numSwaps) { maxSwaps = numSwaps; //printf("maxSwaps = %d\n", maxSwaps); } return batchIdx; }
void b3DynamicBvhBroadphase::collide(b3Dispatcher* dispatcher) { /*printf("---------------------------------------------------------\n"); printf("m_sets[0].m_leaves=%d\n",m_sets[0].m_leaves); printf("m_sets[1].m_leaves=%d\n",m_sets[1].m_leaves); printf("numPairs = %d\n",getOverlappingPairCache()->getNumOverlappingPairs()); { int i; for (i=0;i<getOverlappingPairCache()->getNumOverlappingPairs();i++) { printf("pair[%d]=(%d,%d),",i,getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy0->getUid(), getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy1->getUid()); } printf("\n"); } */ b3SPC(m_profiling.m_total); /* optimize */ m_sets[0].optimizeIncremental(1+(m_sets[0].m_leaves*m_dupdates)/100); if(m_fixedleft) { const int count=1+(m_sets[1].m_leaves*m_fupdates)/100; m_sets[1].optimizeIncremental(1+(m_sets[1].m_leaves*m_fupdates)/100); m_fixedleft=b3Max<int>(0,m_fixedleft-count); } /* dynamic -> fixed set */ m_stageCurrent=(m_stageCurrent+1)%STAGECOUNT; b3DbvtProxy* current=m_stageRoots[m_stageCurrent]; if(current) { b3DbvtTreeCollider collider(this); do { b3DbvtProxy* next=current->links[1]; b3ListRemove(current,m_stageRoots[current->stage]); b3ListAppend(current,m_stageRoots[STAGECOUNT]); #if B3_DBVT_BP_ACCURATESLEEPING m_paircache->removeOverlappingPairsContainingProxy(current,dispatcher); collider.proxy=current; b3DynamicBvh::collideTV(m_sets[0].m_root,current->aabb,collider); b3DynamicBvh::collideTV(m_sets[1].m_root,current->aabb,collider); #endif m_sets[0].remove(current->leaf); B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume) curAabb=b3DbvtVolume::FromMM(current->m_aabbMin,current->m_aabbMax); current->leaf = m_sets[1].insert(curAabb,current); current->stage = STAGECOUNT; current = next; } while(current); m_fixedleft=m_sets[1].m_leaves; m_needcleanup=true; } /* collide dynamics */ { b3DbvtTreeCollider collider(this); if(m_deferedcollide) { b3SPC(m_profiling.m_fdcollide); m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[1].m_root,collider); } if(m_deferedcollide) { b3SPC(m_profiling.m_ddcollide); m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[0].m_root,collider); } } /* clean up */ if(m_needcleanup) { b3SPC(m_profiling.m_cleanup); b3BroadphasePairArray& pairs=m_paircache->getOverlappingPairArray(); if(pairs.size()>0) { int ni=b3Min(pairs.size(),b3Max<int>(m_newpairs,(pairs.size()*m_cupdates)/100)); for(int i=0;i<ni;++i) { b3BroadphasePair& p=pairs[(m_cid+i)%pairs.size()]; b3DbvtProxy* pa=&m_proxies[p.x]; b3DbvtProxy* pb=&m_proxies[p.y]; if(!b3Intersect(pa->leaf->volume,pb->leaf->volume)) { #if B3_DBVT_BP_SORTPAIRS if(pa->m_uniqueId>pb->m_uniqueId) b3Swap(pa,pb); #endif m_paircache->removeOverlappingPair(pa->getUid(),pb->getUid(),dispatcher); --ni;--i; } } if(pairs.size()>0) m_cid=(m_cid+ni)%pairs.size(); else m_cid=0; } } ++m_pid; m_newpairs=1; m_needcleanup=false; if(m_updates_call>0) { m_updates_ratio=m_updates_done/(b3Scalar)m_updates_call; } else { m_updates_ratio=0; } m_updates_done/=2; m_updates_call/=2; }
inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) { B3_PROFILE("sortConstraintByBatch2"); bodyUsed2.resize(2*simdWidth); for (int q=0;q<2*simdWidth;q++) bodyUsed2[q]=0; int curBodyUsed = 0; int numIter = 0; m_data->m_sortData.resize(numConstraints); m_data->m_idxBuffer.resize(numConstraints); m_data->m_old.resize(numConstraints); unsigned int* idxSrc = &m_data->m_idxBuffer[0]; #if defined(_DEBUG) for(int i=0; i<numConstraints; i++) cs[i].getBatchIdx() = -1; #endif for(int i=0; i<numConstraints; i++) idxSrc[i] = i; int numValidConstraints = 0; int unprocessedConstraintIndex = 0; int batchIdx = 0; { B3_PROFILE("cpu batch innerloop"); while( numValidConstraints < numConstraints) { numIter++; int nCurrentBatch = 0; // clear flag for(int i=0; i<curBodyUsed; i++) bodyUsed2[i] = 0; curBodyUsed = 0; for(int i=numValidConstraints; i<numConstraints; i++) { int idx = idxSrc[i]; b3Assert( idx < numConstraints ); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; int aUnavailable = 0; int bUnavailable = 0; if (!aIsStatic) { for (int j=0;j<curBodyUsed;j++) { if (bodyA == bodyUsed2[j]) { aUnavailable=1; break; } } } if (!aUnavailable) if (!bIsStatic) { for (int j=0;j<curBodyUsed;j++) { if (bodyB == bodyUsed2[j]) { bUnavailable=1; break; } } } if( aUnavailable==0 && bUnavailable==0 ) // ok { if (!aIsStatic) { bodyUsed2[curBodyUsed++] = bodyA; } if (!bIsStatic) { bodyUsed2[curBodyUsed++] = bodyB; } cs[idx].getBatchIdx() = batchIdx; m_data->m_sortData[idx].m_key = batchIdx; m_data->m_sortData[idx].m_value = idx; if (i!=numValidConstraints) { b3Swap(idxSrc[i], idxSrc[numValidConstraints]); } numValidConstraints++; { nCurrentBatch++; if( nCurrentBatch == simdWidth ) { nCurrentBatch = 0; for(int i=0; i<curBodyUsed; i++) bodyUsed2[i] = 0; curBodyUsed = 0; } } } } batchIdx ++; } } { B3_PROFILE("quickSort"); //m_data->m_sortData.quickSort(sortfnc); } { B3_PROFILE("reorder"); // reorder memcpy( &m_data->m_old[0], cs, sizeof(b3Contact4)*numConstraints); for(int i=0; i<numConstraints; i++) { b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]); int idx = m_data->m_sortData[idxSrc[i]].m_value; cs[i] = m_data->m_old[idx]; } } #if defined(_DEBUG) // debugPrintf( "nBatches: %d\n", batchIdx ); for(int i=0; i<numConstraints; i++) { b3Assert( cs[i].getBatchIdx() != -1 ); } #endif return batchIdx; }
inline int b3GpuPgsContactSolver::sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies) { B3_PROFILE("sortConstraintByBatch"); int numIter = 0; sortData.resize(n); idxBuffer.resize(n); old.resize(n); unsigned int* idxSrc = &idxBuffer[0]; unsigned int* idxDst = &idxBuffer[0]; int nIdxSrc, nIdxDst; const int N_FLG = 256; const int FLG_MASK = N_FLG-1; unsigned int flg[N_FLG/32]; #if defined(_DEBUG) for(int i=0; i<n; i++) cs[i].getBatchIdx() = -1; #endif for(int i=0; i<n; i++) idxSrc[i] = i; nIdxSrc = n; int batchIdx = 0; { B3_PROFILE("cpu batch innerloop"); while( nIdxSrc ) { numIter++; nIdxDst = 0; int nCurrentBatch = 0; // clear flag for(int i=0; i<N_FLG/32; i++) flg[i] = 0; for(int i=0; i<nIdxSrc; i++) { int idx = idxSrc[i]; b3Assert( idx < n ); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); int aIdx = bodyA & FLG_MASK; int bIdx = bodyB & FLG_MASK; unsigned int aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31)); unsigned int bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31)); bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; //use inv_mass! aUnavailable = !aIsStatic? aUnavailable:0;// bUnavailable = !bIsStatic? bUnavailable:0; if( aUnavailable==0 && bUnavailable==0 ) // ok { if (!aIsStatic) flg[ aIdx/32 ] |= (1<<(aIdx&31)); if (!bIsStatic) flg[ bIdx/32 ] |= (1<<(bIdx&31)); cs[idx].getBatchIdx() = batchIdx; sortData[idx].m_key = batchIdx; sortData[idx].m_value = idx; { nCurrentBatch++; if( nCurrentBatch == simdWidth ) { nCurrentBatch = 0; for(int i=0; i<N_FLG/32; i++) flg[i] = 0; } } } else { idxDst[nIdxDst++] = idx; } } b3Swap( idxSrc, idxDst ); b3Swap( nIdxSrc, nIdxDst ); batchIdx ++; } } { B3_PROFILE("quickSort"); sortData.quickSort(sortfnc); } { B3_PROFILE("reorder"); // reorder memcpy( &old[0], cs, sizeof(b3Contact4)*n); for(int i=0; i<n; i++) { int idx = sortData[i].m_value; cs[i] = old[idx]; } } #if defined(_DEBUG) // debugPrintf( "nBatches: %d\n", batchIdx ); for(int i=0; i<n; i++) { b3Assert( cs[i].getBatchIdx() != -1 ); } #endif return batchIdx; }
void* b3HashedOverlappingPairCache::removeOverlappingPair(int proxy0, int proxy1,b3Dispatcher* dispatcher) { b3g_removePairs++; if(proxy0>proxy1) b3Swap(proxy0,proxy1); int proxyId1 = proxy0; int proxyId2 = proxy1; /*if (proxyId1 > proxyId2) b3Swap(proxyId1, proxyId2);*/ int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1),static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity()-1)); b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash); if (pair == NULL) { return 0; } cleanOverlappingPair(*pair,dispatcher); int pairIndex = int(pair - &m_overlappingPairArray[0]); b3Assert(pairIndex < m_overlappingPairArray.size()); // Remove the pair from the hash table. int index = m_hashTable[hash]; b3Assert(index != B3_NULL_PAIR); int previous = B3_NULL_PAIR; while (index != pairIndex) { previous = index; index = m_next[index]; } if (previous != B3_NULL_PAIR) { b3Assert(m_next[previous] == pairIndex); m_next[previous] = m_next[pairIndex]; } else { m_hashTable[hash] = m_next[pairIndex]; } // We now move the last pair into spot of the // pair being removed. We need to fix the hash // table indices to support the move. int lastPairIndex = m_overlappingPairArray.size() - 1; //if (m_ghostPairCallback) // m_ghostPairCallback->removeOverlappingPair(proxy0, proxy1,dispatcher); // If the removed pair is the last pair, we are done. if (lastPairIndex == pairIndex) { m_overlappingPairArray.pop_back(); return 0; } // Remove the last pair from the hash table. const b3BroadphasePair* last = &m_overlappingPairArray[lastPairIndex]; /* missing swap here too, Nat. */ int lastHash = static_cast<int>(getHash(static_cast<unsigned int>(last->x), static_cast<unsigned int>(last->y)) & (m_overlappingPairArray.capacity()-1)); index = m_hashTable[lastHash]; b3Assert(index != B3_NULL_PAIR); previous = B3_NULL_PAIR; while (index != lastPairIndex) { previous = index; index = m_next[index]; } if (previous != B3_NULL_PAIR) { b3Assert(m_next[previous] == lastPairIndex); m_next[previous] = m_next[lastPairIndex]; } else { m_hashTable[lastHash] = m_next[lastPairIndex]; } // Copy the last pair into the remove pair's spot. m_overlappingPairArray[pairIndex] = m_overlappingPairArray[lastPairIndex]; // Insert the last pair into the hash table m_next[pairIndex] = m_hashTable[lastHash]; m_hashTable[lastHash] = pairIndex; m_overlappingPairArray.pop_back(); return 0; }
void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */) { int n = inout.size(); const int BITS_PER_PASS = 8; const int NUM_TABLES = (1<<BITS_PER_PASS); int tables[NUM_TABLES]; int counter[NUM_TABLES]; b3SortData* src = &inout[0]; b3AlignedObjectArray<b3SortData> workbuffer; workbuffer.resize(inout.size()); b3SortData* dst = &workbuffer[0]; int count=0; for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS) { for(int i=0; i<NUM_TABLES; i++) { tables[i] = 0; } for(int i=0; i<n; i++) { int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); tables[tableIdx]++; } //#define TEST #ifdef TEST printf("histogram size=%d\n",NUM_TABLES); for (int i=0;i<NUM_TABLES;i++) { if (tables[i]!=0) { printf("tables[%d]=%d]\n",i,tables[i]); } } #endif //TEST // prefix scan int sum = 0; for(int i=0; i<NUM_TABLES; i++) { int iData = tables[i]; tables[i] = sum; sum += iData; counter[i] = 0; } // distribute for(int i=0; i<n; i++) { int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); dst[tables[tableIdx] + counter[tableIdx]] = src[i]; counter[tableIdx] ++; } b3Swap( src, dst ); count++; } if (count&1) { b3Assert(0);//need to copy } }
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) { int originalSize = keyValuesInOut.size(); int workingSize = originalSize; int dataAlignment = DATA_ALIGNMENT; #ifdef DEBUG_RADIXSORT2 b3AlignedObjectArray<b3SortData> test2; keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } #endif //DEBUG_RADIXSORT2 b3OpenCLArray<b3SortData>* src = 0; if (workingSize%dataAlignment) { workingSize += dataAlignment-(workingSize%dataAlignment); m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); m_workBuffer4->resize(workingSize); b3SortData fillValue; fillValue.m_key = 0xffffffff; fillValue.m_value = 0xffffffff; #define USE_BTFILL #ifdef USE_BTFILL m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize); #else //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) for (int i=originalSize; i<workingSize;i++) { m_workBuffer4->copyFromHostPointer(&fillValue,1,i); } #endif//USE_BTFILL src = m_workBuffer4; } else { src = &keyValuesInOut; m_workBuffer4->resize(0); } b3Assert( workingSize%DATA_ALIGNMENT == 0 ); int minCap = NUM_BUCKET*NUM_WGS; int n = workingSize; m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); b3Assert( BITS_PER_PASS == 4 ); b3Assert( WG_SIZE == 64 ); b3Assert( (sortBits&0x3) == 0 ); b3OpenCLArray<b3SortData>* dst = m_workBuffer3; b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; int nWGs = NUM_WGS; b3ConstData cdata; { int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 int nBlocks = (n+blockSize-1)/(blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; if( nBlocks < NUM_WGS ) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } int count=0; for(int ib=0; ib<sortBits; ib+=4) { #ifdef DEBUG_RADIXSORT2 keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { if (test2[i].m_key != test2[i].m_value) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } } #endif //DEBUG_RADIXSORT2 cdata.m_startBit = ib; if (src->size()) { b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); int num = NUM_WGS*WG_SIZE; launcher.launch1D( num, WG_SIZE ); } #ifdef DEBUG_RADIXSORT b3AlignedObjectArray<unsigned int> testHist; srcHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } #endif //DEBUG_RADIXSORT //fast prefix scan is not working properly on Mac OSX yet #ifdef _WIN32 bool fastScan=!m_deviceCPU;//only use fast scan on GPU #else bool fastScan=false; #endif if (fastScan) {// prefix scan group histogram b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( 128, 128 ); destHisto = srcHisto; }else { //unsigned int sum; //for debugging m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); } #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } for (int i=0;i<testHist.size();i+=NUM_WGS) { printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); } #endif //DEBUG_RADIXSORT #define USE_GPU #ifdef USE_GPU if (src->size()) {// local sort and distribute b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); } #else { #define NUM_TABLES 16 //#define SEQUENTIAL #ifdef SEQUENTIAL int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; int startBit = ib; destHisto->copyToHost(testHist); b3AlignedObjectArray<b3SortData> srcHost; b3AlignedObjectArray<b3SortData> dstHost; dstHost.resize(src->size()); src->copyToHost(srcHost); for (int i=0;i<NUM_TABLES;i++) { tables[i] = testHist[i*NUM_WGS]; } // distribute for(int i=0; i<n; i++) { int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } #else int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; b3AlignedObjectArray<b3SortData> dstHostOK; dstHostOK.resize(src->size()); destHisto->copyToHost(testHist); b3AlignedObjectArray<b3SortData> srcHost; src->copyToHost(srcHost); int blockSize = 256; int nBlocksPerWG = cdata.m_nBlocksPerWG; int startBit = ib; { for (int i=0;i<NUM_TABLES;i++) { tables[i] = testHist[i*NUM_WGS]; } // distribute for(int i=0; i<n; i++) { int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } } b3AlignedObjectArray<b3SortData> dstHost; dstHost.resize(src->size()); int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) { int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) { for (int lIdx = 0;lIdx < 64;lIdx++) { int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops // AMD: AtomInc performs better while NV prefers ++ for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) { if( addr+j < n ) { // printf ("addr+j=%d\n", addr+j); int i = addr+j; int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; b3SortData ok = dstHostOK[destIndex]; if (ok.m_key != srcHost[i].m_key) { printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); } if (ok.m_value != srcHost[i].m_value) { printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); } dstHost[destIndex] = srcHost[i]; counter[tableIdx] ++; } } } } } #endif //SEQUENTIAL dst->copyFromHost(dstHost); } #endif//USE_GPU #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } #endif //DEBUG_RADIXSORT b3Swap(src, dst ); b3Swap(srcHisto,destHisto); #ifdef DEBUG_RADIXSORT2 keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { if (test2[i].m_key != test2[i].m_value) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } } #endif //DEBUG_RADIXSORT2 count++; }