SIMD_FORCE_INLINE void small_cache_read_triple( void* ls0, ppu_address_t ea0, void* ls1, ppu_address_t ea1, void* ls2, ppu_address_t ea2, size_t size) { btAssert(size<16); ATTRIBUTE_ALIGNED16(char tmpBuffer0[32]); ATTRIBUTE_ALIGNED16(char tmpBuffer1[32]); ATTRIBUTE_ALIGNED16(char tmpBuffer2[32]); uint32_t i; ///make sure last 4 bits are the same, for cellDmaSmallGet char* localStore0 = (char*)ls0; uint32_t last4BitsOffset = ea0 & 0x0f; char* tmpTarget0 = tmpBuffer0 + last4BitsOffset; #ifdef __SPU__ cellDmaSmallGet(tmpTarget0,ea0,size,DMA_TAG(1),0,0); #else tmpTarget0 = (char*)cellDmaSmallGetReadOnly(tmpTarget0,ea0,size,DMA_TAG(1),0,0); #endif char* localStore1 = (char*)ls1; last4BitsOffset = ea1 & 0x0f; char* tmpTarget1 = tmpBuffer1 + last4BitsOffset; #ifdef __SPU__ cellDmaSmallGet(tmpTarget1,ea1,size,DMA_TAG(1),0,0); #else tmpTarget1 = (char*)cellDmaSmallGetReadOnly(tmpTarget1,ea1,size,DMA_TAG(1),0,0); #endif char* localStore2 = (char*)ls2; last4BitsOffset = ea2 & 0x0f; char* tmpTarget2 = tmpBuffer2 + last4BitsOffset; #ifdef __SPU__ cellDmaSmallGet(tmpTarget2,ea2,size,DMA_TAG(1),0,0); #else tmpTarget2 = (char*)cellDmaSmallGetReadOnly(tmpTarget2,ea2,size,DMA_TAG(1),0,0); #endif cellDmaWaitTagStatusAll( DMA_MASK(1) ); //this is slowish, perhaps memcpy on SPU is smarter? for (i=0; btLikely( i<size );i++) { localStore0[i] = tmpTarget0[i]; localStore1[i] = tmpTarget1[i]; localStore2[i] = tmpTarget2[i]; } }
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes) int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size) { btAssert(size<32); ATTRIBUTE_ALIGNED16(char tmpBuffer[32]); char* mainMem = (char*)ea; char* localStore = (char*)ls; uint32_t i; ///make sure last 4 bits are the same, for cellDmaSmallGet uint32_t last4BitsOffset = ea & 0x0f; char* tmpTarget = tmpBuffer + last4BitsOffset; #if defined (__SPU__) || defined (USE_LIBSPE2) int remainingSize = size; //#define FORCE_cellDmaUnalignedGet 1 #ifdef FORCE_cellDmaUnalignedGet cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0); #else char* remainingTmpTarget = tmpTarget; uint64_t remainingEa = ea; while (remainingSize) { switch (remainingSize) { case 1: case 2: case 4: case 8: case 16: { mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0); remainingSize=0; break; } default: { //spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize); int actualSize = 0; if (remainingSize > 16) actualSize = 16; else if (remainingSize >8) actualSize=8; else if (remainingSize >4) actualSize=4; else if (remainingSize >2) actualSize=2; mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0); remainingSize-=actualSize; remainingTmpTarget+=actualSize; remainingEa += actualSize; } } } #endif//FORCE_cellDmaUnalignedGet #else //copy into final destination #ifdef USE_MEMCPY memcpy(tmpTarget,mainMem,size); #else for ( i=0;i<size;i++) { tmpTarget[i] = mainMem[i]; } #endif //USE_MEMCPY #endif cellDmaWaitTagStatusAll(DMA_MASK(1)); //this is slowish, perhaps memcpy on SPU is smarter? for (i=0; btLikely( i<size );i++) { localStore[i] = tmpTarget[i]; } return 0; }
virtual void processNode(int subPart, int triangleIndex) { ///Create a triangle on the stack, call process collision, with GJK ///DMA the vertices, can benefit from software caching // spu_printf("processNode with triangleIndex %d\n",triangleIndex); // ugly solution to support both 16bit and 32bit indices if (m_lsMemPtr->bvhShapeData.gIndexMesh.m_indexType == PHY_SHORT) { short int* indexBasePtr = (short int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride); ATTRIBUTE_ALIGNED16(short int tmpIndices[3]); small_cache_read_triple(&tmpIndices[0],(ppu_address_t)&indexBasePtr[0], &tmpIndices[1],(ppu_address_t)&indexBasePtr[1], &tmpIndices[2],(ppu_address_t)&indexBasePtr[2], sizeof(short int)); m_lsMemPtr->spuIndices[0] = int(tmpIndices[0]); m_lsMemPtr->spuIndices[1] = int(tmpIndices[1]); m_lsMemPtr->spuIndices[2] = int(tmpIndices[2]); } else { int* indexBasePtr = (int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride); small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0], &m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1], &m_lsMemPtr->spuIndices[2],(ppu_address_t)&indexBasePtr[2], sizeof(int)); } //printf("%d %d %d\n", m_lsMemPtr->spuIndices[0], m_lsMemPtr->spuIndices[1], m_lsMemPtr->spuIndices[2]); // spu_printf("SPU index0=%d ,",spuIndices[0]); // spu_printf("SPU index1=%d ,",spuIndices[1]); // spu_printf("SPU index2=%d ,",spuIndices[2]); // spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr); const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling(); for (int j=2;btLikely( j>=0 );j--) { int graphicsindex = m_lsMemPtr->spuIndices[j]; //spu_printf("SPU index=%d ,",graphicsindex); btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride); // spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr); ///handle un-aligned vertices... //another DMA for each vertex small_cache_read_triple(&spuUnscaledVertex[0],(ppu_address_t)&graphicsbasePtr[0], &spuUnscaledVertex[1],(ppu_address_t)&graphicsbasePtr[1], &spuUnscaledVertex[2],(ppu_address_t)&graphicsbasePtr[2], sizeof(btScalar)); //printf("%f %f %f\n", spuUnscaledVertex[0],spuUnscaledVertex[1],spuUnscaledVertex[2]); spuTriangleVertices[j] = btVector3( spuUnscaledVertex[0]*meshScaling.getX(), spuUnscaledVertex[1]*meshScaling.getY(), spuUnscaledVertex[2]*meshScaling.getZ()); //spu_printf("SPU:triangle vertices:%f,%f,%f\n",spuTriangleVertices[j].x(),spuTriangleVertices[j].y(),spuTriangleVertices[j].z()); } RaycastGatheredObjectData triangleGatheredObjectData (*m_gatheredObjectData); triangleGatheredObjectData.m_shapeType = TRIANGLE_SHAPE_PROXYTYPE; triangleGatheredObjectData.m_spuCollisionShape = &spuTriangleVertices[0]; //printf("%f %f %f\n", spuTriangleVertices[0][0],spuTriangleVertices[0][1],spuTriangleVertices[0][2]); //printf("%f %f %f\n", spuTriangleVertices[1][0],spuTriangleVertices[1][1],spuTriangleVertices[1][2]); //printf("%f %f %f\n", spuTriangleVertices[2][0],spuTriangleVertices[2][1],spuTriangleVertices[2][2]); for (int i = 0; i < m_numWorkUnits; i++) { SpuRaycastTaskWorkUnitOut out; out.hitFraction = 1.0; performRaycastAgainstConvex (&triangleGatheredObjectData, m_workUnits[i], &out, m_lsMemPtr); /* XXX: For now only take the closest hit */ if (out.hitFraction < m_workUnitsOut[i].hitFraction) { m_workUnitsOut[i].hitFraction = out.hitFraction; m_workUnitsOut[i].hitNormal = out.hitNormal; } } }
virtual void processNode(int subPart, int triangleIndex) { ///Create a triangle on the stack, call process collision, with GJK ///DMA the vertices, can benefit from software caching // spu_printf("processNode with triangleIndex %d\n",triangleIndex); if (m_lsMemPtr->bvhShapeData.gIndexMesh.m_indexType == PHY_SHORT) { unsigned short int* indexBasePtr = (unsigned short int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride); ATTRIBUTE_ALIGNED16(unsigned short int tmpIndices[3]); small_cache_read_triple(&tmpIndices[0],(ppu_address_t)&indexBasePtr[0], &tmpIndices[1],(ppu_address_t)&indexBasePtr[1], &tmpIndices[2],(ppu_address_t)&indexBasePtr[2], sizeof(unsigned short int)); m_lsMemPtr->spuIndices[0] = int(tmpIndices[0]); m_lsMemPtr->spuIndices[1] = int(tmpIndices[1]); m_lsMemPtr->spuIndices[2] = int(tmpIndices[2]); } else { unsigned int* indexBasePtr = (unsigned int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride); small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0], &m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1], &m_lsMemPtr->spuIndices[2],(ppu_address_t)&indexBasePtr[2], sizeof(int)); } // spu_printf("SPU index0=%d ,",spuIndices[0]); // spu_printf("SPU index1=%d ,",spuIndices[1]); // spu_printf("SPU index2=%d ,",spuIndices[2]); // spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr); const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling(); for (int j=2;btLikely( j>=0 );j--) { int graphicsindex = m_lsMemPtr->spuIndices[j]; // spu_printf("SPU index=%d ,",graphicsindex); btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride); // spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr); ///handle un-aligned vertices... //another DMA for each vertex small_cache_read_triple(&spuUnscaledVertex[0],(ppu_address_t)&graphicsbasePtr[0], &spuUnscaledVertex[1],(ppu_address_t)&graphicsbasePtr[1], &spuUnscaledVertex[2],(ppu_address_t)&graphicsbasePtr[2], sizeof(btScalar)); m_tmpTriangleShape.getVertexPtr(j).setValue(spuUnscaledVertex[0]*meshScaling.getX(), spuUnscaledVertex[1]*meshScaling.getY(), spuUnscaledVertex[2]*meshScaling.getZ()); // spu_printf("SPU:triangle vertices:%f,%f,%f\n",spuTriangleVertices[j].x(),spuTriangleVertices[j].y(),spuTriangleVertices[j].z()); } SpuCollisionPairInput triangleConcaveInput(*m_wuInput); // triangleConcaveInput.m_spuCollisionShapes[1] = &spuTriangleVertices[0]; triangleConcaveInput.m_spuCollisionShapes[1] = &m_tmpTriangleShape; triangleConcaveInput.m_shapeType1 = TRIANGLE_SHAPE_PROXYTYPE; m_spuContacts.setShapeIdentifiersB(subPart,triangleIndex); // m_spuContacts.flush(); ProcessSpuConvexConvexCollision(&triangleConcaveInput, m_lsMemPtr,m_spuContacts); ///this flush should be automatic // m_spuContacts.flush(); }