void GatherCollisionObjectAndShapeData (RaycastGatheredObjectData* gatheredObjectData, RaycastTask_LocalStoreMemory* lsMemPtr, ppu_address_t objectWrapper) { register int dmaSize; register ppu_address_t dmaPpuAddress2; /* DMA Collision object wrapper into local store */ dmaSize = sizeof(SpuCollisionObjectWrapper); dmaPpuAddress2 = objectWrapper; cellDmaGet(&lsMemPtr->gCollisionObjectWrapper, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); /* DMA Collision object into local store */ dmaSize = sizeof(btCollisionObject); dmaPpuAddress2 = lsMemPtr->getCollisionObjectWrapper()->getCollisionObjectPtr(); cellDmaGet(&lsMemPtr->gColObj, dmaPpuAddress2 , dmaSize, DMA_TAG(2), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(2)); /* Gather information about collision object and shape */ gatheredObjectData->m_worldTransform = lsMemPtr->getColObj()->getWorldTransform(); gatheredObjectData->m_collisionMargin = lsMemPtr->getCollisionObjectWrapper()->getCollisionMargin (); gatheredObjectData->m_shapeType = lsMemPtr->getCollisionObjectWrapper()->getShapeType (); gatheredObjectData->m_collisionShape = (ppu_address_t)lsMemPtr->getColObj()->getCollisionShape(); gatheredObjectData->m_spuCollisionShape = (void*)&lsMemPtr->gCollisionShape.collisionShape; /* DMA shape data */ dmaCollisionShape (gatheredObjectData->m_spuCollisionShape, gatheredObjectData->m_collisionShape, 1, gatheredObjectData->m_shapeType); cellDmaWaitTagStatusAll(DMA_MASK(1)); if (btBroadphaseProxy::isConvex (gatheredObjectData->m_shapeType)) { btConvexInternalShape* spuConvexShape = (btConvexInternalShape*)gatheredObjectData->m_spuCollisionShape; gatheredObjectData->m_primitiveDimensions = spuConvexShape->getImplicitShapeDimensions (); } else { gatheredObjectData->m_primitiveDimensions = btVector3(1.0, 1.0, 1.0); } }
void processDecodeSet(unsigned int uiPtr) { SpursSpeexTaskOutput spuOutput; cellDmaGet(&gviSpursSpeexTaskDesc, uiPtr, sizeof(SpursSpeexTaskDesc), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] CMD_SAMPLE_TASK_DECODESET_COMMAND\n"); if (gviSpursSpeexTaskDesc.mDebugPause) { snPause(); } cellDmaLargeGet(gviSpursSpeexStateBuffer, (uint64_t)gviSpursSpeexTaskDesc.mSpeexStateBuffer, SPEEX_DECODER_STATE_BUFFER_SIZE, DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); gviSpursSpeexDecodeSet(&spuOutput); if (spuOutput.mSpeexReturnCode < 0) { spuDebugPrintf("SPU: failed to encode, ret = %d\n", spuOutput.mSpeexReturnCode); } cellDmaPut(&spuOutput, (uint64_t)gviSpursSpeexTaskDesc.mSpeexTaskOutput, sizeof(SpursSpeexTaskOutput), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); cellDmaLargePut(gviSpursSpeexStateBuffer, (uint64_t)gviSpursSpeexTaskDesc.mSpeexStateBuffer, SPEEX_DECODER_STATE_BUFFER_SIZE, DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] buffer dma done\n"); }
void processDecodeInit(unsigned int uiPtr) { SpursSpeexTaskOutput spuOutput; cellDmaGet(&gviSpursSpeexTaskDesc, uiPtr, sizeof(SpursSpeexTaskDesc), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] CMD_SAMPLE_TASK_DECODE_INIT_COMMAND\n"); if (gviSpursSpeexTaskDesc.mDebugPause) { snPause(); } gviSpursSpeexDecoderInitialize(&spuOutput); if (spuOutput.mSpeexReturnCode < 0) { spuDebugPrintf("[Speex][SPU] failed to initialize decoder, ret = %d\n", spuOutput.mSpeexReturnCode); } cellDmaPut(&spuOutput, (uint64_t)gviSpursSpeexTaskDesc.mSpeexTaskOutput, sizeof(SpursSpeexTaskOutput), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); cellDmaLargePut(gviSpursSpeexStateBuffer, (uint64_t)gviSpursSpeexTaskDesc.mSpeexStateBuffer, gviSpursSpeexTaskDesc.mSpeexStateBufferSize, DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] buffer dma done\n"); }
void procesEncodeInit(unsigned int uiPtr) { SpursSpeexTaskOutput spuOutput; //spuDebugPrintf("[Speex][SPU] CMD_SAMPLE_TASK_ENCODE_INIT_COMMAND\n"); cellDmaGet(&gviSpursSpeexTaskDesc, uiPtr, sizeof(SpursSpeexTaskDesc), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); if (gviSpursSpeexTaskDesc.mDebugPause) { snPause(); } gviSpursSpeexEncoderInitialize(&spuOutput); if (spuOutput.mSpeexReturnCode < 0) { spuDebugPrintf("[Speex][SPU] failed to initialize encoder, ret = %d\n", spuOutput.mSpeexReturnCode); } //spuDebugPrintf("[Speex][SPU] done with initializing things for speex, now returning data via DMA put\n"); //printGlobalTaskDescData(); cellDmaPut(&spuOutput, (uint64_t)gviSpursSpeexTaskDesc.mSpeexTaskOutput, sizeof(SpursSpeexTaskOutput), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] task dma done\n"); cellDmaLargePut(gviSpursSpeexStateBuffer, (uint64_t)gviSpursSpeexTaskDesc.mSpeexStateBuffer, SPEEX_ENCODER_STATE_BUFFER_SIZE, DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] buffer dma done\n"); }
void gviSpursSpeexEncode(SpursSpeexTaskOutput *spuTaskOut) { short *inBuffer; float *speexBuffer; char *outBuffer; unsigned int i; spuTaskOut->mSpeexEncodedFrameSize = 0; spuTaskOut->mSpeexInitialized = 1; spuTaskOut->mSpeexSamplesPerFrame = 0; spuTaskOut->mSpeexReturnCode = 0; spuTaskOut->mSpeexOutBufferSize = 0; speexBuffer = (float *)memalign(16, gviSpursSpeexTaskDesc.mInputBufferSize * sizeof(float)); inBuffer = (short *)memalign(16, gviSpursSpeexTaskDesc.mInputBufferSize * sizeof(short)); outBuffer = (char *)memalign(16, gviSpursSpeexTaskDesc.mOutputBufferSize); memset(speexBuffer, 0, gviSpursSpeexTaskDesc.mInputBufferSize * sizeof(float)); memset(inBuffer, 0, gviSpursSpeexTaskDesc.mInputBufferSize * sizeof(short)); memset(outBuffer, 0, gviSpursSpeexTaskDesc.mOutputBufferSize); cellDmaGet(inBuffer, (uint64_t)gviSpursSpeexTaskDesc.mInputBuffer, gviSpursSpeexTaskDesc.mInputBufferSize * sizeof(short), DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); // convert the input to floats for encoding for(i = 0 ; i < gviSpursSpeexTaskDesc.mInputBufferSize ; i++) speexBuffer[i] = inBuffer[i]; // (re)initialize the bits struct speex_bits_init_buffer(&gviSpursSpeexBits,gviSpursSpeexBitsBuffer,sizeof(gviSpursSpeexBitsBuffer)); // flush the bits speex_bits_reset(&gviSpursSpeexBits); // encode the frame speex_encode(gviSpursSpeexStateBuffer, speexBuffer, &gviSpursSpeexBits); // write the bits to the output spuTaskOut->mSpeexOutBufferSize = speex_bits_write(&gviSpursSpeexBits, (char *)outBuffer, gviSpursSpeexTaskDesc.mEncodedFrameSize); //spuDebugPrintf("[Speex][SPU] transferring data back, output size should be: %d\n", gviSpursSpeexTaskDesc.mOutputBufferSize>16?gviSpursSpeexTaskDesc.mOutputBufferSize:16); cellDmaPut(outBuffer, (uint64_t)gviSpursSpeexTaskDesc.mOutputBuffer, gviSpursSpeexTaskDesc.mOutputBufferSize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] done transferring data back\n"); free(speexBuffer); free(inBuffer); free(outBuffer); spuTaskOut->mSpeexReturnCode = 0; }
void gviSpursSpeexDecodeAdd(SpursSpeexTaskOutput *spuTaskOut) { char *inBuffer; float *speexBuffer; short *outBuffer; int rcode; unsigned int i; //spuDebugPrintf("[Speex][SPU] allocating buffers for decoding\n"); speexBuffer = (float *)memalign(16, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(float)); outBuffer = (short *)memalign(16, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(short)); inBuffer = (char *)memalign(16, gviSpursSpeexTaskDesc.mInputBufferSize); memset(speexBuffer, 0, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(float)); memset(outBuffer, 0, gviSpursSpeexTaskDesc.mOutputBufferSize); memset(inBuffer, 0, gviSpursSpeexTaskDesc.mInputBufferSize * sizeof(short)); //spuDebugPrintf("[Speex][SPU] done allocating, getting input data, inbuffer size: %d\n", gSpuSampleTaskDesc.mInputBufferSize); cellDmaGet(inBuffer, (uint64_t)gviSpursSpeexTaskDesc.mInputBuffer, gviSpursSpeexTaskDesc.mInputBufferSize, DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); // spuDebugPrintf("[Speex][SPU] done getting input data, preparing for speex to decode\n"); // read the data into the bits // (re)initialize the bits struct speex_bits_init_buffer(&gviSpursSpeexBits,gviSpursSpeexBitsBuffer,sizeof(gviSpursSpeexBitsBuffer)); speex_bits_read_from(&gviSpursSpeexBits, (char *)inBuffer, gviSpursSpeexTaskDesc.mEncodedFrameSize); // decode it rcode = speex_decode((void *)gviSpursSpeexStateBuffer, &gviSpursSpeexBits, speexBuffer); assert(rcode == 0); //spuDebugPrintf("[Speex][SPU] done with speex decode\n"); // convert the output from floats for(i = 0 ; i < gviSpursSpeexTaskDesc.mOutputBufferSize ; i++) outBuffer[i] = (short)speexBuffer[i]; //spuDebugPrintf("[Speex][SPU] transferring data back\n"); cellDmaPut(outBuffer, (uint64_t)gviSpursSpeexTaskDesc.mOutputBuffer, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(short), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); //spuDebugPrintf("[Speex][SPU] done transferring data back\n"); free(speexBuffer); free(inBuffer); free(outBuffer); spuTaskOut->mSpeexReturnCode = 0; }
SIMD_FORCE_INLINE void small_cache_read_triple( void* ls0, ppu_address_t ea0, void* ls1, ppu_address_t ea1, void* ls2, ppu_address_t ea2, size_t size) { btAssert(size<16); ATTRIBUTE_ALIGNED16(char tmpBuffer0[32]); ATTRIBUTE_ALIGNED16(char tmpBuffer1[32]); ATTRIBUTE_ALIGNED16(char tmpBuffer2[32]); uint32_t i; ///make sure last 4 bits are the same, for cellDmaSmallGet char* localStore0 = (char*)ls0; uint32_t last4BitsOffset = ea0 & 0x0f; char* tmpTarget0 = tmpBuffer0 + last4BitsOffset; #ifdef __SPU__ cellDmaSmallGet(tmpTarget0,ea0,size,DMA_TAG(1),0,0); #else tmpTarget0 = (char*)cellDmaSmallGetReadOnly(tmpTarget0,ea0,size,DMA_TAG(1),0,0); #endif char* localStore1 = (char*)ls1; last4BitsOffset = ea1 & 0x0f; char* tmpTarget1 = tmpBuffer1 + last4BitsOffset; #ifdef __SPU__ cellDmaSmallGet(tmpTarget1,ea1,size,DMA_TAG(1),0,0); #else tmpTarget1 = (char*)cellDmaSmallGetReadOnly(tmpTarget1,ea1,size,DMA_TAG(1),0,0); #endif char* localStore2 = (char*)ls2; last4BitsOffset = ea2 & 0x0f; char* tmpTarget2 = tmpBuffer2 + last4BitsOffset; #ifdef __SPU__ cellDmaSmallGet(tmpTarget2,ea2,size,DMA_TAG(1),0,0); #else tmpTarget2 = (char*)cellDmaSmallGetReadOnly(tmpTarget2,ea2,size,DMA_TAG(1),0,0); #endif cellDmaWaitTagStatusAll( DMA_MASK(1) ); //this is slowish, perhaps memcpy on SPU is smarter? for (i=0; btLikely( i<size );i++) { localStore0[i] = tmpTarget0[i]; localStore1[i] = tmpTarget1[i]; localStore2[i] = tmpTarget2[i]; } }
void performRaycastAgainstConvex (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr) { SpuVoronoiSimplexSolver simplexSolver; btTransform rayFromTrans, rayToTrans; rayFromTrans.setIdentity (); rayFromTrans.setOrigin (workUnit.rayFrom); rayToTrans.setIdentity (); rayToTrans.setOrigin (workUnit.rayTo); SpuCastResult result; /* Load the vertex data if the shape is a convex hull */ /* XXX: We might be loading the shape twice */ ATTRIBUTE_ALIGNED16(char convexHullShape[sizeof(btConvexHullShape)]); if (gatheredObjectData->m_shapeType == CONVEX_HULL_SHAPE_PROXYTYPE) { register int dmaSize; register ppu_address_t dmaPpuAddress2; dmaSize = sizeof(btConvexHullShape); dmaPpuAddress2 = gatheredObjectData->m_collisionShape; cellDmaGet(&convexHullShape, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); dmaConvexVertexData (&lsMemPtr->convexVertexData, (btConvexHullShape*)&convexHullShape); cellDmaWaitTagStatusAll(DMA_MASK(2)); // dmaConvexVertexData uses dma channel 2! lsMemPtr->convexVertexData.gSpuConvexShapePtr = gatheredObjectData->m_spuCollisionShape; lsMemPtr->convexVertexData.gConvexPoints = &lsMemPtr->convexVertexData.g_convexPointBuffer[0]; } /* performRaycast */ SpuSubsimplexRayCast caster (gatheredObjectData->m_spuCollisionShape, &lsMemPtr->convexVertexData, gatheredObjectData->m_shapeType, gatheredObjectData->m_collisionMargin, &simplexSolver); bool r = caster.calcTimeOfImpact (rayFromTrans, rayToTrans, gatheredObjectData->m_worldTransform, gatheredObjectData->m_worldTransform,result); if (r) { workUnitOut->hitFraction = result.m_fraction; workUnitOut->hitNormal = result.m_normal; } }
void gviSpursSpeexDecodeSet(SpursSpeexTaskOutput *spuTaskOut) { char *inBuffer; float *speexBuffer; short *outBuffer; int rcode; unsigned int i; speexBuffer = (float *)memalign(16, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(float)); outBuffer = (short *)memalign(16, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(short)); inBuffer = (char *)memalign(16, gviSpursSpeexTaskDesc.mInputBufferSize); memset(speexBuffer, 0, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(float)); memset(inBuffer, 0, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(short)); memset(outBuffer, 0, gviSpursSpeexTaskDesc.mInputBufferSize); cellDmaGet(inBuffer, (uint64_t)gviSpursSpeexTaskDesc.mInputBuffer, gviSpursSpeexTaskDesc.mInputBufferSize, DMA_TAG(1), 0,0); cellDmaWaitTagStatusAll(DMA_MASK(1)); // read the data into the bits speex_bits_read_from(&gviSpursSpeexBits, (char *)inBuffer, gviSpursSpeexTaskDesc.mEncodedFrameSize); // decode it rcode = speex_decode((void *)gviSpursSpeexStateBuffer, &gviSpursSpeexBits, speexBuffer); assert(rcode == 0); // convert the output from floats for(i = 0 ; i < gviSpursSpeexTaskDesc.mOutputBufferSize ; i++) // Expanded to remove warnings in VS2K5 outBuffer[i] = (short)speexBuffer[i]; cellDmaPut(outBuffer, (uint64_t)gviSpursSpeexTaskDesc.mOutputBuffer, gviSpursSpeexTaskDesc.mOutputBufferSize * sizeof(short), DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); free(speexBuffer); free(inBuffer); free(outBuffer); spuTaskOut->mSpeexReturnCode = 0; }
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes) int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size) { btAssert(size<32); ATTRIBUTE_ALIGNED16(char tmpBuffer[32]); char* mainMem = (char*)ea; char* localStore = (char*)ls; uint32_t i; ///make sure last 4 bits are the same, for cellDmaSmallGet uint32_t last4BitsOffset = ea & 0x0f; char* tmpTarget = tmpBuffer + last4BitsOffset; #if defined (__SPU__) || defined (USE_LIBSPE2) int remainingSize = size; //#define FORCE_cellDmaUnalignedGet 1 #ifdef FORCE_cellDmaUnalignedGet cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0); #else char* remainingTmpTarget = tmpTarget; uint64_t remainingEa = ea; while (remainingSize) { switch (remainingSize) { case 1: case 2: case 4: case 8: case 16: { mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0); remainingSize=0; break; } default: { //spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize); int actualSize = 0; if (remainingSize > 16) actualSize = 16; else if (remainingSize >8) actualSize=8; else if (remainingSize >4) actualSize=4; else if (remainingSize >2) actualSize=2; mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0); remainingSize-=actualSize; remainingTmpTarget+=actualSize; remainingEa += actualSize; } } } #endif//FORCE_cellDmaUnalignedGet #else //copy into final destination #ifdef USE_MEMCPY memcpy(tmpTarget,mainMem,size); #else for ( i=0;i<size;i++) { tmpTarget[i] = mainMem[i]; } #endif //USE_MEMCPY #endif cellDmaWaitTagStatusAll(DMA_MASK(1)); //this is slowish, perhaps memcpy on SPU is smarter? for (i=0; btLikely( i<size );i++) { localStore[i] = tmpTarget[i]; } return 0; }
void processRaycastTask(void* userPtr, void* lsMemory) { RaycastTask_LocalStoreMemory* localMemory = (RaycastTask_LocalStoreMemory*)lsMemory; SpuRaycastTaskDesc* taskDescPtr = (SpuRaycastTaskDesc*)userPtr; SpuRaycastTaskDesc& taskDesc = *taskDescPtr; SpuCollisionObjectWrapper* cows = (SpuCollisionObjectWrapper*)taskDesc.spuCollisionObjectsWrappers; //spu_printf("in processRaycastTask %d\n", taskDesc.numSpuCollisionObjectWrappers); /* for each object */ RaycastGatheredObjectData gatheredObjectData; for (int objectId = 0; objectId < taskDesc.numSpuCollisionObjectWrappers; objectId++) { //spu_printf("%d / %d\n", objectId, taskDesc.numSpuCollisionObjectWrappers); /* load initial collision shape */ GatherCollisionObjectAndShapeData (&gatheredObjectData, localMemory, (ppu_address_t)&cows[objectId]); if (btBroadphaseProxy::isConcave (gatheredObjectData.m_shapeType)) { SpuRaycastTaskWorkUnitOut tWorkUnitsOut[SPU_RAYCAST_WORK_UNITS_PER_TASK]; for (int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++) { tWorkUnitsOut[rayId].hitFraction = 1.0; } performRaycastAgainstConcave (&gatheredObjectData, &taskDesc.workUnits[0], &tWorkUnitsOut[0], taskDesc.numWorkUnits, localMemory); for (int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++) { const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId]; if (tWorkUnitsOut[rayId].hitFraction == 1.0) continue; ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut); dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); /* XXX Only support taking the closest hit for now */ if (tWorkUnitsOut[rayId].hitFraction < workUnitOut.hitFraction) { workUnitOut.hitFraction = tWorkUnitsOut[rayId].hitFraction; workUnitOut.hitNormal = tWorkUnitsOut[rayId].hitNormal; } /* write ray cast data back */ dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); } } else if (btBroadphaseProxy::isConvex (gatheredObjectData.m_shapeType)) { btVector3 objectBoxMin, objectBoxMax; computeAabb (objectBoxMin, objectBoxMax, (btConvexInternalShape*)gatheredObjectData.m_spuCollisionShape, gatheredObjectData.m_collisionShape, gatheredObjectData.m_shapeType, gatheredObjectData.m_worldTransform); for (unsigned int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++) { const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId]; btScalar ignored_param = 1.0; btVector3 ignored_normal; if (btRayAabb(workUnit.rayFrom, workUnit.rayTo, objectBoxMin, objectBoxMax, ignored_param, ignored_normal)) { ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut); SpuRaycastTaskWorkUnitOut tWorkUnitOut; tWorkUnitOut.hitFraction = 1.0; performRaycastAgainstConvex (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory); if (tWorkUnitOut.hitFraction == 1.0) continue; dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); /* XXX Only support taking the closest hit for now */ if (tWorkUnitOut.hitFraction < workUnitOut.hitFraction) { workUnitOut.hitFraction = tWorkUnitOut.hitFraction; workUnitOut.hitNormal = tWorkUnitOut.hitNormal; /* write ray cast data back */ dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); } } } } else if (btBroadphaseProxy::isCompound (gatheredObjectData.m_shapeType)) { for (unsigned int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++) { const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId]; ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut); SpuRaycastTaskWorkUnitOut tWorkUnitOut; tWorkUnitOut.hitFraction = 1.0; performRaycastAgainstCompound (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory); if (tWorkUnitOut.hitFraction == 1.0) continue; dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); /* XXX Only support taking the closest hit for now */ if (tWorkUnitOut.hitFraction < workUnitOut.hitFraction) { workUnitOut.hitFraction = tWorkUnitOut.hitFraction; workUnitOut.hitNormal = tWorkUnitOut.hitNormal; } /* write ray cast data back */ dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); } } } }
void performRaycastAgainstConcave (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit* workUnits, SpuRaycastTaskWorkUnitOut* workUnitsOut, int numWorkUnits, RaycastTask_LocalStoreMemory* lsMemPtr) { //order: first collision shape is convex, second concave. m_isSwapped is true, if the original order was opposite register int dmaSize; register ppu_address_t dmaPpuAddress2; btBvhTriangleMeshShape* trimeshShape = (btBvhTriangleMeshShape*)gatheredObjectData->m_spuCollisionShape; //need the mesh interface, for access to triangle vertices dmaBvhShapeData (&(lsMemPtr->bvhShapeData), trimeshShape); unsigned short int quantizedQueryAabbMin[SPU_RAYCAST_WORK_UNITS_PER_TASK][3]; unsigned short int quantizedQueryAabbMax[SPU_RAYCAST_WORK_UNITS_PER_TASK][3]; btVector3 rayFromInTriangleSpace[SPU_RAYCAST_WORK_UNITS_PER_TASK]; btVector3 rayToInTriangleSpace[SPU_RAYCAST_WORK_UNITS_PER_TASK]; /* Calculate the AABB for the ray in the triangle mesh shape */ btTransform rayInTriangleSpace; rayInTriangleSpace = gatheredObjectData->m_worldTransform.inverse(); for (int i = 0; i < numWorkUnits; i++) { btVector3 aabbMin; btVector3 aabbMax; rayFromInTriangleSpace[i] = rayInTriangleSpace(workUnits[i].rayFrom); rayToInTriangleSpace[i] = rayInTriangleSpace(workUnits[i].rayTo); aabbMin = rayFromInTriangleSpace[i]; aabbMin.setMin (rayToInTriangleSpace[i]); aabbMax = rayFromInTriangleSpace[i]; aabbMax.setMax (rayToInTriangleSpace[i]); lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin[i],aabbMin,0); lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax[i],aabbMax,1); } QuantizedNodeArray& nodeArray = lsMemPtr->bvhShapeData.getOptimizedBvh()->getQuantizedNodeArray(); //spu_printf("SPU: numNodes = %d\n",nodeArray.size()); BvhSubtreeInfoArray& subTrees = lsMemPtr->bvhShapeData.getOptimizedBvh()->getSubtreeInfoArray(); #ifdef CALLBACK_ALL spuRaycastNodeCallback nodeCallback (gatheredObjectData, workUnits, workUnitsOut, numWorkUnits, lsMemPtr); #else spuRaycastNodeCallback1 nodeCallback (gatheredObjectData, workUnits, workUnitsOut, lsMemPtr); #endif IndexedMeshArray& indexArray = lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getIndexedMeshArray(); //spu_printf("SPU:indexArray.size() = %d\n",indexArray.size()); // spu_printf("SPU: numSubTrees = %d\n",subTrees.size()); //not likely to happen if (subTrees.size() && indexArray.size() == 1) { ///DMA in the index info dmaBvhIndexedMesh (&lsMemPtr->bvhShapeData.gIndexMesh, indexArray, 0 /* index into indexArray */, 1 /* dmaTag */); cellDmaWaitTagStatusAll(DMA_MASK(1)); //display the headers int numBatch = subTrees.size(); for (int i=0;i<numBatch;) { // BEN: TODO - can reorder DMA transfers for less stall int remaining = subTrees.size() - i; int nextBatch = remaining < MAX_SPU_SUBTREE_HEADERS ? remaining : MAX_SPU_SUBTREE_HEADERS; dmaBvhSubTreeHeaders (&lsMemPtr->bvhShapeData.gSubtreeHeaders[0], (ppu_address_t)(&subTrees[i]), nextBatch, 1); cellDmaWaitTagStatusAll(DMA_MASK(1)); // spu_printf("nextBatch = %d\n",nextBatch); for (int j=0;j<nextBatch;j++) { const btBvhSubtreeInfo& subtree = lsMemPtr->bvhShapeData.gSubtreeHeaders[j]; unsigned int overlap = 1; for (int boxId = 0; boxId < numWorkUnits; boxId++) { overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin[boxId],quantizedQueryAabbMax[boxId],subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax); if (overlap) break; } if (overlap) { btAssert(subtree.m_subtreeSize); //dma the actual nodes of this subtree dmaBvhSubTreeNodes (&lsMemPtr->bvhShapeData.gSubtreeNodes[0], subtree, nodeArray, 2); cellDmaWaitTagStatusAll(DMA_MASK(2)); /* Walk this subtree */ { spuWalkStacklessQuantizedTreeAgainstRays(lsMemPtr, &nodeCallback, &rayFromInTriangleSpace[0], &rayToInTriangleSpace[0], numWorkUnits, &quantizedQueryAabbMin[0][0],&quantizedQueryAabbMax[0][0], &lsMemPtr->bvhShapeData.gSubtreeNodes[0], 0, subtree.m_subtreeSize); } } // spu_printf("subtreeSize = %d\n",gSubtreeHeaders[j].m_subtreeSize); } // unsigned short int m_quantizedAabbMin[3]; // unsigned short int m_quantizedAabbMax[3]; // int m_rootNodeIndex; // int m_subtreeSize; i+=nextBatch; } //pre-fetch first tree, then loop and double buffer } }
//-- MAIN METHOD void processSampleTask(void* userPtr, void* lsMemory) { // BT_PROFILE("processSampleTask"); SampleTask_LocalStoreMemory* localMemory = (SampleTask_LocalStoreMemory*)lsMemory; SpuSampleTaskDesc* taskDescPtr = (SpuSampleTaskDesc*)userPtr; SpuSampleTaskDesc& taskDesc = *taskDescPtr; switch (taskDesc.m_sampleCommand) { case CMD_SAMPLE_INTEGRATE_BODIES: { btTransform predictedTrans; btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr; int batchSize = taskDesc.m_sampleValue; if (batchSize>MAX_NUM_BODIES) { spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n"); break; } int dmaArraySize = batchSize*sizeof(void*); uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr); // spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize); if (dmaArraySize>=16) { cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress , dmaArraySize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); } else { stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress , dmaArraySize); } for ( int i=0;i<batchSize;i++) { ///DMA rigid body void* localPtr = &localMemory->gLocalRigidBody[0]; void* shortAdd = localMemory->gPointerArray[i]; uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd); // spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr); int dmaBodySize = sizeof(btRigidBody); cellDmaGet((void*)localPtr, ppuRigidBodyAddress , dmaBodySize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); float timeStep = 1.f/60.f; btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj); if (body) { if (body->isActive() && (!body->isStaticOrKinematicObject())) { body->predictIntegratedTransform(timeStep, predictedTrans); body->proceedToTransform( predictedTrans); void* ptr = (void*)localPtr; // spu_printf("cellDmaLargePut from %llx to LS %llx\n",ptr,ppuRigidBodyAddress); cellDmaLargePut(ptr, ppuRigidBodyAddress , dmaBodySize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); } } } break; } case CMD_SAMPLE_PREDICT_MOTION_BODIES: { btTransform predictedTrans; btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr; int batchSize = taskDesc.m_sampleValue; int dmaArraySize = batchSize*sizeof(void*); if (batchSize>MAX_NUM_BODIES) { spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n"); break; } uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr); // spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize); if (dmaArraySize>=16) { cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress , dmaArraySize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); } else { stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress , dmaArraySize); } for ( int i=0;i<batchSize;i++) { ///DMA rigid body void* localPtr = &localMemory->gLocalRigidBody[0]; void* shortAdd = localMemory->gPointerArray[i]; uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd); // spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr); int dmaBodySize = sizeof(btRigidBody); cellDmaGet((void*)localPtr, ppuRigidBodyAddress , dmaBodySize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); float timeStep = 1.f/60.f; btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj); if (body) { if (!body->isStaticOrKinematicObject()) { if (body->isActive()) { body->integrateVelocities( timeStep); //damping body->applyDamping(timeStep); body->predictIntegratedTransform(timeStep,body->getInterpolationWorldTransform()); void* ptr = (void*)localPtr; cellDmaLargePut(ptr, ppuRigidBodyAddress , dmaBodySize, DMA_TAG(1), 0, 0); cellDmaWaitTagStatusAll(DMA_MASK(1)); } } } } break; } default: { } }; }