void writeTriangleBuffer(Triangle* endTriangle) { if (endTriangle != _currentTriangle) { int length = ( ((char*)endTriangle) - _currentTriangleBuffer + 127) & ~127; unsigned short endTriangleBase = (((char*)endTriangle) - ((char*)_currentTriangle)) + _currentTriangleOffset; vec_ushort8 v_new_end = spu_promote(endTriangleBase, 1); // calculate genuine next pointer ( rewind==0 -> next, rewind!=0 -> 0 ) unsigned short next_pointer = spu_extract( spu_andc( v_new_end, _currentTriangleRewind ), 1 ); _currentTriangle->next_triangle = next_pointer; // printf("current=0x%x, endTriBase=0x%x, next_pointer=0x%x\n", _currentTriangleOffset, endTriangleBase, next_pointer); // DMA the triangle data out spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(_currentTriangleBufferEA), mfc_ea2l(_currentTriangleBufferEA), length, 0, MFC_PUT_CMD); // update the information in the cache line _currentTriangleRewind = spu_splats(next_pointer); // re-use this variable as we don't need it anymore char* dstart = ((char*)&_currentTriangleRewind) + (_currentTriangleCacheEndTriangleEAL & 15); spu_mfcdma64(dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL, sizeof(short), 0, MFC_PUTB_CMD); // printf("writing from %x to %x:%x\n", dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL); // finally invalidate the triangle info _currentTriangle = NULL; // and make sure the DMA completed mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } }
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp) { int i; int tag = 1; /* DMA Transfer 1 : GET input/output parameters */ spu_mfcdma64(&abs_params, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(abs_params_t), tag, MFC_GET_CMD); spu_writech(MFC_WrTagMask, 1 << tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); /* DMA Transfer 2 : GET input data */ spu_mfcdma64(in_spe, mfc_ea2h(abs_params.ea_in), mfc_ea2l(abs_params.ea_in), abs_params.size * sizeof(float), tag, MFC_GET_CMD); spu_writech(MFC_WrTagMask, 1 << tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); /* Calculate absolute values */ for (i = 0; i < abs_params.size; i++) { if (in_spe[i] > 0) { out_spe[i] = in_spe[i]; } else { out_spe[i] = in_spe[i] * -1; } } /* DMA Transfer 3 : PUT output data */ spu_mfcdma64(out_spe, mfc_ea2h(abs_params.ea_out), mfc_ea2l(abs_params.ea_out), abs_params.size * sizeof(float), tag, MFC_PUT_CMD); spu_writech(MFC_WrTagMask, 1 << tag); spu_mfcstat(MFC_TAG_UPDATE_ALL); return 0; }
void blockActivater(Block* block, ActiveBlock* active, int tag) { unsigned int bx=block->bx, by=block->by; unsigned long long ea = screen.address + screen.bytes_per_line*by*32+bx*128; block->pixels = (vec_uint4*) ((void*)&active->pixels[0]); if (active->ea_copy == ea) { // printf("re-using same ea %llx in %x -> %x\n", ea, block, active); return; } active->ea_copy = ea; unsigned long stride = screen.bytes_per_line; unsigned int lines = 32; ///////////// unsigned long eah = ea >> 32; unsigned long eal = ((unsigned long) (ea&0xffffffff)); build_blit_list(active->new_dma, eal, stride); unsigned long old_size = active->current_length; unsigned long half_new_size = lines * 4; unsigned long new_size = half_new_size * 2; unsigned long store_new_size = new_size; unsigned long eal_old = (unsigned long) ((void*)active->current_dma); unsigned long eal_new = (unsigned long) ((void*)active->new_dma); // if this is an unused block, then we have no data to blit out // so to avoid branches, split the read block in half //TODO: why do this fix work??? unsigned long is_new = 0; //cmp_eq(old_size, 0); unsigned long cmd = if_then_else(is_new, MFC_GETL_CMD, MFC_PUTLF_CMD); eal_old = if_then_else(is_new, eal_new+half_new_size, eal_old); new_size = if_then_else(is_new, half_new_size, new_size); old_size = if_then_else(is_new, half_new_size, old_size); #ifdef DEBUG_2 printf("old_size %d, is_new %d store_new %d\n", active->current_length, is_new&1, store_new_size); printf("DMA[%02X]: ls=%lx eah=%lx list=%lx, size=%d, tag=%d\n", cmd, &active->pixels[0],eah,eal_old,old_size,tag); printf("DMA[%02X]: ls=%lx eah=%lx list=%lx, size=%d, tag=%d\n", MFC_GETLF_CMD, &active->pixels[0],eah,eal_new,new_size,tag); #endif spu_mfcdma64(&active->pixels[0],eah,eal_old,old_size,tag, cmd); spu_mfcdma64(&active->pixels[0],eah,eal_new,new_size,tag, MFC_GETLF_CMD); // update the buffer pointers active->current_length = store_new_size; vec_uint4* t = active->current_dma; active->current_dma = active->new_dma; active->new_dma = t; active->eah = eah; }
/* loads program info - blocks until done */ void load_program_info(unsigned long long ea, spe_program_info_t *info) { /* initiate DMA request for program info */ /* spu_mfcdma64(ls_addr, ea_h, ea_l, size, tag_id, cmd); */ spu_mfcdma64(info, mfc_ea2h(ea), mfc_ea2l(ea), sizeof(spe_program_info_t), SPUDMA_PROGRAMINFO, MFC_GET_CMD); /* wait for request to complete */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_PROGRAMINFO); mfc_read_tag_status_all(); /* assign to global for debugging purposes */ speid = info->speId; #if defined(_DEBUG) && _DEBUG > 1 printf("Program info:\n\tSpe ID: %d\n\tNum Pixels: %d\n\tSpp: %d\n\tNum Spes %d\n\tDepth: %d\n", info->speId, info->numPixels, info->samplesPerPixel, info->numSpes, info->depth); #endif }
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp) { int i; for (i = 0; i < 10000; i++) { spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_GET_CMD); spu_writech(MFC_WrTagMask, 1 << 0); spu_mfcstat(MFC_TAG_UPDATE_ALL); counter++; spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_PUT_CMD); spu_writech(MFC_WrTagMask, 1 << 0); spu_mfcstat(MFC_TAG_UPDATE_ALL); } return 0; }
void activeBlockFlush(ActiveBlock* active, int tag) { unsigned long len = active->current_length; if (len) { unsigned long eah = active->eah; unsigned long eal = (unsigned long) ((void*)active->current_dma); spu_mfcdma64(&active->pixels[0], eah, eal, len, tag, MFC_PUTLF_CMD); active->current_length = 0; // printf("flushed...\n"); } }
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks) { const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) { // merge lo bytes from unsigned shorts (array) 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) { // get busy flag with ones in unused bytes 0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0 }; const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0); char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ]; char sync_buffer[128+127]; void* aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 ); RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer; unsigned long long cache_ea; spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); while (cache_ea) { // terminate immediately if possible if (spu_stat_in_mbox()) return; // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); unsigned int endTriangle = cache->endTriangle; vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle); // first look for short chunks vec_uchar16 next = cache->chunkNext; vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 start0 = cache->chunkStart[0]; vec_ushort8 start1 = cache->chunkStart[1]; vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) ); vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) ); vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0); vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1); vec_ushort8 len0 = spu_sub( end0, start0); vec_ushort8 len1 = spu_sub( end1, start1); vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0); vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1); vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE ); vec_uint4 smallChunkGather = spu_gather(small); // check to see if chunk is already at the last triangle vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle( (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]), (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]), SHUFFLE_MERGE_BYTES) ); // check if the chunk is free vec_uint4 freeChunkGather = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); // check to see if the chunk is being processed vec_uint4 busyChunkGather = spu_gather( spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK), spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) ); // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0 // note that if freeChunkGather is true then busyChunkGather must also be true // done=false, free=false, busy=false -> can process // free=false, busy=false -> can be merged // decide which chunk to process vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather ); vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather ); vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) ); vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask ); /* if (!spu_extract(shortSelMask, 0)) printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n", spu_extract(mayProcessGather, 0) & 0xffff, spu_extract(smallChunkGather, 0), spu_extract(mayProcessShortGather, 0), spu_extract(shortSelMask, 0) & 0xffff, spu_extract(mayProcessSelection, 0) & 0xffff ); */ vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16); unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0); unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0); // if there's nothing to process, try the next cache line in the rendering tasks list if (!spu_extract(mayProcessBits, 0)) { trynextcacheline: cache_ea = cache->next; // sleep(); continue; } unsigned int chunkStart = cache->chunkStartArray [chunkToProcess]; unsigned int chunkTriangle = cache->chunkTriangleArray[chunkToProcess]; unsigned int chunkNext = cache->chunkNextArray [chunkToProcess] & CHUNKNEXT_MASK; unsigned int chunkEnd = (cache->chunkStartArray [chunkNext]-1) & (NUMBER_OF_TILES-1); unsigned int chunkLength = 1 + chunkEnd-chunkStart; // only need an extra block if the block is especially long if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) { freeChunk = 32; } // mark this block as busy cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT; // if there's at least one free chunk, claim it if (freeChunk != 32) { cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED; cache->chunkTriangleArray[freeChunk] = chunkTriangle; } // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) continue; #ifdef INFO printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID, chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle, freeChunk!=32 ? freeChunk : -1 ); // debug_render_tasks(cache); #endif Triangle* triangle; int firstTile; do { // read the triangle data for the current triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // get the triangle deltas firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd); if (firstTile>=0) break; // no match, try next triangle chunkTriangle = triangle->next_triangle; } while (chunkTriangle != endTriangle); // if we actually have something to process... if (firstTile>=0) { // the "normal" splitting will now become: // chunkStart .. (firstTile-1) -> triangle->next_triangle // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY) // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE) int tailChunk; int thisChunk; int nextBlockStart; int thisBlockStart; int realBlockStart; do { retry: // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // calculate start of next block nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK; if (nextBlockStart > chunkEnd) nextBlockStart = chunkEnd+1; // calculate start of block to mark as busy thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK; if (thisBlockStart < chunkStart) thisBlockStart = chunkStart; realBlockStart = thisBlockStart; #ifdef INFO printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID, nextBlockStart, realBlockStart, thisBlockStart, chunkStart); #endif // allocate some more free chunks vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq( spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16); unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); if (freeChunk == 32) { // if we didn't have one before, try again freeChunk = freeChunk2; // and try to get the second one freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) ); freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); } else { // speculatively clear the free chunk just in case we don't need it cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK; } #ifdef INFO printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n", _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart ); #endif // mark region after as available for processing if required if (nextBlockStart < chunkEnd) { if (freeChunk==32) { // if no free chunk, relinquish entire block and write back cache->chunkNextArray[chunkToProcess] = chunkNext; spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); // if writeback failed, we *might* have a free block, retry if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) goto retry; // otherwise give up and try the next cache line goto trynextcacheline; } cache->chunkStartArray[freeChunk] = nextBlockStart; cache->chunkNextArray[freeChunk] = chunkNext; cache->chunkTriangleArray[freeChunk] = chunkTriangle; cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT; tailChunk = freeChunk; #ifdef INFO printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess); debug_render_tasks(cache); #endif } else { // we're gonna use freeChunk2 for the "in front" block, as we've not // used freeChunk, let's use it as it's more likely to have a free chunk freeChunk2 = freeChunk; tailChunk = chunkNext; } // mark region before as available if required and possible thisChunk = chunkToProcess; if (thisBlockStart > chunkStart) { if (freeChunk2 != 32) { // mark this region as busy cache->chunkStartArray[freeChunk2]=thisBlockStart; cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT; cache->chunkTriangleArray[freeChunk2]=chunkTriangle; // mark region before as available for processing cache->chunkNextArray[chunkToProcess]=freeChunk2; cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle; thisChunk = freeChunk2; #ifdef INFO printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #endif } else { // need to keep whole block, update info and mark bust cache->chunkTriangleArray[chunkToProcess]=chunkTriangle; cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT; realBlockStart = chunkStart; printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #ifdef INFO #endif sleep(); } } // merge chunks merge_cache_blocks(cache); // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS); // finally after the write succeeded, update the variables chunkNext = tailChunk; chunkToProcess = thisChunk; chunkStart = firstTile; //thisBlockStart; chunkLength = nextBlockStart - firstTile; chunkEnd = chunkStart + chunkLength - 1; freeChunk = 32; // now we can process the block up to endTriangle initTileBuffers(thisBlockStart, chunkEnd); int ok=0; while (chunkTriangle != endTriangle) { #ifdef INFO printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n", _SPUID, chunkToProcess, chunkStart, chunkLength, chunkTriangle, firstTile, thisBlockStart); #endif // and actually process that triangle on these chunks processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok); ok=1; #ifdef PAUSE sleep(); #endif // and advance to the next-triangle chunkTriangle = triangle->next_triangle; // this should only ever happen if we're running really low on cache line slots // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles. // in this case, we process from thisBlockStart only (because we know that from // chunkStart to there has no result) and then we only process one triangle if (chunkStart != realBlockStart) { /* printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, " "firstTile=%d chunk=%d\n", _SPUID, chunkStart, realBlockStart, chunkEnd, firstTile, chunkToProcess); debug_render_tasks(cache); */ // abort the while loop break; } // read the next triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // until chunkTriangle == endTriangle // flush any output buffers flushTileBuffers(thisBlockStart, chunkEnd); } // firstTile>=0
/* loads the scene using DMA - blocks until done */ void load_scene(unsigned long long ea, scene_t *scene) { unsigned int i = 0; object3d_t *objects = 0; pointlight_t *lights = 0; point_t *v = 0; #if defined(_DEBUG) && _DEBUG > 2 printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for SCENE\n", sizeof(scene_t), &scene, mfc_ea2h(ea), mfc_ea2l(ea)); #endif /* DMA request for scene */ spu_mfcdma64(scene, mfc_ea2h(ea), mfc_ea2l(ea), sizeof(scene_t), SPUDMA_SCENE, MFC_GET_CMD); /* wait for request to complete */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_SCENE); mfc_read_tag_status_all(); /* copy over objects */ objects = _malloc_align(sizeof(object3d_t) * scene->nObjects, 4); #if defined(_DEBUG) && _DEBUG > 2 printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for OBJECTS\n", sizeof(object3d_t) * scene->nObjects, objects, mfc_ea2h(scene->objects_ea), mfc_ea2l(scene->objects_ea)); #endif /* initiate DMA */ spu_mfcdma64(objects, mfc_ea2h(scene->objects_ea), mfc_ea2l(scene->objects_ea), sizeof(object3d_t) * scene->nObjects, SPUDMA_OBJECTS, MFC_GET_CMD); /* copy over lights */ lights = _malloc_align(sizeof(pointlight_t) * scene->nLights, 4); #if defined(_DEBUG) && _DEBUG > 2 printf("Transferring %d bytes to LSaddr(%8X) from EAadd(%8lX:%8lX) for LIGHTS\n", sizeof(pointlight_t) * scene->nLights, lights, mfc_ea2h(scene->lights_ea), mfc_ea2l(scene->lights_ea)); #endif /* initiate DMA for lights */ spu_mfcdma64(lights, mfc_ea2h(scene->lights_ea), mfc_ea2l(scene->lights_ea), sizeof(pointlight_t) * scene->nLights, SPUDMA_LIGHTS, MFC_GET_CMD); /* wait for objects to complete */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_OBJECTS); mfc_read_tag_status_all(); /* assign local store pointer to objects */ scene->objects = objects; /* iterate each object locally */ for(; i < scene->nObjects; ++i) { if(objects[i].geometryType == GEOMETRY_POLYGON) { /* allocate memory for vertex */ v = _malloc_align(sizeof(point_t) * objects[i].poly_obj.nVerticies, 4); /* initiate DMA to get verticies */ spu_mfcdma64(v, mfc_ea2h(objects[i].poly_obj.vertex_ea), mfc_ea2l(objects[i].poly_obj.vertex_ea), sizeof(point_t) * objects[i].poly_obj.nVerticies, SPUDMA_VERTEXES, MFC_GET_CMD); /* assign local store pointer - WARNING - safe? */ objects[i].poly_obj.vertex = v; } } /* wait for all DMA to finish (vertexes, lights) */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_LIGHTS | 1 << SPUDMA_VERTEXES ); mfc_read_tag_status_all(); /* assign local store lights pointer */ scene->lights = lights; }
Triangle* getTriangleBuffer(Context* context) { // if we've already allocated a triangle buffer (and we're in the same context) if (context == _currentTriangleContext && _currentTriangle) return _currentTriangle; // trash the default values _currentTriangleContext = context; _currentTriangle = NULL; // read the current renderable cache line to ensure there is room for the triangle data // in the cache line buffer; we do this by comparing against all 16 cache line blocks // to make sure that extending the write pointer wouldn't clobber the data unsigned long long cache_ea = context->renderableCacheLine; if (cache_ea == 0) return NULL; char cachebuffer[128+127]; RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 ); // printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea)); spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // extendvalid = ( read<=write && test<end ) || ( read>write && test<read ) // extendvalid = ( read>write && read>test ) || ( read<=write && end>test ) // simplifies to extendvalid = selb(end, read, read>write) > test // or extendvalid = selb(end>test, read>test, read>write) // rewind = next >= end // rewindvalid = read != 0 // valid = extendvalid && (!rewind || rewindvalid) // = extendvalid && (!rewind || !rewindinvalid) // = extendvalid && !(rewind && rewindinvalid) // invalid = ! (extendvalid && !(rewind && rewindinvalid)) // = (!extendvalid || (rewind && rewindinvalid)) vec_ushort8 v_writeptr = spu_splats( cache->endTriangle ); vec_ushort8 v_readptr0 = cache->chunkTriangle[0]; vec_ushort8 v_readptr1 = cache->chunkTriangle[1]; vec_ushort8 v_testptr = spu_add(v_writeptr, TRIANGLE_MAX_SIZE); vec_ushort8 v_nextptr = spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE); vec_ushort8 v_endptr = spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE); vec_ushort8 v_zero = spu_splats( (unsigned short) 0 ); vec_uchar16 v_merger = (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; vec_ushort8 v_max0_test = spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) ); vec_ushort8 v_max1_test = spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) ); vec_ushort8 v_extend0_valid = spu_cmpgt( v_max0_test, v_testptr ); vec_ushort8 v_extend1_valid = spu_cmpgt( v_max1_test, v_testptr ); vec_ushort8 v_rewind0_invalid = spu_cmpeq( v_readptr0, v_zero ); vec_ushort8 v_rewind1_invalid = spu_cmpeq( v_readptr1, v_zero ); vec_ushort8 v_rewind8 = spu_cmpgt( v_nextptr, v_endptr ); vec_uchar16 v_extend_valid = (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger ); vec_uchar16 v_rewind_invalid = (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger ); vec_uchar16 v_rewind = (vec_uchar16) v_rewind8; vec_uchar16 v_valid_rhs = spu_and( v_rewind_invalid, v_rewind ); vec_uchar16 v_invalid = spu_orc( v_valid_rhs, v_extend_valid ); // check to see if the chunk is being processed vec_uint4 v_free = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); vec_uint4 v_invalid_bits = spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free ); // if any of the bits are invalid, then no can do if ( spu_extract(v_invalid_bits, 0) ) { return NULL; } // fetch in the data before this triangle in the cache buffer unsigned int offset = cache->endTriangle; _currentTriangleBufferExtra = offset & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127); if (_currentTriangleBufferExtra) { spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD); // ensure DMA did actually complete mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // final bit of initialisation _currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra); _currentTriangleOffset = offset; _currentTriangleRewind = v_rewind8; _currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache)); _currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); _currentTriangleBufferEA = trianglebuffer_ea; // printf("Allocated new triangle buffer: %x\n", offset); // and return the buffer ready to go return _currentTriangle; }