/* Waits for message from PPU to begin work on the current frame */ static void wait_for_begin(uint32_t *mbox_message) { do { while (spu_stat_in_mbox() <= 0) ; *mbox_message = spu_read_in_mbox(); } while (*mbox_message != BEGIN); }
int main(uint64_t speid, uint64_t argp, uint64_t envp){ unsigned int data[NUM_STREAMS]; unsigned int num_spus = (unsigned int)argp, i, num_images; struct image my_image __attribute__ ((aligned(16))); int mode = (int)envp; speid = speid; //get rid of warning while(1){ num_images = 0; for (i = 0; i < NUM_STREAMS / num_spus; i++){ //assume NUM_STREAMS is a multiple of num_spus while(spu_stat_in_mbox() == 0); data[i] = spu_read_in_mbox(); if (!data[i]) return 0; num_images++; } for (i = 0; i < num_images; i++){ mfc_get(&my_image, data[i], sizeof(struct image), MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); switch(mode){ default: case MODE_SIMPLE: process_image_simple(&my_image); break; case MODE_2LINES: process_image_2lines(&my_image); break; case MODE_DOUBLE: process_image_double(&my_image); break; case MODE_DMALIST: process_image_dmalist(&my_image); break; } } data[0] = DONE; spu_write_out_intr_mbox(data[0]); } return 0; }
// -- is there something available in my mailbox? ----------------------------- int as_mbx_avail () { return (spu_stat_in_mbox () > 0); }
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks) { const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) { // merge lo bytes from unsigned shorts (array) 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) { // get busy flag with ones in unused bytes 0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0 }; const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0); char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ]; char sync_buffer[128+127]; void* aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 ); RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer; unsigned long long cache_ea; spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); while (cache_ea) { // terminate immediately if possible if (spu_stat_in_mbox()) return; // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); unsigned int endTriangle = cache->endTriangle; vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle); // first look for short chunks vec_uchar16 next = cache->chunkNext; vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 start0 = cache->chunkStart[0]; vec_ushort8 start1 = cache->chunkStart[1]; vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) ); vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) ); vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0); vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1); vec_ushort8 len0 = spu_sub( end0, start0); vec_ushort8 len1 = spu_sub( end1, start1); vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0); vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1); vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE ); vec_uint4 smallChunkGather = spu_gather(small); // check to see if chunk is already at the last triangle vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle( (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]), (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]), SHUFFLE_MERGE_BYTES) ); // check if the chunk is free vec_uint4 freeChunkGather = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); // check to see if the chunk is being processed vec_uint4 busyChunkGather = spu_gather( spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK), spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) ); // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0 // note that if freeChunkGather is true then busyChunkGather must also be true // done=false, free=false, busy=false -> can process // free=false, busy=false -> can be merged // decide which chunk to process vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather ); vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather ); vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) ); vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask ); /* if (!spu_extract(shortSelMask, 0)) printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n", spu_extract(mayProcessGather, 0) & 0xffff, spu_extract(smallChunkGather, 0), spu_extract(mayProcessShortGather, 0), spu_extract(shortSelMask, 0) & 0xffff, spu_extract(mayProcessSelection, 0) & 0xffff ); */ vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16); unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0); unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0); // if there's nothing to process, try the next cache line in the rendering tasks list if (!spu_extract(mayProcessBits, 0)) { trynextcacheline: cache_ea = cache->next; // sleep(); continue; } unsigned int chunkStart = cache->chunkStartArray [chunkToProcess]; unsigned int chunkTriangle = cache->chunkTriangleArray[chunkToProcess]; unsigned int chunkNext = cache->chunkNextArray [chunkToProcess] & CHUNKNEXT_MASK; unsigned int chunkEnd = (cache->chunkStartArray [chunkNext]-1) & (NUMBER_OF_TILES-1); unsigned int chunkLength = 1 + chunkEnd-chunkStart; // only need an extra block if the block is especially long if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) { freeChunk = 32; } // mark this block as busy cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT; // if there's at least one free chunk, claim it if (freeChunk != 32) { cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED; cache->chunkTriangleArray[freeChunk] = chunkTriangle; } // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) continue; #ifdef INFO printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID, chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle, freeChunk!=32 ? freeChunk : -1 ); // debug_render_tasks(cache); #endif Triangle* triangle; int firstTile; do { // read the triangle data for the current triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // get the triangle deltas firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd); if (firstTile>=0) break; // no match, try next triangle chunkTriangle = triangle->next_triangle; } while (chunkTriangle != endTriangle); // if we actually have something to process... if (firstTile>=0) { // the "normal" splitting will now become: // chunkStart .. (firstTile-1) -> triangle->next_triangle // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY) // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE) int tailChunk; int thisChunk; int nextBlockStart; int thisBlockStart; int realBlockStart; do { retry: // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // calculate start of next block nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK; if (nextBlockStart > chunkEnd) nextBlockStart = chunkEnd+1; // calculate start of block to mark as busy thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK; if (thisBlockStart < chunkStart) thisBlockStart = chunkStart; realBlockStart = thisBlockStart; #ifdef INFO printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID, nextBlockStart, realBlockStart, thisBlockStart, chunkStart); #endif // allocate some more free chunks vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq( spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16); unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); if (freeChunk == 32) { // if we didn't have one before, try again freeChunk = freeChunk2; // and try to get the second one freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) ); freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); } else { // speculatively clear the free chunk just in case we don't need it cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK; } #ifdef INFO printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n", _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart ); #endif // mark region after as available for processing if required if (nextBlockStart < chunkEnd) { if (freeChunk==32) { // if no free chunk, relinquish entire block and write back cache->chunkNextArray[chunkToProcess] = chunkNext; spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); // if writeback failed, we *might* have a free block, retry if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) goto retry; // otherwise give up and try the next cache line goto trynextcacheline; } cache->chunkStartArray[freeChunk] = nextBlockStart; cache->chunkNextArray[freeChunk] = chunkNext; cache->chunkTriangleArray[freeChunk] = chunkTriangle; cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT; tailChunk = freeChunk; #ifdef INFO printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess); debug_render_tasks(cache); #endif } else { // we're gonna use freeChunk2 for the "in front" block, as we've not // used freeChunk, let's use it as it's more likely to have a free chunk freeChunk2 = freeChunk; tailChunk = chunkNext; } // mark region before as available if required and possible thisChunk = chunkToProcess; if (thisBlockStart > chunkStart) { if (freeChunk2 != 32) { // mark this region as busy cache->chunkStartArray[freeChunk2]=thisBlockStart; cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT; cache->chunkTriangleArray[freeChunk2]=chunkTriangle; // mark region before as available for processing cache->chunkNextArray[chunkToProcess]=freeChunk2; cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle; thisChunk = freeChunk2; #ifdef INFO printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #endif } else { // need to keep whole block, update info and mark bust cache->chunkTriangleArray[chunkToProcess]=chunkTriangle; cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT; realBlockStart = chunkStart; printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #ifdef INFO #endif sleep(); } } // merge chunks merge_cache_blocks(cache); // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS); // finally after the write succeeded, update the variables chunkNext = tailChunk; chunkToProcess = thisChunk; chunkStart = firstTile; //thisBlockStart; chunkLength = nextBlockStart - firstTile; chunkEnd = chunkStart + chunkLength - 1; freeChunk = 32; // now we can process the block up to endTriangle initTileBuffers(thisBlockStart, chunkEnd); int ok=0; while (chunkTriangle != endTriangle) { #ifdef INFO printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n", _SPUID, chunkToProcess, chunkStart, chunkLength, chunkTriangle, firstTile, thisBlockStart); #endif // and actually process that triangle on these chunks processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok); ok=1; #ifdef PAUSE sleep(); #endif // and advance to the next-triangle chunkTriangle = triangle->next_triangle; // this should only ever happen if we're running really low on cache line slots // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles. // in this case, we process from thisBlockStart only (because we know that from // chunkStart to there has no result) and then we only process one triangle if (chunkStart != realBlockStart) { /* printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, " "firstTile=%d chunk=%d\n", _SPUID, chunkStart, realBlockStart, chunkEnd, firstTile, chunkToProcess); debug_render_tasks(cache); */ // abort the while loop break; } // read the next triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // until chunkTriangle == endTriangle // flush any output buffers flushTileBuffers(thisBlockStart, chunkEnd); } // firstTile>=0
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) { int tgiy0[2]; int tgiy1[2]; int tgiu0[2]; int tgiu1[2]; int tgiv0[2]; int tgiv1[2]; int tgo0[2]; int tgo1[2]; tgiu1[0]=1; tgiu1[1]=2; tgo0[0]=3; tgo0[1]=4; tgiy0[0]=5; tgiy0[1]=6; tgiy1[0]=7; tgiy1[1]=8; tgiu0[0]=9; tgiu0[1]=10; tgiv0[0]=11; tgiv0[1]=12; tgiv1[1]=13; tgiv1[1]=14; tgo1[0]=15; tgo1[1]=16; int selOut = 0; int selIn = 0; int tag = 31; int LineSelIn=0; int LineSelOut=0; int selY0In = 0; int selY1In = 0; int selCrIn = 0; struct img_args *iargs; iargs =(struct img_args*)memalign(128,sizeof(*iargs)); unsigned long long Cp; int first=1; int waiting=0; unsigned long long Op; unsigned int msg; unsigned long long YIp,UIp,VIp,YOp; int crblock0; int crblock1; int srcsmallcroma=0; ; int noscale=1; static int crblockdst1; static int crblockdst0; scaler_settings_t sc; while (spu_stat_in_mbox() == 0); msg=spu_read_in_mbox(); if (msg==RUN){ fprintf(stderr,"spu_yuv2argb_scaler: Starting Up\n"); } dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process image printf("spu_yuv2argb_scaler: SRC width %d,DST width %d\n",iargs->srcW,iargs->dstW); printf("spu_yuv2argb_scaler: SRC height %d,DST height %d\n",iargs->srcH,iargs->dstH); printf("spu_yuv2argb_scaler: DST offset %d\n",iargs->offset); // bad fix for centering image on 1080p) //iargs->offset=(iargs->maxwidth-iargs->dstW)/2 + iargs->maxwidth*(1080-iargs->dstH)/2; vector unsigned char *widthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*4+16); vector unsigned char *widthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*4+16); vector unsigned char *crwidthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*2+16); vector unsigned char *crwidthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*2+16); vector float * weightWfilter0=(vector float*)memalign(128,MAXWIDTH*4+16); vector float * weightWfilter1=(vector float*)memalign(128,MAXWIDTH*4+16); float weightHfilter[MAXHEIGHT+1]; unsigned long long dmapos[MAXHEIGHT+2]; unsigned long long dmacromapos[MAXHEIGHT+2]; vector float * Ytemp0=(vector float *)memalign(128,MAXWIDTH*4+16); vector float * Ytemp1=(vector float *)memalign(128,MAXWIDTH*4+16); vector float * Utemp=(vector float *)memalign(128,MAXWIDTH*2+16); vector float * Vtemp=(vector float *)memalign(128,MAXWIDTH*2+16); int wfilterpos[MAXWIDTH+2]; int hfilterpos0[MAXHEIGHT+2]; int hfilterpos1[MAXHEIGHT+2]; int crwfilterpos[MAXWIDTH/2+2]; vector unsigned char *InputY0[2]; InputY0[0]=(vector unsigned char*)memalign(128,MAXWIDTH); InputY0[1]=(vector unsigned char*)memalign(128,MAXWIDTH); vector unsigned char *InputU0[2]; InputU0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); InputU0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); vector unsigned char *InputV0[2]; InputV0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); InputV0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); vector unsigned char *InputY1[2]; InputY1[0]=(vector unsigned char*)memalign(128,MAXWIDTH); InputY1[1]=(vector unsigned char*)memalign(128,MAXWIDTH); vector unsigned char *InputU1[2]; InputU1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); InputU1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); vector unsigned char *InputV1[2]; InputV1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); InputV1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); vector unsigned char* Output0[2]; Output0[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4); // 1line output Output0[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4); // 1line output vector unsigned char* Output1[2]; Output1[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4); // 1line output Output1[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4); // 1line output while (msg!=STOP) { int h=0; int i; if (first) { crblock0=(iargs->srcW>>1)&~15; // rounded down crblock1=((iargs->srcW>>1) + 15)&~15; //rounded up crblockdst1=((iargs->dstW>>1) + 15)&~15;//destination size rounded up. crblockdst0=((iargs->dstW>>1) + 7)&~7;//destination size rounded up. initHFilter(iargs->srcW,iargs->srcH,iargs->dstH,hfilterpos0,hfilterpos1,weightHfilter,dmapos,dmacromapos); // printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[0]]/16.0); // printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[1]]/16.0); // // for (i=0;i < iargs->dstH>>1;i++) // { // // printf("Hfilterpos0 dst: %d, src:%d, weight:%f\n",i,hfilterpos0[i],weightHfilter[i]); // // printf("Hfilterpos1 dst: %d, src:%d, weight:%f\n",i,hfilterpos1[i],1.0-weightHfilter[i]); // printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+2]]/16.0,dmacromapos[hfilterpos1[2*i+2]]/16.0); // printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+3]]/16.0,dmacromapos[hfilterpos1[2*i+3]]/16.0); // } if ((iargs->srcW==iargs->dstW)&&(iargs->srcH==iargs->dstH)) { printf("spu_yuv2argb_scaler: No scaling proceeding with direct csc\n"); noscale=1; if ((iargs->srcW%32) != 0) { srcsmallcroma=1; sc.smallcroma=1; } } else { noscale=0; printf("spu_yuv2argb_scaler: Scaling, computing shuffle filters\n"); initWFilter(iargs->srcW,iargs->dstW,1,wfilterpos,widthfilter0,widthfilter1,weightWfilter0,weightWfilter1); /* for (i=0;i < iargs->dstW/4;i++) { printf("filterpos dst: %d, src:%d\n",i,wfilterpos[i]); printcharvec("widthfilter0",widthfilter0[i]); printcharvec("widthfilter1",widthfilter1[i]); printfvec("weightWfilter0",weightWfilter0[i]); printfvec("weightWfilter1",weightWfilter1[i]); }*/ srcsmallcroma=0; sc.smallcroma=0; if ((iargs->srcW%32) != 0) { sc.smallcroma=1; srcsmallcroma=1; initWcrFilter(iargs->srcW/2,iargs->dstW/2,1,crwfilterpos,crwidthfilter0,crwidthfilter1); printf("spu_yuv2argb_scaler: Computing Crshuffle filter\n"); // for (i=0;i < (iargs->dstW>>1)/4;i++) // { // printf("crwfilterpos dst: %d, src:%d, weight:%f\n",i,crwfilterpos[i]); // printcharvec("crwidthfilter0",crwidthfilter0[i]); // printcharvec("crwidthfilter1",crwidthfilter1[i]); // printfvec("weightWfilter0",weightWfilter0[i]); // printfvec("weightWfilter1",weightWfilter1[i]); // // } } sc.wWfilter0=weightWfilter0; sc.wWfilter1=weightWfilter1; sc.wfilterpos=wfilterpos; sc.sWfilter0=widthfilter0; sc.sWfilter1=widthfilter1; sc.crsWfilter0=crwidthfilter0; sc.crsWfilter1=crwidthfilter1; sc.crfilterpos=crwfilterpos; sc.smallcromaline0=0; sc.smallcromaline1=0; } first=0; printf("spu_yuv2argb_scaler: Initiation completed\n"); } YIp = iargs->Ystart[selIn]; UIp = iargs->Ustart[selIn]; VIp = iargs->Vstart[selIn]; Op = iargs->Output[selOut] + iargs->offset*4; LineSelOut=0; selY0In=0; selY1In=0; selCrIn=0; dmaGet(InputY0[0],YIp+dmapos[hfilterpos0[0]],iargs->srcW,tgiy0[0]); dmaGet(InputY1[0],YIp+dmapos[hfilterpos1[0]],iargs->srcW,tgiy1[0]); dmaGet(InputY0[1],YIp+dmapos[hfilterpos0[1]],iargs->srcW,tgiy0[1]); dmaGet(InputY1[1],YIp+dmapos[hfilterpos1[1]],iargs->srcW,tgiy1[1]); dmaGet(InputU0[0],UIp+dmacromapos[hfilterpos0[0]],crblock1,tgiu0[0]); dmaGet(InputU0[1],UIp+dmacromapos[hfilterpos0[1]],crblock1,tgiu0[1]); dmaGet(InputU1[0],UIp+dmacromapos[hfilterpos1[0]],crblock1,tgiu1[0]); dmaGet(InputU1[1],UIp+dmacromapos[hfilterpos1[1]],crblock1,tgiu1[1]); // dmaGet(InputV0[0],VIp+dmacromapos[hfilterpos0[0]],crblock1,tgiv0[0]); dmaGet(InputV0[1],VIp+dmacromapos[hfilterpos0[1]],crblock1,tgiv0[1]); dmaGet(InputV1[0],VIp+dmacromapos[hfilterpos1[0]],crblock1,tgiv1[0]); dmaGet(InputV1[1],VIp+dmacromapos[hfilterpos1[1]],crblock1,tgiv1[1]); LineSelOut=0; selY0In=0; selY1In=0; selCrIn=0; // printf("New image\n"); for (h=0; h < iargs->dstH>>1; h++) //we asume that output is allways h/2 { sc.width=iargs->dstW; sc.smallcroma=0; sc.smallcromaline0=0; sc.smallcromaline1=0; sc.wHfilter=weightHfilter[2*h]; dmaWaitTag(tgiy0[selY0In]); // printf("dma: %d\n",2*h+2); dmaWaitTag(tgiy1[selY1In]); // printf("dma: %d\n",2*h+2); sc.source00=InputY0[selY0In]; sc.source01=InputY1[selY1In]; sc.Output=Ytemp0; if (noscale) { unpack(&sc); } else { scale(&sc); } //first Y line scaled dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+2]],iargs->srcW,tgiy0[selY0In]); // printf("dma: %d\n",2*h+2); if (!noscale) { //if we are scaling we also need the second line dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+2]],iargs->srcW,tgiy1[selY1In]); } // printf("dma: %d\n",2*h+2); selY0In=selY0In^1; selY1In=selY1In^1; sc.wHfilter=weightHfilter[2*h+1]; dmaWaitTag(tgiy0[selY0In]); dmaWaitTag(tgiy1[selY0In]); sc.source00=InputY0[selY0In]; sc.source01=InputY1[selY0In]; sc.Output=Ytemp1; if (noscale) { unpack(&sc); } else { scale(&sc); } //second Y line scaled dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+3]],iargs->srcW,tgiy0[selY0In]); if(!noscale) { //if we are scaling we also need the second line dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+3]],iargs->srcW,tgiy1[selY1In]); } selY0In=selY0In^1; selY1In=selY1In^1; // printf("dma: %d\n",2*h+3); if (srcsmallcroma) //these settings applly for both U and V { sc.smallcroma=1; if ((hfilterpos0[h]&1)==1) { sc.smallcromaline0=1; } else { sc.smallcromaline0=0; } if ((hfilterpos1[h]&1)==1){ sc.smallcromaline1=1; } else { sc.smallcromaline1=0; } if (((hfilterpos0[h]&1)==0)&&((hfilterpos1[h]&1)==0)) { sc.smallcroma=0; //both lines are 128 bit alligned only when doing extreme downscaling can this happen } } // if (noscale) { // sc.width=crblockdst0;//crblockdst1; // } else { // sc.width=crblockdst0; // } sc.width=iargs->dstW>>1; sc.wHfilter=weightHfilter[h]; dmaWaitTag(tgiu0[selCrIn]); dmaWaitTag(tgiu1[selCrIn]); sc.Output=Utemp; sc.source00=InputU0[selCrIn]; sc.source01=InputU1[selCrIn]; if (noscale) { unpack(&sc); } else { scale(&sc); } dmaWaitTag(tgiv0[selCrIn]); dmaWaitTag(tgiv1[selCrIn]); sc.Output=Vtemp; sc.source00=InputV0[selCrIn]; sc.source01=InputV1[selCrIn]; if (noscale) { unpack(&sc); } else { scale(&sc); } dmaGet(InputV0[selCrIn],VIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiu0[selCrIn]); //this is allways pos 0 dmaGet(InputU0[selCrIn],UIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiv0[selCrIn]); if(!noscale) { //if we are scaling we also need the second line dmaGet(InputV1[selCrIn],VIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiu1[selCrIn]); dmaGet(InputU1[selCrIn],UIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiv1[selCrIn]); } selCrIn=selCrIn^1; dmaWaitTag(tgo0[LineSelOut]); dmaWaitTag(tgo1[LineSelOut]); yuv420toARGBfloat(Ytemp0,Ytemp1,Utemp,Vtemp,Output0[LineSelOut],Output1[LineSelOut],iargs->dstW,iargs->maxwidth); //colorspace convert results dmaPut(Output0[LineSelOut],Op,iargs->dstW*4,tgo0[LineSelOut]); Op=Op+iargs->maxwidth*4; dmaPut(Output1[LineSelOut],Op,iargs->dstW*4,tgo1[LineSelOut]); Op=Op+iargs->maxwidth*4; LineSelOut=LineSelOut^1; } dmaWaitTag(tgo0[LineSelOut^1]); //wait for last write. dmaWaitTag(tgo1[LineSelOut^1]); //wait for last write. // printf("Image done\n"); if (iargs->MessageForm==INTR) { while (spu_stat_out_intr_mbox() == 0); msg=RDY; spu_writech(SPU_WrOutIntrMbox, msg); waiting=1; } if (iargs->MessageForm==HARD) { while (spu_stat_out_mbox() == 0); msg=RDY; spu_write_out_mbox(msg); waiting=1; } // fprintf(stderr,"spu_yuvscaler: Waiting\n"); while (waiting){ while (spu_stat_in_mbox() == 0); msg=spu_read_in_mbox(); if (msg == RUN){ selOut = selOut ^ 1; // flips the output buffer pointers selIn = selIn ^ 1; // flips the input buffer pointers waiting=0; } else if (msg == STOP) { // fprintf(stderr,"spu_yuvscaler: Stopping\n"); waiting=0; } else if (msg == UPDATE) { // fprintf(stderr,"spu_yuvscaler: Update\n"); dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image first=1; // update filters to reflect the new image! // selOut=0; // no need to change these. that can be done by the run. // selIn=0; } } } return 0; }
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) { int tgi0[2]; int tgo0[2]; int tgio0[2]; tgi0[0]=1; tgi0[1]=2; tgio0[0]=11; tgio0[1]=12; tgo0[0]=13; tgo0[1]=14; /* tgo1[0]=15; tgo1[1]=16;*/ int selOut = 0; int selIn = 0; int msg=RUN; int waiting=0; int tag = 31; struct img_args *iargs; iargs =(struct img_args*)memalign(128,sizeof(*iargs)); dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); printf("spu_blit_yuv422_to_argb: SRC width %d,DST width %d\n",iargs->src_w,iargs->drw_w); printf("spu_blit_yuv422_to_argb: SRC height %d,DST height %d\n",iargs->src_h,iargs->drw_h); while (spu_stat_in_mbox() == 0); msg=spu_read_in_mbox(); // first=0; vector unsigned char *InOutbuffer[2]; vector unsigned char *Inbuffer[2]; vector unsigned char *Outbuffer[2]; int Outwidth=(4*iargs->drw_w+3)&~3; int Inwidth=(2*iargs->src_w+7)&~7; Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth); Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth); if (iargs->BLEND) { InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth); InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth); } Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth); Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth); unsigned long long Inp,Outp,InOutp; int i=0; // int update=1; while (msg!=STOP) { selOut = 0; selIn = 0; Inp=iargs->Inp0[0]; InOutp=iargs->Outp0[0]; Outp=iargs->Outp0[0]; dmaGet(Inbuffer[0],Inp,Inwidth,tgi0[0]); Inp=Inp+iargs->Istride[0]*2; dmaGet(Inbuffer[1],Inp,Inwidth,tgi0[1]); Inp=Inp+iargs->Istride[0]*2; // if (iargs->BLEND) // { // dmaGet(InOutbuffer[0],InOutp,Outwidth,tgio0[0]); // InOutp=InOutp+iargs->Ostride[0]*4; // dmaGet(InOutbuffer[1],InOutp,Outwidth,tgio0[1]); // InOutp=InOutp+iargs->Ostride[0]*4; // } selIn=0; selOut=0; for (i=0;i < iargs->drw_h ;i++) { dmaWaitTag(tgi0[selIn]); // if (iargs->BLEND) // dmaWaitTag(tgio0[selIn]); dmaWaitTag(tgo0[selOut]); if (iargs->SourceFormat==YUY2||iargs->SourceFormat==YUYV422) { yuv422_to_argb(Inbuffer[selIn],Outbuffer[selOut],iargs->drw_w); // printf("spe_blitter: YUV422->ARGB\n"); } //yuv420_to_yuv2(Yinbuffer[selIn],Uinbuffer[selIn],Vinbuffer[selIn],Outbuffer[selOut],iargs->Istride[0]); // if (iargs->BLEND) // blend(InOutbuffer[selIn],OutBuffer[selOut],iargs->ALPHA,iargs->SourceFormat); dmaPut(Outbuffer[selOut],Outp,Outwidth,tgo0[selOut]); // if (iargs->BLEND){ // dmaGet(InOutbuffer[selIn],InOutp,Outwidth,tgio0[selIn]); // InOutp=InOutp+iargs->Ostride[0]; // // } dmaGet(Inbuffer[selIn],Inp,Inwidth,tgi0[selIn]); Inp=Inp+iargs->Istride[0]*2; Outp=Outp+iargs->Ostride[0]*4; selIn=selIn^1; selOut=selOut^1; } while (spu_stat_out_intr_mbox() == 0); msg=RDY; spu_writech(SPU_WrOutIntrMbox, msg); waiting=1; while (waiting){ while (spu_stat_in_mbox() == 0); msg=spu_read_in_mbox(); if (msg == RUN){ waiting=0; } else if (msg == STOP) { waiting=0; } else if (msg == UPDATE) { tag=30; dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image // // update=1; // update filters to reflect the new image! // Outwidth=(iargs->drw_w+3)&~3; // Inwidth=(iargs->src_w+7)&~7; // free(Inbuffer[0]); // free(Inbuffer[1]); // // free(Outbuffer[0]); // free(Outbuffer[1]); // // Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth); // Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth); // // if (iargs->BLEND) // { // free(InOutbuffer[0]); // free(InOutbuffer[1]); // InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth); // InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth); // } // // Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth); // Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth); } } } return 0; }