/* Waits for message from PPU to begin work on the current frame */
static void wait_for_begin(uint32_t *mbox_message) {
	do {
		while (spu_stat_in_mbox() <= 0)
			;
		*mbox_message = spu_read_in_mbox();

	} while (*mbox_message != BEGIN);
}
Ejemplo n.º 2
0
int main(uint64_t speid, uint64_t argp, uint64_t envp){
	unsigned int data[NUM_STREAMS];
	unsigned int num_spus = (unsigned int)argp, i, num_images;
	struct image my_image __attribute__ ((aligned(16)));
	int mode = (int)envp;

	speid = speid; //get rid of warning

	while(1){
		num_images = 0;
		for (i = 0; i < NUM_STREAMS / num_spus; i++){
			//assume NUM_STREAMS is a multiple of num_spus
			while(spu_stat_in_mbox() == 0);
			data[i] = spu_read_in_mbox();
			if (!data[i])
				return 0;
			num_images++;
		}

		for (i = 0; i < num_images; i++){
			mfc_get(&my_image, data[i], sizeof(struct image), MY_TAG, 0, 0);
			mfc_write_tag_mask(1 << MY_TAG);
			mfc_read_tag_status_all();
			switch(mode){
				default:
				case MODE_SIMPLE:
					process_image_simple(&my_image);
					break;
				case MODE_2LINES:
					process_image_2lines(&my_image);
					break;
				case MODE_DOUBLE:
					process_image_double(&my_image);
					break;
				case MODE_DMALIST:
					process_image_dmalist(&my_image);
					break;
			}
		}	
		data[0] = DONE;
		spu_write_out_intr_mbox(data[0]);	
	}

	return 0;
}
Ejemplo n.º 3
0
// -- is there something available in my mailbox? -----------------------------
int as_mbx_avail ()
{
  return (spu_stat_in_mbox () > 0);
}
Ejemplo n.º 4
0
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks)
{
    const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) {	// merge lo bytes from unsigned shorts (array)
        1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
    };

    const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) {	// get busy flag with ones in unused bytes
        0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0
    };

    const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0);

    char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ];

    char	sync_buffer[128+127];
    void*	aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 );

    RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer;
    unsigned long long cache_ea;

    spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD);
    mfc_write_tag_mask(1<<0);
    mfc_read_tag_status_all();

    while (cache_ea) {
        // terminate immediately if possible
        if (spu_stat_in_mbox())
            return;

        // read the cache line
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
        spu_readch(MFC_RdAtomicStat);

        unsigned int endTriangle = cache->endTriangle;
        vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle);

        // first look for short chunks
        vec_uchar16 next = cache->chunkNext;
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 start0 = cache->chunkStart[0];
        vec_ushort8 start1 = cache->chunkStart[1];

        vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) );
        vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) );

        vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0);
        vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1);

        vec_ushort8 len0 = spu_sub( end0, start0);
        vec_ushort8 len1 = spu_sub( end1, start1);

        vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0);
        vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1);
        vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE );
        vec_uint4 smallChunkGather = spu_gather(small);

        // check to see if chunk is already at the last triangle
        vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle(
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]),
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]),
                SHUFFLE_MERGE_BYTES) );

        // check if the chunk is free
        vec_uint4 freeChunkGather = spu_gather(
                                        spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );

        // check to see if the chunk is being processed
        vec_uint4 busyChunkGather = spu_gather(
                                        spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK),
                                                spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) );

        // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0
        // note that if freeChunkGather is true then busyChunkGather must also be true

        // done=false, free=false, busy=false -> can process
        // free=false, busy=false -> can be merged

        // decide which chunk to process
        vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather );
        vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather );

        vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) );
        vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask );

        /*
        		if (!spu_extract(shortSelMask, 0))
        			printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n",
        				spu_extract(mayProcessGather, 0) & 0xffff,
        				spu_extract(smallChunkGather, 0),
        				spu_extract(mayProcessShortGather, 0),
        				spu_extract(shortSelMask, 0) & 0xffff,
        				spu_extract(mayProcessSelection, 0) & 0xffff );
        */

        vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16);
        unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0);
        unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0);

        // if there's nothing to process, try the next cache line in the rendering tasks list
        if (!spu_extract(mayProcessBits, 0)) {
trynextcacheline:
            cache_ea = cache->next;
            // sleep();
            continue;
        }

        unsigned int chunkStart    	= cache->chunkStartArray   [chunkToProcess];
        unsigned int chunkTriangle	= cache->chunkTriangleArray[chunkToProcess];
        unsigned int chunkNext		= cache->chunkNextArray	   [chunkToProcess] & CHUNKNEXT_MASK;
        unsigned int chunkEnd		= (cache->chunkStartArray  [chunkNext]-1) & (NUMBER_OF_TILES-1);
        unsigned int chunkLength	= 1 + chunkEnd-chunkStart;

        // only need an extra block if the block is especially long
        if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) {
            freeChunk = 32;
        }

        // mark this block as busy
        cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT;

        // if there's at least one free chunk, claim it
        if (freeChunk != 32) {
            cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED;
            cache->chunkTriangleArray[freeChunk] = chunkTriangle;
        }

        // write the cache line back
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
            continue;

#ifdef INFO
        printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID,
               chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle,
               freeChunk!=32 ? freeChunk : -1 );
//		debug_render_tasks(cache);
#endif

        Triangle* triangle;
        int firstTile;
        do {
            // read the triangle data for the current triangle
            unsigned int extra = chunkTriangle & 127;
            unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
            triangle = (Triangle*) (trianglebuffer+extra);
            unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

            // ensure DMA slot available
            do {} while (!spu_readchcnt(MFC_Cmd));

            spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea),
                         length, 0, MFC_GET_CMD);
            mfc_write_tag_mask(1<<0);
            mfc_read_tag_status_all();

            // get the triangle deltas
            firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd);

            if (firstTile>=0)
                break;

            // no match, try next triangle
            chunkTriangle = triangle->next_triangle;
        } while (chunkTriangle != endTriangle);

        // if we actually have something to process...
        if (firstTile>=0) {
            // the "normal" splitting will now become:
            // chunkStart .. (firstTile-1)	-> triangle->next_triangle
            // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY)
            // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE)

            int tailChunk;
            int thisChunk;
            int nextBlockStart;
            int thisBlockStart;
            int realBlockStart;
            do {
retry:
                // read the cache line
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
                spu_readch(MFC_RdAtomicStat);

                // calculate start of next block
                nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK;
                if (nextBlockStart > chunkEnd)
                    nextBlockStart = chunkEnd+1;

                // calculate start of block to mark as busy
                thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK;
                if (thisBlockStart < chunkStart)
                    thisBlockStart = chunkStart;
                realBlockStart = thisBlockStart;

#ifdef INFO
                printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID,
                       nextBlockStart, realBlockStart, thisBlockStart, chunkStart);
#endif


                // allocate some more free chunks
                vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq(
                                                        spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16);
                unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);

                if (freeChunk == 32) {
                    // if we didn't have one before, try again
                    freeChunk = freeChunk2;

                    // and try to get the second one
                    freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) );
                    freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);
                } else {
                    // speculatively clear the free chunk just in case we don't need it
                    cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK;
                }

#ifdef INFO
                printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n",
                       _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart );
#endif

                // mark region after as available for processing if required
                if (nextBlockStart < chunkEnd) {
                    if (freeChunk==32) {
                        // if no free chunk, relinquish entire block and write back
                        cache->chunkNextArray[chunkToProcess] = chunkNext;
                        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
                        // if writeback failed, we *might* have a free block, retry
                        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
                            goto retry;

                        // otherwise give up and try the next cache line
                        goto trynextcacheline;
                    }
                    cache->chunkStartArray[freeChunk] = nextBlockStart;
                    cache->chunkNextArray[freeChunk] = chunkNext;
                    cache->chunkTriangleArray[freeChunk] = chunkTriangle;
                    cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT;
                    tailChunk = freeChunk;
#ifdef INFO
                    printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess);
                    debug_render_tasks(cache);
#endif
                } else {
                    // we're gonna use freeChunk2 for the "in front" block, as we've not
                    // used freeChunk, let's use it as it's more likely to have a free chunk
                    freeChunk2 = freeChunk;
                    tailChunk = chunkNext;
                }

                // mark region before as available if required and possible
                thisChunk = chunkToProcess;
                if (thisBlockStart > chunkStart) {
                    if (freeChunk2 != 32) {
                        // mark this region as busy
                        cache->chunkStartArray[freeChunk2]=thisBlockStart;
                        cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        cache->chunkTriangleArray[freeChunk2]=chunkTriangle;

                        // mark region before as available for processing
                        cache->chunkNextArray[chunkToProcess]=freeChunk2;
                        cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle;
                        thisChunk = freeChunk2;
#ifdef INFO
                        printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#endif
                    } else {
                        // need to keep whole block, update info and mark bust
                        cache->chunkTriangleArray[chunkToProcess]=chunkTriangle;
                        cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        realBlockStart = chunkStart;
                        printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#ifdef INFO
#endif
                        sleep();
                    }
                }

                // merge chunks
                merge_cache_blocks(cache);

                // write the cache line back
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
            } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS);

            // finally after the write succeeded, update the variables
            chunkNext = tailChunk;
            chunkToProcess = thisChunk;
            chunkStart = firstTile; //thisBlockStart;
            chunkLength = nextBlockStart - firstTile;
            chunkEnd = chunkStart + chunkLength - 1;
            freeChunk = 32;

            // now we can process the block up to endTriangle
            initTileBuffers(thisBlockStart, chunkEnd);

            int ok=0;
            while (chunkTriangle != endTriangle) {
#ifdef INFO
                printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n",
                       _SPUID, chunkToProcess, chunkStart, chunkLength,
                       chunkTriangle, firstTile, thisBlockStart);
#endif
                // and actually process that triangle on these chunks
                processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok);
                ok=1;
#ifdef PAUSE
                sleep();
#endif
                // and advance to the next-triangle
                chunkTriangle = triangle->next_triangle;

                // this should only ever happen if we're running really low on cache line slots
                // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and
                // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles.
                // in this case, we process from thisBlockStart only (because we know that from
                // chunkStart to there has no result) and then we only process one triangle
                if (chunkStart != realBlockStart) {
                    /*
                    printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, "
                    	"firstTile=%d chunk=%d\n",
                    	_SPUID, chunkStart, realBlockStart, chunkEnd,
                    	firstTile, chunkToProcess);
                    debug_render_tasks(cache);
                    */

                    // abort the while loop
                    break;
                }

                // read the next triangle
                unsigned int extra = chunkTriangle & 127;
                unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
                triangle = (Triangle*) (trianglebuffer+extra);
                unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

                // ensure DMA slot available
                do {} while (!spu_readchcnt(MFC_Cmd));

                spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea),
                             mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD);
                mfc_write_tag_mask(1<<0);
                mfc_read_tag_status_all();
            } // until chunkTriangle == endTriangle

            // flush any output buffers
            flushTileBuffers(thisBlockStart, chunkEnd);

        } // firstTile>=0
Ejemplo n.º 5
0
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) 
{
	int tgiy0[2];
	int tgiy1[2];
	int tgiu0[2];
	int tgiu1[2];
	int tgiv0[2];
	int tgiv1[2];
	int tgo0[2];
	int tgo1[2];

	tgiu1[0]=1;
	tgiu1[1]=2;
	tgo0[0]=3;
	tgo0[1]=4;
	tgiy0[0]=5;
	tgiy0[1]=6;
	tgiy1[0]=7;
	tgiy1[1]=8;
	tgiu0[0]=9;
	tgiu0[1]=10;
	tgiv0[0]=11;
	tgiv0[1]=12;
	tgiv1[1]=13;
	tgiv1[1]=14;
	tgo1[0]=15;
	tgo1[1]=16;
	
	int selOut = 0;
	int selIn = 0;
	int tag = 31;
	int LineSelIn=0;
	int LineSelOut=0;
		
	int selY0In = 0;
	int selY1In = 0;
	int selCrIn = 0;
	struct img_args *iargs;
	
	iargs =(struct img_args*)memalign(128,sizeof(*iargs));

	unsigned long long Cp;

	int first=1;
	int waiting=0;
	unsigned long long Op;
	unsigned int msg;
	unsigned long long YIp,UIp,VIp,YOp;

	int crblock0;
	int crblock1;
	int srcsmallcroma=0;
;
	int noscale=1;

 	static	int crblockdst1;
	static	int crblockdst0;
	scaler_settings_t sc;
	
	while (spu_stat_in_mbox() == 0);
		msg=spu_read_in_mbox();
	if (msg==RUN){	
		fprintf(stderr,"spu_yuv2argb_scaler: Starting Up\n");
	}

	dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process image
	printf("spu_yuv2argb_scaler: SRC width %d,DST width %d\n",iargs->srcW,iargs->dstW);
	printf("spu_yuv2argb_scaler: SRC height %d,DST height %d\n",iargs->srcH,iargs->dstH);
	
	printf("spu_yuv2argb_scaler: DST offset %d\n",iargs->offset);
	
	// bad fix for centering image on 1080p)
	//iargs->offset=(iargs->maxwidth-iargs->dstW)/2 + iargs->maxwidth*(1080-iargs->dstH)/2;	
	

	vector unsigned char *widthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*4+16);
	vector unsigned char *widthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*4+16);

	vector unsigned char *crwidthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*2+16);
	vector unsigned char *crwidthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*2+16);	

	vector float * weightWfilter0=(vector float*)memalign(128,MAXWIDTH*4+16);
	vector float * weightWfilter1=(vector float*)memalign(128,MAXWIDTH*4+16);

	float weightHfilter[MAXHEIGHT+1];

	unsigned long long dmapos[MAXHEIGHT+2];
	unsigned long long dmacromapos[MAXHEIGHT+2];

	
	vector float * Ytemp0=(vector float *)memalign(128,MAXWIDTH*4+16);
	vector float * Ytemp1=(vector float *)memalign(128,MAXWIDTH*4+16);
	vector float * Utemp=(vector float *)memalign(128,MAXWIDTH*2+16);
	vector float * Vtemp=(vector float *)memalign(128,MAXWIDTH*2+16);

	int wfilterpos[MAXWIDTH+2];
	int hfilterpos0[MAXHEIGHT+2];
	int hfilterpos1[MAXHEIGHT+2];
	int crwfilterpos[MAXWIDTH/2+2];

	vector unsigned char *InputY0[2];
	InputY0[0]=(vector unsigned char*)memalign(128,MAXWIDTH); 
	InputY0[1]=(vector unsigned char*)memalign(128,MAXWIDTH); 

	vector unsigned char *InputU0[2];
	InputU0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputU0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	
	vector unsigned char *InputV0[2];
	InputV0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputV0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16);

	vector unsigned char *InputY1[2];
	InputY1[0]=(vector unsigned char*)memalign(128,MAXWIDTH); 
	InputY1[1]=(vector unsigned char*)memalign(128,MAXWIDTH); 

	vector unsigned char *InputU1[2];
	InputU1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputU1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16);

	vector unsigned char *InputV1[2];
	InputV1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputV1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 	

	vector unsigned char* Output0[2];
	Output0[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	Output0[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output

	vector unsigned char* Output1[2];
	Output1[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	Output1[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	

	
	while (msg!=STOP) 
	{
		int h=0;
		int i;
		
		if (first)
		{
			crblock0=(iargs->srcW>>1)&~15; // rounded down
			crblock1=((iargs->srcW>>1) + 15)&~15; //rounded up
			crblockdst1=((iargs->dstW>>1) + 15)&~15;//destination size rounded up.
			crblockdst0=((iargs->dstW>>1) + 7)&~7;//destination size rounded up.

			
			initHFilter(iargs->srcW,iargs->srcH,iargs->dstH,hfilterpos0,hfilterpos1,weightHfilter,dmapos,dmacromapos);
// 			printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[0]]/16.0);
// 			printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[1]]/16.0);
// 			
// 			for (i=0;i < iargs->dstH>>1;i++)
// 			{
//  			//	printf("Hfilterpos0 dst: %d, src:%d, weight:%f\n",i,hfilterpos0[i],weightHfilter[i]);
//  			//	printf("Hfilterpos1 dst: %d, src:%d, weight:%f\n",i,hfilterpos1[i],1.0-weightHfilter[i]);
// 				printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+2]]/16.0,dmacromapos[hfilterpos1[2*i+2]]/16.0);
// 				printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+3]]/16.0,dmacromapos[hfilterpos1[2*i+3]]/16.0);
// 			}
			
			if ((iargs->srcW==iargs->dstW)&&(iargs->srcH==iargs->dstH))
			{
				
				printf("spu_yuv2argb_scaler: No scaling proceeding with direct csc\n");
				noscale=1;
				if ((iargs->srcW%32) != 0)
				{
					srcsmallcroma=1;
					sc.smallcroma=1;
				}
				
			} else {
				
			
				noscale=0;
				printf("spu_yuv2argb_scaler: Scaling, computing shuffle filters\n");
				initWFilter(iargs->srcW,iargs->dstW,1,wfilterpos,widthfilter0,widthfilter1,weightWfilter0,weightWfilter1);

/*				for (i=0;i < iargs->dstW/4;i++)
				{
					printf("filterpos dst: %d, src:%d\n",i,wfilterpos[i]);
					printcharvec("widthfilter0",widthfilter0[i]);
					printcharvec("widthfilter1",widthfilter1[i]);
					printfvec("weightWfilter0",weightWfilter0[i]);
					printfvec("weightWfilter1",weightWfilter1[i]);
				}*/				

				srcsmallcroma=0;
				sc.smallcroma=0;
				if ((iargs->srcW%32) != 0)
				{
					sc.smallcroma=1;
					srcsmallcroma=1;	
					initWcrFilter(iargs->srcW/2,iargs->dstW/2,1,crwfilterpos,crwidthfilter0,crwidthfilter1);	
					printf("spu_yuv2argb_scaler: Computing Crshuffle filter\n");
	
// 					for (i=0;i < (iargs->dstW>>1)/4;i++)
// 					{
// 						printf("crwfilterpos dst: %d, src:%d, weight:%f\n",i,crwfilterpos[i]);
// 						printcharvec("crwidthfilter0",crwidthfilter0[i]);
// 						printcharvec("crwidthfilter1",crwidthfilter1[i]);
// 						printfvec("weightWfilter0",weightWfilter0[i]);
// 						printfvec("weightWfilter1",weightWfilter1[i]);
// 					
// 					}
							
				}
				
				sc.wWfilter0=weightWfilter0;
				sc.wWfilter1=weightWfilter1;
				sc.wfilterpos=wfilterpos;
				sc.sWfilter0=widthfilter0; 
				sc.sWfilter1=widthfilter1;
				sc.crsWfilter0=crwidthfilter0;
				sc.crsWfilter1=crwidthfilter1;
				sc.crfilterpos=crwfilterpos;

				sc.smallcromaline0=0;
				sc.smallcromaline1=0;
				
			}
			first=0;
			printf("spu_yuv2argb_scaler: Initiation completed\n");
		}

	
		YIp = iargs->Ystart[selIn];
		UIp = iargs->Ustart[selIn];
		VIp = iargs->Vstart[selIn];
		Op = iargs->Output[selOut] + iargs->offset*4;

		
		LineSelOut=0;
		selY0In=0; 
		selY1In=0;
		selCrIn=0;

	
		dmaGet(InputY0[0],YIp+dmapos[hfilterpos0[0]],iargs->srcW,tgiy0[0]); 
		dmaGet(InputY1[0],YIp+dmapos[hfilterpos1[0]],iargs->srcW,tgiy1[0]); 
		dmaGet(InputY0[1],YIp+dmapos[hfilterpos0[1]],iargs->srcW,tgiy0[1]); 
		dmaGet(InputY1[1],YIp+dmapos[hfilterpos1[1]],iargs->srcW,tgiy1[1]); 


		dmaGet(InputU0[0],UIp+dmacromapos[hfilterpos0[0]],crblock1,tgiu0[0]);
		dmaGet(InputU0[1],UIp+dmacromapos[hfilterpos0[1]],crblock1,tgiu0[1]);
		dmaGet(InputU1[0],UIp+dmacromapos[hfilterpos1[0]],crblock1,tgiu1[0]);	
		dmaGet(InputU1[1],UIp+dmacromapos[hfilterpos1[1]],crblock1,tgiu1[1]); 
// 
		dmaGet(InputV0[0],VIp+dmacromapos[hfilterpos0[0]],crblock1,tgiv0[0]);
		dmaGet(InputV0[1],VIp+dmacromapos[hfilterpos0[1]],crblock1,tgiv0[1]);
		dmaGet(InputV1[0],VIp+dmacromapos[hfilterpos1[0]],crblock1,tgiv1[0]);	
		dmaGet(InputV1[1],VIp+dmacromapos[hfilterpos1[1]],crblock1,tgiv1[1]);


		LineSelOut=0;
		selY0In=0; 
		selY1In=0;
		selCrIn=0;
	//	printf("New image\n");
		for (h=0; h < iargs->dstH>>1; h++) //we asume that output is allways h/2
		{

			sc.width=iargs->dstW;
			sc.smallcroma=0;
			sc.smallcromaline0=0;
			sc.smallcromaline1=0;

			sc.wHfilter=weightHfilter[2*h];
			dmaWaitTag(tgiy0[selY0In]);
		//	printf("dma: %d\n",2*h+2);
			dmaWaitTag(tgiy1[selY1In]);
		//	printf("dma: %d\n",2*h+2);
			sc.source00=InputY0[selY0In];
			sc.source01=InputY1[selY1In];
			sc.Output=Ytemp0;
			
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}	
								//first Y line scaled
			dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+2]],iargs->srcW,tgiy0[selY0In]); 
		//	printf("dma: %d\n",2*h+2);
			if (!noscale) { //if we are scaling we also need the second line
				dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+2]],iargs->srcW,tgiy1[selY1In]); 
			}
		//	printf("dma: %d\n",2*h+2);
			selY0In=selY0In^1;
			selY1In=selY1In^1;
			

			sc.wHfilter=weightHfilter[2*h+1];
			dmaWaitTag(tgiy0[selY0In]);
			dmaWaitTag(tgiy1[selY0In]);
			sc.source00=InputY0[selY0In];
			sc.source01=InputY1[selY0In];
			sc.Output=Ytemp1;
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}								//second Y line scaled
			dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+3]],iargs->srcW,tgiy0[selY0In]); 
			if(!noscale) { //if we are scaling we also need the second line
				dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+3]],iargs->srcW,tgiy1[selY1In]); 
		
			}
			selY0In=selY0In^1;
			selY1In=selY1In^1;
		//	printf("dma: %d\n",2*h+3);
			if (srcsmallcroma) //these settings applly for both U and V
			{	
				sc.smallcroma=1;
				if ((hfilterpos0[h]&1)==1) {
					sc.smallcromaline0=1;	
				} else {
					sc.smallcromaline0=0;
				}
				if ((hfilterpos1[h]&1)==1){
					sc.smallcromaline1=1;
				} else {
					sc.smallcromaline1=0;
				} 	
				if (((hfilterpos0[h]&1)==0)&&((hfilterpos1[h]&1)==0))
				{
					sc.smallcroma=0; //both lines are 128 bit alligned only when doing extreme downscaling can this happen
				}
			}
// 			if (noscale) {
// 				sc.width=crblockdst0;//crblockdst1;
// 			} else {
// 				sc.width=crblockdst0;
// 			}
			sc.width=iargs->dstW>>1;
			sc.wHfilter=weightHfilter[h];
			
	
			dmaWaitTag(tgiu0[selCrIn]);
			dmaWaitTag(tgiu1[selCrIn]);
			sc.Output=Utemp;
			sc.source00=InputU0[selCrIn];
			sc.source01=InputU1[selCrIn];
		
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}

			dmaWaitTag(tgiv0[selCrIn]);
			dmaWaitTag(tgiv1[selCrIn]);
			sc.Output=Vtemp;
			sc.source00=InputV0[selCrIn];
			sc.source01=InputV1[selCrIn];
			
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}

			dmaGet(InputV0[selCrIn],VIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiu0[selCrIn]); 		//this is allways pos 0 
			dmaGet(InputU0[selCrIn],UIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiv0[selCrIn]); 

			if(!noscale) {	//if we are scaling we also need the second line
				dmaGet(InputV1[selCrIn],VIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiu1[selCrIn]);
				dmaGet(InputU1[selCrIn],UIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiv1[selCrIn]); 
			} 

			selCrIn=selCrIn^1;
			dmaWaitTag(tgo0[LineSelOut]);
			dmaWaitTag(tgo1[LineSelOut]);
							
			yuv420toARGBfloat(Ytemp0,Ytemp1,Utemp,Vtemp,Output0[LineSelOut],Output1[LineSelOut],iargs->dstW,iargs->maxwidth); //colorspace convert results
			
			dmaPut(Output0[LineSelOut],Op,iargs->dstW*4,tgo0[LineSelOut]);
			Op=Op+iargs->maxwidth*4;
			
			dmaPut(Output1[LineSelOut],Op,iargs->dstW*4,tgo1[LineSelOut]);
			Op=Op+iargs->maxwidth*4;
			
			LineSelOut=LineSelOut^1;
		} 
		dmaWaitTag(tgo0[LineSelOut^1]); //wait for last write.
		dmaWaitTag(tgo1[LineSelOut^1]); //wait for last write.

	//	printf("Image done\n");
		if (iargs->MessageForm==INTR)
		{
			while (spu_stat_out_intr_mbox() == 0);
			msg=RDY;
			spu_writech(SPU_WrOutIntrMbox, msg);
			waiting=1;
		}

		if (iargs->MessageForm==HARD)
		{
			while (spu_stat_out_mbox() == 0);
			msg=RDY;
			spu_write_out_mbox(msg);
			waiting=1;
		}
//  		fprintf(stderr,"spu_yuvscaler: Waiting\n");		
		while (waiting){
			
			while (spu_stat_in_mbox() == 0);
			msg=spu_read_in_mbox();
			
			if (msg == RUN){
				selOut = selOut ^ 1; // flips the output buffer pointers
				selIn = selIn ^ 1; // flips the input buffer pointers	
				waiting=0;
			}
			else if (msg == STOP)
			{
// 				fprintf(stderr,"spu_yuvscaler: Stopping\n");
				waiting=0;
			}
			else if (msg == UPDATE)
			{
// 				fprintf(stderr,"spu_yuvscaler: Update\n");
				dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image	
				first=1; // update filters to reflect the new image!
			//	selOut=0; // no need to change these. that can be done by the run.
			//	selIn=0;
			}
		}
		
		
		

	}
	
	return 0;
}
Ejemplo n.º 6
0
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) 
{
	int tgi0[2];

	int tgo0[2];

	int tgio0[2];

	tgi0[0]=1;
	tgi0[1]=2;

	
	tgio0[0]=11;
	tgio0[1]=12;

	tgo0[0]=13;
	tgo0[1]=14;
/*	tgo1[0]=15;
	tgo1[1]=16;*/	

	
	int selOut = 0;
	int selIn = 0;
	int msg=RUN;
	int waiting=0;
	int tag = 31;
	struct img_args *iargs;
	iargs =(struct img_args*)memalign(128,sizeof(*iargs));
	dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag);
	
	printf("spu_blit_yuv422_to_argb: SRC width %d,DST width %d\n",iargs->src_w,iargs->drw_w);
	printf("spu_blit_yuv422_to_argb: SRC height %d,DST height %d\n",iargs->src_h,iargs->drw_h);
	
	while (spu_stat_in_mbox() == 0);
		msg=spu_read_in_mbox();
//	first=0;

	vector unsigned char *InOutbuffer[2];

	vector unsigned char *Inbuffer[2];

	vector unsigned char *Outbuffer[2];

	int Outwidth=(4*iargs->drw_w+3)&~3;
	int Inwidth=(2*iargs->src_w+7)&~7;

	Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth);
	Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth);

	if (iargs->BLEND)
	{
		InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
		InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
	}

	Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
	Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);

	unsigned long long Inp,Outp,InOutp;
	
	int i=0;
//	int update=1;


	while (msg!=STOP)
	{
		selOut = 0;
		selIn = 0;

		Inp=iargs->Inp0[0];
		InOutp=iargs->Outp0[0];
		Outp=iargs->Outp0[0];

		dmaGet(Inbuffer[0],Inp,Inwidth,tgi0[0]);
		Inp=Inp+iargs->Istride[0]*2;

		dmaGet(Inbuffer[1],Inp,Inwidth,tgi0[1]);
		Inp=Inp+iargs->Istride[0]*2;

// 		if (iargs->BLEND)
// 		{
// 			dmaGet(InOutbuffer[0],InOutp,Outwidth,tgio0[0]);
// 			InOutp=InOutp+iargs->Ostride[0]*4;
// 			dmaGet(InOutbuffer[1],InOutp,Outwidth,tgio0[1]);
// 			InOutp=InOutp+iargs->Ostride[0]*4;
// 		}



		selIn=0;
		selOut=0;

		for (i=0;i < iargs->drw_h ;i++) {
			dmaWaitTag(tgi0[selIn]);
			
// 			if (iargs->BLEND)
// 				dmaWaitTag(tgio0[selIn]); 
			dmaWaitTag(tgo0[selOut]);
			if (iargs->SourceFormat==YUY2||iargs->SourceFormat==YUYV422)
			{
				yuv422_to_argb(Inbuffer[selIn],Outbuffer[selOut],iargs->drw_w);
			//	printf("spe_blitter: YUV422->ARGB\n");
			}
			//yuv420_to_yuv2(Yinbuffer[selIn],Uinbuffer[selIn],Vinbuffer[selIn],Outbuffer[selOut],iargs->Istride[0]);
			
		//	if (iargs->BLEND)
			//	blend(InOutbuffer[selIn],OutBuffer[selOut],iargs->ALPHA,iargs->SourceFormat);
			
			dmaPut(Outbuffer[selOut],Outp,Outwidth,tgo0[selOut]);
		
// 			if (iargs->BLEND){
// 				dmaGet(InOutbuffer[selIn],InOutp,Outwidth,tgio0[selIn]);
// 				InOutp=InOutp+iargs->Ostride[0];
// 					
// 			}

			dmaGet(Inbuffer[selIn],Inp,Inwidth,tgi0[selIn]);
			
			Inp=Inp+iargs->Istride[0]*2;
			Outp=Outp+iargs->Ostride[0]*4;
			selIn=selIn^1;
			selOut=selOut^1;
		}
	

		while (spu_stat_out_intr_mbox() == 0);
		msg=RDY;
		spu_writech(SPU_WrOutIntrMbox, msg);
		waiting=1;
		
		while (waiting){
			
			while (spu_stat_in_mbox() == 0);
			msg=spu_read_in_mbox();
			
			if (msg == RUN){
				waiting=0;
			}
			else if (msg == STOP)
			{
				waiting=0;
			}
			else if (msg == UPDATE)
			{
				tag=30;
 				dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image	
//  			//	update=1; // update filters to reflect the new image!
// 				Outwidth=(iargs->drw_w+3)&~3;
// 				Inwidth=(iargs->src_w+7)&~7;
// 				free(Inbuffer[0]);
// 				free(Inbuffer[1]);
// 	
// 				free(Outbuffer[0]);
// 				free(Outbuffer[1]);
// 				
// 				Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth);
// 				Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth);
// 			
// 				if (iargs->BLEND)
// 				{
// 					free(InOutbuffer[0]);
// 					free(InOutbuffer[1]);	
// 					InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
// 					InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
// 				}
// 	
// 				Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
// 				Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
			}
		}
	}
		 

	return 0;
}