/* Sends a message to PPU that the current frame has been processed */
static void send_frame_done(ppu_data_t ppu_data, int frame_id) {
	uint32_t mbox_message = SPU_FRAME_DONE;

	while (spu_stat_out_intr_mbox() <= 0)
		;
	spu_write_out_intr_mbox(mbox_message);
	dprintf("SPU[%d]: Finished frame %d. Informed PPU\n",
		ppu_data.spe_id, frame_id + 1);
	/* Suppress warnings of unused */
	ppu_data.spe_id = ppu_data.spe_id;
	frame_id = frame_id;
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) 
{
	int tgiy0[2];
	int tgiy1[2];
	int tgiu0[2];
	int tgiu1[2];
	int tgiv0[2];
	int tgiv1[2];
	int tgo0[2];
	int tgo1[2];

	tgiu1[0]=1;
	tgiu1[1]=2;
	tgo0[0]=3;
	tgo0[1]=4;
	tgiy0[0]=5;
	tgiy0[1]=6;
	tgiy1[0]=7;
	tgiy1[1]=8;
	tgiu0[0]=9;
	tgiu0[1]=10;
	tgiv0[0]=11;
	tgiv0[1]=12;
	tgiv1[1]=13;
	tgiv1[1]=14;
	tgo1[0]=15;
	tgo1[1]=16;
	
	int selOut = 0;
	int selIn = 0;
	int tag = 31;
	int LineSelIn=0;
	int LineSelOut=0;
		
	int selY0In = 0;
	int selY1In = 0;
	int selCrIn = 0;
	struct img_args *iargs;
	
	iargs =(struct img_args*)memalign(128,sizeof(*iargs));

	unsigned long long Cp;

	int first=1;
	int waiting=0;
	unsigned long long Op;
	unsigned int msg;
	unsigned long long YIp,UIp,VIp,YOp;

	int crblock0;
	int crblock1;
	int srcsmallcroma=0;
;
	int noscale=1;

 	static	int crblockdst1;
	static	int crblockdst0;
	scaler_settings_t sc;
	
	while (spu_stat_in_mbox() == 0);
		msg=spu_read_in_mbox();
	if (msg==RUN){	
		fprintf(stderr,"spu_yuv2argb_scaler: Starting Up\n");
	}

	dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process image
	printf("spu_yuv2argb_scaler: SRC width %d,DST width %d\n",iargs->srcW,iargs->dstW);
	printf("spu_yuv2argb_scaler: SRC height %d,DST height %d\n",iargs->srcH,iargs->dstH);
	
	printf("spu_yuv2argb_scaler: DST offset %d\n",iargs->offset);
	
	// bad fix for centering image on 1080p)
	//iargs->offset=(iargs->maxwidth-iargs->dstW)/2 + iargs->maxwidth*(1080-iargs->dstH)/2;	
	

	vector unsigned char *widthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*4+16);
	vector unsigned char *widthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*4+16);

	vector unsigned char *crwidthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*2+16);
	vector unsigned char *crwidthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*2+16);	

	vector float * weightWfilter0=(vector float*)memalign(128,MAXWIDTH*4+16);
	vector float * weightWfilter1=(vector float*)memalign(128,MAXWIDTH*4+16);

	float weightHfilter[MAXHEIGHT+1];

	unsigned long long dmapos[MAXHEIGHT+2];
	unsigned long long dmacromapos[MAXHEIGHT+2];

	
	vector float * Ytemp0=(vector float *)memalign(128,MAXWIDTH*4+16);
	vector float * Ytemp1=(vector float *)memalign(128,MAXWIDTH*4+16);
	vector float * Utemp=(vector float *)memalign(128,MAXWIDTH*2+16);
	vector float * Vtemp=(vector float *)memalign(128,MAXWIDTH*2+16);

	int wfilterpos[MAXWIDTH+2];
	int hfilterpos0[MAXHEIGHT+2];
	int hfilterpos1[MAXHEIGHT+2];
	int crwfilterpos[MAXWIDTH/2+2];

	vector unsigned char *InputY0[2];
	InputY0[0]=(vector unsigned char*)memalign(128,MAXWIDTH); 
	InputY0[1]=(vector unsigned char*)memalign(128,MAXWIDTH); 

	vector unsigned char *InputU0[2];
	InputU0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputU0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	
	vector unsigned char *InputV0[2];
	InputV0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputV0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16);

	vector unsigned char *InputY1[2];
	InputY1[0]=(vector unsigned char*)memalign(128,MAXWIDTH); 
	InputY1[1]=(vector unsigned char*)memalign(128,MAXWIDTH); 

	vector unsigned char *InputU1[2];
	InputU1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputU1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16);

	vector unsigned char *InputV1[2];
	InputV1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputV1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 	

	vector unsigned char* Output0[2];
	Output0[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	Output0[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output

	vector unsigned char* Output1[2];
	Output1[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	Output1[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	

	
	while (msg!=STOP) 
	{
		int h=0;
		int i;
		
		if (first)
		{
			crblock0=(iargs->srcW>>1)&~15; // rounded down
			crblock1=((iargs->srcW>>1) + 15)&~15; //rounded up
			crblockdst1=((iargs->dstW>>1) + 15)&~15;//destination size rounded up.
			crblockdst0=((iargs->dstW>>1) + 7)&~7;//destination size rounded up.

			
			initHFilter(iargs->srcW,iargs->srcH,iargs->dstH,hfilterpos0,hfilterpos1,weightHfilter,dmapos,dmacromapos);
// 			printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[0]]/16.0);
// 			printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[1]]/16.0);
// 			
// 			for (i=0;i < iargs->dstH>>1;i++)
// 			{
//  			//	printf("Hfilterpos0 dst: %d, src:%d, weight:%f\n",i,hfilterpos0[i],weightHfilter[i]);
//  			//	printf("Hfilterpos1 dst: %d, src:%d, weight:%f\n",i,hfilterpos1[i],1.0-weightHfilter[i]);
// 				printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+2]]/16.0,dmacromapos[hfilterpos1[2*i+2]]/16.0);
// 				printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+3]]/16.0,dmacromapos[hfilterpos1[2*i+3]]/16.0);
// 			}
			
			if ((iargs->srcW==iargs->dstW)&&(iargs->srcH==iargs->dstH))
			{
				
				printf("spu_yuv2argb_scaler: No scaling proceeding with direct csc\n");
				noscale=1;
				if ((iargs->srcW%32) != 0)
				{
					srcsmallcroma=1;
					sc.smallcroma=1;
				}
				
			} else {
				
			
				noscale=0;
				printf("spu_yuv2argb_scaler: Scaling, computing shuffle filters\n");
				initWFilter(iargs->srcW,iargs->dstW,1,wfilterpos,widthfilter0,widthfilter1,weightWfilter0,weightWfilter1);

/*				for (i=0;i < iargs->dstW/4;i++)
				{
					printf("filterpos dst: %d, src:%d\n",i,wfilterpos[i]);
					printcharvec("widthfilter0",widthfilter0[i]);
					printcharvec("widthfilter1",widthfilter1[i]);
					printfvec("weightWfilter0",weightWfilter0[i]);
					printfvec("weightWfilter1",weightWfilter1[i]);
				}*/				

				srcsmallcroma=0;
				sc.smallcroma=0;
				if ((iargs->srcW%32) != 0)
				{
					sc.smallcroma=1;
					srcsmallcroma=1;	
					initWcrFilter(iargs->srcW/2,iargs->dstW/2,1,crwfilterpos,crwidthfilter0,crwidthfilter1);	
					printf("spu_yuv2argb_scaler: Computing Crshuffle filter\n");
	
// 					for (i=0;i < (iargs->dstW>>1)/4;i++)
// 					{
// 						printf("crwfilterpos dst: %d, src:%d, weight:%f\n",i,crwfilterpos[i]);
// 						printcharvec("crwidthfilter0",crwidthfilter0[i]);
// 						printcharvec("crwidthfilter1",crwidthfilter1[i]);
// 						printfvec("weightWfilter0",weightWfilter0[i]);
// 						printfvec("weightWfilter1",weightWfilter1[i]);
// 					
// 					}
							
				}
				
				sc.wWfilter0=weightWfilter0;
				sc.wWfilter1=weightWfilter1;
				sc.wfilterpos=wfilterpos;
				sc.sWfilter0=widthfilter0; 
				sc.sWfilter1=widthfilter1;
				sc.crsWfilter0=crwidthfilter0;
				sc.crsWfilter1=crwidthfilter1;
				sc.crfilterpos=crwfilterpos;

				sc.smallcromaline0=0;
				sc.smallcromaline1=0;
				
			}
			first=0;
			printf("spu_yuv2argb_scaler: Initiation completed\n");
		}

	
		YIp = iargs->Ystart[selIn];
		UIp = iargs->Ustart[selIn];
		VIp = iargs->Vstart[selIn];
		Op = iargs->Output[selOut] + iargs->offset*4;

		
		LineSelOut=0;
		selY0In=0; 
		selY1In=0;
		selCrIn=0;

	
		dmaGet(InputY0[0],YIp+dmapos[hfilterpos0[0]],iargs->srcW,tgiy0[0]); 
		dmaGet(InputY1[0],YIp+dmapos[hfilterpos1[0]],iargs->srcW,tgiy1[0]); 
		dmaGet(InputY0[1],YIp+dmapos[hfilterpos0[1]],iargs->srcW,tgiy0[1]); 
		dmaGet(InputY1[1],YIp+dmapos[hfilterpos1[1]],iargs->srcW,tgiy1[1]); 


		dmaGet(InputU0[0],UIp+dmacromapos[hfilterpos0[0]],crblock1,tgiu0[0]);
		dmaGet(InputU0[1],UIp+dmacromapos[hfilterpos0[1]],crblock1,tgiu0[1]);
		dmaGet(InputU1[0],UIp+dmacromapos[hfilterpos1[0]],crblock1,tgiu1[0]);	
		dmaGet(InputU1[1],UIp+dmacromapos[hfilterpos1[1]],crblock1,tgiu1[1]); 
// 
		dmaGet(InputV0[0],VIp+dmacromapos[hfilterpos0[0]],crblock1,tgiv0[0]);
		dmaGet(InputV0[1],VIp+dmacromapos[hfilterpos0[1]],crblock1,tgiv0[1]);
		dmaGet(InputV1[0],VIp+dmacromapos[hfilterpos1[0]],crblock1,tgiv1[0]);	
		dmaGet(InputV1[1],VIp+dmacromapos[hfilterpos1[1]],crblock1,tgiv1[1]);


		LineSelOut=0;
		selY0In=0; 
		selY1In=0;
		selCrIn=0;
	//	printf("New image\n");
		for (h=0; h < iargs->dstH>>1; h++) //we asume that output is allways h/2
		{

			sc.width=iargs->dstW;
			sc.smallcroma=0;
			sc.smallcromaline0=0;
			sc.smallcromaline1=0;

			sc.wHfilter=weightHfilter[2*h];
			dmaWaitTag(tgiy0[selY0In]);
		//	printf("dma: %d\n",2*h+2);
			dmaWaitTag(tgiy1[selY1In]);
		//	printf("dma: %d\n",2*h+2);
			sc.source00=InputY0[selY0In];
			sc.source01=InputY1[selY1In];
			sc.Output=Ytemp0;
			
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}	
								//first Y line scaled
			dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+2]],iargs->srcW,tgiy0[selY0In]); 
		//	printf("dma: %d\n",2*h+2);
			if (!noscale) { //if we are scaling we also need the second line
				dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+2]],iargs->srcW,tgiy1[selY1In]); 
			}
		//	printf("dma: %d\n",2*h+2);
			selY0In=selY0In^1;
			selY1In=selY1In^1;
			

			sc.wHfilter=weightHfilter[2*h+1];
			dmaWaitTag(tgiy0[selY0In]);
			dmaWaitTag(tgiy1[selY0In]);
			sc.source00=InputY0[selY0In];
			sc.source01=InputY1[selY0In];
			sc.Output=Ytemp1;
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}								//second Y line scaled
			dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+3]],iargs->srcW,tgiy0[selY0In]); 
			if(!noscale) { //if we are scaling we also need the second line
				dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+3]],iargs->srcW,tgiy1[selY1In]); 
		
			}
			selY0In=selY0In^1;
			selY1In=selY1In^1;
		//	printf("dma: %d\n",2*h+3);
			if (srcsmallcroma) //these settings applly for both U and V
			{	
				sc.smallcroma=1;
				if ((hfilterpos0[h]&1)==1) {
					sc.smallcromaline0=1;	
				} else {
					sc.smallcromaline0=0;
				}
				if ((hfilterpos1[h]&1)==1){
					sc.smallcromaline1=1;
				} else {
					sc.smallcromaline1=0;
				} 	
				if (((hfilterpos0[h]&1)==0)&&((hfilterpos1[h]&1)==0))
				{
					sc.smallcroma=0; //both lines are 128 bit alligned only when doing extreme downscaling can this happen
				}
			}
// 			if (noscale) {
// 				sc.width=crblockdst0;//crblockdst1;
// 			} else {
// 				sc.width=crblockdst0;
// 			}
			sc.width=iargs->dstW>>1;
			sc.wHfilter=weightHfilter[h];
			
	
			dmaWaitTag(tgiu0[selCrIn]);
			dmaWaitTag(tgiu1[selCrIn]);
			sc.Output=Utemp;
			sc.source00=InputU0[selCrIn];
			sc.source01=InputU1[selCrIn];
		
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}

			dmaWaitTag(tgiv0[selCrIn]);
			dmaWaitTag(tgiv1[selCrIn]);
			sc.Output=Vtemp;
			sc.source00=InputV0[selCrIn];
			sc.source01=InputV1[selCrIn];
			
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}

			dmaGet(InputV0[selCrIn],VIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiu0[selCrIn]); 		//this is allways pos 0 
			dmaGet(InputU0[selCrIn],UIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiv0[selCrIn]); 

			if(!noscale) {	//if we are scaling we also need the second line
				dmaGet(InputV1[selCrIn],VIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiu1[selCrIn]);
				dmaGet(InputU1[selCrIn],UIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiv1[selCrIn]); 
			} 

			selCrIn=selCrIn^1;
			dmaWaitTag(tgo0[LineSelOut]);
			dmaWaitTag(tgo1[LineSelOut]);
							
			yuv420toARGBfloat(Ytemp0,Ytemp1,Utemp,Vtemp,Output0[LineSelOut],Output1[LineSelOut],iargs->dstW,iargs->maxwidth); //colorspace convert results
			
			dmaPut(Output0[LineSelOut],Op,iargs->dstW*4,tgo0[LineSelOut]);
			Op=Op+iargs->maxwidth*4;
			
			dmaPut(Output1[LineSelOut],Op,iargs->dstW*4,tgo1[LineSelOut]);
			Op=Op+iargs->maxwidth*4;
			
			LineSelOut=LineSelOut^1;
		} 
		dmaWaitTag(tgo0[LineSelOut^1]); //wait for last write.
		dmaWaitTag(tgo1[LineSelOut^1]); //wait for last write.

	//	printf("Image done\n");
		if (iargs->MessageForm==INTR)
		{
			while (spu_stat_out_intr_mbox() == 0);
			msg=RDY;
			spu_writech(SPU_WrOutIntrMbox, msg);
			waiting=1;
		}

		if (iargs->MessageForm==HARD)
		{
			while (spu_stat_out_mbox() == 0);
			msg=RDY;
			spu_write_out_mbox(msg);
			waiting=1;
		}
//  		fprintf(stderr,"spu_yuvscaler: Waiting\n");		
		while (waiting){
			
			while (spu_stat_in_mbox() == 0);
			msg=spu_read_in_mbox();
			
			if (msg == RUN){
				selOut = selOut ^ 1; // flips the output buffer pointers
				selIn = selIn ^ 1; // flips the input buffer pointers	
				waiting=0;
			}
			else if (msg == STOP)
			{
// 				fprintf(stderr,"spu_yuvscaler: Stopping\n");
				waiting=0;
			}
			else if (msg == UPDATE)
			{
// 				fprintf(stderr,"spu_yuvscaler: Update\n");
				dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image	
				first=1; // update filters to reflect the new image!
			//	selOut=0; // no need to change these. that can be done by the run.
			//	selIn=0;
			}
		}
		
		
		

	}
	
	return 0;
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) 
{
	int tgi0[2];

	int tgo0[2];

	int tgio0[2];

	tgi0[0]=1;
	tgi0[1]=2;

	
	tgio0[0]=11;
	tgio0[1]=12;

	tgo0[0]=13;
	tgo0[1]=14;
/*	tgo1[0]=15;
	tgo1[1]=16;*/	

	
	int selOut = 0;
	int selIn = 0;
	int msg=RUN;
	int waiting=0;
	int tag = 31;
	struct img_args *iargs;
	iargs =(struct img_args*)memalign(128,sizeof(*iargs));
	dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag);
	
	printf("spu_blit_yuv422_to_argb: SRC width %d,DST width %d\n",iargs->src_w,iargs->drw_w);
	printf("spu_blit_yuv422_to_argb: SRC height %d,DST height %d\n",iargs->src_h,iargs->drw_h);
	
	while (spu_stat_in_mbox() == 0);
		msg=spu_read_in_mbox();
//	first=0;

	vector unsigned char *InOutbuffer[2];

	vector unsigned char *Inbuffer[2];

	vector unsigned char *Outbuffer[2];

	int Outwidth=(4*iargs->drw_w+3)&~3;
	int Inwidth=(2*iargs->src_w+7)&~7;

	Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth);
	Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth);

	if (iargs->BLEND)
	{
		InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
		InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
	}

	Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
	Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);

	unsigned long long Inp,Outp,InOutp;
	
	int i=0;
//	int update=1;


	while (msg!=STOP)
	{
		selOut = 0;
		selIn = 0;

		Inp=iargs->Inp0[0];
		InOutp=iargs->Outp0[0];
		Outp=iargs->Outp0[0];

		dmaGet(Inbuffer[0],Inp,Inwidth,tgi0[0]);
		Inp=Inp+iargs->Istride[0]*2;

		dmaGet(Inbuffer[1],Inp,Inwidth,tgi0[1]);
		Inp=Inp+iargs->Istride[0]*2;

// 		if (iargs->BLEND)
// 		{
// 			dmaGet(InOutbuffer[0],InOutp,Outwidth,tgio0[0]);
// 			InOutp=InOutp+iargs->Ostride[0]*4;
// 			dmaGet(InOutbuffer[1],InOutp,Outwidth,tgio0[1]);
// 			InOutp=InOutp+iargs->Ostride[0]*4;
// 		}



		selIn=0;
		selOut=0;

		for (i=0;i < iargs->drw_h ;i++) {
			dmaWaitTag(tgi0[selIn]);
			
// 			if (iargs->BLEND)
// 				dmaWaitTag(tgio0[selIn]); 
			dmaWaitTag(tgo0[selOut]);
			if (iargs->SourceFormat==YUY2||iargs->SourceFormat==YUYV422)
			{
				yuv422_to_argb(Inbuffer[selIn],Outbuffer[selOut],iargs->drw_w);
			//	printf("spe_blitter: YUV422->ARGB\n");
			}
			//yuv420_to_yuv2(Yinbuffer[selIn],Uinbuffer[selIn],Vinbuffer[selIn],Outbuffer[selOut],iargs->Istride[0]);
			
		//	if (iargs->BLEND)
			//	blend(InOutbuffer[selIn],OutBuffer[selOut],iargs->ALPHA,iargs->SourceFormat);
			
			dmaPut(Outbuffer[selOut],Outp,Outwidth,tgo0[selOut]);
		
// 			if (iargs->BLEND){
// 				dmaGet(InOutbuffer[selIn],InOutp,Outwidth,tgio0[selIn]);
// 				InOutp=InOutp+iargs->Ostride[0];
// 					
// 			}

			dmaGet(Inbuffer[selIn],Inp,Inwidth,tgi0[selIn]);
			
			Inp=Inp+iargs->Istride[0]*2;
			Outp=Outp+iargs->Ostride[0]*4;
			selIn=selIn^1;
			selOut=selOut^1;
		}
	

		while (spu_stat_out_intr_mbox() == 0);
		msg=RDY;
		spu_writech(SPU_WrOutIntrMbox, msg);
		waiting=1;
		
		while (waiting){
			
			while (spu_stat_in_mbox() == 0);
			msg=spu_read_in_mbox();
			
			if (msg == RUN){
				waiting=0;
			}
			else if (msg == STOP)
			{
				waiting=0;
			}
			else if (msg == UPDATE)
			{
				tag=30;
 				dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image	
//  			//	update=1; // update filters to reflect the new image!
// 				Outwidth=(iargs->drw_w+3)&~3;
// 				Inwidth=(iargs->src_w+7)&~7;
// 				free(Inbuffer[0]);
// 				free(Inbuffer[1]);
// 	
// 				free(Outbuffer[0]);
// 				free(Outbuffer[1]);
// 				
// 				Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth);
// 				Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth);
// 			
// 				if (iargs->BLEND)
// 				{
// 					free(InOutbuffer[0]);
// 					free(InOutbuffer[1]);	
// 					InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
// 					InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
// 				}
// 	
// 				Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
// 				Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
			}
		}
	}
		 

	return 0;
}