Exemple #1
0
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp)
{
    int i;
    int tag = 1;

    /* DMA Transfer 1 : GET input/output parameters */
    spu_mfcdma64(&abs_params, mfc_ea2h(argp), mfc_ea2l(argp),
                 sizeof(abs_params_t), tag, MFC_GET_CMD);
    spu_writech(MFC_WrTagMask, 1 << tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);

    /* DMA Transfer 2 : GET input data */
    spu_mfcdma64(in_spe, mfc_ea2h(abs_params.ea_in), mfc_ea2l(abs_params.ea_in),
                 abs_params.size * sizeof(float), tag, MFC_GET_CMD);
    spu_writech(MFC_WrTagMask, 1 << tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);

    /* Calculate absolute values */
    for (i = 0; i < abs_params.size; i++) {
        if (in_spe[i] > 0) {
                out_spe[i] = in_spe[i];
        } else {
                out_spe[i] = in_spe[i] * -1;
        }
    }

    /* DMA Transfer 3 : PUT output data */
    spu_mfcdma64(out_spe, mfc_ea2h(abs_params.ea_out), mfc_ea2l(abs_params.ea_out),
                 abs_params.size * sizeof(float), tag, MFC_PUT_CMD);
    spu_writech(MFC_WrTagMask, 1 << tag);
    spu_mfcstat(MFC_TAG_UPDATE_ALL);

    return 0;
}
Exemple #2
0
static inline void enqueue_sync(addr64 lscsa_ea)
{
	unsigned int tag_id = 0;
	unsigned int cmd = 0xCC;

	/* Save, Step 14:
	 *    Enqueue an MFC_SYNC command (tag 0).
	 */
	spu_writech(MFC_TagID, tag_id);
	spu_writech(MFC_Cmd, cmd);
}
int spu_thread_send_event(uint8_t spup,uint32_t data0,uint32_t data1)
{
	uint32_t val = ((spup<<EVENT_PORT_SHIFT) | (data0&EVENT_DATA0_MASK));

	if(spup>EVENT_PORT_MAX_NUM) return 0x80010002;
	if(spu_readchcnt(SPU_RdInMbox)>0) return 0x8001000A;

	spu_writech(SPU_WrOutMbox,data1);
	spu_writech(SPU_WrOutIntrMbox,val);

	return (int)spu_readch(SPU_RdInMbox);
}
Exemple #4
0
/* loads program info - blocks until done */
void load_program_info(unsigned long long ea, spe_program_info_t *info)
{
	/* initiate DMA request for program info */
	/* spu_mfcdma64(ls_addr, ea_h, ea_l, size, tag_id, cmd); */
	spu_mfcdma64(info, mfc_ea2h(ea), mfc_ea2l(ea),
		sizeof(spe_program_info_t),
		SPUDMA_PROGRAMINFO,
		MFC_GET_CMD);

	/* wait for request to complete */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_PROGRAMINFO);
	mfc_read_tag_status_all();

	/* assign to global for debugging purposes */
	speid = info->speId;

#if defined(_DEBUG) && _DEBUG > 1
	printf("Program info:\n\tSpe ID:       %d\n\tNum Pixels:   %d\n\tSpp:          %d\n\tNum Spes      %d\n\tDepth:        %d\n",
		info->speId,
		info->numPixels,
		info->samplesPerPixel,
		info->numSpes,
		info->depth);
#endif
}
Exemple #5
0
int main(unsigned long long spe, unsigned long long argp, unsigned long long envp)
{
    int i;

    for (i = 0; i < 10000; i++) {
        spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_GET_CMD);
        spu_writech(MFC_WrTagMask, 1 << 0);
        spu_mfcstat(MFC_TAG_UPDATE_ALL);

        counter++;

        spu_mfcdma64(&counter, mfc_ea2h(argp), mfc_ea2l(argp), sizeof(int), 0, MFC_PUT_CMD);
        spu_writech(MFC_WrTagMask, 1 << 0);
        spu_mfcstat(MFC_TAG_UPDATE_ALL);
    }

    return 0;
}
static inline void restore_srr0(void)
{
	unsigned int offset;
	unsigned int srr0;

	/* Restore, Step 14:
	 *    Restore the SPU SRR0 data from the LSCSA.
	 */
	offset = LSCSA_QW_OFFSET(srr0);
	srr0 = regs_spill[offset].slot[0];
	spu_writech(SPU_WrSRR0, srr0);
}
static inline void restore_event_mask(void)
{
	unsigned int offset;
	unsigned int event_mask;

	/* Restore, Step 15:
	 *    Restore the SPU_RdEventMsk data from the LSCSA.
	 */
	offset = LSCSA_QW_OFFSET(event_mask);
	event_mask = regs_spill[offset].slot[0];
	spu_writech(SPU_WrEventMask, event_mask);
}
static inline void restore_tag_mask(void)
{
	unsigned int offset;
	unsigned int tag_mask;

	/* Restore, Step 16:
	 *    Restore the SPU_RdTagMsk data from the LSCSA.
	 */
	offset = LSCSA_QW_OFFSET(tag_mask);
	tag_mask = regs_spill[offset].slot[0];
	spu_writech(MFC_WrTagMask, tag_mask);
}
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm)
{
  int i, j;
  int left, cnt;
  float time;
  unsigned int tag_id;
  vector float dt_v, dt_inv_mass_v;

  // Reserve a tag ID
  tag_id = mfc_tag_reserve();

  spu_writech(MFC_WrTagMask, -1);

  // Input parameter parm is a pointer to the particle parameter context.
  // Fetch the context, waiting for it to complete.
  
  spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt_v = spu_splats(ctx.dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += ctx.dt) {
    // For each block of particles
    for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) {
      // Determine the number of particles in this block.
      left = ctx.particles - i;
      cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete 
      // before performing computation.
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD);
      spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      // Compute the step in time for the block of particles
      for (j=0; j<cnt; j++) {
	pos[j] = spu_madd(vel[j], dt_v, pos[j]);
	dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j]));
	vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]);
      }

      // Put the position and velocity data back into system memory
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD);
    }
  }
  // Wait for final DMAs to complete before terminating SPU thread.
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  return (0);
}
static inline void write_ppuint_mb(void)
{
	unsigned int offset;
	unsigned int data;

	/* Restore, Step 12:
	 *    Write the MFC_WrInt_MB channel with the PPUINT_MB
	 *    data from LSCSA.
	 */
	offset = LSCSA_QW_OFFSET(ppuint_mb);
	data = regs_spill[offset].slot[0];
	spu_writech(SPU_WrOutIntrMbox, data);
}
Exemple #11
0
void GetSPEAddr( unsigned int ea, unsigned int *PPE_addr )
{

//#ifdef _DEBUG
//#if _DBGLVL > 1
//	printf( "Getting SPE address @ %#x to %#x\n", ea, (unsigned int)PPE_addr );
//#endif
//#endif
	Printf1( "SPE[%u]: Getting SPE address @ %#x to %#x\n", SPE_id, ea, (unsigned int)PPE_addr );
	// Get STRUCTURE
	spu_mfcdma32( PPE_addr, ea, 16, 30, MFC_GET_CMD );
	spu_writech( MFC_WrTagMask, 1 << 30 );
	spu_mfcstat( MFC_TAG_UPDATE_ALL );
}
Exemple #12
0
void GetShader( unsigned int EA, unsigned int size, unsigned int *shader )
{
//#ifdef _DEBUG
//#if _DBGLVL > 1
//	printf( "Getting shader @ %#x to %#x(%u)\n", EA, (unsigned int)shader, size	 );
//	printf( "Shader size is %u\n", size );
//#endif
//#endif
	Printf1( "SPE[%u]: Getting shader @ %#x to %#x(%u)\n", SPE_id, EA, (unsigned int)shader, size	 );
	// Get shader
	size = ( size + 15 ) &~ 15;

	spu_mfcdma32( shader, EA, size, 29, MFC_GET_CMD );
	spu_writech( MFC_WrTagMask, 1 << 29 );
	spu_mfcstat( MFC_TAG_UPDATE_ALL );
}
int spu_thread_receive_event(uint32_t spuq,uint32_t *data0,uint32_t *data1,uint32_t *data2)
{
	int ret;

	if(spu_readchcnt(SPU_RdInMbox)>0) return 0x8001000A;

	spu_writech(SPU_WrOutMbox,spuq);
	spu_stop(0x110);

	ret = spu_readch(SPU_RdInMbox);
	if(ret) return ret;

	*data0 = spu_readch(SPU_RdInMbox);
	*data1 = spu_readch(SPU_RdInMbox);
	*data2 = spu_readch(SPU_RdInMbox);

	return ret;
}
static inline void restore_decr(void)
{
	unsigned int offset;
	unsigned int decr_running;
	unsigned int decr;

	/* Restore, Step 6(moved):
	 *    If the LSCSA "decrementer running" flag is set
	 *    then write the SPU_WrDec channel with the
	 *    decrementer value from LSCSA.
	 */
	offset = LSCSA_QW_OFFSET(decr_status);
	decr_running = regs_spill[offset].slot[0] & SPU_DECR_STATUS_RUNNING;
	if (decr_running) {
		offset = LSCSA_QW_OFFSET(decr);
		decr = regs_spill[offset].slot[0];
		spu_writech(SPU_WrDec, decr);
	}
}
static inline void fetch_regs_from_mem(addr64 lscsa_ea)
{
	unsigned int ls = (unsigned int)&regs_spill[0];
	unsigned int size = sizeof(regs_spill);
	unsigned int tag_id = 0;
	unsigned int cmd = 0x40;	/* GET */

	spu_writech(MFC_LSA, ls);
	spu_writech(MFC_EAH, lscsa_ea.ui[0]);
	spu_writech(MFC_EAL, lscsa_ea.ui[1]);
	spu_writech(MFC_Size, size);
	spu_writech(MFC_TagID, tag_id);
	spu_writech(MFC_Cmd, cmd);
}
Exemple #16
0
void GetOperation( unsigned int ea, Operation_t *data )
{

//#ifdef _DEBUG
//#if _DBGLVL > 1
//	printf( "Getting operation @ %#x to %#x(%u)\n", ea, (unsigned int)data, sizeof( Operation_t ) );
//#endif
//#endif
	Printf1( "SPE[%u]: Getting operation @ %#x to %#x(%u)\n", SPE_id, ea, (unsigned int)data, sizeof( Operation_t ) );
	// Get STRUCTURE
	spu_mfcdma32( data, ea, 32, 30, MFC_GET_CMD );
	// Waiting
	spu_writech( MFC_WrTagMask, 1 << 30 );
	spu_mfcstat( MFC_TAG_UPDATE_ALL );
//	printf( "---->%#x\n", (unsigned int)data->EA_shader );
//	printf( "---->%#x\n", (unsigned int)data->shaderSize );
//	printf( "---->%#x\n", (unsigned int)data->obj[0] );
//	printf( "---->%#x\n", (unsigned int)data->obj[1] );
//	printf( "---->%#x\n", (unsigned int)data->obj[2] );
//	printf( "---->%#x\n", (unsigned int)data->scalars[0] );
//	printf( "---->%#x\n", (unsigned int)data->scalars[1] );
//	printf( "---->%#x\n", (unsigned int)data->scalars[2] );
}
Exemple #17
0
static inline void spill_regs_to_mem(addr64 lscsa_ea)
{
	unsigned int ls = (unsigned int)&regs_spill[0];
	unsigned int size = sizeof(regs_spill);
	unsigned int tag_id = 0;
	unsigned int cmd = 0x20;	/* PUT */

	/* Save, Step 13:
	 *    Enqueue a PUT command (tag 0) to send the LSCSA
	 *    to the CSA.
	 */
	spu_writech(MFC_LSA, ls);
	spu_writech(MFC_EAH, lscsa_ea.ui[0]);
	spu_writech(MFC_EAL, lscsa_ea.ui[1]);
	spu_writech(MFC_Size, size);
	spu_writech(MFC_TagID, tag_id);
	spu_writech(MFC_Cmd, cmd);
}
static inline void restore_upper_240kb(addr64 lscsa_ea)
{
	unsigned int ls = 16384;
	unsigned int list = (unsigned int)&dma_list[0];
	unsigned int size = sizeof(dma_list);
	unsigned int tag_id = 0;
	unsigned int cmd = 0x44;	/* GETL */

	/* Restore, Step 4:
	 *    Enqueue the GETL command (tag 0) to the MFC SPU command
	 *    queue to transfer the upper 240 kb of LS from CSA.
	 */
	spu_writech(MFC_LSA, ls);
	spu_writech(MFC_EAH, lscsa_ea.ui[0]);
	spu_writech(MFC_EAL, list);
	spu_writech(MFC_Size, size);
	spu_writech(MFC_TagID, tag_id);
	spu_writech(MFC_Cmd, cmd);
}
Exemple #19
0
static inline void save_upper_240kb(addr64 lscsa_ea)
{
	unsigned int ls = 16384;
	unsigned int list = (unsigned int)&dma_list[0];
	unsigned int size = sizeof(dma_list);
	unsigned int tag_id = 0;
	unsigned int cmd = 0x24;	/* PUTL */

	/* Save, Step 7:
	 *    Enqueue the PUTL command (tag 0) to the MFC SPU command
	 *    queue to transfer the remaining 240 kb of LS to CSA.
	 */
	spu_writech(MFC_LSA, ls);
	spu_writech(MFC_EAH, lscsa_ea.ui[0]);
	spu_writech(MFC_EAL, list);
	spu_writech(MFC_Size, size);
	spu_writech(MFC_TagID, tag_id);
	spu_writech(MFC_Cmd, cmd);
}
Exemple #20
0
/* loads the scene using DMA - blocks until done */
void load_scene(unsigned long long ea, scene_t *scene)
{
	unsigned int i = 0;
	object3d_t *objects = 0;
	pointlight_t *lights = 0;
	point_t *v = 0;

#if defined(_DEBUG) && _DEBUG > 2
	printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for SCENE\n",
		sizeof(scene_t),
		&scene,
		mfc_ea2h(ea),
		mfc_ea2l(ea));
#endif
	/* DMA request for scene */
	spu_mfcdma64(scene,
		mfc_ea2h(ea),
		mfc_ea2l(ea),
		sizeof(scene_t),
		SPUDMA_SCENE,
		MFC_GET_CMD);
	
	/* wait for request to complete */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_SCENE);
	mfc_read_tag_status_all();
	
	
	/* copy over objects */
	objects = _malloc_align(sizeof(object3d_t) * scene->nObjects, 4);
#if defined(_DEBUG) && _DEBUG > 2
	printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for OBJECTS\n",
		sizeof(object3d_t) * scene->nObjects,
		objects,
		mfc_ea2h(scene->objects_ea),
		mfc_ea2l(scene->objects_ea));
#endif
	/* initiate DMA */
	spu_mfcdma64(objects,
		mfc_ea2h(scene->objects_ea),
		mfc_ea2l(scene->objects_ea),
		sizeof(object3d_t) * scene->nObjects,
		SPUDMA_OBJECTS,
		MFC_GET_CMD);
	
	/* copy over lights */
	lights = _malloc_align(sizeof(pointlight_t) * scene->nLights, 4);	
#if defined(_DEBUG) && _DEBUG > 2
	printf("Transferring %d bytes to LSaddr(%8X) from EAadd(%8lX:%8lX) for LIGHTS\n",
		sizeof(pointlight_t) * scene->nLights,
		lights,
		mfc_ea2h(scene->lights_ea),
		mfc_ea2l(scene->lights_ea));
#endif
	/* initiate DMA for lights */
	spu_mfcdma64(lights,
		mfc_ea2h(scene->lights_ea),
		mfc_ea2l(scene->lights_ea),
		sizeof(pointlight_t) * scene->nLights,
		SPUDMA_LIGHTS,
		MFC_GET_CMD);
	
	/* wait for objects to complete */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_OBJECTS);
	mfc_read_tag_status_all();
	/* assign local store pointer to objects */
	scene->objects = objects;

	/* iterate each object locally */
	for(; i < scene->nObjects; ++i)
	{
		if(objects[i].geometryType == GEOMETRY_POLYGON)
		{
			/* allocate memory for vertex */
			v = _malloc_align(sizeof(point_t) 
				* objects[i].poly_obj.nVerticies, 4);
			/* initiate DMA to get verticies */
			spu_mfcdma64(v,
				mfc_ea2h(objects[i].poly_obj.vertex_ea),
				mfc_ea2l(objects[i].poly_obj.vertex_ea),
				sizeof(point_t)
				* objects[i].poly_obj.nVerticies,
				SPUDMA_VERTEXES,
				MFC_GET_CMD);
			/* assign local store pointer - WARNING - safe? */
			objects[i].poly_obj.vertex = v;				
		}
	}
	
	/* wait for all DMA to finish (vertexes, lights) */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_LIGHTS |
				1 << SPUDMA_VERTEXES );
	mfc_read_tag_status_all();
	/* assign local store lights pointer */
	scene->lights = lights;
}
void process_buffer(int buffer, int cnt, vector float dt_v)
{
  int i;
  volatile vector float *p_inv_mass_v;
  vector float force_v, inv_mass_v;
  vector float pos0, pos1, pos2, pos3;
  vector float vel0, vel1, vel2, vel3;
  vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3;
  vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
  vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7};
  vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11};
  vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15};

  p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; 
  force_v = ctx.force_v;

  // Compute the step in time for the block of particles, four 
  // particle at a time.
  for (i=0; i<cnt; i+=4) {
    inv_mass_v = *p_inv_mass_v++;
    
    pos0 = pos[buffer][i+0];
    pos1 = pos[buffer][i+1];
    pos2 = pos[buffer][i+2];
    pos3 = pos[buffer][i+3];

    vel0 = vel[buffer][i+0];
    vel1 = vel[buffer][i+1];
    vel2 = vel[buffer][i+2];
    vel3 = vel[buffer][i+3];

    dt_inv_mass_v = spu_mul(dt_v, inv_mass_v);

    pos0 = spu_madd(vel0, dt_v, pos0);
    pos1 = spu_madd(vel1, dt_v, pos1);
    pos2 = spu_madd(vel2, dt_v, pos2);
    pos3 = spu_madd(vel3, dt_v, pos3);

    dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0);
    dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1);
    dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2);
    dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3);

    vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0);
    vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1);
    vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2);
    vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3);

    pos[buffer][i+0] = pos0;
    pos[buffer][i+1] = pos1;
    pos[buffer][i+2] = pos2;
    pos[buffer][i+3] = pos3;

    vel[buffer][i+0] = vel0;
    vel[buffer][i+1] = vel1;
    vel[buffer][i+2] = vel2;
    vel[buffer][i+3] = vel3;
  }
}


int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv)
{
  int buffer, next_buffer;
  int cnt, next_cnt, left;
  float time, dt;
  vector float dt_v;
  volatile vector float *ctx_pos_v, *ctx_vel_v;
  volatile vector float *next_ctx_pos_v, *next_ctx_vel_v;
  volatile float *ctx_inv_mass, *next_ctx_inv_mass;
  unsigned int tags[2];

  // Reserve a pair of DMA tag IDs
  tags[0] = mfc_tag_reserve();
  tags[1] = mfc_tag_reserve();
  
  // Input parameter argv is a pointer to the particle context.
  // Fetch the parameter context, waiting for it to complete.
  spu_writech(MFC_WrTagMask, 1 << tags[0]);
  spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt = ctx.dt;
  dt_v = spu_splats(dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each double buffered block of particles
    left = ctx.particles;

    cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

    ctx_pos_v = ctx.pos_v;
    ctx_vel_v = ctx.vel_v;
    ctx_inv_mass = ctx.inv_mass;

    // Prefetch first buffer of input data.
    buffer = 0;
    spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD);
    spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD);
    spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD);

    while (cnt < left) {
      left -= cnt;

      next_ctx_pos_v = ctx_pos_v + cnt;
      next_ctx_vel_v = ctx_vel_v + cnt;
      next_ctx_inv_mass = ctx_inv_mass + cnt;
      next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Prefetch next buffer so the data is available for computation on next loop iteration.
      // The first DMA is barriered so that we don't GET data before the previous iteration's
      // data is PUT.
      next_buffer = buffer^1;

      spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD);
      spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD);
      spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD);
      
      // Wait for previously prefetched data
      spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      process_buffer(buffer, cnt, dt_v);

      // Put the buffer's position and velocity data back into system memory
      spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
      
      ctx_pos_v = next_ctx_pos_v;
      ctx_vel_v = next_ctx_vel_v;
      ctx_inv_mass = next_ctx_inv_mass;

      buffer = next_buffer;
      cnt = next_cnt;		  
    }

    // Wait for previously prefetched data
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

    process_buffer(buffer, cnt, dt_v);

    // Put the buffer's position and velocity data back into system memory
    spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);
    spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD);

    // Wait for DMAs to complete before starting the next step in time.
    spu_writech(MFC_WrTagMask, 1 << tags[buffer]);
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  }

  return (0);
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) 
{
	int tgiy0[2];
	int tgiy1[2];
	int tgiu0[2];
	int tgiu1[2];
	int tgiv0[2];
	int tgiv1[2];
	int tgo0[2];
	int tgo1[2];

	tgiu1[0]=1;
	tgiu1[1]=2;
	tgo0[0]=3;
	tgo0[1]=4;
	tgiy0[0]=5;
	tgiy0[1]=6;
	tgiy1[0]=7;
	tgiy1[1]=8;
	tgiu0[0]=9;
	tgiu0[1]=10;
	tgiv0[0]=11;
	tgiv0[1]=12;
	tgiv1[1]=13;
	tgiv1[1]=14;
	tgo1[0]=15;
	tgo1[1]=16;
	
	int selOut = 0;
	int selIn = 0;
	int tag = 31;
	int LineSelIn=0;
	int LineSelOut=0;
		
	int selY0In = 0;
	int selY1In = 0;
	int selCrIn = 0;
	struct img_args *iargs;
	
	iargs =(struct img_args*)memalign(128,sizeof(*iargs));

	unsigned long long Cp;

	int first=1;
	int waiting=0;
	unsigned long long Op;
	unsigned int msg;
	unsigned long long YIp,UIp,VIp,YOp;

	int crblock0;
	int crblock1;
	int srcsmallcroma=0;
;
	int noscale=1;

 	static	int crblockdst1;
	static	int crblockdst0;
	scaler_settings_t sc;
	
	while (spu_stat_in_mbox() == 0);
		msg=spu_read_in_mbox();
	if (msg==RUN){	
		fprintf(stderr,"spu_yuv2argb_scaler: Starting Up\n");
	}

	dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process image
	printf("spu_yuv2argb_scaler: SRC width %d,DST width %d\n",iargs->srcW,iargs->dstW);
	printf("spu_yuv2argb_scaler: SRC height %d,DST height %d\n",iargs->srcH,iargs->dstH);
	
	printf("spu_yuv2argb_scaler: DST offset %d\n",iargs->offset);
	
	// bad fix for centering image on 1080p)
	//iargs->offset=(iargs->maxwidth-iargs->dstW)/2 + iargs->maxwidth*(1080-iargs->dstH)/2;	
	

	vector unsigned char *widthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*4+16);
	vector unsigned char *widthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*4+16);

	vector unsigned char *crwidthfilter0=(vector unsigned char*)memalign(128,MAXWIDTH*2+16);
	vector unsigned char *crwidthfilter1=(vector unsigned char*)memalign(128,MAXWIDTH*2+16);	

	vector float * weightWfilter0=(vector float*)memalign(128,MAXWIDTH*4+16);
	vector float * weightWfilter1=(vector float*)memalign(128,MAXWIDTH*4+16);

	float weightHfilter[MAXHEIGHT+1];

	unsigned long long dmapos[MAXHEIGHT+2];
	unsigned long long dmacromapos[MAXHEIGHT+2];

	
	vector float * Ytemp0=(vector float *)memalign(128,MAXWIDTH*4+16);
	vector float * Ytemp1=(vector float *)memalign(128,MAXWIDTH*4+16);
	vector float * Utemp=(vector float *)memalign(128,MAXWIDTH*2+16);
	vector float * Vtemp=(vector float *)memalign(128,MAXWIDTH*2+16);

	int wfilterpos[MAXWIDTH+2];
	int hfilterpos0[MAXHEIGHT+2];
	int hfilterpos1[MAXHEIGHT+2];
	int crwfilterpos[MAXWIDTH/2+2];

	vector unsigned char *InputY0[2];
	InputY0[0]=(vector unsigned char*)memalign(128,MAXWIDTH); 
	InputY0[1]=(vector unsigned char*)memalign(128,MAXWIDTH); 

	vector unsigned char *InputU0[2];
	InputU0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputU0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	
	vector unsigned char *InputV0[2];
	InputV0[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputV0[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16);

	vector unsigned char *InputY1[2];
	InputY1[0]=(vector unsigned char*)memalign(128,MAXWIDTH); 
	InputY1[1]=(vector unsigned char*)memalign(128,MAXWIDTH); 

	vector unsigned char *InputU1[2];
	InputU1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputU1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16);

	vector unsigned char *InputV1[2];
	InputV1[0]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 
	InputV1[1]=(vector unsigned char*)memalign(128,MAXWIDTH/2+16); 	

	vector unsigned char* Output0[2];
	Output0[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	Output0[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output

	vector unsigned char* Output1[2];
	Output1[0]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	Output1[1]=(vector unsigned char*)memalign(128,MAXWIDTH*4);	// 1line output
	

	
	while (msg!=STOP) 
	{
		int h=0;
		int i;
		
		if (first)
		{
			crblock0=(iargs->srcW>>1)&~15; // rounded down
			crblock1=((iargs->srcW>>1) + 15)&~15; //rounded up
			crblockdst1=((iargs->dstW>>1) + 15)&~15;//destination size rounded up.
			crblockdst0=((iargs->dstW>>1) + 7)&~7;//destination size rounded up.

			
			initHFilter(iargs->srcW,iargs->srcH,iargs->dstH,hfilterpos0,hfilterpos1,weightHfilter,dmapos,dmacromapos);
// 			printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[0]]/16.0);
// 			printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[1]]/16.0,dmacromapos[hfilterpos1[1]]/16.0);
// 			
// 			for (i=0;i < iargs->dstH>>1;i++)
// 			{
//  			//	printf("Hfilterpos0 dst: %d, src:%d, weight:%f\n",i,hfilterpos0[i],weightHfilter[i]);
//  			//	printf("Hfilterpos1 dst: %d, src:%d, weight:%f\n",i,hfilterpos1[i],1.0-weightHfilter[i]);
// 				printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+2]]/16.0,dmacromapos[hfilterpos1[2*i+2]]/16.0);
// 				printf("line :%d, dmapos :%f, dmacromapos :%f \n",i,dmapos[hfilterpos1[2*i+3]]/16.0,dmacromapos[hfilterpos1[2*i+3]]/16.0);
// 			}
			
			if ((iargs->srcW==iargs->dstW)&&(iargs->srcH==iargs->dstH))
			{
				
				printf("spu_yuv2argb_scaler: No scaling proceeding with direct csc\n");
				noscale=1;
				if ((iargs->srcW%32) != 0)
				{
					srcsmallcroma=1;
					sc.smallcroma=1;
				}
				
			} else {
				
			
				noscale=0;
				printf("spu_yuv2argb_scaler: Scaling, computing shuffle filters\n");
				initWFilter(iargs->srcW,iargs->dstW,1,wfilterpos,widthfilter0,widthfilter1,weightWfilter0,weightWfilter1);

/*				for (i=0;i < iargs->dstW/4;i++)
				{
					printf("filterpos dst: %d, src:%d\n",i,wfilterpos[i]);
					printcharvec("widthfilter0",widthfilter0[i]);
					printcharvec("widthfilter1",widthfilter1[i]);
					printfvec("weightWfilter0",weightWfilter0[i]);
					printfvec("weightWfilter1",weightWfilter1[i]);
				}*/				

				srcsmallcroma=0;
				sc.smallcroma=0;
				if ((iargs->srcW%32) != 0)
				{
					sc.smallcroma=1;
					srcsmallcroma=1;	
					initWcrFilter(iargs->srcW/2,iargs->dstW/2,1,crwfilterpos,crwidthfilter0,crwidthfilter1);	
					printf("spu_yuv2argb_scaler: Computing Crshuffle filter\n");
	
// 					for (i=0;i < (iargs->dstW>>1)/4;i++)
// 					{
// 						printf("crwfilterpos dst: %d, src:%d, weight:%f\n",i,crwfilterpos[i]);
// 						printcharvec("crwidthfilter0",crwidthfilter0[i]);
// 						printcharvec("crwidthfilter1",crwidthfilter1[i]);
// 						printfvec("weightWfilter0",weightWfilter0[i]);
// 						printfvec("weightWfilter1",weightWfilter1[i]);
// 					
// 					}
							
				}
				
				sc.wWfilter0=weightWfilter0;
				sc.wWfilter1=weightWfilter1;
				sc.wfilterpos=wfilterpos;
				sc.sWfilter0=widthfilter0; 
				sc.sWfilter1=widthfilter1;
				sc.crsWfilter0=crwidthfilter0;
				sc.crsWfilter1=crwidthfilter1;
				sc.crfilterpos=crwfilterpos;

				sc.smallcromaline0=0;
				sc.smallcromaline1=0;
				
			}
			first=0;
			printf("spu_yuv2argb_scaler: Initiation completed\n");
		}

	
		YIp = iargs->Ystart[selIn];
		UIp = iargs->Ustart[selIn];
		VIp = iargs->Vstart[selIn];
		Op = iargs->Output[selOut] + iargs->offset*4;

		
		LineSelOut=0;
		selY0In=0; 
		selY1In=0;
		selCrIn=0;

	
		dmaGet(InputY0[0],YIp+dmapos[hfilterpos0[0]],iargs->srcW,tgiy0[0]); 
		dmaGet(InputY1[0],YIp+dmapos[hfilterpos1[0]],iargs->srcW,tgiy1[0]); 
		dmaGet(InputY0[1],YIp+dmapos[hfilterpos0[1]],iargs->srcW,tgiy0[1]); 
		dmaGet(InputY1[1],YIp+dmapos[hfilterpos1[1]],iargs->srcW,tgiy1[1]); 


		dmaGet(InputU0[0],UIp+dmacromapos[hfilterpos0[0]],crblock1,tgiu0[0]);
		dmaGet(InputU0[1],UIp+dmacromapos[hfilterpos0[1]],crblock1,tgiu0[1]);
		dmaGet(InputU1[0],UIp+dmacromapos[hfilterpos1[0]],crblock1,tgiu1[0]);	
		dmaGet(InputU1[1],UIp+dmacromapos[hfilterpos1[1]],crblock1,tgiu1[1]); 
// 
		dmaGet(InputV0[0],VIp+dmacromapos[hfilterpos0[0]],crblock1,tgiv0[0]);
		dmaGet(InputV0[1],VIp+dmacromapos[hfilterpos0[1]],crblock1,tgiv0[1]);
		dmaGet(InputV1[0],VIp+dmacromapos[hfilterpos1[0]],crblock1,tgiv1[0]);	
		dmaGet(InputV1[1],VIp+dmacromapos[hfilterpos1[1]],crblock1,tgiv1[1]);


		LineSelOut=0;
		selY0In=0; 
		selY1In=0;
		selCrIn=0;
	//	printf("New image\n");
		for (h=0; h < iargs->dstH>>1; h++) //we asume that output is allways h/2
		{

			sc.width=iargs->dstW;
			sc.smallcroma=0;
			sc.smallcromaline0=0;
			sc.smallcromaline1=0;

			sc.wHfilter=weightHfilter[2*h];
			dmaWaitTag(tgiy0[selY0In]);
		//	printf("dma: %d\n",2*h+2);
			dmaWaitTag(tgiy1[selY1In]);
		//	printf("dma: %d\n",2*h+2);
			sc.source00=InputY0[selY0In];
			sc.source01=InputY1[selY1In];
			sc.Output=Ytemp0;
			
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}	
								//first Y line scaled
			dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+2]],iargs->srcW,tgiy0[selY0In]); 
		//	printf("dma: %d\n",2*h+2);
			if (!noscale) { //if we are scaling we also need the second line
				dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+2]],iargs->srcW,tgiy1[selY1In]); 
			}
		//	printf("dma: %d\n",2*h+2);
			selY0In=selY0In^1;
			selY1In=selY1In^1;
			

			sc.wHfilter=weightHfilter[2*h+1];
			dmaWaitTag(tgiy0[selY0In]);
			dmaWaitTag(tgiy1[selY0In]);
			sc.source00=InputY0[selY0In];
			sc.source01=InputY1[selY0In];
			sc.Output=Ytemp1;
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}								//second Y line scaled
			dmaGet(InputY0[selY0In],YIp+dmapos[hfilterpos0[2*h+3]],iargs->srcW,tgiy0[selY0In]); 
			if(!noscale) { //if we are scaling we also need the second line
				dmaGet(InputY1[selY1In],YIp+dmapos[hfilterpos1[2*h+3]],iargs->srcW,tgiy1[selY1In]); 
		
			}
			selY0In=selY0In^1;
			selY1In=selY1In^1;
		//	printf("dma: %d\n",2*h+3);
			if (srcsmallcroma) //these settings applly for both U and V
			{	
				sc.smallcroma=1;
				if ((hfilterpos0[h]&1)==1) {
					sc.smallcromaline0=1;	
				} else {
					sc.smallcromaline0=0;
				}
				if ((hfilterpos1[h]&1)==1){
					sc.smallcromaline1=1;
				} else {
					sc.smallcromaline1=0;
				} 	
				if (((hfilterpos0[h]&1)==0)&&((hfilterpos1[h]&1)==0))
				{
					sc.smallcroma=0; //both lines are 128 bit alligned only when doing extreme downscaling can this happen
				}
			}
// 			if (noscale) {
// 				sc.width=crblockdst0;//crblockdst1;
// 			} else {
// 				sc.width=crblockdst0;
// 			}
			sc.width=iargs->dstW>>1;
			sc.wHfilter=weightHfilter[h];
			
	
			dmaWaitTag(tgiu0[selCrIn]);
			dmaWaitTag(tgiu1[selCrIn]);
			sc.Output=Utemp;
			sc.source00=InputU0[selCrIn];
			sc.source01=InputU1[selCrIn];
		
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}

			dmaWaitTag(tgiv0[selCrIn]);
			dmaWaitTag(tgiv1[selCrIn]);
			sc.Output=Vtemp;
			sc.source00=InputV0[selCrIn];
			sc.source01=InputV1[selCrIn];
			
			if (noscale) {
				unpack(&sc);
			} else {
				scale(&sc);	
			}

			dmaGet(InputV0[selCrIn],VIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiu0[selCrIn]); 		//this is allways pos 0 
			dmaGet(InputU0[selCrIn],UIp+dmacromapos[hfilterpos0[h+2]],crblock1,tgiv0[selCrIn]); 

			if(!noscale) {	//if we are scaling we also need the second line
				dmaGet(InputV1[selCrIn],VIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiu1[selCrIn]);
				dmaGet(InputU1[selCrIn],UIp+dmacromapos[hfilterpos1[h+2]],crblock1,tgiv1[selCrIn]); 
			} 

			selCrIn=selCrIn^1;
			dmaWaitTag(tgo0[LineSelOut]);
			dmaWaitTag(tgo1[LineSelOut]);
							
			yuv420toARGBfloat(Ytemp0,Ytemp1,Utemp,Vtemp,Output0[LineSelOut],Output1[LineSelOut],iargs->dstW,iargs->maxwidth); //colorspace convert results
			
			dmaPut(Output0[LineSelOut],Op,iargs->dstW*4,tgo0[LineSelOut]);
			Op=Op+iargs->maxwidth*4;
			
			dmaPut(Output1[LineSelOut],Op,iargs->dstW*4,tgo1[LineSelOut]);
			Op=Op+iargs->maxwidth*4;
			
			LineSelOut=LineSelOut^1;
		} 
		dmaWaitTag(tgo0[LineSelOut^1]); //wait for last write.
		dmaWaitTag(tgo1[LineSelOut^1]); //wait for last write.

	//	printf("Image done\n");
		if (iargs->MessageForm==INTR)
		{
			while (spu_stat_out_intr_mbox() == 0);
			msg=RDY;
			spu_writech(SPU_WrOutIntrMbox, msg);
			waiting=1;
		}

		if (iargs->MessageForm==HARD)
		{
			while (spu_stat_out_mbox() == 0);
			msg=RDY;
			spu_write_out_mbox(msg);
			waiting=1;
		}
//  		fprintf(stderr,"spu_yuvscaler: Waiting\n");		
		while (waiting){
			
			while (spu_stat_in_mbox() == 0);
			msg=spu_read_in_mbox();
			
			if (msg == RUN){
				selOut = selOut ^ 1; // flips the output buffer pointers
				selIn = selIn ^ 1; // flips the input buffer pointers	
				waiting=0;
			}
			else if (msg == STOP)
			{
// 				fprintf(stderr,"spu_yuvscaler: Stopping\n");
				waiting=0;
			}
			else if (msg == UPDATE)
			{
// 				fprintf(stderr,"spu_yuvscaler: Update\n");
				dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image	
				first=1; // update filters to reflect the new image!
			//	selOut=0; // no need to change these. that can be done by the run.
			//	selIn=0;
			}
		}
		
		
		

	}
	
	return 0;
}
Exemple #23
0
int main( unsigned long long id  __attribute__ ((unused)), unsigned long long argv )
{
	unsigned int i, EA = argv;
	/*
	 * Allocate memory
	 *
	 */

	// Info
	LS_ShaderInfo shaderinfo;

	// Memory for the shader
	unsigned int *shader[__NUMBER_OF_SHADERS];
	for( i = 0 ; i < __NUMBER_OF_SHADERS ; i++ )
	{
		shader[i] = (unsigned int *)_malloc_align( __SHADER_SIZE, 7 );
	}

	// Memory for the blocks
	char *blocks[(__NUMBER_OF_BLOCKS_IN_MEM) * 2];
	for( i = 0 ; i < __NUMBER_OF_BLOCKS_IN_MEM * 2 ; i++ )
	{
		blocks[i] = (char *)_malloc_align( __BLOCK_DATA_SIZE, 7 );
		shaderinfo.LS_blockDataArea[i] = blocks[i];
	}

	// Memory for metadata
	char *meta = (char *)_malloc_align( __META_DATA_SIZE, 7 );
	shaderinfo.LS_shaderMemory = meta;

	Operation_t myop[__NUMBER_OF_SHADERS];
	unsigned int PPE_addr[__NUMBER_OF_SHADERS] __attribute__((aligned(16)));
	unsigned int EA_result[__NUMBER_OF_SHADERS];

	// Getting SPE id
	SPE_id   = mb_getmbox( );
	// SELF CHECK
	//if ( SPE_id == 0 ) SelfCheck();
	//printf( "SPE[%u]: SPE_* is at %#x\n", SPE_id, EA );

	//printf( "[SPE(%u)]Check!!!\n", SPE_id );

	// SPU program fragment prototype
	void (*run)( unsigned int SPE_id, LS_ShaderInfo *info, Operation_t *myop, Functions_t *funcs ) = NULL;
	void (*runr)( unsigned int SPE_id, unsigned int EA_result, LS_ShaderInfo *info, Operation_t *myop, Functions_t *funcs ) = NULL;

	// Getting SPE id
	unsigned int seed = mb_getmbox( );
	// Init random
	mc_rand_ks_init( seed );

	// Common Functions
	Functions_t funcs;
	funcs.printuint      = PrintUInt;
	funcs.printint       = PrintInt;
	funcs.printe         = PrintE;
	funcs.printchar      = PrintChar;
	funcs.printfloat     = PrintFloat;
	funcs.printaddr      = PrintAddr;
	funcs.printstr       = PrintString;
	funcs.printfloat3    = PrintFloat3;
	funcs.printfloat3v   = PrintFloat3v;
	funcs.printfloatv    = PrintFloatv;
	funcs.printfloatrow  = PrintFloatRow;

	//funcs.printf         = printf2;

	funcs.rand_0_to_1_f  = mc_rand_ks_0_to_1_f4;
	//funcs.rand_0_to_1_fm = mc_rand_ks_0_to_1_array_f4;

	unsigned int running = 1, task = 0;

//	prof_clear();
//	prof_start();

	unsigned int idt;

	// THA LOOP
	while( running )
	{
		task = mb_getmbox( );

		Printf1( "[SPE(%u)]Got state %u\n", SPE_id, task );

		// Transfer operation and run shader with no return value
		// 0 -> 99
		if( task < 100 )
		{
			idt = task;

			CHECKBOUNDS( idt );

			Printf1( "SPE[%u]: Get shader for slot %u\n", SPE_id, idt );

			GetSPEAddr( EA, PPE_addr );
			GetOperation( PPE_addr[idt], &myop[idt] );
			GetShader( (unsigned int)myop[idt].EA_shader, myop[idt].shaderSize, shader[idt] );
			Printf1( "SPE[%u]: Shader recieved for slot\n", SPE_id );
			run = (void *)shader[0];

			Printf1( "SPE[%u]: --- Running shader! ----------\n", SPE_id );
			run( SPE_id, &shaderinfo, &myop[idt], &funcs );
			Printf1( "SPE[%u]: --- End of shader! -----------\n", SPE_id );

			spu_writech( SPU_WrOutMbox, 1 );
		}

		// Transfer operation and run shader WITH return value
		// 100 -> 199
		else if( task < 200 )
		{
			idt = task - 100;

			CHECKBOUNDS( idt );

			Printf1( "SPE[%u]: Get shader with return value for slot %u\n", SPE_id, idt );

			EA_result[idt] = mb_getmbox( );
			GetSPEAddr( EA, PPE_addr );
			GetOperation( PPE_addr[idt], &myop[idt] );
			GetShader( (unsigned int)myop[idt].EA_shader, myop[idt].shaderSize, shader[idt] );
			runr = (void *)shader[idt];

			Printf1( "SPE[%u]: --- Running shader! ----------\n", SPE_id );
			runr( SPE_id, EA_result[idt], &shaderinfo, &myop[idt], &funcs );

			Printf1( "SPE[%u]: --- End of shader! -----------\n", SPE_id );

			spu_writech( SPU_WrOutMbox, 1 );
		}


		// Transfer operation and shader WITHOUT return value
		// 200 -> 299
		else if( task < 300 )
		{
			idt = task - 200;

			CHECKBOUNDS( idt );

			Printf1( "SPE[%u]: Get shader for slot %u\n", SPE_id, idt );
			GetSPEAddr( EA, PPE_addr );
			GetOperation( PPE_addr[idt], &myop[idt] );
			GetShader( (unsigned int)myop[idt].EA_shader, myop[idt].shaderSize, shader[idt] );
			spu_writech( SPU_WrOutMbox, 1 );
		}

		// Run shader with no return value
		// 300 -> 399
		else if( task < 400 )
		{
			idt = task - 300;

			CHECKBOUNDS( idt );

			Printf1( "SPE[%u]: Run shader from slot %u\n", SPE_id, idt );
			run = (void *)shader[idt];
			run( SPE_id, &shaderinfo, &myop[idt], &funcs );
			spu_writech( SPU_WrOutMbox, 1 );
		}

		// Transfer operation and shader WITH return value
		// 400 -> 499
		else if( task < 500 )
		{
			idt = task - 400;

			CHECKBOUNDS( idt );

			Printf1( "SPE[%u]: Get shader with return value for slot %u\n", SPE_id, idt );
			EA_result[idt] = mb_getmbox( );
			GetSPEAddr( EA, PPE_addr );
			GetOperation( PPE_addr[idt], &myop[idt] );
			GetShader( (unsigned int)myop[idt].EA_shader, myop[idt].shaderSize, shader[idt] );
			Printf1( "SPE[%u]: Done getting operation %u\n", SPE_id, idt );
			spu_writech( SPU_WrOutMbox, 1 );
		}

		// Run shader WITH return value
		// 500 -> 599
		else if( task < 600 )
		{
			idt = task - 500;

			CHECKBOUNDS( idt );

			Printf1( "SPE[%u]: Run shader with return value from slot %u\n", SPE_id, idt );
			runr = (void *)shader[0];
			runr( SPE_id, EA_result[idt], &shaderinfo, &myop[0], &funcs );
			spu_writech( SPU_WrOutMbox, 1 );
		}

		// Sanity check!!!
		else if( task == 1000 )
		{
			spu_writech( SPU_WrOutMbox, 123 );
		}

		// DEFAULT
		else
		{
			Printf1( "[SPE(%u)]No such instruction, quitting\n", SPE_id );
			running = 0;
		}





//		switch( task )
//		{
//		case 0: // QUIT
//			Printf1( "[SPE(%u)]Quitting\n", SPE_id );
//
//			running = 0;
//			break;
//
//			/*
//			 * Get an operation with no return value
//			 *
//			 */
//		case 1: // Get operation and run
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[0], &myop[0] );
//			GetShader( (unsigned int)myop[0].EA_shader, myop[0].shaderSize, shader[0] );
//			run = (void *)shader[0];
//
//			Printf1( "SPE[%u]: --- Running shader! ----------\n", SPE_id );
//			run( SPE_id, &shaderinfo, &myop[0], &funcs );
//			Printf1( "SPE[%u]: --- End of shader! -----------\n", SPE_id );
//
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//			/*
//			 * Get an operation with a return value
//			 *
//			 */
//		case 2: // Get operation with return value and run
//			EA_result = mb_getmbox( );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[0], &myop[0] );
//			GetShader( (unsigned int)myop[0].EA_shader, myop[0].shaderSize, shader[0] );
//			runr = (void *)shader[0];
//
//			Printf1( "SPE[%u]: --- Running shader! ----------\n", SPE_id );
//			runr( SPE_id, EA_result, &shaderinfo, &myop[0], &funcs );
//
//			Printf1( "SPE[%u]: --- End of shader! -----------\n", SPE_id );
//
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 3: // Get operation without return value NO RUN!
//
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[0], &myop[0] );
//			GetShader( (unsigned int)myop[0].EA_shader, myop[0].shaderSize, shader[0] );
//			run = (void *)shader[0];
//
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//
//		case 4: // Get operation  with return value NO RUN!
//
//			EA_result = mb_getmbox( );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[0], &myop[0] );
//			GetShader( (unsigned int)myop[0].EA_shader, myop[0].shaderSize, shader[0] );
//			runr = (void *)shader[0];
//
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 5: // RUN!!!! with _NO_ return value
//			Printf1( "SPE[%u]: --- Running shader! ----------\n", SPE_id );
//
//			run( SPE_id, &shaderinfo, &myop[0], &funcs );
//			Printf1( "SPE[%u]: --- End of shader! -----------\n", SPE_id );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 6: // RUN!!!! with return value
//
//			Printf1( "SPE[%u]: --- Running shader! ----------\n", SPE_id );
//
//			runr( SPE_id, EA_result, &shaderinfo, &myop[0], &funcs );
//
//			Printf1( "SPE[%u]: --- End of shader! -----------\n", SPE_id );
//
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		/*
//		 * Operations with no return values
//		 */
//		case 100: // Get a shader for slot 0
//			Printf1( "SPE[%u]: Get shader for slot 0\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[0], &myop[0] );
//			GetShader( (unsigned int)myop[0].EA_shader, myop[0].shaderSize, shader[0] );
//			break;
//
//		case 101: // Get a shader for slot 1
//			Printf1( "SPE[%u]: Get shader for slot 1\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[1], &myop[1] );
//			GetShader( (unsigned int)myop[1].EA_shader, myop[1].shaderSize, shader[1] );
//			break;
//
//		case 102: // Get a shader for slot 2
//			Printf1( "SPE[%u]: Get shader for slot 2\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[2], &myop[2] );
//			GetShader( (unsigned int)myop[2].EA_shader, myop[2].shaderSize, shader[2] );
//			break;
//
//		case 103: // Get a shader for slot 3
//			Printf1( "SPE[%u]: Get shader for slot 3\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[3], &myop[3] );
//			GetShader( (unsigned int)myop[3].EA_shader, myop[3].shaderSize, shader[3] );
//			break;
//
//		case 110: // Run slot 0
//			Printf1( "SPE[%u]: Run shader from slot 0\n", SPE_id );
//			run = (void *)shader[0];
//			run( SPE_id, &shaderinfo, &myop[0], &funcs );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 111: // Run slot 1
//			Printf1( "SPE[%u]: Run shader from slot 1\n", SPE_id );
//			run = (void *)shader[1];
//			run( SPE_id, &shaderinfo, &myop[1], &funcs );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 112: // Run slot 2
//			Printf1( "SPE[%u]: Run shader from slot 2\n", SPE_id );
//			run = (void *)shader[2];
//			run( SPE_id, &shaderinfo, &myop[2], &funcs );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 113: // Run slot 3
//			Printf1( "SPE[%u]: Run shader from slot 3\n", SPE_id );
//			run = (void *)shader[3];
//			Printf1( "SPE[%u]: Run shader from slot 3\n", SPE_id );
//			run( SPE_id, &shaderinfo, &myop[3], &funcs );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//			/*
//			 * Update operations
//			 */
//
//		case 120: // Update operation for slot 0
//			Printf1( "SPE[%u]: Update operation for slot 0\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[0], &myop[0] );
//			break;
//
//		case 121: // Update operation for slot 1
//			Printf1( "SPE[%u]: Update operation for slot 1\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[1], &myop[1] );
//			break;
//
//		case 122: // Update operation for slot 2
//			Printf1( "SPE[%u]: Update operation for slot 2\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[2], &myop[2] );
//			break;
//
//		case 123: // Update operation for slot 3
//			Printf1( "SPE[%u]: Update operation for slot 3\n", SPE_id );
//			GetSPEAddr( EA, PPE_addr );
//			GetOperation( PPE_addr[3], &myop[3] );
//			break;
//
//		/*
//		 * Operations with return values
//		 */
//		case 200: // Get a shader for slot 0
//			Printf1( "SPE[%u]: Run shader with return value from slot 0\n", SPE_id );
//			EA_result = mb_getmbox( );
//			GetOperation( PPE_addr[0], &myop[0] );
//			GetShader( (unsigned int)myop[0].EA_shader, myop[0].shaderSize, shader[0] );
//			break;
//
//		case 201: // Get a shader for slot 1
//			Printf1( "SPE[%u]: Run shader with return value from slot 1\n", SPE_id );
//			EA_result = mb_getmbox( );
//			GetOperation( PPE_addr[0], &myop[1] );
//			GetShader( (unsigned int)myop[1].EA_shader, myop[1].shaderSize, shader[1] );
//			break;
//
//		case 210: // Run slot 0
//			runr = (void *)shader[0];
//			runr( SPE_id, EA_result, &shaderinfo, &myop[0], &funcs );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 211: // Run slot 1
//			runr = (void *)shader[1];
//			runr( SPE_id, EA_result, &shaderinfo, &myop[1], &funcs );
//			spu_writech( SPU_WrOutMbox, 1 );
//			break;
//
//		case 1000: // Sanity check
//			spu_writech( SPU_WrOutMbox, 123 );
//			break;
//
//
//		default:
//			Printf1( "[SPE(%u)]No such instruction, quitting\n", SPE_id );
//			running = 0;
//		}
	}

	//prof_stop();

	return 1;
}
void spu_thread_group_exit(int status)
{
	spu_writech(SPU_WrOutMbox,status);
	spu_stop(0x101);
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) 
{
	int tgi0[2];

	int tgo0[2];

	int tgio0[2];

	tgi0[0]=1;
	tgi0[1]=2;

	
	tgio0[0]=11;
	tgio0[1]=12;

	tgo0[0]=13;
	tgo0[1]=14;
/*	tgo1[0]=15;
	tgo1[1]=16;*/	

	
	int selOut = 0;
	int selIn = 0;
	int msg=RUN;
	int waiting=0;
	int tag = 31;
	struct img_args *iargs;
	iargs =(struct img_args*)memalign(128,sizeof(*iargs));
	dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag);
	
	printf("spu_blit_yuv422_to_argb: SRC width %d,DST width %d\n",iargs->src_w,iargs->drw_w);
	printf("spu_blit_yuv422_to_argb: SRC height %d,DST height %d\n",iargs->src_h,iargs->drw_h);
	
	while (spu_stat_in_mbox() == 0);
		msg=spu_read_in_mbox();
//	first=0;

	vector unsigned char *InOutbuffer[2];

	vector unsigned char *Inbuffer[2];

	vector unsigned char *Outbuffer[2];

	int Outwidth=(4*iargs->drw_w+3)&~3;
	int Inwidth=(2*iargs->src_w+7)&~7;

	Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth);
	Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth);

	if (iargs->BLEND)
	{
		InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
		InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
	}

	Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
	Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);

	unsigned long long Inp,Outp,InOutp;
	
	int i=0;
//	int update=1;


	while (msg!=STOP)
	{
		selOut = 0;
		selIn = 0;

		Inp=iargs->Inp0[0];
		InOutp=iargs->Outp0[0];
		Outp=iargs->Outp0[0];

		dmaGet(Inbuffer[0],Inp,Inwidth,tgi0[0]);
		Inp=Inp+iargs->Istride[0]*2;

		dmaGet(Inbuffer[1],Inp,Inwidth,tgi0[1]);
		Inp=Inp+iargs->Istride[0]*2;

// 		if (iargs->BLEND)
// 		{
// 			dmaGet(InOutbuffer[0],InOutp,Outwidth,tgio0[0]);
// 			InOutp=InOutp+iargs->Ostride[0]*4;
// 			dmaGet(InOutbuffer[1],InOutp,Outwidth,tgio0[1]);
// 			InOutp=InOutp+iargs->Ostride[0]*4;
// 		}



		selIn=0;
		selOut=0;

		for (i=0;i < iargs->drw_h ;i++) {
			dmaWaitTag(tgi0[selIn]);
			
// 			if (iargs->BLEND)
// 				dmaWaitTag(tgio0[selIn]); 
			dmaWaitTag(tgo0[selOut]);
			if (iargs->SourceFormat==YUY2||iargs->SourceFormat==YUYV422)
			{
				yuv422_to_argb(Inbuffer[selIn],Outbuffer[selOut],iargs->drw_w);
			//	printf("spe_blitter: YUV422->ARGB\n");
			}
			//yuv420_to_yuv2(Yinbuffer[selIn],Uinbuffer[selIn],Vinbuffer[selIn],Outbuffer[selOut],iargs->Istride[0]);
			
		//	if (iargs->BLEND)
			//	blend(InOutbuffer[selIn],OutBuffer[selOut],iargs->ALPHA,iargs->SourceFormat);
			
			dmaPut(Outbuffer[selOut],Outp,Outwidth,tgo0[selOut]);
		
// 			if (iargs->BLEND){
// 				dmaGet(InOutbuffer[selIn],InOutp,Outwidth,tgio0[selIn]);
// 				InOutp=InOutp+iargs->Ostride[0];
// 					
// 			}

			dmaGet(Inbuffer[selIn],Inp,Inwidth,tgi0[selIn]);
			
			Inp=Inp+iargs->Istride[0]*2;
			Outp=Outp+iargs->Ostride[0]*4;
			selIn=selIn^1;
			selOut=selOut^1;
		}
	

		while (spu_stat_out_intr_mbox() == 0);
		msg=RDY;
		spu_writech(SPU_WrOutIntrMbox, msg);
		waiting=1;
		
		while (waiting){
			
			while (spu_stat_in_mbox() == 0);
			msg=spu_read_in_mbox();
			
			if (msg == RUN){
				waiting=0;
			}
			else if (msg == STOP)
			{
				waiting=0;
			}
			else if (msg == UPDATE)
			{
				tag=30;
 				dmaGetnWait(iargs,(unsigned int)argp,(int)envp,tag); //getting neccesary data to process the new image	
//  			//	update=1; // update filters to reflect the new image!
// 				Outwidth=(iargs->drw_w+3)&~3;
// 				Inwidth=(iargs->src_w+7)&~7;
// 				free(Inbuffer[0]);
// 				free(Inbuffer[1]);
// 	
// 				free(Outbuffer[0]);
// 				free(Outbuffer[1]);
// 				
// 				Inbuffer[0]=(vector unsigned char*)memalign(128,Inwidth);
// 				Inbuffer[1]=(vector unsigned char*)memalign(128,Inwidth);
// 			
// 				if (iargs->BLEND)
// 				{
// 					free(InOutbuffer[0]);
// 					free(InOutbuffer[1]);	
// 					InOutbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
// 					InOutbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
// 				}
// 	
// 				Outbuffer[0]=(vector unsigned char*)memalign(128,Outwidth);
// 				Outbuffer[1]=(vector unsigned char*)memalign(128,Outwidth);
			}
		}
	}
		 

	return 0;
}