コード例 #1
0
ファイル: spu.c プロジェクト: LaitaStefan/labs-2014
void process_image_simple(struct image* img){
	unsigned char *input, *output, *temp;
	unsigned int addr1, addr2, i, j, k, r, g, b;
	int block_nr = img->block_nr;
	vector unsigned char *v1, *v2, *v3, *v4, *v5 ;

	input = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4);
	output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4);
	temp = malloc_align(NUM_CHANNELS * img->width, 4);

	v1 = (vector unsigned char *) &input[0];
	v2 = (vector unsigned char *) &input[1 * img->width * NUM_CHANNELS];
	v3 = (vector unsigned char *) &input[2 * img->width * NUM_CHANNELS];
	v4 = (vector unsigned char *) &input[3 * img->width * NUM_CHANNELS];
	v5 = (vector unsigned char *) temp;

	addr2 = (unsigned int)img->dst; //start of image
	addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * 
		img->height / NUM_IMAGES_HEIGHT; //start line of spu block
	addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS *
		img->width / NUM_IMAGES_WIDTH;

	for (i=0; i<img->height / SCALE_FACTOR; i++){
		//get 4 lines
		addr1 = ((unsigned int)img->src) + i * img->width * NUM_CHANNELS * SCALE_FACTOR;
		mfc_get(input, addr1, SCALE_FACTOR * img->width * NUM_CHANNELS, MY_TAG, 0, 0);
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();

		//compute the scaled line
		for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){
			v5[j] = spu_avg(spu_avg(v1[j], v2[j]), spu_avg(v3[j], v4[j]));
		}
		for (j=0; j < img->width; j+=SCALE_FACTOR){
			r = g = b = 0;
			for (k = j; k < j + SCALE_FACTOR; k++) {
				r += temp[k * NUM_CHANNELS + 0];
				g += temp[k * NUM_CHANNELS + 1];
				b += temp[k * NUM_CHANNELS + 2];
			}
			r /= SCALE_FACTOR;
			b /= SCALE_FACTOR;
			g /= SCALE_FACTOR;

			output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b;
		}

		//put the scaled line back
		mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
		addr2 += img->width * NUM_CHANNELS; //line inside spu block
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();
	}

	free_align(temp);
	free_align(input);
	free_align(output);
}
コード例 #2
0
ファイル: simple_dma_spu.c プロジェクト: chmoder/PS3
int main(unsigned long long speid __attribute__ ((unused)), 
	 unsigned long long argp, 
	 unsigned long long envp __attribute__ ((unused))) 
{
  int i;
  unsigned int tag_id;

  /* Reserve a tag for application usage */
  if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) {
    printf("ERROR: unable to reserve a tag\n");
    return 1;
  }

  /* Here is the actual DMA call */
  /* the first parameter is the address in local store to place the data */
  /* the second parameter holds the main memory address                  */
  /* the third parameter holds the number of bytes to DMA                */
  /* the fourth parameter identifies a "tag" to associate with this DMA  */
  /* (this should be a number between 0 and 31, inclusive)               */
  /* the last two parameters are only useful if you've implemented your  */
  /* own cache replacement management policy.  Otherwise set them to 0.  */

  mfc_get(&cb, argp, sizeof(cb), tag_id, 0, 0);

  /* Now, we set the "tag bit" into the correct channel on the hardware  */
  /* this is always 1 left-shifted by the tag specified with the DMA     */
  /* for whose completion you wish to wait.                              */
  mfc_write_tag_mask(1<<tag_id);

  /* Now, issue the read and wait to guarantee DMA completion before we  */
  /* continue. */
  mfc_read_tag_status_all();

  /* DMA the data from system memory to our local store buffer. */
  mfc_get(data, cb.addr, DATA_BUFFER_SIZE, tag_id, 0, 0);


  printf("Address received through control block = 0x%llx\n", cb.addr);


  /* Wait for the data array DMA to complete. */
  mfc_read_tag_status_all();

  /* Verify that the data array contains a valid fibonacci sequence.
   */
  for (i=2; i<DATA_BUFFER_ENTRIES; i++) {
    if (data[i] != data[i-1] + data[i-2]) {
      printf("ERROR: fibonacci sequence error at entry %d. Expected %d, Got %d\n",
	     i, data[i-1] + data[i-2], data[i]);
      return (1);
    }
  }

  return 0;
}
コード例 #3
0
ファイル: spe-fractal.c プロジェクト: twoscomplement/cell
/*
 * The argv argument will be populated with the address that the PPE provided,
 * from the 4th argument to spe_context_run()
 */
int main(uint64_t speid, uint64_t argv, uint64_t envp)
{
    struct spe_args args __attribute__((aligned(SPE_ALIGN)));

    mfc_get(&args, argv, sizeof(args), 0, 0, 0);

    mfc_write_tag_mask(1 << 0);
    mfc_read_tag_status_all();

    cmap_calls = 0;
    dma_puts = 0;
    spu_write_decrementer(-1);

    // Run multiple renders with offsets.  Should be factored into render_fractal()
    render_fractal(&args.fractal, args.thread_idx, args.n_threads, 0.);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 7 / 8);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 3 / 4);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 5 / 8);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta / 2);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 3 / 8);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta / 4);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta / 8);

    // Send remaining points
    if(fill%2048) {
        // select the last buffer used
        int f = fill / 2048;
        mfc_put(&points[f*2048], (uint)args.fractal.pointbuf[f], 16384, 0, 0, 0);
        // Block for completion
        mfc_write_tag_mask(1<<0);
        mfc_read_tag_status_all();
        // Send a message with top bit set to indicate final item
        spu_write_out_intr_mbox((1<<31)|f);
        // Send another message indicating count
        spu_write_out_intr_mbox(fill%2048);
        ++dma_puts;
    }

    // Report some stats
    uint ticks = -1 - spu_read_decrementer();
    printf("cmap calls %d ticks %u calls/tick %f\n",
           cmap_calls, ticks, (double)cmap_calls/ticks );
    printf("dma puts %d\n", dma_puts);

    return 0;
}
コード例 #4
0
ファイル: spe_md5.c プロジェクト: jamella/hashclash
int main2mod(unsigned long long spe_id, unsigned long long program_data_ea, unsigned long long env) 
{
	unsigned tagid = spe_id&31;
	uint32 i,j;

	// get program data
	mfc_get(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0);
	mfc_write_tag_mask(1<<tagid);
	mfc_read_tag_status_all();

	// precompute partial working states based on ihv & partial msg block
	pre_compute(pd.ihv1, pd.ihv2, pd.m1, pd.m2);

	if (pd.collisiondata > 0)
	{
		j = pd.collisiondata*8;
		vec_uint32* bufferptr = &buffer[j];

		// get the trail buffer
		for (i = 0; i < j; i += 128)
			mfc_get(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0);
		mfc_write_tag_mask(1<<tagid);
		mfc_read_tag_status_all();

		// process collision trails
		
		reduce_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr);
		reduce_trails2mod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr);
		find_collmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr);

		// store the trail buffer
		for (i = 0; i < j; i += 128)
			mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0);
		mfc_write_tag_mask(1<<tagid);
		mfc_read_tag_status_all();
	} else {
		// fill the trail buffer in steps and do intermediate DMA transfers
		vec_uint32* bufferptr = &buffer[0];
		for (i = 0; i < BUFFERSIZE; i += 256)
		{
			bufferptr = generate_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, bufferptr, &buffer[i+256]);
			mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0);
			mfc_put(&buffer[i+128], &pd.buffer[i+128], sizeof(vec_uint32)*128, tagid, 0, 0);
		}
	}
	// transfer the current program data back
	mfc_put(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0);

	// wait for dma transfers to complete
	mfc_write_tag_mask(1<<tagid);
	mfc_read_tag_status_all();
	return 0;
}
コード例 #5
0
ファイル: spu.c プロジェクト: kraused/cell-coding
void triad()
{
	int i, j, n;
	
	vector float s = spu_splats(args.scalar);
	
	n = SIZE * sizeof(float);

	for (i = 0; (i + SIZE) < args.N; i += SIZE) {
		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();

		for (j = 0; j < (SIZE / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);

		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
	}
		
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	if (unlikely(i < args.N)) {
		/* 
		 * args.N - i will be smaller than SIZE at this point so
		 * it is safe to do a DMA transfer.
		 * We need to make sure that size is a multiple of 16.
		 */
		n = ((args.N - i) * sizeof(float)) & (~127);

		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
		
		/* n must be divisible by 4. */
		for (j = 0; j < ((args.N - i) / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);
		
		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
	}

	/* 
	 * At this point it may be that i is still smaller than args.N if the length
	 * was not divisible by the number of SPUs times 16.
	 */
}
コード例 #6
0
ファイル: spu_main.c プロジェクト: davidoguns/raytracer
/* loads program info - blocks until done */
void load_program_info(unsigned long long ea, spe_program_info_t *info)
{
	/* initiate DMA request for program info */
	/* spu_mfcdma64(ls_addr, ea_h, ea_l, size, tag_id, cmd); */
	spu_mfcdma64(info, mfc_ea2h(ea), mfc_ea2l(ea),
		sizeof(spe_program_info_t),
		SPUDMA_PROGRAMINFO,
		MFC_GET_CMD);

	/* wait for request to complete */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_PROGRAMINFO);
	mfc_read_tag_status_all();

	/* assign to global for debugging purposes */
	speid = info->speId;

#if defined(_DEBUG) && _DEBUG > 1
	printf("Program info:\n\tSpe ID:       %d\n\tNum Pixels:   %d\n\tSpp:          %d\n\tNum Spes      %d\n\tDepth:        %d\n",
		info->speId,
		info->numPixels,
		info->samplesPerPixel,
		info->numSpes,
		info->depth);
#endif
}
コード例 #7
0
int cacheGetPrime(int n)
{
    if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart))
    {
        int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
        return r;
    }

    // Haal op.

    uint32_t    tag, size;
    tag = mfc_tag_reserve();
    size = CACHE_PRIME_SIZE*16;

    unsigned long long  EA = setup.vPrimes + (n - n%4) * 4;

    mfc_get(&primeCacheData, EA, size, tag, 0, 0);
    mfc_write_tag_mask(1 << tag);
    mfc_read_tag_status_all();
    mfc_tag_release(tag);

    primeCacheStart = n - (n % 4);

    int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
    return r;
}
コード例 #8
0
ファイル: trianglebuffer.c プロジェクト: ralferoo/spugl
void writeTriangleBuffer(Triangle* endTriangle)
{
	if (endTriangle != _currentTriangle) {
		int length = ( ((char*)endTriangle) - _currentTriangleBuffer + 127) & ~127;
		unsigned short endTriangleBase = (((char*)endTriangle) - ((char*)_currentTriangle)) + _currentTriangleOffset;
		vec_ushort8 v_new_end = spu_promote(endTriangleBase, 1);

		// calculate genuine next pointer ( rewind==0 -> next, rewind!=0 -> 0 )
		unsigned short next_pointer = spu_extract( spu_andc( v_new_end, _currentTriangleRewind ), 1 );
		_currentTriangle->next_triangle = next_pointer;
		
//		printf("current=0x%x, endTriBase=0x%x, next_pointer=0x%x\n", _currentTriangleOffset, endTriangleBase, next_pointer);

		// DMA the triangle data out
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(_currentTriangleBufferEA), mfc_ea2l(_currentTriangleBufferEA), length, 0, MFC_PUT_CMD);

		// update the information in the cache line
		_currentTriangleRewind = spu_splats(next_pointer);		// re-use this variable as we don't need it anymore
		char* dstart = ((char*)&_currentTriangleRewind) + (_currentTriangleCacheEndTriangleEAL & 15);
		spu_mfcdma64(dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL, sizeof(short), 0, MFC_PUTB_CMD);

//		printf("writing from %x to %x:%x\n", dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL);

		// finally invalidate the triangle info
		_currentTriangle = NULL;

		// and make sure the DMA completed
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}
}
コード例 #9
0
  void compute()
  {
    // Compute my portion to compute
    int my_rows = rows / nspe + (rank < rows % nspe);
    int offset  = rank * (rows / nspe) + std::min(rank, rows % nspe);

#if DEBUG
    printf("Compute (%d/%d %d, %d) %d/%d\n", my_rows, rows, offset,
	   cols, rank, nspe);
#endif
    int tag = 23;

    uint64_t pin0 = in0 + offset * cols * sizeof(float);
    uint64_t pin1 = in1 + offset * cols * sizeof(float);
    uint64_t pin2 = in2 + offset * cols * sizeof(float);
    uint64_t pout = out + offset * cols * sizeof(float);

    float  buf[4*cols];
    float* buf0 = buf + 0*cols;
    float* buf1 = buf + 1*cols;
    float* buf2 = buf + 2*cols;
    float* buf3 = buf + 3*cols;

    for (int r=0; r<my_rows; ++r)
    {
      mfc_get(buf0, pin0, cols*sizeof(float), tag, 0, 0);
      mfc_get(buf1, pin1, cols*sizeof(float), tag, 0, 0);
      mfc_get(buf2, pin2, cols*sizeof(float), tag, 0, 0);

      pin0 += cols * sizeof(float);
      pin1 += cols * sizeof(float);
      pin2 += cols * sizeof(float);

      // Wait for DMAs to complete
      mfc_write_tag_mask(1<<tag);
      mfc_read_tag_status_all();

      for (int c=0; c<cols; ++c)
	buf3[c] = buf0[c] * buf1[c] + buf2[c];

      mfc_put(buf3, pout, cols*sizeof(float), tag, 0, 0);
      pout += cols * sizeof(float);
    }

    mfc_write_tag_mask(1<<tag);
    mfc_read_tag_status_all();
  }
コード例 #10
0
ファイル: spu.c プロジェクト: LaitaStefan/labs-2014
int main(uint64_t speid, uint64_t argp, uint64_t envp){
	unsigned int data[NUM_STREAMS];
	unsigned int num_spus = (unsigned int)argp, i, num_images;
	struct image my_image __attribute__ ((aligned(16)));
	int mode = (int)envp;

	speid = speid; //get rid of warning

	while(1){
		num_images = 0;
		for (i = 0; i < NUM_STREAMS / num_spus; i++){
			//assume NUM_STREAMS is a multiple of num_spus
			while(spu_stat_in_mbox() == 0);
			data[i] = spu_read_in_mbox();
			if (!data[i])
				return 0;
			num_images++;
		}

		for (i = 0; i < num_images; i++){
			mfc_get(&my_image, data[i], sizeof(struct image), MY_TAG, 0, 0);
			mfc_write_tag_mask(1 << MY_TAG);
			mfc_read_tag_status_all();
			switch(mode){
				default:
				case MODE_SIMPLE:
					process_image_simple(&my_image);
					break;
				case MODE_2LINES:
					process_image_2lines(&my_image);
					break;
				case MODE_DOUBLE:
					process_image_double(&my_image);
					break;
				case MODE_DMALIST:
					process_image_dmalist(&my_image);
					break;
			}
		}	
		data[0] = DONE;
		spu_write_out_intr_mbox(data[0]);	
	}

	return 0;
}
コード例 #11
0
static void cleargroups(void)
{
  unsigned i;

  for (i = 0; i < GROUPS_COUNT; i++)
  {
    group_keysvectors[i] = spu_splats((u16) 0);
    group_insertpos[i]   = spu_splats((u32) 0);
#ifdef GET_CACHE_STATS
    group_length[i]      = 0;
#endif
  }
  /* All vectors now points to group0, so fill all entries with true data for group 0 */
  mfc_get(group_values[0][0], myCellOGRCoreArgs.upchoose, GROUP_ELEMENTS * 2, DMA_ID, 0, 0);
  mfc_read_tag_status_all();
  for (i = 1; i < GROUPS_COUNT * GROUPS_LENGTH; i++)
    memcpy(group_values[0][i], group_values[0][0], GROUP_ELEMENTS * 2);
}
コード例 #12
0
static void init(unsigned long long argp) {
  mfc_get(&spu_arguments, (unsigned) argp, sizeof(spu_arguments), 0, 0, 0);
  mfc_write_tag_mask(1 << 0);
  mfc_read_tag_status_all();

  first_channel =  spu_arguments.spu_id      * NR_CHANNELS / NR_SPUS;
  last_channel  = (spu_arguments.spu_id + 1) * NR_CHANNELS / NR_SPUS;

  for(int i=0; i<NR_STATIONS; i++) {
    samples_dma_list[i].size = sizeof(samples[0][0]);
  }

  if(spu_arguments.spu_id == 0) {
    printf("SPU sample dma size = %ld bytes\n", sizeof(samples[0][0]));
    printf("SPU in buffers = %ld KB @ %p, out buffers = %ld B @ %p\n", sizeof(samples) / 1024, samples, sizeof(visibilities), visibilities);
  }

  printf("I am spu %d, calculating channels %3d - %3d\n", spu_arguments.spu_id, first_channel, last_channel);
}
コード例 #13
0
void initialize(
  Fastconv_params* fc,
  void*            p_kernel, 
  fft1d_f*         obj,
  void*            buf)
{
  unsigned int size = fc->elements*2*sizeof(float);

  // The kernel matches the input and output size
  mfc_get(p_kernel, fc->ea_kernel, size, 31, 0, 0);
  mfc_write_tag_mask(1<<31);
  mfc_read_tag_status_all();

  if (fc->transform_kernel)
  {
    // Perform the forward FFT on the kernel, in place.  This only need 
    // be done once -- subsequent calls will utilize the same kernel.
    cml_ccfft1d_ip_f(obj, (float*)coeff, CML_FFT_FWD, buf);
  }
}
コード例 #14
0
ファイル: spu.c プロジェクト: kraused/cell-coding
int main(ull id, ull argp, ull envp)
{
	unsigned int cmd;

	mfc_get(&args, argp, sizeof(args), TAG, 0, 0);
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	while (1) {
		cmd = spu_read_in_mbox();

		if (unlikely(SPU2_MSG_PPU_TO_SPU_EXIT == cmd))
			break;

		switch (cmd) {
		case SPU2_MSG_PPU_TO_SPU_DO_COPY:
			copy();
			break;
		case SPU2_MSG_PPU_TO_SPU_DO_SCALE:
			scale();
			break;
		case SPU2_MSG_PPU_TO_SPU_DO_ADD:
			add();
			break;
		case SPU2_MSG_PPU_TO_SPU_DO_TRIAD:
			triad();
			break;
		default:
			fprintf(stderr, " [SPU]: Invalid command received in mailbox\n");
		}

		spu_write_out_mbox(SPU2_MSG_SPU_TO_PPU_DONE);
	}

	return 0;
}
コード例 #15
0
ファイル: gc_logging.c プロジェクト: GREO/GNU-Radio
void
_gc_log_write(gc_log_entry_t entry)
{
  if (log_base_ea == 0)
    return;

  entry.seqno = log_seqno++;
  entry.timestamp = spu_read_decrementer();

  if (tmp_buffer_busy & (1 << tmp_buffer_idx)){
    mfc_write_tag_mask(1 << (log_tags + tmp_buffer_idx));
    mfc_read_tag_status_all();
  }

  tmp_buffer[tmp_buffer_idx] = entry;	// save local copy

  mfc_put(&tmp_buffer[tmp_buffer_idx],
	  log_base_ea + log_idx * sizeof(entry), sizeof(entry),
	  log_tags + tmp_buffer_idx, 0, 0);

  tmp_buffer_busy |= (1 << tmp_buffer_idx);
  tmp_buffer_idx ^= 0x1;
  log_idx = (log_idx + 1) & log_idx_mask;
}
コード例 #16
0
static inline void wait_for_dma_samples(int buffer) {
#if DO_DMA
  mfc_write_tag_mask(1 << buffer);
  mfc_read_tag_status_all();
#endif
}
コード例 #17
0
int main(uint64_t speid,uint64_t argp, uint64_t envp){
  int i,j,k;
 
  speid=speid;envp=envp; //avoid warnings
  //============================================================================
  // This part is used to Data input using DMA to get it from PPE
  // DMA in control block and wait for completion
  mfc_get(&cb,argp,sizeof(cb),0,0, 0);
  mfc_write_tag_mask(1 << 0);
  mfc_read_tag_status_all();
    
  // DMA in MatrixSPE and wait for completion
  int* spuptr =(int *) &MatrixSPE[0][0];  // dst, start addr 
  int* ppuptr = (int *) cb.data;           // src, start addr
  int totalwords = sizeof(MatrixSPE) >> 2;
 
  const int dt_unit = 4096;    // in words, or 4 bytes
  int* spulast = spuptr + totalwords;
  while ( spuptr < spulast ) {
    int nwords = ( spuptr + dt_unit > spulast ) ? (spulast - spuptr) : dt_unit;
    
    mfc_get(spuptr,(unsigned int) ppuptr, 4*nwords, 0,0,0);
    mfc_write_tag_mask(1 << 0);
    mfc_read_tag_status_all();
    spuptr += dt_unit;
    ppuptr += dt_unit;
  }   

   // DMA in TransposeSPE and wait for completion
   spuptr =(int *)  &TransposeSPE[0][0];  // dst, start addr 
   ppuptr = (int *) cb.data1;           // src, start addr
   totalwords = sizeof(TransposeSPE) >> 2;

   spulast = spuptr + totalwords;
   while ( spuptr < spulast ) {
       int nwords = ( spuptr + dt_unit > spulast ) ? (spulast - spuptr) : dt_unit;

          mfc_get(spuptr,(unsigned int) ppuptr, 4*nwords, 0,0,0);
          mfc_write_tag_mask(1 << 0);
          mfc_read_tag_status_all();
          spuptr += dt_unit;
          ppuptr += dt_unit;
                     }

  //============================================================================


/* Do computing of 16*16 output matrix on each SPE.. based on 16rows and 16columns passed from PPU*/
  for(i=0; i<16; i++)
	for(j=0;j<16;j++)
	for(k=0;k<1024;k++)
	  		MultSPE[i][j]+=MatrixSPE[i][k]*TransposeSPE[j][k];


 //============================================================================

 /* Send result to PPU */

 	spuptr = (int *)&MultSPE[0][0];  
	ppuptr = (int *)cb.result;  
	totalwords = sizeof(MultSPE) >> 2;

	spulast = spuptr + totalwords;
   	while ( spuptr < spulast ) {
   		 int nwords = ( spuptr + dt_unit > spulast ) ? (spulast - spuptr) : dt_unit;
   	 	mfc_put(spuptr,(unsigned int) ppuptr, 4*nwords,2,0,0);
    		mfc_write_tag_mask(1 << 2);
    		mfc_read_tag_status_all();
    		spuptr += dt_unit;
    		ppuptr += dt_unit;
  		}   

  //exit(0);  
}
コード例 #18
0
ファイル: dma_example_spu.c プロジェクト: chmoder/PS3
int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  unsigned int i, num_chunks;
  mfc_list_element_t* dma_list_in;
  unsigned int tmp_addr;

#ifdef USE_TIMER
  uint64_t start, time_working;
    
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* calculate the address of the local buffer where we can point the 
   * dma_list_in pointer to */
  tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - 
      (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS));
  dma_list_in = (mfc_list_element_t*) (tmp_addr);

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA get to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks of data
   * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE
   * of data into system memory. Data is moved into local store, processed, and 
   * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration.      
   */
  for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float));

    /* fill the dma list with the appropriate lower 32bit effective address and size for
     * each dma list element. This dma list is used to gather the input data 
     * from system memory */
    fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); 

    /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory.
     * The data will be gathered into local buffer local_buffer_in */
    mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0);

    /* wait for the DMA get list command to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();

    /* invoke process_data to work on the data that's just been moved into local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS);

    /* fill the dma list with the appropriate lower 32 bit ea and size for each
     * dma list element. This dma list is used to scatter the output data to system memory  */
    fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); 

    /* issue the DMA put list command to scatter the result from local memory to 
    * different places in system memory */
    mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), 
        tag, 0, 0);

    /* wait for the DMA put list to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();

  }

#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
コード例 #19
0
static inline void wait_for_dma_visibilities2(int buffer1, int buffer2) {
#if DO_DMA
  mfc_write_tag_mask(1 << (buffer1 + NR_SAMPLE_BUFFERS) | 1 << (buffer2 + NR_SAMPLE_BUFFERS));
  mfc_read_tag_status_all();
#endif
}
コード例 #20
0
ファイル: trianglebuffer.c プロジェクト: ralferoo/spugl
Triangle* getTriangleBuffer(Context* context)
{
	// if we've already allocated a triangle buffer (and we're in the same context)
	if (context == _currentTriangleContext && _currentTriangle)
		return _currentTriangle;

	// trash the default values
	_currentTriangleContext	= context;
	_currentTriangle	= NULL;

	// read the current renderable cache line to ensure there is room for the triangle data
	// in the cache line buffer; we do this by comparing against all 16 cache line blocks
	// to make sure that extending the write pointer wouldn't clobber the data

	unsigned long long cache_ea = context->renderableCacheLine;
	if (cache_ea == 0)
		return NULL;
	char cachebuffer[128+127];
	RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 );

	// printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea));

	spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
	spu_readch(MFC_RdAtomicStat);

	// extendvalid = ( read<=write && test<end ) || ( read>write && test<read )
	// extendvalid = ( read>write && read>test ) || ( read<=write && end>test )
	// simplifies to	extendvalid = selb(end, read, read>write) > test
	// or			extendvalid = selb(end>test, read>test, read>write)
	// rewind = next >= end
	// rewindvalid = read != 0
	// valid = extendvalid && (!rewind || rewindvalid)
	// 	 = extendvalid && (!rewind || !rewindinvalid)
	// 	 = extendvalid && !(rewind && rewindinvalid)
	// invalid = ! (extendvalid && !(rewind && rewindinvalid))
	//         = (!extendvalid || (rewind && rewindinvalid))

	vec_ushort8 v_writeptr		= spu_splats( cache->endTriangle );
	vec_ushort8 v_readptr0		= cache->chunkTriangle[0];
	vec_ushort8 v_readptr1		= cache->chunkTriangle[1];
	vec_ushort8 v_testptr		= spu_add(v_writeptr,   TRIANGLE_MAX_SIZE);
	vec_ushort8 v_nextptr		= spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE);
	vec_ushort8 v_endptr		= spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE);

	vec_ushort8 v_zero		= spu_splats( (unsigned short) 0 );
	vec_uchar16 v_merger		= (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };

	vec_ushort8 v_max0_test		= spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) );
	vec_ushort8 v_max1_test		= spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) );
	vec_ushort8 v_extend0_valid	= spu_cmpgt( v_max0_test, v_testptr );
	vec_ushort8 v_extend1_valid	= spu_cmpgt( v_max1_test, v_testptr );
	vec_ushort8 v_rewind0_invalid	= spu_cmpeq( v_readptr0, v_zero );
	vec_ushort8 v_rewind1_invalid	= spu_cmpeq( v_readptr1, v_zero );
	vec_ushort8 v_rewind8		= spu_cmpgt( v_nextptr, v_endptr );

	vec_uchar16 v_extend_valid	= (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger );
	vec_uchar16 v_rewind_invalid	= (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger );
	vec_uchar16 v_rewind		= (vec_uchar16) v_rewind8;

	vec_uchar16 v_valid_rhs		= spu_and( v_rewind_invalid, v_rewind );
	vec_uchar16 v_invalid		= spu_orc( v_valid_rhs, v_extend_valid );

	// check to see if the chunk is being processed
	vec_uint4 v_free = spu_gather(
		spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );
	vec_uint4   v_invalid_bits	= spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free );

	// if any of the bits are invalid, then no can do
	if ( spu_extract(v_invalid_bits, 0) ) {
		return NULL;
	}

	// fetch in the data before this triangle in the cache buffer
	unsigned int offset = cache->endTriangle;
	_currentTriangleBufferExtra = offset & 127;
	unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127);
	if (_currentTriangleBufferExtra) {
		spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD);

		// ensure DMA did actually complete
		mfc_write_tag_mask(1<<0);
		mfc_read_tag_status_all();
	}

	// final bit of initialisation
	_currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra);
	_currentTriangleOffset = offset;
	_currentTriangleRewind = v_rewind8;
	_currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache));
	_currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); 
	_currentTriangleBufferEA = trianglebuffer_ea; 

	// printf("Allocated new triangle buffer: %x\n", offset);

	// and return the buffer ready to go
	return _currentTriangle;
}
コード例 #21
0
ファイル: dma_example_spu.c プロジェクト: chmoder/PS3
void process_data_simd (float* buf_in, float* buf_out, unsigned int size)
{
  unsigned int i;
  vector float *vbuf_in, *vbuf_out;
  vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f};
  vbuf_in = (vector float*) buf_in;
  vbuf_out = (vector float*) buf_out;

  for (i = 0; i < (size / 4); i++)
  {
   vbuf_out[i] = spu_add (vbuf_in[i], v1); 
  }
}

int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  int i, num_chunks;

#ifdef USE_TIMER
  uint64_t start, time_working;
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks
   * and fetches one 'chunk' of data at a time, process it, and write 
   * it back to system memory until done. 
   */
  for (i = 0; i < num_chunks; i++)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float));

    /* issue a DMA get command to fetch the chunk of data from system memory */
    mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA get to complete */ 
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();


    /* invoke process_data to work on the data that's just been moved into 
     * local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE);

   /* issue the DMA put command to transfer result from local memory to 
    * system memory */
    mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA put to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();
  }
#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
コード例 #22
0
ファイル: cacheline.c プロジェクト: ralferoo/spugl
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks)
{
    const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) {	// merge lo bytes from unsigned shorts (array)
        1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
    };

    const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) {	// get busy flag with ones in unused bytes
        0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0
    };

    const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0);

    char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ];

    char	sync_buffer[128+127];
    void*	aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 );

    RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer;
    unsigned long long cache_ea;

    spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD);
    mfc_write_tag_mask(1<<0);
    mfc_read_tag_status_all();

    while (cache_ea) {
        // terminate immediately if possible
        if (spu_stat_in_mbox())
            return;

        // read the cache line
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
        spu_readch(MFC_RdAtomicStat);

        unsigned int endTriangle = cache->endTriangle;
        vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle);

        // first look for short chunks
        vec_uchar16 next = cache->chunkNext;
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 start0 = cache->chunkStart[0];
        vec_ushort8 start1 = cache->chunkStart[1];

        vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) );
        vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) );

        vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0);
        vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1);

        vec_ushort8 len0 = spu_sub( end0, start0);
        vec_ushort8 len1 = spu_sub( end1, start1);

        vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0);
        vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1);
        vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE );
        vec_uint4 smallChunkGather = spu_gather(small);

        // check to see if chunk is already at the last triangle
        vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle(
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]),
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]),
                SHUFFLE_MERGE_BYTES) );

        // check if the chunk is free
        vec_uint4 freeChunkGather = spu_gather(
                                        spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );

        // check to see if the chunk is being processed
        vec_uint4 busyChunkGather = spu_gather(
                                        spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK),
                                                spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) );

        // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0
        // note that if freeChunkGather is true then busyChunkGather must also be true

        // done=false, free=false, busy=false -> can process
        // free=false, busy=false -> can be merged

        // decide which chunk to process
        vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather );
        vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather );

        vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) );
        vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask );

        /*
        		if (!spu_extract(shortSelMask, 0))
        			printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n",
        				spu_extract(mayProcessGather, 0) & 0xffff,
        				spu_extract(smallChunkGather, 0),
        				spu_extract(mayProcessShortGather, 0),
        				spu_extract(shortSelMask, 0) & 0xffff,
        				spu_extract(mayProcessSelection, 0) & 0xffff );
        */

        vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16);
        unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0);
        unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0);

        // if there's nothing to process, try the next cache line in the rendering tasks list
        if (!spu_extract(mayProcessBits, 0)) {
trynextcacheline:
            cache_ea = cache->next;
            // sleep();
            continue;
        }

        unsigned int chunkStart    	= cache->chunkStartArray   [chunkToProcess];
        unsigned int chunkTriangle	= cache->chunkTriangleArray[chunkToProcess];
        unsigned int chunkNext		= cache->chunkNextArray	   [chunkToProcess] & CHUNKNEXT_MASK;
        unsigned int chunkEnd		= (cache->chunkStartArray  [chunkNext]-1) & (NUMBER_OF_TILES-1);
        unsigned int chunkLength	= 1 + chunkEnd-chunkStart;

        // only need an extra block if the block is especially long
        if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) {
            freeChunk = 32;
        }

        // mark this block as busy
        cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT;

        // if there's at least one free chunk, claim it
        if (freeChunk != 32) {
            cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED;
            cache->chunkTriangleArray[freeChunk] = chunkTriangle;
        }

        // write the cache line back
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
            continue;

#ifdef INFO
        printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID,
               chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle,
               freeChunk!=32 ? freeChunk : -1 );
//		debug_render_tasks(cache);
#endif

        Triangle* triangle;
        int firstTile;
        do {
            // read the triangle data for the current triangle
            unsigned int extra = chunkTriangle & 127;
            unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
            triangle = (Triangle*) (trianglebuffer+extra);
            unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

            // ensure DMA slot available
            do {} while (!spu_readchcnt(MFC_Cmd));

            spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea),
                         length, 0, MFC_GET_CMD);
            mfc_write_tag_mask(1<<0);
            mfc_read_tag_status_all();

            // get the triangle deltas
            firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd);

            if (firstTile>=0)
                break;

            // no match, try next triangle
            chunkTriangle = triangle->next_triangle;
        } while (chunkTriangle != endTriangle);

        // if we actually have something to process...
        if (firstTile>=0) {
            // the "normal" splitting will now become:
            // chunkStart .. (firstTile-1)	-> triangle->next_triangle
            // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY)
            // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE)

            int tailChunk;
            int thisChunk;
            int nextBlockStart;
            int thisBlockStart;
            int realBlockStart;
            do {
retry:
                // read the cache line
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
                spu_readch(MFC_RdAtomicStat);

                // calculate start of next block
                nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK;
                if (nextBlockStart > chunkEnd)
                    nextBlockStart = chunkEnd+1;

                // calculate start of block to mark as busy
                thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK;
                if (thisBlockStart < chunkStart)
                    thisBlockStart = chunkStart;
                realBlockStart = thisBlockStart;

#ifdef INFO
                printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID,
                       nextBlockStart, realBlockStart, thisBlockStart, chunkStart);
#endif


                // allocate some more free chunks
                vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq(
                                                        spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16);
                unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);

                if (freeChunk == 32) {
                    // if we didn't have one before, try again
                    freeChunk = freeChunk2;

                    // and try to get the second one
                    freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) );
                    freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);
                } else {
                    // speculatively clear the free chunk just in case we don't need it
                    cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK;
                }

#ifdef INFO
                printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n",
                       _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart );
#endif

                // mark region after as available for processing if required
                if (nextBlockStart < chunkEnd) {
                    if (freeChunk==32) {
                        // if no free chunk, relinquish entire block and write back
                        cache->chunkNextArray[chunkToProcess] = chunkNext;
                        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
                        // if writeback failed, we *might* have a free block, retry
                        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
                            goto retry;

                        // otherwise give up and try the next cache line
                        goto trynextcacheline;
                    }
                    cache->chunkStartArray[freeChunk] = nextBlockStart;
                    cache->chunkNextArray[freeChunk] = chunkNext;
                    cache->chunkTriangleArray[freeChunk] = chunkTriangle;
                    cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT;
                    tailChunk = freeChunk;
#ifdef INFO
                    printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess);
                    debug_render_tasks(cache);
#endif
                } else {
                    // we're gonna use freeChunk2 for the "in front" block, as we've not
                    // used freeChunk, let's use it as it's more likely to have a free chunk
                    freeChunk2 = freeChunk;
                    tailChunk = chunkNext;
                }

                // mark region before as available if required and possible
                thisChunk = chunkToProcess;
                if (thisBlockStart > chunkStart) {
                    if (freeChunk2 != 32) {
                        // mark this region as busy
                        cache->chunkStartArray[freeChunk2]=thisBlockStart;
                        cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        cache->chunkTriangleArray[freeChunk2]=chunkTriangle;

                        // mark region before as available for processing
                        cache->chunkNextArray[chunkToProcess]=freeChunk2;
                        cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle;
                        thisChunk = freeChunk2;
#ifdef INFO
                        printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#endif
                    } else {
                        // need to keep whole block, update info and mark bust
                        cache->chunkTriangleArray[chunkToProcess]=chunkTriangle;
                        cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        realBlockStart = chunkStart;
                        printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#ifdef INFO
#endif
                        sleep();
                    }
                }

                // merge chunks
                merge_cache_blocks(cache);

                // write the cache line back
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
            } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS);

            // finally after the write succeeded, update the variables
            chunkNext = tailChunk;
            chunkToProcess = thisChunk;
            chunkStart = firstTile; //thisBlockStart;
            chunkLength = nextBlockStart - firstTile;
            chunkEnd = chunkStart + chunkLength - 1;
            freeChunk = 32;

            // now we can process the block up to endTriangle
            initTileBuffers(thisBlockStart, chunkEnd);

            int ok=0;
            while (chunkTriangle != endTriangle) {
#ifdef INFO
                printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n",
                       _SPUID, chunkToProcess, chunkStart, chunkLength,
                       chunkTriangle, firstTile, thisBlockStart);
#endif
                // and actually process that triangle on these chunks
                processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok);
                ok=1;
#ifdef PAUSE
                sleep();
#endif
                // and advance to the next-triangle
                chunkTriangle = triangle->next_triangle;

                // this should only ever happen if we're running really low on cache line slots
                // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and
                // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles.
                // in this case, we process from thisBlockStart only (because we know that from
                // chunkStart to there has no result) and then we only process one triangle
                if (chunkStart != realBlockStart) {
                    /*
                    printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, "
                    	"firstTile=%d chunk=%d\n",
                    	_SPUID, chunkStart, realBlockStart, chunkEnd,
                    	firstTile, chunkToProcess);
                    debug_render_tasks(cache);
                    */

                    // abort the while loop
                    break;
                }

                // read the next triangle
                unsigned int extra = chunkTriangle & 127;
                unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
                triangle = (Triangle*) (trianglebuffer+extra);
                unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

                // ensure DMA slot available
                do {} while (!spu_readchcnt(MFC_Cmd));

                spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea),
                             mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD);
                mfc_write_tag_mask(1<<0);
                mfc_read_tag_status_all();
            } // until chunkTriangle == endTriangle

            // flush any output buffers
            flushTileBuffers(thisBlockStart, chunkEnd);

        } // firstTile>=0
コード例 #23
0
ファイル: merge_spu.c プロジェクト: pstrinkle/misc-umbc
void waitfor_matrix_io ( int tag ) {
	mfc_write_tag_mask(1<<tag);
	mfc_read_tag_status_all();   // Wait for the data array DMA to complete.
}
コード例 #24
0
ファイル: multiply_spu.c プロジェクト: pstrinkle/misc-umbc
int main(unsigned long long speid, addr64 argp, addr64 envp) 
{
	unsigned long long dummy ;
	int bi, bj, bk ;
	int i_initial, i_final ;
	int k;

	/* Here is the actual DMA call */
	/* the first parameter is the address in local store to place the data */
	/* the second parameter holds the main memory address                  */
	/* the third parameter holds the number of bytes to DMA                */
	/* the fourth parameter identifies a "tag" to associate with this DMA  */
	/* (this should be a number between 0 and 31, inclusive)               */
	/* the last two parameters are only useful if you've implemented your  */
	/* own cache replacement management policy.  Otherwise set them to 0.  */
	dummy = envp.ull ;
	dummy = speid ;

	mfc_get((void*)&args, argp.ull, 128, 31, 0, 0);

	/* Now, we set the "tag bit" into the correct channel on the hardware  */
	/* this is always 1 left-shifted by the tag specified with the DMA     */
	/* for whose completion you wish to wait.                              */
	mfc_write_tag_mask(1<<31);

	/* Wait for the data array DMA to complete. */
	mfc_read_tag_status_all();

	pA_matrix = args.Amat ;
	pB_matrix = args.Bmat ;
	pC_matrix = args.Cmat ;
	i_initial = args.i_initial ;
	i_final   = args.i_final ;


	for( k=0; k<loops; k++ ) {
		for(bi=args.i_initial; bi<(int)args.i_final; bi+=stsize) {
			for(bj=0; bj<tsize; bj+=stsize) {
				get_Cmatrix_segment ( 30, bi, bj );
				waitfor_matrix_io  ( 30 ) ;
				for(bk=0; bk<tsize; bk+=stsize) {
			
					get_Amatrix_segments ( 31, bi, bk );
					get_Bmatrix_segments ( 31, bk, bj );
					waitfor_matrix_io  ( 31 ) ;
			
					{ int i, j;
					for (i=0;i<stsize;i++) {
						for (j=0;j<stsize;j++) {
							for (k=0;k<stsize;k++) {
								CM0[i][j] += AM0[i][k] * BM0[k][j] ;
							}
						}
					}
					}
			
				}
				put_Cmatrix_segment ( 30, bi, bj );
				waitfor_matrix_io  ( 30 ) ;
			}
		}
	}



# ifdef NEVER
	{ int i, j;
	for (i=0;i<32;i++) {
		for (j=0;j<10;j++) {
			printf(" %7.2f",iM0[i][j]);
		}
		printf("\n");
	}
		printf("\n\n");
	}
# endif

# ifdef NEVER
	{ int i, j;
	for (i=0;i<tsize;i++) {
		for (j=0;j<10;j++) {
			printf(" %7.2f",fM0[i][j]);
		}
		printf("\n");
	}
		printf("\n\n");
	}
# endif


	return 0;
}
コード例 #25
0
int main(unsigned long long speid, addr64 argp, addr64 envp)
{
  // Check size of structures, these offsets must match assembly
  STATIC_ASSERT(sizeof(struct OgrLevel) == 6*16+16+16);
  STATIC_ASSERT(sizeof(struct OgrState) == 2*16 + 8*16*29);
  STATIC_ASSERT(sizeof(CellOGRCoreArgs) == 16 + 2*16 + 8*16*29 + 16 + 16 + 16);
  STATIC_ASSERT(offsetof(CellOGRCoreArgs, state       ) == 16);
  STATIC_ASSERT(offsetof(CellOGRCoreArgs, state.Levels) == 16 + 32);
  STATIC_ASSERT(sizeof(u16) == 2); /* DMA fetches of pchoose */
  
  (void) speid; (void) envp;

  // One DMA used in program
  mfc_write_tag_mask(1<<DMA_ID);

  // Fetch arguments from main memory
  mfc_get(&myCellOGRCoreArgs, argp.a32[1], sizeof(CellOGRCoreArgs), DMA_ID, 0, 0);
  mfc_read_tag_status_all();

  s32 retval;
  /* check for memory corruption in incoming arguments */
  if (myCellOGRCoreArgs.sign1 != SIGN_PPU_TO_SPU_1)
  {
    retval = RETVAL_ERR_BAD_SIGN1;
    goto done;
  }
  if (myCellOGRCoreArgs.sign2 != SIGN_PPU_TO_SPU_2)
  {
    retval = RETVAL_ERR_BAD_SIGN2;
    goto done;
  }

  // Prepare arguments to be passed to the core
  struct OgrState* state = &myCellOGRCoreArgs.state;
  int* pnodes   = &myCellOGRCoreArgs.pnodes;
  u32  upchoose = myCellOGRCoreArgs.upchoose;
  
  static int cached_maxdepth;
  if (state->maxdepth != cached_maxdepth)
  {
    cached_maxdepth = state->maxdepth;
    cleargroups();
  }

  // Call the core
//  s32 retval = SPE_CORE_FUNCTION(CORE_NAME) (state, pnodes, ogr_choose_dat);
  if (*pnodes) /* core will not handle nodes == 0 */
    myCellOGRCoreArgs.ret_depth = ogr_cycle_256_test(state, pnodes, upchoose);

  // Check for memory corruption after core exit
  if (myCellOGRCoreArgs.sign1 != SIGN_PPU_TO_SPU_1)
    retval = RETVAL_ERR_TRASHED_SIGN1;
  else if (myCellOGRCoreArgs.sign2 != SIGN_PPU_TO_SPU_2)
    retval = RETVAL_ERR_TRASHED_SIGN2;
  else
    retval = 0;

  update_groups_stats();

done:
  // Update changes in main memory
  myCellOGRCoreArgs.sign1 = SIGN_SPU_TO_PPU_1;
  myCellOGRCoreArgs.sign2 = SIGN_SPU_TO_PPU_2;
  mfc_put(&myCellOGRCoreArgs, argp.a32[1], sizeof(CellOGRCoreArgs), DMA_ID, 0, 0);
  mfc_read_tag_status_all();

  return retval; /* no status codes in ogr-ng, core info returned in ret_depth */
}
コード例 #26
0
ファイル: spu.c プロジェクト: LaitaStefan/labs-2014
void process_image_2lines(struct image* img){
	unsigned char *input, *output, *output2, *temp;
	unsigned int addr1, addr2, i, j, k, r1, g1, b1, r2, g2, b2;
	
	int block_nr = img->block_nr;
	
	vector unsigned char *v1_1, *v1_2, *v1_3, *v1_4, *v1_5;
	vector unsigned char *v2_1, *v2_2, *v2_3, *v2_4, *v2_5;

	// optimization
	unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width;
	unsigned int num_channels_X_img_width_X_SCALE_FACTOR = num_channels_X_img_width * SCALE_FACTOR;
	
	input  = malloc_align(2 * num_channels_X_img_width_X_SCALE_FACTOR, 4);
	
	output  = malloc_align(num_channels_X_img_width / SCALE_FACTOR, 4);
	output2 = malloc_align(num_channels_X_img_width / SCALE_FACTOR, 4);
	
	temp = malloc_align(2 * NUM_CHANNELS * img->width, 4);

	// first line
	v1_1 = (vector unsigned char *) &input[0];
	v1_2 = (vector unsigned char *) &input[1 * num_channels_X_img_width];
	v1_3 = (vector unsigned char *) &input[2 * num_channels_X_img_width];
	v1_4 = (vector unsigned char *) &input[3 * num_channels_X_img_width];
	v1_5 = (vector unsigned char *) temp;
	
	// second line
	v2_1 = (vector unsigned char *) &input[4 * num_channels_X_img_width];
	v2_2 = (vector unsigned char *) &input[5 * num_channels_X_img_width];
	v2_3 = (vector unsigned char *) &input[6 * num_channels_X_img_width];
	v2_4 = (vector unsigned char *) &input[7 * num_channels_X_img_width];
	v2_5 = (vector unsigned char *) &temp[num_channels_X_img_width];

	addr2 = (unsigned int)img->dst; //start of image
	addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * 
		img->height / NUM_IMAGES_HEIGHT; //start line of spu block
	addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS *
		img->width / NUM_IMAGES_WIDTH;

	for (i = 0; i<img->height / SCALE_FACTOR / 2; i++){
		// get 8 lines
		addr1 = ((unsigned int)img->src) + 2 * i * num_channels_X_img_width_X_SCALE_FACTOR;
		mfc_get(input, addr1, 2 * num_channels_X_img_width * SCALE_FACTOR, MY_TAG, 0, 0);
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();

		// compute the 2 scaled line
		for (j = 0; j < num_channels_X_img_width / 16; j++){
			v1_5[j] = spu_avg(spu_avg(v1_1[j], v1_2[j]), spu_avg(v1_3[j], v1_4[j]));
			v2_5[j] = spu_avg(spu_avg(v2_1[j], v2_2[j]), spu_avg(v2_3[j], v2_4[j]));
		}

		for (j = 0; j < img->width; j += SCALE_FACTOR){
			r1 = g1 = b1 = 0;
			r2 = b2 = g2 = 0;
			for (k = j; k < j + SCALE_FACTOR; k++) {
				unsigned int k_X_NUM_CHANNELS = k * NUM_CHANNELS;
				r1 += temp[k_X_NUM_CHANNELS + 0];
				g1 += temp[k_X_NUM_CHANNELS + 1];
				b1 += temp[k_X_NUM_CHANNELS + 2];

				r2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 0];
				g2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 1];
				b2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 2];
			}
			r1 /= SCALE_FACTOR;
			b1 /= SCALE_FACTOR;
			g1 /= SCALE_FACTOR;
			
			r2 /= SCALE_FACTOR;
			b2 /= SCALE_FACTOR;
			g2 /= SCALE_FACTOR;
			
			output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r1;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g1;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b1;
			
			output2[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r2;	
			output2[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g2;
			output2[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b2;
		}

		//put the scaled line back
		mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
		addr2 += img->width * NUM_CHANNELS; //line inside spu block
		
		// trimite si al 2-lea set
		mfc_put(output2, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
		addr2 += img->width * NUM_CHANNELS; //line inside spu block
		
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();
	}

	free_align(temp);
	free_align(input);
	free_align(output);
	free_align(output2);
}
コード例 #27
0
ファイル: spu.c プロジェクト: LaitaStefan/labs-2014
void process_image_double(struct image* img){
	unsigned char *input[2], *output, *temp;
	unsigned int addr1, addr2, i, j, k, r, g, b;
	int block_nr = img->block_nr;
	vector unsigned char *v1[2], *v2[2], *v3[2], *v4[2], *v5;

	int buf, nxt_buf; //index of the buffer (0/1)

	input[0] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4);
	input[1] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4);

	output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4);
	temp = malloc_align(NUM_CHANNELS * img->width, 4);

	//optimization
	unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width;

	v1[0] = (vector unsigned char *) &input[0][0];
	v2[0] = (vector unsigned char *) &input[0][1 * num_channels_X_img_width];
	v3[0] = (vector unsigned char *) &input[0][2 * num_channels_X_img_width];
	v4[0] = (vector unsigned char *) &input[0][3 * num_channels_X_img_width];
	v5 = (vector unsigned char *) temp;

	v1[1] = (vector unsigned char *) &input[1][0];
	v2[1] = (vector unsigned char *) &input[1][1 * num_channels_X_img_width];
	v3[1] = (vector unsigned char *) &input[1][2 * num_channels_X_img_width];
	v4[1] = (vector unsigned char *) &input[1][3 * num_channels_X_img_width];


	addr2 = (unsigned int)img->dst; //start of image
	addr2 += (block_nr / NUM_IMAGES_HEIGHT) * num_channels_X_img_width * 
		img->height / NUM_IMAGES_HEIGHT; //start line of spu block
	addr2 += (block_nr % NUM_IMAGES_WIDTH) * num_channels_X_img_width / NUM_IMAGES_WIDTH;

	addr1 = ((unsigned int)img->src);

	buf = 0; // first data transfer
	mfc_getb(input[buf], addr1, SCALE_FACTOR * num_channels_X_img_width, 0, 0, 0);

	for (i = 1; i<img->height / SCALE_FACTOR; i++){
		// get 4 lines
		nxt_buf = buf ^ 1; //ask for next data buffer from PPU
		
		//mfg_get with barrier
		addr1 = ((unsigned int)img->src) + i * num_channels_X_img_width * SCALE_FACTOR;
		mfc_getb(input[nxt_buf], addr1, SCALE_FACTOR * num_channels_X_img_width, nxt_buf, 0, 0);

		mfc_write_tag_mask(1 << buf);
		mfc_read_tag_status_all();

		// process current buffer
		for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){
			v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j]));
		}
		
		for (j = 0; j < img->width; j+=SCALE_FACTOR){
			r = g = b = 0;
			for (k = j; k < j + SCALE_FACTOR; k++) {
				r += temp[k * NUM_CHANNELS + 0];
				g += temp[k * NUM_CHANNELS + 1];
				b += temp[k * NUM_CHANNELS + 2];
			}
			r /= SCALE_FACTOR;
			b /= SCALE_FACTOR;
			g /= SCALE_FACTOR;

			output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b;
		}

		// sent precedent buffer to PPU
		mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
		addr2 += img->width * NUM_CHANNELS; //line inside spu block
		
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();

		buf = nxt_buf; //prepare next iteration
	}

	mfc_write_tag_mask(1 << buf);
	mfc_read_tag_status_all();

	// process last buffer
	for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){
		v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j]));
	}
	
	for (j=0; j < img->width; j+=SCALE_FACTOR){
		r = g = b = 0;
		for (k = j; k < j + SCALE_FACTOR; k++) {
			r += temp[k * NUM_CHANNELS + 0];
			g += temp[k * NUM_CHANNELS + 1];
			b += temp[k * NUM_CHANNELS + 2];
		}
		r /= SCALE_FACTOR;
		b /= SCALE_FACTOR;
		g /= SCALE_FACTOR;

		output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r;
		output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g;
		output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b;
	}

	// send last buffer to PPU
	mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
	addr2 += img->width * NUM_CHANNELS;

	mfc_write_tag_mask(1 << MY_TAG);
	mfc_read_tag_status_all();

	free_align(temp);
	free_align(input[0]);
	free_align(input[1]);
	free_align(output);
}
コード例 #28
0
ファイル: spu_threads.c プロジェクト: pstrinkle/misc-umbc
int main (unsigned long long spe_id, 
          unsigned long long argp, 
          unsigned long long envp)
{
   unsigned int id;
   int i, j, bufindex;
   vector float temp[4];

   /* this is a set of 2 16K buffers */
   vector float buf[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128)));
   vector float out[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128)));

   vector unsigned char maskLeft = (vector unsigned char){0x00, 0x01, 0x02, 0x03,
                                                          0x10, 0x11, 0x12, 0x13,
                                                          0x04, 0x05, 0x06, 0x07,
                                                          0x14, 0x15, 0x16, 0x17};

   vector unsigned char maskRight = (vector unsigned char){0x08, 0x09, 0x0a, 0x0b,
                                                           0x18, 0x19, 0x1a, 0x1b,
                                                           0x0c, 0x0d, 0x0e, 0x0f,
                                                           0x1c, 0x1d, 0x1e, 0x1f};

   transpose_package_t package;

   /* location markers */
   unsigned long long dataaddr = 0;
   int rowid, blockid, blockaddr, blockstart, row;
   int opporowid, oppoblockaddr;

   /* read in package */
   mfc_get(&package, argp, sizeof(transpose_package_t), TAG, 0, 0);
   mfc_write_tag_mask(1<<TAG);
   mfc_read_tag_status_all();

   id = package.id;

   blockstart = id * (N / THREADCNT / BLOCK) * BLOCK * sizeof(float);

   /* For each Row set (64 rows in a row set)
    *     for each block
    *         for each row in a block
    *              read
    */
   for (rowid = 0; rowid < N; rowid += BLOCK)
   {
      /* read in prebuf */
      blockid = 0;

      blockaddr = blockstart + (blockid * sizeof(buf[0][0]));

      /* each rowset is 64 rows */
      for (row = rowid; row < rowid + BLOCK; row++)
      {
	 dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr;

	 mfc_get(
	    buf[blockid & 1][row % BLOCK],
	    dataaddr,
	    sizeof(buf[0][0]),
	    0,
	    0,
	    0);
      }

      /* each spu must walk 8 blocks per rowset */
      for (blockid = 1; blockid < (N / THREADCNT / BLOCK); blockid++)
      {
	 blockaddr = blockstart + (blockid * sizeof(buf[0][0]));

	 /* each rowset is 64 rows */
	 for (row = rowid; row < rowid + BLOCK; row++)
	 {
	    dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr;

	    mfc_get(
	       buf[blockid & 1][row % BLOCK],
	       dataaddr,
	       sizeof(buf[0][0]),
	       blockid & 1,
	       0,
	       0);
	 }

	 mfc_write_tag_mask(1 << (1 - (blockid & 1)));
	 mfc_read_tag_status_all();

	 bufindex = (blockid & 1) ? 0 : 1;

	 /* transpose the previous block */
         for (i = 0; i < BLOCK; i+= 4)
         {
            for (j = 0; j < BLOCK / 4; j++)
            {
               /* first phase */
               temp[0] = spu_shuffle(
                  buf[bufindex][i][j],
                  buf[bufindex][i+2][j],
                  maskLeft);
               temp[1] = spu_shuffle(
                  buf[bufindex][i][j],
                  buf[bufindex][i+2][j],
                  maskRight);
               temp[2] = spu_shuffle(
                  buf[bufindex][i+1][j],
                  buf[bufindex][i+3][j],
                  maskLeft);
               temp[3] = spu_shuffle(
                  buf[bufindex][i+1][j],
                  buf[bufindex][i+3][j],
                  maskRight);

               /* second phase */
               out[bufindex][j*4][i/4] =
                  spu_shuffle(temp[0], temp[2], maskLeft);
               out[bufindex][(j*4)+1][i/4] =
                  spu_shuffle(temp[0], temp[2], maskRight);
               out[bufindex][(j*4)+2][i/4] =
                  spu_shuffle(temp[1], temp[3], maskLeft);
               out[bufindex][(j*4)+3][i/4] =
                  spu_shuffle(temp[1], temp[3], maskRight);
            }
         }

	 /* calculating opposite location! */
	 oppoblockaddr = rowid * sizeof(float);

	 blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0]));
	 opporowid = blockaddr / sizeof(float);
	 
	 /* write the block back out -> to the opposite location! */
	 for (row = opporowid; row < opporowid + BLOCK; row++)
	 {
	    dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr;
	    
	    mfc_put(
	       out[1 - (blockid & 1)][row % BLOCK],
	       dataaddr,
	       sizeof(buf[0][0]),
	       1 - (blockid & 1),
	       0,
	       0);
	 }
      }

      /* handle final block in row */

      mfc_write_tag_mask(2);
      mfc_read_tag_status_all();

      /* process remaining block */
      bufindex = (blockid == 1) ? 0 : 1;

      /* transpose the previous block */
      /* i indexes the row */
      for (i = 0; i < BLOCK; i+=4)
      {
         /* j indexes the column */
         for (j = 0; j < BLOCK / 4; j++)
         {
            /* first phase */
            temp[0] = spu_shuffle(
               buf[bufindex][i][j],
               buf[bufindex][i+2][j],
               maskLeft);
            temp[1] = spu_shuffle(
               buf[bufindex][i][j],
               buf[bufindex][i+2][j],
               maskRight);
            temp[2] = spu_shuffle(
               buf[bufindex][i+1][j],
               buf[bufindex][i+3][j],
               maskLeft);
            temp[3] = spu_shuffle(
               buf[bufindex][i+1][j],
               buf[bufindex][i+3][j],
               maskRight);

            /* second phase */
            out[bufindex][j*4][i/4] = spu_shuffle(temp[0], temp[2], maskLeft);
            out[bufindex][(j*4)+1][i/4] = spu_shuffle(temp[0], temp[2], maskRight);
            out[bufindex][(j*4)+2][i/4] = spu_shuffle(temp[1], temp[3], maskLeft);
            out[bufindex][(j*4)+3][i/4] = spu_shuffle(temp[1], temp[3], maskRight);
         }
      }

      /* calculating opposite for the previous block */
      blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0]));

      oppoblockaddr = rowid * sizeof(float);
      opporowid = blockaddr / sizeof(float);

      /* write the block back out -> to the opposite location! */
      for (row = opporowid; row < opporowid + BLOCK; row++)
      {
         dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr;

         mfc_put(
            out[bufindex][row % BLOCK],
            dataaddr,
            sizeof(buf[0][0]),
            1,
            0,
            0);
      }

      mfc_read_tag_status_all();
   }

   return 0;
}
コード例 #29
0
ファイル: dataflow.c プロジェクト: robertfoss/edan25
void work(param_t param)
{
printf("SPU[%u] work()\n", param.proc);
	unsigned int inbox, offset;
    unsigned int *in = malloc_align(param.bitset_size, ALIGN_EXP);
    unsigned int *out = malloc_align(param.bitset_size, ALIGN_EXP);
    unsigned int *use = malloc_align(param.bitset_size, ALIGN_EXP);
    unsigned int *def = malloc_align(param.bitset_size, ALIGN_EXP);
    if(in == NULL || out == NULL || use == NULL || def == NULL) {
	    printf("malloc_align() failed\n");
	    exit(1);
    }
    unsigned tag_1, tag_2, tag_3, tag_4;
    unsigned int tag_id;   
    /* Reserve a tag for application usage */ 
    if ((tag_1 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_1\n"); 
    }
    if ((tag_2 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_2\n"); 
    }
    if ((tag_3 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_3\n"); 
    }
    if ((tag_4 = mfc_tag_reserve()) == MFC_TAG_INVALID) 
    {
        printf("ERROR: unable to reserve a tag_4\n");
    } 

	while(1) {
		inbox = spu_read_in_mbox();

        if(inbox == UINT_MAX)
        {
            printf("SPU[%u] received exit signal.. exiting.\n", param.proc);
            return;
        }
		
		offset = param.bitset_subsets*inbox;

		mfc_get(in,  (unsigned int) (param.bs_in_addr  + offset), param.bitset_size, tag_1, 0, 0);
		mfc_get(out, (unsigned int) (param.bs_out_addr + offset), param.bitset_size, tag_2, 0, 0);
		mfc_get(use, (unsigned int) (param.bs_use_addr + offset), param.bitset_size, tag_3, 0, 0);
		mfc_get(def, (unsigned int) (param.bs_def_addr + offset), param.bitset_size, tag_4, 0, 0);
		mfc_write_tag_mask(1 << tag_1 | 1 << tag_2 | 1 << tag_3 | 1 << tag_4);
		mfc_read_tag_status_all();

D(printf("SPU[%d] index: %u  bitset_subsets: %u  offset: %u\n", param.proc, inbox, param.bitset_subsets, offset);
printf("SPU[%d]\t&use: %p\n\t&def: %p\n\t&out: %p\n\t&in:  %p\n", param.proc, (void*)param.bs_use_addr, (void*)param.bs_def_addr, (void*)param.bs_out_addr, (void*)param.bs_in_addr);
void *tmp_ptr = (void*) (param.bs_use_addr  + offset);
printf("SPU[%d] read\t\t&%p = use(%p)={", param.proc, (void*)use, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(use, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");
tmp_ptr = (void*) (param.bs_def_addr  + offset);
printf("SPU[%d] read\t\t&%p = def(%p)={", param.proc, (void*)def, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(def, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");
tmp_ptr = (void*) (param.bs_out_addr  + offset);
printf("SPU[%d] read\t\t&%p = out(%p)={", param.proc, (void*)out, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(out, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");
tmp_ptr = (void*) (param.bs_in_addr  + offset);
printf("SPU[%d] read\t\t&%p = in (%p)={", param.proc, (void*)in, tmp_ptr);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(in, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n"));
		bitset_megaop(param, in, out, use, def);		

D(printf("SPU[%d] calculated\tin={", param.proc);
	for (int i = 0; i < 100; ++i){
	if ( bitset_get_bit(in, i) ) {
			printf("%d ", i);
		}
	}
printf("}\n");)

		mfc_put(in, (unsigned int)  (param.bs_in_addr  +  offset), param.bitset_size, tag_1, 0, 0);
		mfc_write_tag_mask(1 << tag_1);
		mfc_read_tag_status_all();

		spu_write_out_intr_mbox(inbox);
	}
コード例 #30
0
ファイル: spu_main.c プロジェクト: davidoguns/raytracer
/* loads the scene using DMA - blocks until done */
void load_scene(unsigned long long ea, scene_t *scene)
{
	unsigned int i = 0;
	object3d_t *objects = 0;
	pointlight_t *lights = 0;
	point_t *v = 0;

#if defined(_DEBUG) && _DEBUG > 2
	printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for SCENE\n",
		sizeof(scene_t),
		&scene,
		mfc_ea2h(ea),
		mfc_ea2l(ea));
#endif
	/* DMA request for scene */
	spu_mfcdma64(scene,
		mfc_ea2h(ea),
		mfc_ea2l(ea),
		sizeof(scene_t),
		SPUDMA_SCENE,
		MFC_GET_CMD);
	
	/* wait for request to complete */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_SCENE);
	mfc_read_tag_status_all();
	
	
	/* copy over objects */
	objects = _malloc_align(sizeof(object3d_t) * scene->nObjects, 4);
#if defined(_DEBUG) && _DEBUG > 2
	printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for OBJECTS\n",
		sizeof(object3d_t) * scene->nObjects,
		objects,
		mfc_ea2h(scene->objects_ea),
		mfc_ea2l(scene->objects_ea));
#endif
	/* initiate DMA */
	spu_mfcdma64(objects,
		mfc_ea2h(scene->objects_ea),
		mfc_ea2l(scene->objects_ea),
		sizeof(object3d_t) * scene->nObjects,
		SPUDMA_OBJECTS,
		MFC_GET_CMD);
	
	/* copy over lights */
	lights = _malloc_align(sizeof(pointlight_t) * scene->nLights, 4);	
#if defined(_DEBUG) && _DEBUG > 2
	printf("Transferring %d bytes to LSaddr(%8X) from EAadd(%8lX:%8lX) for LIGHTS\n",
		sizeof(pointlight_t) * scene->nLights,
		lights,
		mfc_ea2h(scene->lights_ea),
		mfc_ea2l(scene->lights_ea));
#endif
	/* initiate DMA for lights */
	spu_mfcdma64(lights,
		mfc_ea2h(scene->lights_ea),
		mfc_ea2l(scene->lights_ea),
		sizeof(pointlight_t) * scene->nLights,
		SPUDMA_LIGHTS,
		MFC_GET_CMD);
	
	/* wait for objects to complete */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_OBJECTS);
	mfc_read_tag_status_all();
	/* assign local store pointer to objects */
	scene->objects = objects;

	/* iterate each object locally */
	for(; i < scene->nObjects; ++i)
	{
		if(objects[i].geometryType == GEOMETRY_POLYGON)
		{
			/* allocate memory for vertex */
			v = _malloc_align(sizeof(point_t) 
				* objects[i].poly_obj.nVerticies, 4);
			/* initiate DMA to get verticies */
			spu_mfcdma64(v,
				mfc_ea2h(objects[i].poly_obj.vertex_ea),
				mfc_ea2l(objects[i].poly_obj.vertex_ea),
				sizeof(point_t)
				* objects[i].poly_obj.nVerticies,
				SPUDMA_VERTEXES,
				MFC_GET_CMD);
			/* assign local store pointer - WARNING - safe? */
			objects[i].poly_obj.vertex = v;				
		}
	}
	
	/* wait for all DMA to finish (vertexes, lights) */
	spu_writech(MFC_WrTagMask, 1 << SPUDMA_LIGHTS |
				1 << SPUDMA_VERTEXES );
	mfc_read_tag_status_all();
	/* assign local store lights pointer */
	scene->lights = lights;
}