Пример #1
0
void as_mbx_copy_prev ()
{
  uint32_t prev = (sd.num - 1) % sd.num_thr;

  mfc_getb (sd.ad.sol,		// store here (load address)
	    (uint32_t) sd.spe_ls_ea[prev] +
	      (int) &sd.ad.sol, // same address but in other SPU
	    ROUND_UP_16(sd.size), // number of bytes to copy
	    tag, 0, 0);
  waittag (tag);
}
Пример #2
0
/**
 * Fetches x data from main memory
 */
void fetch_x_buffer(uint32_t i, uint64_t ea_off)
{
    const uint32_t rtag1 = i+9;
    const uint32_t rtag2 = i+10;
    const uint32_t rtag3 = i+11;

    mfc_getb(shuffle[0].data, conc[i].ea_base+ea_off,
             conc[i].length*sizeof(real_t), rtag1, 0, 0);
    mfc_getb(shuffle[1].data, wind[i].ea_base+ea_off,
             wind[i].length*sizeof(real_t), rtag2, 0, 0);
    mfc_getb(shuffle[2].data, diff[i].ea_base+ea_off,
             diff[i].length*sizeof(real_t), rtag3, 0, 0);

    wait_for_dma(rtag1);
    copy_as_vector(shuffle[0].data, (vector real_t*)conc[i].data, conc[i].length);
    wait_for_dma(rtag2);
    copy_as_vector(shuffle[1].data, (vector real_t*)wind[i].data, wind[i].length);
    wait_for_dma(rtag3);
    copy_as_vector(shuffle[2].data, (vector real_t*)diff[i].data, diff[i].length);
}
Пример #3
0
int main (uint64_t speid, uint64_t argp)
{
  DPRINTF ("+(spu)main (%lld, %lld)\n", speid, argp);

  // -- reserve DMA tag ID for this SPU ---------------------------------------
  if ((tag = mfc_tag_reserve()) == MFC_TAG_INVALID)
    as_exitf ("ERROR - can't reserve a tag\n");
  DPRINTF (" [%lld] mfc_tag_reserve() = %d\n", speid, tag);

  // -- get CBE and problem information from system memory. -------------------
  DPRINTF (" [%lld] mfc_get (0x%x, 0x%llx, %d, %lu, 0, 0)\n", speid,
	   (unsigned) &sd, argp, sizeof(sd), (int) tag);
  mfc_getb (&sd, argp, sizeof(sd), tag, 0, 0);
  DPRINTF (" [%lld] waittag (%d)\n", speid, (int) tag);
  waittag (tag);

  sd.sd_ea = argp;		// save PPE address of sd block
  sd.value = sd.ad.sol;		// save PPE address of solution vector
  sd.size = ROUND_UP_128 (sd.ad.size_in_bytes);
  sd.ad.sol = memalign (16, sd.size); // allocate LS block
  if (sd.ad.sol == NULL) {
    fprintf (stderr,
	     "%s:%d: malloc failed in SPU %d\n", __FILE__, __LINE__, sd.num);
    exit(1);
  }


#if defined(DEBUG) && (DEBUG & 16)
  printf ("spe%d: &sd=0x%x, sd.value=0x%x, sd.ad.sol=0x%x\n",
	  sd.num, &sd, sd.value, sd.ad.sol);
#endif
  // -- *TBD* -- does sd.value need to be remapped (EA?)
  // -- get value vector from system memory into new LS block -----------------
  DPRINTF (" [%lld] mfc_get (0x%x, 0x%x, %d, %lu, 0, 0)\n", speid,
	   (unsigned) sd.ad.sol, (unsigned) sd.value,
	   sd.size, tag);
  

  // -- fix pb with DMA limitation (max = 16 KB) ------------------------------
  {
    int nbytes = sd.size;
    char *addr_ls = (char *) sd.ad.sol;
    char *addr_ea = (char *) sd.value;

    do {
      int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA;

      mfc_getb (addr_ls, (uint32_t) addr_ea, sz, tag, 0, 0);
      waittag (tag);

      nbytes -= sz;
      addr_ls += sz;
      addr_ea += sz;
    } while (nbytes > 0);
  }

#if defined(DEBUG) && (DEBUG & 8)
  printf (" [%lld] as_init dump:", speid);
  printf ("   sd.num = %d", sd.num);
  printf ("   sd.ctx = %d", (int) sd.ctx);
  printf ("   sd.thr = %d\n", (int) sd.thr);
#endif

#if defined(DEBUG) && (DEBUG & 2)
  if (sd.ad.do_not_init) {
    printf ("(SPU %d: received data (do_not_init=1):\n", sd.num);
    Ad_Display(sd.ad.sol, &sd.ad, NULL);
    printf(")\n");
  }
#endif
  
  Randomize_Seed (sd.ad.seed ^ sd.num);

  // -- call the benchmark-specific solver
  Solve (&sd.ad);
  
  // -- put the solution back on main memory for the PPE to read
  as_send ();

  //  printf ("SPU main returning\n");
  return 0;
}
Пример #4
0
void process_image_double(struct image* img){
	unsigned char *input[2], *output, *temp;
	unsigned int addr1, addr2, i, j, k, r, g, b;
	int block_nr = img->block_nr;
	vector unsigned char *v1[2], *v2[2], *v3[2], *v4[2], *v5;

	int buf, nxt_buf; //index of the buffer (0/1)

	input[0] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4);
	input[1] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4);

	output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4);
	temp = malloc_align(NUM_CHANNELS * img->width, 4);

	//optimization
	unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width;

	v1[0] = (vector unsigned char *) &input[0][0];
	v2[0] = (vector unsigned char *) &input[0][1 * num_channels_X_img_width];
	v3[0] = (vector unsigned char *) &input[0][2 * num_channels_X_img_width];
	v4[0] = (vector unsigned char *) &input[0][3 * num_channels_X_img_width];
	v5 = (vector unsigned char *) temp;

	v1[1] = (vector unsigned char *) &input[1][0];
	v2[1] = (vector unsigned char *) &input[1][1 * num_channels_X_img_width];
	v3[1] = (vector unsigned char *) &input[1][2 * num_channels_X_img_width];
	v4[1] = (vector unsigned char *) &input[1][3 * num_channels_X_img_width];


	addr2 = (unsigned int)img->dst; //start of image
	addr2 += (block_nr / NUM_IMAGES_HEIGHT) * num_channels_X_img_width * 
		img->height / NUM_IMAGES_HEIGHT; //start line of spu block
	addr2 += (block_nr % NUM_IMAGES_WIDTH) * num_channels_X_img_width / NUM_IMAGES_WIDTH;

	addr1 = ((unsigned int)img->src);

	buf = 0; // first data transfer
	mfc_getb(input[buf], addr1, SCALE_FACTOR * num_channels_X_img_width, 0, 0, 0);

	for (i = 1; i<img->height / SCALE_FACTOR; i++){
		// get 4 lines
		nxt_buf = buf ^ 1; //ask for next data buffer from PPU
		
		//mfg_get with barrier
		addr1 = ((unsigned int)img->src) + i * num_channels_X_img_width * SCALE_FACTOR;
		mfc_getb(input[nxt_buf], addr1, SCALE_FACTOR * num_channels_X_img_width, nxt_buf, 0, 0);

		mfc_write_tag_mask(1 << buf);
		mfc_read_tag_status_all();

		// process current buffer
		for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){
			v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j]));
		}
		
		for (j = 0; j < img->width; j+=SCALE_FACTOR){
			r = g = b = 0;
			for (k = j; k < j + SCALE_FACTOR; k++) {
				r += temp[k * NUM_CHANNELS + 0];
				g += temp[k * NUM_CHANNELS + 1];
				b += temp[k * NUM_CHANNELS + 2];
			}
			r /= SCALE_FACTOR;
			b /= SCALE_FACTOR;
			g /= SCALE_FACTOR;

			output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b;
		}

		// sent precedent buffer to PPU
		mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
		addr2 += img->width * NUM_CHANNELS; //line inside spu block
		
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();

		buf = nxt_buf; //prepare next iteration
	}

	mfc_write_tag_mask(1 << buf);
	mfc_read_tag_status_all();

	// process last buffer
	for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){
		v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j]));
	}
	
	for (j=0; j < img->width; j+=SCALE_FACTOR){
		r = g = b = 0;
		for (k = j; k < j + SCALE_FACTOR; k++) {
			r += temp[k * NUM_CHANNELS + 0];
			g += temp[k * NUM_CHANNELS + 1];
			b += temp[k * NUM_CHANNELS + 2];
		}
		r /= SCALE_FACTOR;
		b /= SCALE_FACTOR;
		g /= SCALE_FACTOR;

		output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r;
		output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g;
		output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b;
	}

	// send last buffer to PPU
	mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
	addr2 += img->width * NUM_CHANNELS;

	mfc_write_tag_mask(1 << MY_TAG);
	mfc_read_tag_status_all();

	free_align(temp);
	free_align(input[0]);
	free_align(input[1]);
	free_align(output);
}