示例#1
0
文件: spu.c 项目: spabreu/Adaptive
void as_send ()
{
  void *t;

  // -- send value vector back to system memory -------------------------------

  // -- fix pb with DMA limitation (max = 16 KB) ------------------------------
  {
    int nbytes = sd.size;
    char *addr_ls = (char *) sd.ad.sol;
    char *addr_ea = (char *) sd.value;

    do {
      int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA;

      mfc_putb (addr_ls, (unsigned int) addr_ea, sz, tag, 0, 0);
      waittag (tag);

      nbytes -= sz;
      addr_ls += sz;
      addr_ea += sz;
    } while (nbytes);
  }


  t = sd.ad.sol;	        // restore PPE address for solution vector
  sd.ad.sol = sd.value;

  // -- send sd block back (inc. ad) ------------------------------------------
  mfc_putb ((void *) &sd, sd.sd_ea, sizeof (sd), (int) tag, 0, 0);
  waittag (tag);

  sd.ad.sol = t;		// put it back.
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp)
{
	int i = 0;
	ppu_data_t ppu_data __attribute__ ((aligned(16)));

	tag_id = mfc_tag_reserve();
	if (tag_id == MFC_TAG_INVALID){
		printf("SPU: ERROR can't allocate tag ID\n");
		return -1;
	}

	/* Obtin prin DMA structura cu pointeri, nr de frame-uri si spe_id */
	dprintf("SPU: am intrat in spu %llx %lu %llx\n",
			speid, sizeof(ppu_data_t), envp);
	mfc_get((void*)&ppu_data, argp, (uint32_t)envp, tag_id, 0, 0);
	waittag(tag_id);

	dprintf("SPU: speid:%llx got struct\n", speid);
	dprintf("SPU: speid:%llx id:%02d input:%p big_img:%p num_frms:%d\n",
			speid, ppu_data.spe_id, ppu_data.input, ppu_data.big_image,
			ppu_data.num_frames);
	speid = speid;

	/* Frame processing goes here */
	for (i = 0; i < ppu_data.num_frames; ++i) {
		process_frame(ppu_data, i);
	}

	return 0;
}
示例#3
0
文件: spu.c 项目: spabreu/Adaptive
void as_mbx_copy_prev ()
{
  uint32_t prev = (sd.num - 1) % sd.num_thr;

  mfc_getb (sd.ad.sol,		// store here (load address)
	    (uint32_t) sd.spe_ls_ea[prev] +
	      (int) &sd.ad.sol, // same address but in other SPU
	    ROUND_UP_16(sd.size), // number of bytes to copy
	    tag, 0, 0);
  waittag (tag);
}
示例#4
0
文件: spu.c 项目: spabreu/Adaptive
int main (uint64_t speid, uint64_t argp)
{
  DPRINTF ("+(spu)main (%lld, %lld)\n", speid, argp);

  // -- reserve DMA tag ID for this SPU ---------------------------------------
  if ((tag = mfc_tag_reserve()) == MFC_TAG_INVALID)
    as_exitf ("ERROR - can't reserve a tag\n");
  DPRINTF (" [%lld] mfc_tag_reserve() = %d\n", speid, tag);

  // -- get CBE and problem information from system memory. -------------------
  DPRINTF (" [%lld] mfc_get (0x%x, 0x%llx, %d, %lu, 0, 0)\n", speid,
	   (unsigned) &sd, argp, sizeof(sd), (int) tag);
  mfc_getb (&sd, argp, sizeof(sd), tag, 0, 0);
  DPRINTF (" [%lld] waittag (%d)\n", speid, (int) tag);
  waittag (tag);

  sd.sd_ea = argp;		// save PPE address of sd block
  sd.value = sd.ad.sol;		// save PPE address of solution vector
  sd.size = ROUND_UP_128 (sd.ad.size_in_bytes);
  sd.ad.sol = memalign (16, sd.size); // allocate LS block
  if (sd.ad.sol == NULL) {
    fprintf (stderr,
	     "%s:%d: malloc failed in SPU %d\n", __FILE__, __LINE__, sd.num);
    exit(1);
  }


#if defined(DEBUG) && (DEBUG & 16)
  printf ("spe%d: &sd=0x%x, sd.value=0x%x, sd.ad.sol=0x%x\n",
	  sd.num, &sd, sd.value, sd.ad.sol);
#endif
  // -- *TBD* -- does sd.value need to be remapped (EA?)
  // -- get value vector from system memory into new LS block -----------------
  DPRINTF (" [%lld] mfc_get (0x%x, 0x%x, %d, %lu, 0, 0)\n", speid,
	   (unsigned) sd.ad.sol, (unsigned) sd.value,
	   sd.size, tag);
  

  // -- fix pb with DMA limitation (max = 16 KB) ------------------------------
  {
    int nbytes = sd.size;
    char *addr_ls = (char *) sd.ad.sol;
    char *addr_ea = (char *) sd.value;

    do {
      int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA;

      mfc_getb (addr_ls, (uint32_t) addr_ea, sz, tag, 0, 0);
      waittag (tag);

      nbytes -= sz;
      addr_ls += sz;
      addr_ea += sz;
    } while (nbytes > 0);
  }

#if defined(DEBUG) && (DEBUG & 8)
  printf (" [%lld] as_init dump:", speid);
  printf ("   sd.num = %d", sd.num);
  printf ("   sd.ctx = %d", (int) sd.ctx);
  printf ("   sd.thr = %d\n", (int) sd.thr);
#endif

#if defined(DEBUG) && (DEBUG & 2)
  if (sd.ad.do_not_init) {
    printf ("(SPU %d: received data (do_not_init=1):\n", sd.num);
    Ad_Display(sd.ad.sol, &sd.ad, NULL);
    printf(")\n");
  }
#endif
  
  Randomize_Seed (sd.ad.seed ^ sd.num);

  // -- call the benchmark-specific solver
  Solve (&sd.ad);
  
  // -- put the solution back on main memory for the PPE to read
  as_send ();

  //  printf ("SPU main returning\n");
  return 0;
}
/* Stores the currently processed line in big_image from PPU */
static void store_line(struct image *img_scaled_line, ppu_data_t ppu_data,
		struct image *big_image, unsigned int chunk_no) {
	/* The line must be transferred directly to big_image on PPU
	 * This is done by several DMAs, because the result will not be
	 * aligned to 16B. 624 / 4 cannot be divided by 16, therefore,
	 * there will be several DMA transfers to accomplish this.
	 *
	 * The first thumbnail will be split into DMA transfers of size
	 * 16*x and 4 Bytes
	 * The second one: 4 Bytes then 8 Bytes then 16 * (x - 1) Bytes
	 *		then 8 Bytes
	 * The third one: 8 Bytes then 16 * (x - 1) Bytes then 8 Bytes
	 *		then 4 Bytes
	 * The fourth one: 4 Bytes then 16 * x Bytes
	 * Where 16 * x + 4 represents the size of the line
	 */
	uint32_t tile_row_sz = big_image->width * (big_image->height / 4) * 3;
	uint32_t base_dma_addr = (uint32_t)big_image->data +
			(ppu_data.spe_id / 4) * tile_row_sz +
			big_image->width * 3 * chunk_no +
			(ppu_data.spe_id % 4) * img_scaled_line->width * 3;
	uint32_t base_local_addr = (uint32_t)img_scaled_line->data;
	unsigned int first_size;
	unsigned int second_size;
	unsigned int third_size;
	unsigned int fourth_size;
	int two_transfers = 0;	/* 1 if  the first and fourth thumbnails
							 * 0 for the second and third thumbnails
							 */
	switch (ppu_data.spe_id % 4) {
		case 0:
			/* First thumbnail */
			first_size = (img_scaled_line->width * 3) / 16 * 16;
			fourth_size = (img_scaled_line->width * 3) % 16;
			two_transfers = 1;
			break;
		case 1:
			/* Second thumbnail */
			first_size = 4;
			second_size = 8;
			third_size = ((img_scaled_line->width * 3) / 16 - 1) * 16;
			fourth_size = 8;
			break;
		case 2:
			/* Second thumbnail */
			first_size = 8;
			second_size = ((img_scaled_line->width * 3) / 16 - 1) * 16;
			third_size = 8;
			fourth_size = 4;
			break;
		case 3:
			/* Fourth thumbnail */
			first_size = (img_scaled_line->width * 3) % 16;
			fourth_size = (img_scaled_line->width * 3)/ 16 * 16;
			two_transfers = 1;
			break;
		default:
			break;
	}

	dprintf("SPU[%d]-chk[%d] base_dma_addr=%p base_local_addr=%p first_size=%u"\
			" second_size=%u third_size=%u fourth_size=%u two_tr=%d\n",
			ppu_data.spe_id, i, (void *)base_dma_addr,
			(void *)base_local_addr, first_size,
			second_size, third_size, fourth_size, two_transfers);
	 /* Transfer first block of the line */
	mfc_put((void *)(base_local_addr), base_dma_addr,
			(uint32_t)first_size, tag_id, 0, 0);

#ifdef DEBUG
	waittag(tag_id);
#endif

	base_dma_addr += first_size;
	base_local_addr += first_size;

	dprintf("SPU[%d]-chk[%d] sent second_dma_addr=%p second_local_addr = %p\n",
			ppu_data.spe_id, i, (void *)base_dma_addr,
			(void *)base_local_addr);

	/* Send the second and the third blocks of the line only if the current
	 * thumbnail is in the middle of the resulting image
	 */
	if (two_transfers == 0) {
		mfc_put((void *)(base_local_addr), base_dma_addr,
				(uint32_t)second_size, tag_id, 0, 0);

		base_dma_addr += second_size;
		base_local_addr += second_size;
		mfc_put((void *)(base_local_addr), base_dma_addr,
				(uint32_t)third_size, tag_id, 0, 0);

		base_dma_addr += third_size;
		base_local_addr += third_size;
	}

	/* Send the last block of the line */
	mfc_put((void *)(base_local_addr), base_dma_addr,
			(uint32_t)fourth_size, tag_id, 0, 0);

	waittag(tag_id);

	dprintf("SPU[%d]-chk[%d] sent fourth_dma_addr=%p second_local_addr = %p\n",
			ppu_data.spe_id, i, (void *)base_dma_addr, (void *)base_local_addr);
}
/* Does the actual processing of the frame */
static void do_work(ppu_data_t ppu_data) {
	struct image input;
	struct image big_image;

	dprintf("SPU[%d] ppu_data.input:%p ppu_big_img:%p sizeof(struct image):%lu\n",
		ppu_data.spe_id, (void *)ppu_data.input,
		(void *)ppu_data.big_image, sizeof(struct image));

	/* Get input image and big_image details */
	mfc_get((void *)(&input), (uint32_t)(ppu_data.input),
			(uint32_t)(sizeof(struct image)), tag_id, 0, 0);
	mfc_get((void *)(&big_image), (uint32_t)(ppu_data.big_image),
			(uint32_t)(sizeof(struct image)), tag_id, 0, 0);

	waittag(tag_id);
	dprintf("SPU[%d] got structs\n"\
			"input.width=%u\tinput.height=%u\n"\
			"big_image.width=%u\tbig_image.height=%u\n"\
			"input.data=%p\tbig_image.data=%p\n",
			ppu_data.spe_id, input.width, input.height, big_image.width,
			big_image.height, (void *)input.data, (void *)big_image.data);

	struct image img_chunk;
	unsigned int buf_line_sz = input.width * NUM_CHANNELS;
	int transfer_sz = 4 * buf_line_sz;

	img_chunk.width = input.width;
	img_chunk.height = 4;
	alloc_image(&img_chunk);

	struct image img_scaled_line;
	img_scaled_line.width = input.width / SCALE_FACTOR;
	img_scaled_line.height = 1;

	/* Hack for memory align of local image data to have the same 4 bits in its
	 * address as the remote corresponding address in PPU
	 */
	int left_padding = (ppu_data.spe_id % 4) * 4;
	unsigned char* addr_to_free = malloc_align(NUM_CHANNELS * 3 * sizeof(char) +
												left_padding, 4);

	img_scaled_line.data = addr_to_free + left_padding;

	unsigned int i;
	/* Process 4 lines from the initial image at a time */
	for (i = 0; i < input.height / img_chunk.height; ++i) {

		/* Get the image chunk from PPU through DMA transfer */
		dprintf("SPU[%d] getting image_chunk %d of size %d\n",
				ppu_data.spe_id, i, transfer_sz);

		dprintf("SPU[%d] input.data=%p img_chunk.data=%p "\
				"start_addr=%p\n", ppu_data.spe_id, (void *)input.data,
				(void *)img_chunk.data, (void *)((uint32_t)(input.data) + i * transfer_sz));

		mfc_get((void *)(img_chunk.data), (uint32_t)(input.data) + i * transfer_sz,
				(uint32_t)(transfer_sz), tag_id, 0, 0);

		waittag(tag_id);
		dprintf("SPU[%d] got image_chunk %d\n", ppu_data.spe_id, i);

		compute_lines_average(&img_chunk, buf_line_sz);

		/* Make average for column. avg = (c0.r + c1.r) / 2 etc*/
		compute_columns_average(&img_chunk, &img_scaled_line);

		store_line(&img_scaled_line, ppu_data, &big_image, i);
	}

	free_image(&img_chunk);
	free_align(addr_to_free);
}