void as_send () { void *t; // -- send value vector back to system memory ------------------------------- // -- fix pb with DMA limitation (max = 16 KB) ------------------------------ { int nbytes = sd.size; char *addr_ls = (char *) sd.ad.sol; char *addr_ea = (char *) sd.value; do { int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA; mfc_putb (addr_ls, (unsigned int) addr_ea, sz, tag, 0, 0); waittag (tag); nbytes -= sz; addr_ls += sz; addr_ea += sz; } while (nbytes); } t = sd.ad.sol; // restore PPE address for solution vector sd.ad.sol = sd.value; // -- send sd block back (inc. ad) ------------------------------------------ mfc_putb ((void *) &sd, sd.sd_ea, sizeof (sd), (int) tag, 0, 0); waittag (tag); sd.ad.sol = t; // put it back. }
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) { int i = 0; ppu_data_t ppu_data __attribute__ ((aligned(16))); tag_id = mfc_tag_reserve(); if (tag_id == MFC_TAG_INVALID){ printf("SPU: ERROR can't allocate tag ID\n"); return -1; } /* Obtin prin DMA structura cu pointeri, nr de frame-uri si spe_id */ dprintf("SPU: am intrat in spu %llx %lu %llx\n", speid, sizeof(ppu_data_t), envp); mfc_get((void*)&ppu_data, argp, (uint32_t)envp, tag_id, 0, 0); waittag(tag_id); dprintf("SPU: speid:%llx got struct\n", speid); dprintf("SPU: speid:%llx id:%02d input:%p big_img:%p num_frms:%d\n", speid, ppu_data.spe_id, ppu_data.input, ppu_data.big_image, ppu_data.num_frames); speid = speid; /* Frame processing goes here */ for (i = 0; i < ppu_data.num_frames; ++i) { process_frame(ppu_data, i); } return 0; }
void as_mbx_copy_prev () { uint32_t prev = (sd.num - 1) % sd.num_thr; mfc_getb (sd.ad.sol, // store here (load address) (uint32_t) sd.spe_ls_ea[prev] + (int) &sd.ad.sol, // same address but in other SPU ROUND_UP_16(sd.size), // number of bytes to copy tag, 0, 0); waittag (tag); }
int main (uint64_t speid, uint64_t argp) { DPRINTF ("+(spu)main (%lld, %lld)\n", speid, argp); // -- reserve DMA tag ID for this SPU --------------------------------------- if ((tag = mfc_tag_reserve()) == MFC_TAG_INVALID) as_exitf ("ERROR - can't reserve a tag\n"); DPRINTF (" [%lld] mfc_tag_reserve() = %d\n", speid, tag); // -- get CBE and problem information from system memory. ------------------- DPRINTF (" [%lld] mfc_get (0x%x, 0x%llx, %d, %lu, 0, 0)\n", speid, (unsigned) &sd, argp, sizeof(sd), (int) tag); mfc_getb (&sd, argp, sizeof(sd), tag, 0, 0); DPRINTF (" [%lld] waittag (%d)\n", speid, (int) tag); waittag (tag); sd.sd_ea = argp; // save PPE address of sd block sd.value = sd.ad.sol; // save PPE address of solution vector sd.size = ROUND_UP_128 (sd.ad.size_in_bytes); sd.ad.sol = memalign (16, sd.size); // allocate LS block if (sd.ad.sol == NULL) { fprintf (stderr, "%s:%d: malloc failed in SPU %d\n", __FILE__, __LINE__, sd.num); exit(1); } #if defined(DEBUG) && (DEBUG & 16) printf ("spe%d: &sd=0x%x, sd.value=0x%x, sd.ad.sol=0x%x\n", sd.num, &sd, sd.value, sd.ad.sol); #endif // -- *TBD* -- does sd.value need to be remapped (EA?) // -- get value vector from system memory into new LS block ----------------- DPRINTF (" [%lld] mfc_get (0x%x, 0x%x, %d, %lu, 0, 0)\n", speid, (unsigned) sd.ad.sol, (unsigned) sd.value, sd.size, tag); // -- fix pb with DMA limitation (max = 16 KB) ------------------------------ { int nbytes = sd.size; char *addr_ls = (char *) sd.ad.sol; char *addr_ea = (char *) sd.value; do { int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA; mfc_getb (addr_ls, (uint32_t) addr_ea, sz, tag, 0, 0); waittag (tag); nbytes -= sz; addr_ls += sz; addr_ea += sz; } while (nbytes > 0); } #if defined(DEBUG) && (DEBUG & 8) printf (" [%lld] as_init dump:", speid); printf (" sd.num = %d", sd.num); printf (" sd.ctx = %d", (int) sd.ctx); printf (" sd.thr = %d\n", (int) sd.thr); #endif #if defined(DEBUG) && (DEBUG & 2) if (sd.ad.do_not_init) { printf ("(SPU %d: received data (do_not_init=1):\n", sd.num); Ad_Display(sd.ad.sol, &sd.ad, NULL); printf(")\n"); } #endif Randomize_Seed (sd.ad.seed ^ sd.num); // -- call the benchmark-specific solver Solve (&sd.ad); // -- put the solution back on main memory for the PPE to read as_send (); // printf ("SPU main returning\n"); return 0; }
/* Stores the currently processed line in big_image from PPU */ static void store_line(struct image *img_scaled_line, ppu_data_t ppu_data, struct image *big_image, unsigned int chunk_no) { /* The line must be transferred directly to big_image on PPU * This is done by several DMAs, because the result will not be * aligned to 16B. 624 / 4 cannot be divided by 16, therefore, * there will be several DMA transfers to accomplish this. * * The first thumbnail will be split into DMA transfers of size * 16*x and 4 Bytes * The second one: 4 Bytes then 8 Bytes then 16 * (x - 1) Bytes * then 8 Bytes * The third one: 8 Bytes then 16 * (x - 1) Bytes then 8 Bytes * then 4 Bytes * The fourth one: 4 Bytes then 16 * x Bytes * Where 16 * x + 4 represents the size of the line */ uint32_t tile_row_sz = big_image->width * (big_image->height / 4) * 3; uint32_t base_dma_addr = (uint32_t)big_image->data + (ppu_data.spe_id / 4) * tile_row_sz + big_image->width * 3 * chunk_no + (ppu_data.spe_id % 4) * img_scaled_line->width * 3; uint32_t base_local_addr = (uint32_t)img_scaled_line->data; unsigned int first_size; unsigned int second_size; unsigned int third_size; unsigned int fourth_size; int two_transfers = 0; /* 1 if the first and fourth thumbnails * 0 for the second and third thumbnails */ switch (ppu_data.spe_id % 4) { case 0: /* First thumbnail */ first_size = (img_scaled_line->width * 3) / 16 * 16; fourth_size = (img_scaled_line->width * 3) % 16; two_transfers = 1; break; case 1: /* Second thumbnail */ first_size = 4; second_size = 8; third_size = ((img_scaled_line->width * 3) / 16 - 1) * 16; fourth_size = 8; break; case 2: /* Second thumbnail */ first_size = 8; second_size = ((img_scaled_line->width * 3) / 16 - 1) * 16; third_size = 8; fourth_size = 4; break; case 3: /* Fourth thumbnail */ first_size = (img_scaled_line->width * 3) % 16; fourth_size = (img_scaled_line->width * 3)/ 16 * 16; two_transfers = 1; break; default: break; } dprintf("SPU[%d]-chk[%d] base_dma_addr=%p base_local_addr=%p first_size=%u"\ " second_size=%u third_size=%u fourth_size=%u two_tr=%d\n", ppu_data.spe_id, i, (void *)base_dma_addr, (void *)base_local_addr, first_size, second_size, third_size, fourth_size, two_transfers); /* Transfer first block of the line */ mfc_put((void *)(base_local_addr), base_dma_addr, (uint32_t)first_size, tag_id, 0, 0); #ifdef DEBUG waittag(tag_id); #endif base_dma_addr += first_size; base_local_addr += first_size; dprintf("SPU[%d]-chk[%d] sent second_dma_addr=%p second_local_addr = %p\n", ppu_data.spe_id, i, (void *)base_dma_addr, (void *)base_local_addr); /* Send the second and the third blocks of the line only if the current * thumbnail is in the middle of the resulting image */ if (two_transfers == 0) { mfc_put((void *)(base_local_addr), base_dma_addr, (uint32_t)second_size, tag_id, 0, 0); base_dma_addr += second_size; base_local_addr += second_size; mfc_put((void *)(base_local_addr), base_dma_addr, (uint32_t)third_size, tag_id, 0, 0); base_dma_addr += third_size; base_local_addr += third_size; } /* Send the last block of the line */ mfc_put((void *)(base_local_addr), base_dma_addr, (uint32_t)fourth_size, tag_id, 0, 0); waittag(tag_id); dprintf("SPU[%d]-chk[%d] sent fourth_dma_addr=%p second_local_addr = %p\n", ppu_data.spe_id, i, (void *)base_dma_addr, (void *)base_local_addr); }
/* Does the actual processing of the frame */ static void do_work(ppu_data_t ppu_data) { struct image input; struct image big_image; dprintf("SPU[%d] ppu_data.input:%p ppu_big_img:%p sizeof(struct image):%lu\n", ppu_data.spe_id, (void *)ppu_data.input, (void *)ppu_data.big_image, sizeof(struct image)); /* Get input image and big_image details */ mfc_get((void *)(&input), (uint32_t)(ppu_data.input), (uint32_t)(sizeof(struct image)), tag_id, 0, 0); mfc_get((void *)(&big_image), (uint32_t)(ppu_data.big_image), (uint32_t)(sizeof(struct image)), tag_id, 0, 0); waittag(tag_id); dprintf("SPU[%d] got structs\n"\ "input.width=%u\tinput.height=%u\n"\ "big_image.width=%u\tbig_image.height=%u\n"\ "input.data=%p\tbig_image.data=%p\n", ppu_data.spe_id, input.width, input.height, big_image.width, big_image.height, (void *)input.data, (void *)big_image.data); struct image img_chunk; unsigned int buf_line_sz = input.width * NUM_CHANNELS; int transfer_sz = 4 * buf_line_sz; img_chunk.width = input.width; img_chunk.height = 4; alloc_image(&img_chunk); struct image img_scaled_line; img_scaled_line.width = input.width / SCALE_FACTOR; img_scaled_line.height = 1; /* Hack for memory align of local image data to have the same 4 bits in its * address as the remote corresponding address in PPU */ int left_padding = (ppu_data.spe_id % 4) * 4; unsigned char* addr_to_free = malloc_align(NUM_CHANNELS * 3 * sizeof(char) + left_padding, 4); img_scaled_line.data = addr_to_free + left_padding; unsigned int i; /* Process 4 lines from the initial image at a time */ for (i = 0; i < input.height / img_chunk.height; ++i) { /* Get the image chunk from PPU through DMA transfer */ dprintf("SPU[%d] getting image_chunk %d of size %d\n", ppu_data.spe_id, i, transfer_sz); dprintf("SPU[%d] input.data=%p img_chunk.data=%p "\ "start_addr=%p\n", ppu_data.spe_id, (void *)input.data, (void *)img_chunk.data, (void *)((uint32_t)(input.data) + i * transfer_sz)); mfc_get((void *)(img_chunk.data), (uint32_t)(input.data) + i * transfer_sz, (uint32_t)(transfer_sz), tag_id, 0, 0); waittag(tag_id); dprintf("SPU[%d] got image_chunk %d\n", ppu_data.spe_id, i); compute_lines_average(&img_chunk, buf_line_sz); /* Make average for column. avg = (c0.r + c1.r) / 2 etc*/ compute_columns_average(&img_chunk, &img_scaled_line); store_line(&img_scaled_line, ppu_data, &big_image, i); } free_image(&img_chunk); free_align(addr_to_free); }