void as_mbx_copy_prev () { uint32_t prev = (sd.num - 1) % sd.num_thr; mfc_getb (sd.ad.sol, // store here (load address) (uint32_t) sd.spe_ls_ea[prev] + (int) &sd.ad.sol, // same address but in other SPU ROUND_UP_16(sd.size), // number of bytes to copy tag, 0, 0); waittag (tag); }
/** * Fetches x data from main memory */ void fetch_x_buffer(uint32_t i, uint64_t ea_off) { const uint32_t rtag1 = i+9; const uint32_t rtag2 = i+10; const uint32_t rtag3 = i+11; mfc_getb(shuffle[0].data, conc[i].ea_base+ea_off, conc[i].length*sizeof(real_t), rtag1, 0, 0); mfc_getb(shuffle[1].data, wind[i].ea_base+ea_off, wind[i].length*sizeof(real_t), rtag2, 0, 0); mfc_getb(shuffle[2].data, diff[i].ea_base+ea_off, diff[i].length*sizeof(real_t), rtag3, 0, 0); wait_for_dma(rtag1); copy_as_vector(shuffle[0].data, (vector real_t*)conc[i].data, conc[i].length); wait_for_dma(rtag2); copy_as_vector(shuffle[1].data, (vector real_t*)wind[i].data, wind[i].length); wait_for_dma(rtag3); copy_as_vector(shuffle[2].data, (vector real_t*)diff[i].data, diff[i].length); }
int main (uint64_t speid, uint64_t argp) { DPRINTF ("+(spu)main (%lld, %lld)\n", speid, argp); // -- reserve DMA tag ID for this SPU --------------------------------------- if ((tag = mfc_tag_reserve()) == MFC_TAG_INVALID) as_exitf ("ERROR - can't reserve a tag\n"); DPRINTF (" [%lld] mfc_tag_reserve() = %d\n", speid, tag); // -- get CBE and problem information from system memory. ------------------- DPRINTF (" [%lld] mfc_get (0x%x, 0x%llx, %d, %lu, 0, 0)\n", speid, (unsigned) &sd, argp, sizeof(sd), (int) tag); mfc_getb (&sd, argp, sizeof(sd), tag, 0, 0); DPRINTF (" [%lld] waittag (%d)\n", speid, (int) tag); waittag (tag); sd.sd_ea = argp; // save PPE address of sd block sd.value = sd.ad.sol; // save PPE address of solution vector sd.size = ROUND_UP_128 (sd.ad.size_in_bytes); sd.ad.sol = memalign (16, sd.size); // allocate LS block if (sd.ad.sol == NULL) { fprintf (stderr, "%s:%d: malloc failed in SPU %d\n", __FILE__, __LINE__, sd.num); exit(1); } #if defined(DEBUG) && (DEBUG & 16) printf ("spe%d: &sd=0x%x, sd.value=0x%x, sd.ad.sol=0x%x\n", sd.num, &sd, sd.value, sd.ad.sol); #endif // -- *TBD* -- does sd.value need to be remapped (EA?) // -- get value vector from system memory into new LS block ----------------- DPRINTF (" [%lld] mfc_get (0x%x, 0x%x, %d, %lu, 0, 0)\n", speid, (unsigned) sd.ad.sol, (unsigned) sd.value, sd.size, tag); // -- fix pb with DMA limitation (max = 16 KB) ------------------------------ { int nbytes = sd.size; char *addr_ls = (char *) sd.ad.sol; char *addr_ea = (char *) sd.value; do { int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA; mfc_getb (addr_ls, (uint32_t) addr_ea, sz, tag, 0, 0); waittag (tag); nbytes -= sz; addr_ls += sz; addr_ea += sz; } while (nbytes > 0); } #if defined(DEBUG) && (DEBUG & 8) printf (" [%lld] as_init dump:", speid); printf (" sd.num = %d", sd.num); printf (" sd.ctx = %d", (int) sd.ctx); printf (" sd.thr = %d\n", (int) sd.thr); #endif #if defined(DEBUG) && (DEBUG & 2) if (sd.ad.do_not_init) { printf ("(SPU %d: received data (do_not_init=1):\n", sd.num); Ad_Display(sd.ad.sol, &sd.ad, NULL); printf(")\n"); } #endif Randomize_Seed (sd.ad.seed ^ sd.num); // -- call the benchmark-specific solver Solve (&sd.ad); // -- put the solution back on main memory for the PPE to read as_send (); // printf ("SPU main returning\n"); return 0; }
void process_image_double(struct image* img){ unsigned char *input[2], *output, *temp; unsigned int addr1, addr2, i, j, k, r, g, b; int block_nr = img->block_nr; vector unsigned char *v1[2], *v2[2], *v3[2], *v4[2], *v5; int buf, nxt_buf; //index of the buffer (0/1) input[0] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); input[1] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4); temp = malloc_align(NUM_CHANNELS * img->width, 4); //optimization unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width; v1[0] = (vector unsigned char *) &input[0][0]; v2[0] = (vector unsigned char *) &input[0][1 * num_channels_X_img_width]; v3[0] = (vector unsigned char *) &input[0][2 * num_channels_X_img_width]; v4[0] = (vector unsigned char *) &input[0][3 * num_channels_X_img_width]; v5 = (vector unsigned char *) temp; v1[1] = (vector unsigned char *) &input[1][0]; v2[1] = (vector unsigned char *) &input[1][1 * num_channels_X_img_width]; v3[1] = (vector unsigned char *) &input[1][2 * num_channels_X_img_width]; v4[1] = (vector unsigned char *) &input[1][3 * num_channels_X_img_width]; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * num_channels_X_img_width * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * num_channels_X_img_width / NUM_IMAGES_WIDTH; addr1 = ((unsigned int)img->src); buf = 0; // first data transfer mfc_getb(input[buf], addr1, SCALE_FACTOR * num_channels_X_img_width, 0, 0, 0); for (i = 1; i<img->height / SCALE_FACTOR; i++){ // get 4 lines nxt_buf = buf ^ 1; //ask for next data buffer from PPU //mfg_get with barrier addr1 = ((unsigned int)img->src) + i * num_channels_X_img_width * SCALE_FACTOR; mfc_getb(input[nxt_buf], addr1, SCALE_FACTOR * num_channels_X_img_width, nxt_buf, 0, 0); mfc_write_tag_mask(1 << buf); mfc_read_tag_status_all(); // process current buffer for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j])); } for (j = 0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } // sent precedent buffer to PPU mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); buf = nxt_buf; //prepare next iteration } mfc_write_tag_mask(1 << buf); mfc_read_tag_status_all(); // process last buffer for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j])); } for (j=0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } // send last buffer to PPU mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); free_align(temp); free_align(input[0]); free_align(input[1]); free_align(output); }