int cacheGetPrime(int n) { if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart)) { int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; } // Haal op. uint32_t tag, size; tag = mfc_tag_reserve(); size = CACHE_PRIME_SIZE*16; unsigned long long EA = setup.vPrimes + (n - n%4) * 4; mfc_get(&primeCacheData, EA, size, tag, 0, 0); mfc_write_tag_mask(1 << tag); mfc_read_tag_status_all(); mfc_tag_release(tag); primeCacheStart = n - (n % 4); int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; }
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) { int i = 0; ppu_data_t ppu_data __attribute__ ((aligned(16))); tag_id = mfc_tag_reserve(); if (tag_id == MFC_TAG_INVALID){ printf("SPU: ERROR can't allocate tag ID\n"); return -1; } /* Obtin prin DMA structura cu pointeri, nr de frame-uri si spe_id */ dprintf("SPU: am intrat in spu %llx %lu %llx\n", speid, sizeof(ppu_data_t), envp); mfc_get((void*)&ppu_data, argp, (uint32_t)envp, tag_id, 0, 0); waittag(tag_id); dprintf("SPU: speid:%llx got struct\n", speid); dprintf("SPU: speid:%llx id:%02d input:%p big_img:%p num_frms:%d\n", speid, ppu_data.spe_id, ppu_data.input, ppu_data.big_image, ppu_data.num_frames); speid = speid; /* Frame processing goes here */ for (i = 0; i < ppu_data.num_frames; ++i) { process_frame(ppu_data, i); } return 0; }
void init_memcpy() { int i; for(i=0; i < NUM_MEMCPY_TAGS; i++) memcpy_tag[i] = mfc_tag_reserve(); }
int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { int i; unsigned int tag_id; /* Reserve a tag for application usage */ if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag\n"); return 1; } /* Here is the actual DMA call */ /* the first parameter is the address in local store to place the data */ /* the second parameter holds the main memory address */ /* the third parameter holds the number of bytes to DMA */ /* the fourth parameter identifies a "tag" to associate with this DMA */ /* (this should be a number between 0 and 31, inclusive) */ /* the last two parameters are only useful if you've implemented your */ /* own cache replacement management policy. Otherwise set them to 0. */ mfc_get(&cb, argp, sizeof(cb), tag_id, 0, 0); /* Now, we set the "tag bit" into the correct channel on the hardware */ /* this is always 1 left-shifted by the tag specified with the DMA */ /* for whose completion you wish to wait. */ mfc_write_tag_mask(1<<tag_id); /* Now, issue the read and wait to guarantee DMA completion before we */ /* continue. */ mfc_read_tag_status_all(); /* DMA the data from system memory to our local store buffer. */ mfc_get(data, cb.addr, DATA_BUFFER_SIZE, tag_id, 0, 0); printf("Address received through control block = 0x%llx\n", cb.addr); /* Wait for the data array DMA to complete. */ mfc_read_tag_status_all(); /* Verify that the data array contains a valid fibonacci sequence. */ for (i=2; i<DATA_BUFFER_ENTRIES; i++) { if (data[i] != data[i-1] + data[i-2]) { printf("ERROR: fibonacci sequence error at entry %d. Expected %d, Got %d\n", i, data[i-1] + data[i-2], data[i]); return (1); } } return 0; }
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm) { init_spu_abs(); init_memcpy(); uint tag_id = mfc_tag_reserve(); jobtag = mfc_tag_reserve(); memcpy_tag_ppe[0] = mfc_tag_reserve(); memcpy_tag_ppe[1] = mfc_tag_reserve(); // Transfer arg spu_mfcdma32(&arg, (unsigned int)parm, (unsigned int)sizeof(kdbuild_arg_t), tag_id, MFC_GET_CMD); DmaWait(tag_id); nsamplepoints = arg.nsamplepoints; nsplitaxises = arg.nsplitaxises; curleaf = arg.curleaf; curjob = 0; total_leaf_size = 0; MakeNodes(); DmaWaitAll(); MakeLeaves(); DmaWaitAll(); spu_mfcdma32(&numleafpolys, (unsigned int)arg.numleafpolys, sizeof(int), tag_id, MFC_PUT_CMD); DmaWait(tag_id); return 0; }
int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long parm) { int i, j; int left, cnt; float time; unsigned int tag_id; vector float dt_v, dt_inv_mass_v; // Reserve a tag ID tag_id = mfc_tag_reserve(); spu_writech(MFC_WrTagMask, -1); // Input parameter parm is a pointer to the particle parameter context. // Fetch the context, waiting for it to complete. spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(parm_context), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt_v = spu_splats(ctx.dt); // For each step in time for (time=0; time<END_OF_TIME; time += ctx.dt) { // For each block of particles for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) { // Determine the number of particles in this block. left = ctx.particles - i; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Fetch the data - position, velocity and inverse_mass. Wait for the DMA to complete // before performing computation. spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); // Compute the step in time for the block of particles for (j=0; j<cnt; j++) { pos[j] = spu_madd(vel[j], dt_v, pos[j]); dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j])); vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]); } // Put the position and velocity data back into system memory spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); } } // Wait for final DMAs to complete before terminating SPU thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); return (0); }
void pull(int side){ int avail_in = num_free_in_buffer(side); int avail_mm = mcb[am].data_size[side] - md[am].num_pulled[side]; int num_pull = avail_in < avail_mm ? avail_in : avail_mm; num_pull = num_pull < MAX_DMA_SIZE ? num_pull : MAX_DMA_SIZE; int head = spu_extract(md[am].idx[side][HEAD],0); int avail_from_head = mcb[am].buffer_size[side] - head; int first_pull = num_pull < avail_from_head ? num_pull : avail_from_head; if(!first_pull) return; // pull #first_pull unsigned int to_ea = (unsigned int) &md[am].buffer[side][head]; int tag = mfc_tag_reserve(); if(tag == MFC_TAG_INVALID){ return; } else { md[am].held_tag[side] = tag; } mfc_get((void*)to_ea, mcb[am].block_addr[side], first_pull * sizeof(vector signed int), md[am].held_tag[side], 0,0); mcb[am].block_addr[side] += first_pull * sizeof(vector signed int); if(first_pull < num_pull){ to_ea = (unsigned int) &md[am].buffer[side][0]; int second_pull = num_pull - first_pull; mfc_get((void*)to_ea, mcb[am].block_addr[side], second_pull * sizeof(vector signed int), md[am].held_tag[side], 0,0); mcb[am].block_addr[side] += second_pull * sizeof(vector signed int); } md[am].num_waiting[side] = num_pull; }
int main (uint64_t speid, uint64_t argp) { DPRINTF ("+(spu)main (%lld, %lld)\n", speid, argp); // -- reserve DMA tag ID for this SPU --------------------------------------- if ((tag = mfc_tag_reserve()) == MFC_TAG_INVALID) as_exitf ("ERROR - can't reserve a tag\n"); DPRINTF (" [%lld] mfc_tag_reserve() = %d\n", speid, tag); // -- get CBE and problem information from system memory. ------------------- DPRINTF (" [%lld] mfc_get (0x%x, 0x%llx, %d, %lu, 0, 0)\n", speid, (unsigned) &sd, argp, sizeof(sd), (int) tag); mfc_getb (&sd, argp, sizeof(sd), tag, 0, 0); DPRINTF (" [%lld] waittag (%d)\n", speid, (int) tag); waittag (tag); sd.sd_ea = argp; // save PPE address of sd block sd.value = sd.ad.sol; // save PPE address of solution vector sd.size = ROUND_UP_128 (sd.ad.size_in_bytes); sd.ad.sol = memalign (16, sd.size); // allocate LS block if (sd.ad.sol == NULL) { fprintf (stderr, "%s:%d: malloc failed in SPU %d\n", __FILE__, __LINE__, sd.num); exit(1); } #if defined(DEBUG) && (DEBUG & 16) printf ("spe%d: &sd=0x%x, sd.value=0x%x, sd.ad.sol=0x%x\n", sd.num, &sd, sd.value, sd.ad.sol); #endif // -- *TBD* -- does sd.value need to be remapped (EA?) // -- get value vector from system memory into new LS block ----------------- DPRINTF (" [%lld] mfc_get (0x%x, 0x%x, %d, %lu, 0, 0)\n", speid, (unsigned) sd.ad.sol, (unsigned) sd.value, sd.size, tag); // -- fix pb with DMA limitation (max = 16 KB) ------------------------------ { int nbytes = sd.size; char *addr_ls = (char *) sd.ad.sol; char *addr_ea = (char *) sd.value; do { int sz = (nbytes < SPU_MAX_DMA)? nbytes: SPU_MAX_DMA; mfc_getb (addr_ls, (uint32_t) addr_ea, sz, tag, 0, 0); waittag (tag); nbytes -= sz; addr_ls += sz; addr_ea += sz; } while (nbytes > 0); } #if defined(DEBUG) && (DEBUG & 8) printf (" [%lld] as_init dump:", speid); printf (" sd.num = %d", sd.num); printf (" sd.ctx = %d", (int) sd.ctx); printf (" sd.thr = %d\n", (int) sd.thr); #endif #if defined(DEBUG) && (DEBUG & 2) if (sd.ad.do_not_init) { printf ("(SPU %d: received data (do_not_init=1):\n", sd.num); Ad_Display(sd.ad.sol, &sd.ad, NULL); printf(")\n"); } #endif Randomize_Seed (sd.ad.seed ^ sd.num); // -- call the benchmark-specific solver Solve (&sd.ad); // -- put the solution back on main memory for the PPE to read as_send (); // printf ("SPU main returning\n"); return 0; }
int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; unsigned int i, num_chunks; mfc_list_element_t* dma_list_in; unsigned int tmp_addr; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* calculate the address of the local buffer where we can point the * dma_list_in pointer to */ tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS)); dma_list_in = (mfc_list_element_t*) (tmp_addr); /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks of data * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE * of data into system memory. Data is moved into local store, processed, and * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration. */ for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float)); /* fill the dma list with the appropriate lower 32bit effective address and size for * each dma list element. This dma list is used to gather the input data * from system memory */ fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory. * The data will be gathered into local buffer local_buffer_in */ mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA get list command to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS); /* fill the dma list with the appropriate lower 32 bit ea and size for each * dma list element. This dma list is used to scatter the output data to system memory */ fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); /* issue the DMA put list command to scatter the result from local memory to * different places in system memory */ mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA put list to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
void process_data_simd (float* buf_in, float* buf_out, unsigned int size) { unsigned int i; vector float *vbuf_in, *vbuf_out; vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f}; vbuf_in = (vector float*) buf_in; vbuf_out = (vector float*) buf_out; for (i = 0; i < (size / 4); i++) { vbuf_out[i] = spu_add (vbuf_in[i], v1); } } int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; int i, num_chunks; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks * and fetches one 'chunk' of data at a time, process it, and write * it back to system memory until done. */ for (i = 0; i < num_chunks; i++) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float)); /* issue a DMA get command to fetch the chunk of data from system memory */ mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into * local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE); /* issue the DMA put command to transfer result from local memory to * system memory */ mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), tag, 0, 0); /* wait for the DMA put to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
void work(param_t param) { printf("SPU[%u] work()\n", param.proc); unsigned int inbox, offset; unsigned int *in = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *out = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *use = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *def = malloc_align(param.bitset_size, ALIGN_EXP); if(in == NULL || out == NULL || use == NULL || def == NULL) { printf("malloc_align() failed\n"); exit(1); } unsigned tag_1, tag_2, tag_3, tag_4; unsigned int tag_id; /* Reserve a tag for application usage */ if ((tag_1 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_1\n"); } if ((tag_2 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_2\n"); } if ((tag_3 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_3\n"); } if ((tag_4 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_4\n"); } while(1) { inbox = spu_read_in_mbox(); if(inbox == UINT_MAX) { printf("SPU[%u] received exit signal.. exiting.\n", param.proc); return; } offset = param.bitset_subsets*inbox; mfc_get(in, (unsigned int) (param.bs_in_addr + offset), param.bitset_size, tag_1, 0, 0); mfc_get(out, (unsigned int) (param.bs_out_addr + offset), param.bitset_size, tag_2, 0, 0); mfc_get(use, (unsigned int) (param.bs_use_addr + offset), param.bitset_size, tag_3, 0, 0); mfc_get(def, (unsigned int) (param.bs_def_addr + offset), param.bitset_size, tag_4, 0, 0); mfc_write_tag_mask(1 << tag_1 | 1 << tag_2 | 1 << tag_3 | 1 << tag_4); mfc_read_tag_status_all(); D(printf("SPU[%d] index: %u bitset_subsets: %u offset: %u\n", param.proc, inbox, param.bitset_subsets, offset); printf("SPU[%d]\t&use: %p\n\t&def: %p\n\t&out: %p\n\t&in: %p\n", param.proc, (void*)param.bs_use_addr, (void*)param.bs_def_addr, (void*)param.bs_out_addr, (void*)param.bs_in_addr); void *tmp_ptr = (void*) (param.bs_use_addr + offset); printf("SPU[%d] read\t\t&%p = use(%p)={", param.proc, (void*)use, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(use, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_def_addr + offset); printf("SPU[%d] read\t\t&%p = def(%p)={", param.proc, (void*)def, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(def, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_out_addr + offset); printf("SPU[%d] read\t\t&%p = out(%p)={", param.proc, (void*)out, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(out, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_in_addr + offset); printf("SPU[%d] read\t\t&%p = in (%p)={", param.proc, (void*)in, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(in, i) ) { printf("%d ", i); } } printf("}\n")); bitset_megaop(param, in, out, use, def); D(printf("SPU[%d] calculated\tin={", param.proc); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(in, i) ) { printf("%d ", i); } } printf("}\n");) mfc_put(in, (unsigned int) (param.bs_in_addr + offset), param.bitset_size, tag_1, 0, 0); mfc_write_tag_mask(1 << tag_1); mfc_read_tag_status_all(); spu_write_out_intr_mbox(inbox); }
void MakeNodes() { uint put_tag[2]; put_tag[0] = mfc_tag_reserve(); put_tag[1] = mfc_tag_reserve(); ushort b = 0; kdbuffer_t l_kdb ALIGNED(16); kdbuffer_t r_kdb ALIGNED(16); kdnode_t node ALIGNED(16); kdbuffer_t kdb ALIGNED(16); DoubleBufInit(&aabb_db, 0, 0, sizeof(aabb_t), NUM_AABBS, aabbbuffer[0], aabbbuffer[1]); // printf("Empty? %i\n", BufferEmpty(&arg.kdbuffer[b])); while(! BufferEmpty(&arg.kdbuffer[b]) ) { kdbuffer_t *pkdb = (kdbuffer_t*)arg.kdbuffer[b].buffer; int size = BufferNumElements(&arg.kdbuffer[b]); int i; BufferClear(&arg.aabb_buffer[1-b]); BufferClear(&arg.kdbuffer[1-b]); // printf("size %i\n", size); for(i=0; i < size; i++) { l_kdb.node = arg.curnode++; r_kdb.node = arg.curnode++; memcpy_ls(&kdb, &pkdb[i], sizeof(kdbuffer_t)); node.split = kdb.plane; node.axis = kdb.axis; node.left = l_kdb.node; node.right = r_kdb.node; memcpy_ea(&arg.nodes[ kdb.node ], &node, sizeof(kdnode_t)); KDBufferAllocate(&l_kdb, kdb.left_size, &arg.aabb_buffer[1-b]); if(curjob < arg.njobs) KDBufferAllocate(&r_kdb, kdb.right_size, &arg.job_aabb_buffer[curjob]); else KDBufferAllocate(&r_kdb, kdb.right_size, &arg.aabb_buffer[1-b]); KDPartitionAll(&kdb, &l_kdb, &r_kdb); if(l_kdb.depth == arg.maxdepth || l_kdb.size <= arg.maxleafsize) { total_leaf_size += l_kdb.count; l_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.leaf_aabb_buffer, l_kdb.aabb, l_kdb.count); BufferCopyToLS(&arg.leafbuffer, &l_kdb, 1); } else { BufferCopyToLS(&arg.kdbuffer[1-b], &l_kdb, 1); } if(r_kdb.depth == arg.maxdepth || r_kdb.size <= arg.maxleafsize) { total_leaf_size += r_kdb.count; if(curjob < arg.njobs) { r_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.job_leaf_aabb_buffer[curjob], r_kdb.aabb, r_kdb.count); BufferCopyToLS(&arg.job_leafbuffer[curjob], &r_kdb, 1); spu_mfcdma32(&arg.job_leafbuffer[curjob], (uint)arg.pjob_leafbuffer[curjob], sizeof(buffer_t), jobtag, MFC_PUT_CMD); DmaWait(jobtag); } else { r_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.leaf_aabb_buffer, r_kdb.aabb, r_kdb.count); BufferCopyToLS(&arg.leafbuffer, &r_kdb, 1); } } else { if(curjob < arg.njobs) { BufferCopyToLS(&arg.job_kdbuffer[curjob], &r_kdb, 1); spu_mfcdma32(&arg.job_kdbuffer[curjob], (uint)arg.pjob_kdbuffer[curjob], sizeof(buffer_t), jobtag, MFC_PUT_CMD); DmaWait(jobtag); } else BufferCopyToLS(&arg.kdbuffer[1-b], &r_kdb, 1); } /* if(curjob < njobs) KDBufferAllocate(&r_kdb, kdb[i].right_size, &jobs[curjob]->aabb_buffer[0]); else KDBufferAllocate(&r_kdb, kdb[i].right_size, &aabb_buffer[1-b]); KDPartition(&kdb[i], &l_kdb, &r_kdb); if(l_kdb.depth == maxdepth || l_kdb.size <= maxleafsize) { l_kdb.aabb = (aabb_t*)BufferCopyTo(&leaf_aabb_buffer, l_kdb.aabb, l_kdb.count); BufferCopyTo(&leafbuffer, &l_kdb, 1); } else BufferCopyTo(&kdbuffer[1-b], &l_kdb, 1); if(r_kdb.depth == maxdepth || r_kdb.size <= maxleafsize) { if(curjob < njobs) { r_kdb.aabb = (aabb_t*)BufferCopyTo(&jobs[curjob]->leaf_aabb_buffer, r_kdb.aabb, r_kdb.count); BufferCopyTo(&jobs[curjob]->leafbuffer, &r_kdb, 1); } else { r_kdb.aabb = (aabb_t*)BufferCopyTo(&leaf_aabb_buffer, r_kdb.aabb, r_kdb.count); BufferCopyTo(&leafbuffer, &r_kdb, 1); } } else { if(curjob < njobs) BufferCopyTo(&jobs[curjob]->kdbuffer[0], &r_kdb, 1); else BufferCopyTo(&kdbuffer[1-b], &r_kdb, 1); } */ if(curjob < arg.njobs) { // Start other job ppe_post_sema(arg.sema[curjob]); curjob++; } } b = 1 - b; } while( curjob < arg.njobs) { ppe_post_sema(arg.sema[curjob]); curjob++; } // Transfer back spu_mfcdma32(&arg.curnode, (unsigned int)arg.pcurnode, (unsigned int)sizeof(int), put_tag[0], MFC_PUT_CMD); spu_mfcdma32(&total_leaf_size, (unsigned int)arg.ptotal_leaf_size, (unsigned int)sizeof(int), put_tag[1], MFC_PUT_CMD); DmaWait(put_tag[0]); DmaWait(put_tag[1]); spu_mfcdma32(&arg.leafbuffer, (unsigned int)arg.pleafbuffer, (unsigned int)sizeof(buffer_t), put_tag[0], MFC_PUT_CMD); spu_mfcdma32(&arg.leaf_aabb_buffer, (unsigned int)arg.pleaf_aabb_buffer, (unsigned int)sizeof(buffer_t), put_tag[1], MFC_PUT_CMD); DmaWaitAll(); mfc_tag_release(put_tag[0]); mfc_tag_release(put_tag[1]); }
void process_buffer(int buffer, int cnt, vector float dt_v) { int i; volatile vector float *p_inv_mass_v; vector float force_v, inv_mass_v; vector float pos0, pos1, pos2, pos3; vector float vel0, vel1, vel2, vel3; vector float dt_inv_mass_v, dt_inv_mass_v_0, dt_inv_mass_v_1, dt_inv_mass_v_2, dt_inv_mass_v_3; vector unsigned char splat_word_0 = (vector unsigned char){0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; vector unsigned char splat_word_1 = (vector unsigned char){4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7}; vector unsigned char splat_word_2 = (vector unsigned char){8, 9,10,11, 8, 9,10,11, 8, 9,10,11, 8, 9,10,11}; vector unsigned char splat_word_3 = (vector unsigned char){12,13,14,15,12,13,14,15,12,13,14,15,12,13,14,15}; p_inv_mass_v = (volatile vector float *)&inv_mass[buffer][0]; force_v = ctx.force_v; // Compute the step in time for the block of particles, four // particle at a time. for (i=0; i<cnt; i+=4) { inv_mass_v = *p_inv_mass_v++; pos0 = pos[buffer][i+0]; pos1 = pos[buffer][i+1]; pos2 = pos[buffer][i+2]; pos3 = pos[buffer][i+3]; vel0 = vel[buffer][i+0]; vel1 = vel[buffer][i+1]; vel2 = vel[buffer][i+2]; vel3 = vel[buffer][i+3]; dt_inv_mass_v = spu_mul(dt_v, inv_mass_v); pos0 = spu_madd(vel0, dt_v, pos0); pos1 = spu_madd(vel1, dt_v, pos1); pos2 = spu_madd(vel2, dt_v, pos2); pos3 = spu_madd(vel3, dt_v, pos3); dt_inv_mass_v_0 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_0); dt_inv_mass_v_1 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_1); dt_inv_mass_v_2 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_2); dt_inv_mass_v_3 = spu_shuffle(dt_inv_mass_v, dt_inv_mass_v, splat_word_3); vel0 = spu_madd(dt_inv_mass_v_0, force_v, vel0); vel1 = spu_madd(dt_inv_mass_v_1, force_v, vel1); vel2 = spu_madd(dt_inv_mass_v_2, force_v, vel2); vel3 = spu_madd(dt_inv_mass_v_3, force_v, vel3); pos[buffer][i+0] = pos0; pos[buffer][i+1] = pos1; pos[buffer][i+2] = pos2; pos[buffer][i+3] = pos3; vel[buffer][i+0] = vel0; vel[buffer][i+1] = vel1; vel[buffer][i+2] = vel2; vel[buffer][i+3] = vel3; } } int main(unsigned long long spu_id __attribute__ ((unused)), unsigned long long argv) { int buffer, next_buffer; int cnt, next_cnt, left; float time, dt; vector float dt_v; volatile vector float *ctx_pos_v, *ctx_vel_v; volatile vector float *next_ctx_pos_v, *next_ctx_vel_v; volatile float *ctx_inv_mass, *next_ctx_inv_mass; unsigned int tags[2]; // Reserve a pair of DMA tag IDs tags[0] = mfc_tag_reserve(); tags[1] = mfc_tag_reserve(); // Input parameter argv is a pointer to the particle context. // Fetch the parameter context, waiting for it to complete. spu_writech(MFC_WrTagMask, 1 << tags[0]); spu_mfcdma32((void *)(&ctx), (unsigned int)argv, sizeof(parm_context), tags[0], MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt = ctx.dt; dt_v = spu_splats(dt); // For each step in time for (time=0; time<END_OF_TIME; time += dt) { // For each double buffered block of particles left = ctx.particles; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; ctx_pos_v = ctx.pos_v; ctx_vel_v = ctx.vel_v; ctx_inv_mass = ctx.inv_mass; // Prefetch first buffer of input data. buffer = 0; spu_mfcdma32((void *)(pos), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[0], MFC_GETB_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[0], MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx_inv_mass), cnt * sizeof(float), tags[0], MFC_GET_CMD); while (cnt < left) { left -= cnt; next_ctx_pos_v = ctx_pos_v + cnt; next_ctx_vel_v = ctx_vel_v + cnt; next_ctx_inv_mass = ctx_inv_mass + cnt; next_cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Prefetch next buffer so the data is available for computation on next loop iteration. // The first DMA is barriered so that we don't GET data before the previous iteration's // data is PUT. next_buffer = buffer^1; spu_mfcdma32((void *)(&pos[next_buffer][0]), (unsigned int)(next_ctx_pos_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GETB_CMD); spu_mfcdma32((void *)(&vel[next_buffer][0]), (unsigned int)(next_ctx_vel_v), next_cnt * sizeof(vector float), tags[next_buffer], MFC_GET_CMD); spu_mfcdma32((void *)(&inv_mass[next_buffer][0]), (unsigned int)(next_ctx_inv_mass), next_cnt * sizeof(float), tags[next_buffer], MFC_GET_CMD); // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); ctx_pos_v = next_ctx_pos_v; ctx_vel_v = next_ctx_vel_v; ctx_inv_mass = next_ctx_inv_mass; buffer = next_buffer; cnt = next_cnt; } // Wait for previously prefetched data spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); process_buffer(buffer, cnt, dt_v); // Put the buffer's position and velocity data back into system memory spu_mfcdma32((void *)(&pos[buffer][0]), (unsigned int)(ctx_pos_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); spu_mfcdma32((void *)(&vel[buffer][0]), (unsigned int)(ctx_vel_v), cnt * sizeof(vector float), tags[buffer], MFC_PUT_CMD); // Wait for DMAs to complete before starting the next step in time. spu_writech(MFC_WrTagMask, 1 << tags[buffer]); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); } return (0); }
int main( unsigned long long spe_id, unsigned long long ppu_vector_a, unsigned long long ppu_vector_b) { int i, iter, buf_idx, vec_idx; unsigned long long ppu_vector_bases[2] _ALIG(128); vector float * pchunk_a, * pchunk_b; vector float g_vec = {0,0,0,0}; ppu_vector_bases[0] = ppu_vector_a; ppu_vector_bases[1] = ppu_vector_b; const unsigned int spu_num = spu_read_in_mbox(); unsigned long long get_edge_bytes = spu_num * SUBVEC_SZ_BYTES; float buffers[NBUFFERS * BUF_SZ_FLOATS] _ALIG(128); int buffer_tags[NBUFFERS][2] _ALIG(128); //int buffer_tags[NBUFFERS]; for (iter = 0; iter < NBUFFERS; ++iter) { buffer_tags[iter][0] = mfc_tag_reserve(); buffer_tags[iter][1] = mfc_tag_reserve(); } // first mfc_get for all for (buf_idx = 0; buf_idx < NBUFFERS; ++buf_idx) { for (vec_idx = 0; vec_idx < 2; ++vec_idx) { mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx), ppu_vector_bases[vec_idx] + get_edge_bytes, CHUNK_SZ_BYTES, buffer_tags[buf_idx][vec_idx], 0, 0); } } get_edge_bytes += CHUNK_SZ_BYTES; //printf("subvec_sz-chunks: %d\n", SUBVEC_SZ_CHUNKS); //printf("%d==%d\n", MAXITER*NBUFFERS*CHUNK_SZ_FLOATS, SUBVEC_SZ_FLOATS); int chunksleft = SUBVEC_SZ_CHUNKS; while(chunksleft!=0) { for (buf_idx = 0; chunksleft !=0 && buf_idx < NBUFFERS; ++buf_idx) { const int tag_mask = (1 << buffer_tags[buf_idx][0]) | (1 << buffer_tags[buf_idx][1]); mfc_write_tag_mask(tag_mask); mfc_read_tag_status_all(); pchunk_a = buf_ptr_vecfloat(buffers, buf_idx, 0); pchunk_b = buf_ptr_vecfloat(buffers, buf_idx, 1); for (i = 0; i < CHUNK_SZ_FLOATVECS; ++i) { g_vec = spu_madd(pchunk_a[i], pchunk_b[i], g_vec); } // move this mfc_get to end of loop, check get_edge_bytes variable dynamics if (likely(iter != MAXITER - 1)) { for (vec_idx = 0; vec_idx < 2; ++vec_idx) { mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx), ppu_vector_bases[vec_idx] + get_edge_bytes, CHUNK_SZ_BYTES, buffer_tags[buf_idx][vec_idx], 0, 0); } } get_edge_bytes += CHUNK_SZ_BYTES; --chunksleft; } } for (iter = 0; iter < NBUFFERS; ++iter) { mfc_tag_release(buffer_tags[iter][0]); mfc_tag_release(buffer_tags[iter][1]); } float_uint_t retval; retval.f = spu_extract(g_vec, 0) + spu_extract(g_vec, 1) + spu_extract(g_vec, 2) + spu_extract(g_vec, 3); //printf("retval: %f\n", retval.f); spu_write_out_mbox(retval.i); return 0; }
void setup_spu(unsigned int spu_ctrlblock_addr){ ctrl_dma_tag = mfc_tag_reserve(); // Get SPU control block mfc_get(&spu_ctrlblock, spu_ctrlblock_addr, sizeof(spu_ctrlblock), ctrl_dma_tag, 0,0); mfc_write_tag_mask(1<<ctrl_dma_tag); mfc_read_tag_status_all(); mcb = (merger_ctrlblock_t*)memalign(128,spu_ctrlblock.num_mergers * sizeof(merger_ctrlblock_t) ); md = (merger_data_t*)malloc(spu_ctrlblock.num_mergers * sizeof(merger_data_t)); // Set addresses int i; for(i = 0; i < spu_ctrlblock.num_mergers; i++){ // Set head/tail vector addresses mcb[i].idx_addr[LEFT] = (unsigned int) &md[i].idx[LEFT][HEAD]; mcb[i].idx_addr[RIGHT] = (unsigned int) &md[i].idx[RIGHT][HEAD]; mcb[i].idx_addr[OUT] = (unsigned int) &md[i].idx[PARENT][TAIL]; } // Send merger control blocks mfc_put(mcb, spu_ctrlblock.ctrlblocks_addr, spu_ctrlblock.num_mergers * sizeof(merger_ctrlblock_t), ctrl_dma_tag, 0,0); mfc_read_tag_status_all(); // Mail PPU telling it we've set the addresses spu_write_out_mbox(1); // Wait for go-ahead mail spu_read_in_mbox(); // Get merger blocks mfc_get(mcb, spu_ctrlblock.ctrlblocks_addr, spu_ctrlblock.num_mergers * sizeof(merger_ctrlblock_t), ctrl_dma_tag, 0,0); mfc_read_tag_status_all(); int buffer_idx = 0; for(i = 0; i < spu_ctrlblock.num_mergers; i++){ // Add start address of buffer array to all block addresses if(mcb[i].id != 0) mcb[i].block_addr[OUT] += (unsigned int) &buffer[0]; if(!mcb[i].leaf_node){ mcb[i].block_addr[LEFT] += (unsigned int) &buffer[0]; mcb[i].block_addr[RIGHT] += (unsigned int) &buffer[0]; } // Setup merger data md[i].held_tag[LEFT] = 32; md[i].held_tag[RIGHT] = 32; md[i].held_tag[OUT] = 32; md[i].num_pulled[LEFT] = 0; md[i].num_pulled[RIGHT] = 0; md[i].mm_depleted[LEFT] = 0; md[i].mm_depleted[RIGHT] = 0; md[i].depleted[LEFT] = 0; md[i].depleted[RIGHT] = 0; md[i].done = 0; md[i].consumed[LEFT] = 0; md[i].consumed[RIGHT] = 0; md[i].idx[LEFT][HEAD] = spu_splats(0); md[i].idx[LEFT][TAIL] = spu_splats(0); md[i].idx[RIGHT][HEAD] = spu_splats(0); md[i].idx[RIGHT][TAIL] = spu_splats(0); md[i].idx[OUT][HEAD] = spu_splats(0); md[i].idx[OUT][TAIL] = spu_splats(0); md[i].idx[PARENT][HEAD] = spu_splats(0); md[i].idx[PARENT][TAIL] = spu_splats(0); md[i].buffer[LEFT] = &buffer[buffer_idx]; buffer_idx += mcb[i].buffer_size[LEFT]; md[i].buffer[RIGHT] = &buffer[buffer_idx]; buffer_idx += mcb[i].buffer_size[RIGHT]; md[i].buffer[OUT] = &buffer[buffer_idx]; buffer_idx += mcb[i].buffer_size[OUT]; } // Setup internal nodes for(i = 0; i < spu_ctrlblock.num_mergers; i++){ if(mcb[i].local[OUT] < 255){ int parent_idx = mcb[i].local[OUT]; int side = (mcb[i].id+1)&1; md[i].buffer[OUT] = md[parent_idx].buffer[side]; mcb[i].buffer_size[OUT] = mcb[parent_idx].buffer_size[side]; } } }
void push(){ int avail_out = num_in_buffer(OUT); if(!avail_out) return; int avail_parent = num_free_in_buffer(PARENT); if(mcb[am].id == 0) avail_parent = mcb[am].data_size[LEFT] + mcb[am].data_size[RIGHT]; int num_send = avail_out < avail_parent ? avail_out : avail_parent; num_send = num_send < MAX_DMA_SIZE ? num_send : MAX_DMA_SIZE; if(!num_send) return; int tag = mfc_tag_reserve(); if(tag == MFC_TAG_INVALID){ return; } else md[am].held_tag[OUT] = tag; // send num_send vectors, in up to three DMA-put's while(num_send > 0){ int parent_head = spu_extract(md[am].idx[PARENT][HEAD],0); int free_from_head = mcb[am].buffer_size[PARENT] - parent_head; int tail = spu_extract(md[am].idx[OUT][TAIL],0); int avail_from_tail = mcb[am].buffer_size[OUT] - tail; int part_send = num_send < free_from_head ? num_send : free_from_head; part_send = part_send < avail_from_tail ? part_send : avail_from_tail; unsigned int to = mcb[am].block_addr[OUT] + parent_head*sizeof(vector signed int); mfc_put(&md[am].buffer[OUT][tail], to, part_send * sizeof(vector signed int), md[am].held_tag[OUT], 0,0); md[am].idx[PARENT][HEAD] = spu_add(md[am].idx[PARENT][HEAD], part_send); parent_head = spu_extract(md[am].idx[PARENT][HEAD],0); if(parent_head == mcb[am].buffer_size[PARENT]) md[am].idx[PARENT][HEAD] = spu_splats(0); md[am].idx[OUT][TAIL] = spu_add(md[am].idx[OUT][TAIL], part_send); tail = spu_extract(md[am].idx[OUT][TAIL],0); if(tail == mcb[am].buffer_size[OUT]) md[am].idx[OUT][TAIL] = spu_splats(0); num_send -= part_send; } // Inner nodes updates parent in buffer head idx if(mcb[am].id) mfc_putf(&md[am].idx[PARENT][HEAD], mcb[am].idx_addr[OUT], sizeof(vector signed int), md[am].held_tag[OUT], 0,0); }