int cacheGetPrime(int n) { if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart)) { int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; } // Haal op. uint32_t tag, size; tag = mfc_tag_reserve(); size = CACHE_PRIME_SIZE*16; unsigned long long EA = setup.vPrimes + (n - n%4) * 4; mfc_get(&primeCacheData, EA, size, tag, 0, 0); mfc_write_tag_mask(1 << tag); mfc_read_tag_status_all(); mfc_tag_release(tag); primeCacheStart = n - (n % 4); int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; }
void check_pull_dma(int side){ // Check left if(md[am].held_tag[side] < 32){ mfc_write_tag_mask( 1 << md[am].held_tag[side] ); int status = mfc_read_tag_status_immediate(); if(status){ // Update idx md[am].idx[side][HEAD] = spu_add(md[am].idx[side][HEAD], md[am].num_waiting[side]); vector signed int buffer_size = spu_splats(mcb[am].buffer_size[side] -1); vector unsigned int cmp_v = spu_cmpgt(md[am].idx[side][HEAD], buffer_size); vector signed int zeros = {0,0,0,0}; buffer_size = spu_add(buffer_size,1); zeros = spu_sel(zeros,buffer_size,cmp_v); md[am].idx[side][HEAD] = spu_sub(md[am].idx[side][HEAD],zeros); md[am].num_pulled[side] += md[am].num_waiting[side]; md[am].num_waiting[side] = 0; if(md[am].num_pulled[side] == mcb[am].data_size[side]){ md[am].mm_depleted[side] = 1; } // Release tag mfc_tag_release( md[am].held_tag[side] ); md[am].held_tag[side] = 32; } } }
void check_push_dma(){ if(md[am].held_tag[OUT] < 32){ mfc_write_tag_mask( 1 << md[am].held_tag[OUT] ); int status = mfc_read_tag_status_immediate(); if(status){ // Release tag mfc_tag_release( md[am].held_tag[OUT] ); md[am].held_tag[OUT] = 32; if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT] && md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT]){ if(num_in_buffer(OUT) == 0){ md[am].done = 1; --num_active_mergers; } } } } }
void as_exit (int status) { mfc_tag_release (tag); exit (status); }
void MakeNodes() { uint put_tag[2]; put_tag[0] = mfc_tag_reserve(); put_tag[1] = mfc_tag_reserve(); ushort b = 0; kdbuffer_t l_kdb ALIGNED(16); kdbuffer_t r_kdb ALIGNED(16); kdnode_t node ALIGNED(16); kdbuffer_t kdb ALIGNED(16); DoubleBufInit(&aabb_db, 0, 0, sizeof(aabb_t), NUM_AABBS, aabbbuffer[0], aabbbuffer[1]); // printf("Empty? %i\n", BufferEmpty(&arg.kdbuffer[b])); while(! BufferEmpty(&arg.kdbuffer[b]) ) { kdbuffer_t *pkdb = (kdbuffer_t*)arg.kdbuffer[b].buffer; int size = BufferNumElements(&arg.kdbuffer[b]); int i; BufferClear(&arg.aabb_buffer[1-b]); BufferClear(&arg.kdbuffer[1-b]); // printf("size %i\n", size); for(i=0; i < size; i++) { l_kdb.node = arg.curnode++; r_kdb.node = arg.curnode++; memcpy_ls(&kdb, &pkdb[i], sizeof(kdbuffer_t)); node.split = kdb.plane; node.axis = kdb.axis; node.left = l_kdb.node; node.right = r_kdb.node; memcpy_ea(&arg.nodes[ kdb.node ], &node, sizeof(kdnode_t)); KDBufferAllocate(&l_kdb, kdb.left_size, &arg.aabb_buffer[1-b]); if(curjob < arg.njobs) KDBufferAllocate(&r_kdb, kdb.right_size, &arg.job_aabb_buffer[curjob]); else KDBufferAllocate(&r_kdb, kdb.right_size, &arg.aabb_buffer[1-b]); KDPartitionAll(&kdb, &l_kdb, &r_kdb); if(l_kdb.depth == arg.maxdepth || l_kdb.size <= arg.maxleafsize) { total_leaf_size += l_kdb.count; l_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.leaf_aabb_buffer, l_kdb.aabb, l_kdb.count); BufferCopyToLS(&arg.leafbuffer, &l_kdb, 1); } else { BufferCopyToLS(&arg.kdbuffer[1-b], &l_kdb, 1); } if(r_kdb.depth == arg.maxdepth || r_kdb.size <= arg.maxleafsize) { total_leaf_size += r_kdb.count; if(curjob < arg.njobs) { r_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.job_leaf_aabb_buffer[curjob], r_kdb.aabb, r_kdb.count); BufferCopyToLS(&arg.job_leafbuffer[curjob], &r_kdb, 1); spu_mfcdma32(&arg.job_leafbuffer[curjob], (uint)arg.pjob_leafbuffer[curjob], sizeof(buffer_t), jobtag, MFC_PUT_CMD); DmaWait(jobtag); } else { r_kdb.aabb = (aabb_t*)BufferCopyTo(&arg.leaf_aabb_buffer, r_kdb.aabb, r_kdb.count); BufferCopyToLS(&arg.leafbuffer, &r_kdb, 1); } } else { if(curjob < arg.njobs) { BufferCopyToLS(&arg.job_kdbuffer[curjob], &r_kdb, 1); spu_mfcdma32(&arg.job_kdbuffer[curjob], (uint)arg.pjob_kdbuffer[curjob], sizeof(buffer_t), jobtag, MFC_PUT_CMD); DmaWait(jobtag); } else BufferCopyToLS(&arg.kdbuffer[1-b], &r_kdb, 1); } /* if(curjob < njobs) KDBufferAllocate(&r_kdb, kdb[i].right_size, &jobs[curjob]->aabb_buffer[0]); else KDBufferAllocate(&r_kdb, kdb[i].right_size, &aabb_buffer[1-b]); KDPartition(&kdb[i], &l_kdb, &r_kdb); if(l_kdb.depth == maxdepth || l_kdb.size <= maxleafsize) { l_kdb.aabb = (aabb_t*)BufferCopyTo(&leaf_aabb_buffer, l_kdb.aabb, l_kdb.count); BufferCopyTo(&leafbuffer, &l_kdb, 1); } else BufferCopyTo(&kdbuffer[1-b], &l_kdb, 1); if(r_kdb.depth == maxdepth || r_kdb.size <= maxleafsize) { if(curjob < njobs) { r_kdb.aabb = (aabb_t*)BufferCopyTo(&jobs[curjob]->leaf_aabb_buffer, r_kdb.aabb, r_kdb.count); BufferCopyTo(&jobs[curjob]->leafbuffer, &r_kdb, 1); } else { r_kdb.aabb = (aabb_t*)BufferCopyTo(&leaf_aabb_buffer, r_kdb.aabb, r_kdb.count); BufferCopyTo(&leafbuffer, &r_kdb, 1); } } else { if(curjob < njobs) BufferCopyTo(&jobs[curjob]->kdbuffer[0], &r_kdb, 1); else BufferCopyTo(&kdbuffer[1-b], &r_kdb, 1); } */ if(curjob < arg.njobs) { // Start other job ppe_post_sema(arg.sema[curjob]); curjob++; } } b = 1 - b; } while( curjob < arg.njobs) { ppe_post_sema(arg.sema[curjob]); curjob++; } // Transfer back spu_mfcdma32(&arg.curnode, (unsigned int)arg.pcurnode, (unsigned int)sizeof(int), put_tag[0], MFC_PUT_CMD); spu_mfcdma32(&total_leaf_size, (unsigned int)arg.ptotal_leaf_size, (unsigned int)sizeof(int), put_tag[1], MFC_PUT_CMD); DmaWait(put_tag[0]); DmaWait(put_tag[1]); spu_mfcdma32(&arg.leafbuffer, (unsigned int)arg.pleafbuffer, (unsigned int)sizeof(buffer_t), put_tag[0], MFC_PUT_CMD); spu_mfcdma32(&arg.leaf_aabb_buffer, (unsigned int)arg.pleaf_aabb_buffer, (unsigned int)sizeof(buffer_t), put_tag[1], MFC_PUT_CMD); DmaWaitAll(); mfc_tag_release(put_tag[0]); mfc_tag_release(put_tag[1]); }
int main( unsigned long long spe_id, unsigned long long ppu_vector_a, unsigned long long ppu_vector_b) { int i, iter, buf_idx, vec_idx; unsigned long long ppu_vector_bases[2] _ALIG(128); vector float * pchunk_a, * pchunk_b; vector float g_vec = {0,0,0,0}; ppu_vector_bases[0] = ppu_vector_a; ppu_vector_bases[1] = ppu_vector_b; const unsigned int spu_num = spu_read_in_mbox(); unsigned long long get_edge_bytes = spu_num * SUBVEC_SZ_BYTES; float buffers[NBUFFERS * BUF_SZ_FLOATS] _ALIG(128); int buffer_tags[NBUFFERS][2] _ALIG(128); //int buffer_tags[NBUFFERS]; for (iter = 0; iter < NBUFFERS; ++iter) { buffer_tags[iter][0] = mfc_tag_reserve(); buffer_tags[iter][1] = mfc_tag_reserve(); } // first mfc_get for all for (buf_idx = 0; buf_idx < NBUFFERS; ++buf_idx) { for (vec_idx = 0; vec_idx < 2; ++vec_idx) { mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx), ppu_vector_bases[vec_idx] + get_edge_bytes, CHUNK_SZ_BYTES, buffer_tags[buf_idx][vec_idx], 0, 0); } } get_edge_bytes += CHUNK_SZ_BYTES; //printf("subvec_sz-chunks: %d\n", SUBVEC_SZ_CHUNKS); //printf("%d==%d\n", MAXITER*NBUFFERS*CHUNK_SZ_FLOATS, SUBVEC_SZ_FLOATS); int chunksleft = SUBVEC_SZ_CHUNKS; while(chunksleft!=0) { for (buf_idx = 0; chunksleft !=0 && buf_idx < NBUFFERS; ++buf_idx) { const int tag_mask = (1 << buffer_tags[buf_idx][0]) | (1 << buffer_tags[buf_idx][1]); mfc_write_tag_mask(tag_mask); mfc_read_tag_status_all(); pchunk_a = buf_ptr_vecfloat(buffers, buf_idx, 0); pchunk_b = buf_ptr_vecfloat(buffers, buf_idx, 1); for (i = 0; i < CHUNK_SZ_FLOATVECS; ++i) { g_vec = spu_madd(pchunk_a[i], pchunk_b[i], g_vec); } // move this mfc_get to end of loop, check get_edge_bytes variable dynamics if (likely(iter != MAXITER - 1)) { for (vec_idx = 0; vec_idx < 2; ++vec_idx) { mfc_get(buf_ptr_float(buffers, buf_idx, vec_idx), ppu_vector_bases[vec_idx] + get_edge_bytes, CHUNK_SZ_BYTES, buffer_tags[buf_idx][vec_idx], 0, 0); } } get_edge_bytes += CHUNK_SZ_BYTES; --chunksleft; } } for (iter = 0; iter < NBUFFERS; ++iter) { mfc_tag_release(buffer_tags[iter][0]); mfc_tag_release(buffer_tags[iter][1]); } float_uint_t retval; retval.f = spu_extract(g_vec, 0) + spu_extract(g_vec, 1) + spu_extract(g_vec, 2) + spu_extract(g_vec, 3); //printf("retval: %f\n", retval.f); spu_write_out_mbox(retval.i); return 0; }