void process_image_simple(struct image* img){ unsigned char *input, *output, *temp; unsigned int addr1, addr2, i, j, k, r, g, b; int block_nr = img->block_nr; vector unsigned char *v1, *v2, *v3, *v4, *v5 ; input = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4); temp = malloc_align(NUM_CHANNELS * img->width, 4); v1 = (vector unsigned char *) &input[0]; v2 = (vector unsigned char *) &input[1 * img->width * NUM_CHANNELS]; v3 = (vector unsigned char *) &input[2 * img->width * NUM_CHANNELS]; v4 = (vector unsigned char *) &input[3 * img->width * NUM_CHANNELS]; v5 = (vector unsigned char *) temp; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS * img->width / NUM_IMAGES_WIDTH; for (i=0; i<img->height / SCALE_FACTOR; i++){ //get 4 lines addr1 = ((unsigned int)img->src) + i * img->width * NUM_CHANNELS * SCALE_FACTOR; mfc_get(input, addr1, SCALE_FACTOR * img->width * NUM_CHANNELS, MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); //compute the scaled line for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[j], v2[j]), spu_avg(v3[j], v4[j])); } for (j=0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } //put the scaled line back mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); } free_align(temp); free_align(input); free_align(output); }
int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { int i; unsigned int tag_id; /* Reserve a tag for application usage */ if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag\n"); return 1; } /* Here is the actual DMA call */ /* the first parameter is the address in local store to place the data */ /* the second parameter holds the main memory address */ /* the third parameter holds the number of bytes to DMA */ /* the fourth parameter identifies a "tag" to associate with this DMA */ /* (this should be a number between 0 and 31, inclusive) */ /* the last two parameters are only useful if you've implemented your */ /* own cache replacement management policy. Otherwise set them to 0. */ mfc_get(&cb, argp, sizeof(cb), tag_id, 0, 0); /* Now, we set the "tag bit" into the correct channel on the hardware */ /* this is always 1 left-shifted by the tag specified with the DMA */ /* for whose completion you wish to wait. */ mfc_write_tag_mask(1<<tag_id); /* Now, issue the read and wait to guarantee DMA completion before we */ /* continue. */ mfc_read_tag_status_all(); /* DMA the data from system memory to our local store buffer. */ mfc_get(data, cb.addr, DATA_BUFFER_SIZE, tag_id, 0, 0); printf("Address received through control block = 0x%llx\n", cb.addr); /* Wait for the data array DMA to complete. */ mfc_read_tag_status_all(); /* Verify that the data array contains a valid fibonacci sequence. */ for (i=2; i<DATA_BUFFER_ENTRIES; i++) { if (data[i] != data[i-1] + data[i-2]) { printf("ERROR: fibonacci sequence error at entry %d. Expected %d, Got %d\n", i, data[i-1] + data[i-2], data[i]); return (1); } } return 0; }
/* * The argv argument will be populated with the address that the PPE provided, * from the 4th argument to spe_context_run() */ int main(uint64_t speid, uint64_t argv, uint64_t envp) { struct spe_args args __attribute__((aligned(SPE_ALIGN))); mfc_get(&args, argv, sizeof(args), 0, 0, 0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); cmap_calls = 0; dma_puts = 0; spu_write_decrementer(-1); // Run multiple renders with offsets. Should be factored into render_fractal() render_fractal(&args.fractal, args.thread_idx, args.n_threads, 0.); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 7 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 3 / 4); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 5 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 2); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 3 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 4); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 8); // Send remaining points if(fill%2048) { // select the last buffer used int f = fill / 2048; mfc_put(&points[f*2048], (uint)args.fractal.pointbuf[f], 16384, 0, 0, 0); // Block for completion mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // Send a message with top bit set to indicate final item spu_write_out_intr_mbox((1<<31)|f); // Send another message indicating count spu_write_out_intr_mbox(fill%2048); ++dma_puts; } // Report some stats uint ticks = -1 - spu_read_decrementer(); printf("cmap calls %d ticks %u calls/tick %f\n", cmap_calls, ticks, (double)cmap_calls/ticks ); printf("dma puts %d\n", dma_puts); return 0; }
int main2mod(unsigned long long spe_id, unsigned long long program_data_ea, unsigned long long env) { unsigned tagid = spe_id&31; uint32 i,j; // get program data mfc_get(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0); mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); // precompute partial working states based on ihv & partial msg block pre_compute(pd.ihv1, pd.ihv2, pd.m1, pd.m2); if (pd.collisiondata > 0) { j = pd.collisiondata*8; vec_uint32* bufferptr = &buffer[j]; // get the trail buffer for (i = 0; i < j; i += 128) mfc_get(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0); mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); // process collision trails reduce_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr); reduce_trails2mod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr); find_collmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr); // store the trail buffer for (i = 0; i < j; i += 128) mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0); mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); } else { // fill the trail buffer in steps and do intermediate DMA transfers vec_uint32* bufferptr = &buffer[0]; for (i = 0; i < BUFFERSIZE; i += 256) { bufferptr = generate_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, bufferptr, &buffer[i+256]); mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0); mfc_put(&buffer[i+128], &pd.buffer[i+128], sizeof(vec_uint32)*128, tagid, 0, 0); } } // transfer the current program data back mfc_put(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0); // wait for dma transfers to complete mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); return 0; }
void triad() { int i, j, n; vector float s = spu_splats(args.scalar); n = SIZE * sizeof(float); for (i = 0; (i + SIZE) < args.N; i += SIZE) { mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0); mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); for (j = 0; j < (SIZE / 4); ++j) ls3[j] = spu_madd(s, ls2[j], ls1[j]); mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0); } mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); if (unlikely(i < args.N)) { /* * args.N - i will be smaller than SIZE at this point so * it is safe to do a DMA transfer. * We need to make sure that size is a multiple of 16. */ n = ((args.N - i) * sizeof(float)) & (~127); mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0); mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); /* n must be divisible by 4. */ for (j = 0; j < ((args.N - i) / 4); ++j) ls3[j] = spu_madd(s, ls2[j], ls1[j]); mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); } /* * At this point it may be that i is still smaller than args.N if the length * was not divisible by the number of SPUs times 16. */ }
/* loads program info - blocks until done */ void load_program_info(unsigned long long ea, spe_program_info_t *info) { /* initiate DMA request for program info */ /* spu_mfcdma64(ls_addr, ea_h, ea_l, size, tag_id, cmd); */ spu_mfcdma64(info, mfc_ea2h(ea), mfc_ea2l(ea), sizeof(spe_program_info_t), SPUDMA_PROGRAMINFO, MFC_GET_CMD); /* wait for request to complete */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_PROGRAMINFO); mfc_read_tag_status_all(); /* assign to global for debugging purposes */ speid = info->speId; #if defined(_DEBUG) && _DEBUG > 1 printf("Program info:\n\tSpe ID: %d\n\tNum Pixels: %d\n\tSpp: %d\n\tNum Spes %d\n\tDepth: %d\n", info->speId, info->numPixels, info->samplesPerPixel, info->numSpes, info->depth); #endif }
int cacheGetPrime(int n) { if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart)) { int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; } // Haal op. uint32_t tag, size; tag = mfc_tag_reserve(); size = CACHE_PRIME_SIZE*16; unsigned long long EA = setup.vPrimes + (n - n%4) * 4; mfc_get(&primeCacheData, EA, size, tag, 0, 0); mfc_write_tag_mask(1 << tag); mfc_read_tag_status_all(); mfc_tag_release(tag); primeCacheStart = n - (n % 4); int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; }
void writeTriangleBuffer(Triangle* endTriangle) { if (endTriangle != _currentTriangle) { int length = ( ((char*)endTriangle) - _currentTriangleBuffer + 127) & ~127; unsigned short endTriangleBase = (((char*)endTriangle) - ((char*)_currentTriangle)) + _currentTriangleOffset; vec_ushort8 v_new_end = spu_promote(endTriangleBase, 1); // calculate genuine next pointer ( rewind==0 -> next, rewind!=0 -> 0 ) unsigned short next_pointer = spu_extract( spu_andc( v_new_end, _currentTriangleRewind ), 1 ); _currentTriangle->next_triangle = next_pointer; // printf("current=0x%x, endTriBase=0x%x, next_pointer=0x%x\n", _currentTriangleOffset, endTriangleBase, next_pointer); // DMA the triangle data out spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(_currentTriangleBufferEA), mfc_ea2l(_currentTriangleBufferEA), length, 0, MFC_PUT_CMD); // update the information in the cache line _currentTriangleRewind = spu_splats(next_pointer); // re-use this variable as we don't need it anymore char* dstart = ((char*)&_currentTriangleRewind) + (_currentTriangleCacheEndTriangleEAL & 15); spu_mfcdma64(dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL, sizeof(short), 0, MFC_PUTB_CMD); // printf("writing from %x to %x:%x\n", dstart, _currentTriangleCacheEndTriangleEAH, _currentTriangleCacheEndTriangleEAL); // finally invalidate the triangle info _currentTriangle = NULL; // and make sure the DMA completed mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } }
void compute() { // Compute my portion to compute int my_rows = rows / nspe + (rank < rows % nspe); int offset = rank * (rows / nspe) + std::min(rank, rows % nspe); #if DEBUG printf("Compute (%d/%d %d, %d) %d/%d\n", my_rows, rows, offset, cols, rank, nspe); #endif int tag = 23; uint64_t pin0 = in0 + offset * cols * sizeof(float); uint64_t pin1 = in1 + offset * cols * sizeof(float); uint64_t pin2 = in2 + offset * cols * sizeof(float); uint64_t pout = out + offset * cols * sizeof(float); float buf[4*cols]; float* buf0 = buf + 0*cols; float* buf1 = buf + 1*cols; float* buf2 = buf + 2*cols; float* buf3 = buf + 3*cols; for (int r=0; r<my_rows; ++r) { mfc_get(buf0, pin0, cols*sizeof(float), tag, 0, 0); mfc_get(buf1, pin1, cols*sizeof(float), tag, 0, 0); mfc_get(buf2, pin2, cols*sizeof(float), tag, 0, 0); pin0 += cols * sizeof(float); pin1 += cols * sizeof(float); pin2 += cols * sizeof(float); // Wait for DMAs to complete mfc_write_tag_mask(1<<tag); mfc_read_tag_status_all(); for (int c=0; c<cols; ++c) buf3[c] = buf0[c] * buf1[c] + buf2[c]; mfc_put(buf3, pout, cols*sizeof(float), tag, 0, 0); pout += cols * sizeof(float); } mfc_write_tag_mask(1<<tag); mfc_read_tag_status_all(); }
int main(uint64_t speid, uint64_t argp, uint64_t envp){ unsigned int data[NUM_STREAMS]; unsigned int num_spus = (unsigned int)argp, i, num_images; struct image my_image __attribute__ ((aligned(16))); int mode = (int)envp; speid = speid; //get rid of warning while(1){ num_images = 0; for (i = 0; i < NUM_STREAMS / num_spus; i++){ //assume NUM_STREAMS is a multiple of num_spus while(spu_stat_in_mbox() == 0); data[i] = spu_read_in_mbox(); if (!data[i]) return 0; num_images++; } for (i = 0; i < num_images; i++){ mfc_get(&my_image, data[i], sizeof(struct image), MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); switch(mode){ default: case MODE_SIMPLE: process_image_simple(&my_image); break; case MODE_2LINES: process_image_2lines(&my_image); break; case MODE_DOUBLE: process_image_double(&my_image); break; case MODE_DMALIST: process_image_dmalist(&my_image); break; } } data[0] = DONE; spu_write_out_intr_mbox(data[0]); } return 0; }
static void cleargroups(void) { unsigned i; for (i = 0; i < GROUPS_COUNT; i++) { group_keysvectors[i] = spu_splats((u16) 0); group_insertpos[i] = spu_splats((u32) 0); #ifdef GET_CACHE_STATS group_length[i] = 0; #endif } /* All vectors now points to group0, so fill all entries with true data for group 0 */ mfc_get(group_values[0][0], myCellOGRCoreArgs.upchoose, GROUP_ELEMENTS * 2, DMA_ID, 0, 0); mfc_read_tag_status_all(); for (i = 1; i < GROUPS_COUNT * GROUPS_LENGTH; i++) memcpy(group_values[0][i], group_values[0][0], GROUP_ELEMENTS * 2); }
static void init(unsigned long long argp) { mfc_get(&spu_arguments, (unsigned) argp, sizeof(spu_arguments), 0, 0, 0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); first_channel = spu_arguments.spu_id * NR_CHANNELS / NR_SPUS; last_channel = (spu_arguments.spu_id + 1) * NR_CHANNELS / NR_SPUS; for(int i=0; i<NR_STATIONS; i++) { samples_dma_list[i].size = sizeof(samples[0][0]); } if(spu_arguments.spu_id == 0) { printf("SPU sample dma size = %ld bytes\n", sizeof(samples[0][0])); printf("SPU in buffers = %ld KB @ %p, out buffers = %ld B @ %p\n", sizeof(samples) / 1024, samples, sizeof(visibilities), visibilities); } printf("I am spu %d, calculating channels %3d - %3d\n", spu_arguments.spu_id, first_channel, last_channel); }
void initialize( Fastconv_params* fc, void* p_kernel, fft1d_f* obj, void* buf) { unsigned int size = fc->elements*2*sizeof(float); // The kernel matches the input and output size mfc_get(p_kernel, fc->ea_kernel, size, 31, 0, 0); mfc_write_tag_mask(1<<31); mfc_read_tag_status_all(); if (fc->transform_kernel) { // Perform the forward FFT on the kernel, in place. This only need // be done once -- subsequent calls will utilize the same kernel. cml_ccfft1d_ip_f(obj, (float*)coeff, CML_FFT_FWD, buf); } }
int main(ull id, ull argp, ull envp) { unsigned int cmd; mfc_get(&args, argp, sizeof(args), TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); while (1) { cmd = spu_read_in_mbox(); if (unlikely(SPU2_MSG_PPU_TO_SPU_EXIT == cmd)) break; switch (cmd) { case SPU2_MSG_PPU_TO_SPU_DO_COPY: copy(); break; case SPU2_MSG_PPU_TO_SPU_DO_SCALE: scale(); break; case SPU2_MSG_PPU_TO_SPU_DO_ADD: add(); break; case SPU2_MSG_PPU_TO_SPU_DO_TRIAD: triad(); break; default: fprintf(stderr, " [SPU]: Invalid command received in mailbox\n"); } spu_write_out_mbox(SPU2_MSG_SPU_TO_PPU_DONE); } return 0; }
void _gc_log_write(gc_log_entry_t entry) { if (log_base_ea == 0) return; entry.seqno = log_seqno++; entry.timestamp = spu_read_decrementer(); if (tmp_buffer_busy & (1 << tmp_buffer_idx)){ mfc_write_tag_mask(1 << (log_tags + tmp_buffer_idx)); mfc_read_tag_status_all(); } tmp_buffer[tmp_buffer_idx] = entry; // save local copy mfc_put(&tmp_buffer[tmp_buffer_idx], log_base_ea + log_idx * sizeof(entry), sizeof(entry), log_tags + tmp_buffer_idx, 0, 0); tmp_buffer_busy |= (1 << tmp_buffer_idx); tmp_buffer_idx ^= 0x1; log_idx = (log_idx + 1) & log_idx_mask; }
static inline void wait_for_dma_samples(int buffer) { #if DO_DMA mfc_write_tag_mask(1 << buffer); mfc_read_tag_status_all(); #endif }
int main(uint64_t speid,uint64_t argp, uint64_t envp){ int i,j,k; speid=speid;envp=envp; //avoid warnings //============================================================================ // This part is used to Data input using DMA to get it from PPE // DMA in control block and wait for completion mfc_get(&cb,argp,sizeof(cb),0,0, 0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); // DMA in MatrixSPE and wait for completion int* spuptr =(int *) &MatrixSPE[0][0]; // dst, start addr int* ppuptr = (int *) cb.data; // src, start addr int totalwords = sizeof(MatrixSPE) >> 2; const int dt_unit = 4096; // in words, or 4 bytes int* spulast = spuptr + totalwords; while ( spuptr < spulast ) { int nwords = ( spuptr + dt_unit > spulast ) ? (spulast - spuptr) : dt_unit; mfc_get(spuptr,(unsigned int) ppuptr, 4*nwords, 0,0,0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); spuptr += dt_unit; ppuptr += dt_unit; } // DMA in TransposeSPE and wait for completion spuptr =(int *) &TransposeSPE[0][0]; // dst, start addr ppuptr = (int *) cb.data1; // src, start addr totalwords = sizeof(TransposeSPE) >> 2; spulast = spuptr + totalwords; while ( spuptr < spulast ) { int nwords = ( spuptr + dt_unit > spulast ) ? (spulast - spuptr) : dt_unit; mfc_get(spuptr,(unsigned int) ppuptr, 4*nwords, 0,0,0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); spuptr += dt_unit; ppuptr += dt_unit; } //============================================================================ /* Do computing of 16*16 output matrix on each SPE.. based on 16rows and 16columns passed from PPU*/ for(i=0; i<16; i++) for(j=0;j<16;j++) for(k=0;k<1024;k++) MultSPE[i][j]+=MatrixSPE[i][k]*TransposeSPE[j][k]; //============================================================================ /* Send result to PPU */ spuptr = (int *)&MultSPE[0][0]; ppuptr = (int *)cb.result; totalwords = sizeof(MultSPE) >> 2; spulast = spuptr + totalwords; while ( spuptr < spulast ) { int nwords = ( spuptr + dt_unit > spulast ) ? (spulast - spuptr) : dt_unit; mfc_put(spuptr,(unsigned int) ppuptr, 4*nwords,2,0,0); mfc_write_tag_mask(1 << 2); mfc_read_tag_status_all(); spuptr += dt_unit; ppuptr += dt_unit; } //exit(0); }
int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; unsigned int i, num_chunks; mfc_list_element_t* dma_list_in; unsigned int tmp_addr; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* calculate the address of the local buffer where we can point the * dma_list_in pointer to */ tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS)); dma_list_in = (mfc_list_element_t*) (tmp_addr); /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks of data * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE * of data into system memory. Data is moved into local store, processed, and * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration. */ for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float)); /* fill the dma list with the appropriate lower 32bit effective address and size for * each dma list element. This dma list is used to gather the input data * from system memory */ fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory. * The data will be gathered into local buffer local_buffer_in */ mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA get list command to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS); /* fill the dma list with the appropriate lower 32 bit ea and size for each * dma list element. This dma list is used to scatter the output data to system memory */ fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); /* issue the DMA put list command to scatter the result from local memory to * different places in system memory */ mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA put list to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
static inline void wait_for_dma_visibilities2(int buffer1, int buffer2) { #if DO_DMA mfc_write_tag_mask(1 << (buffer1 + NR_SAMPLE_BUFFERS) | 1 << (buffer2 + NR_SAMPLE_BUFFERS)); mfc_read_tag_status_all(); #endif }
Triangle* getTriangleBuffer(Context* context) { // if we've already allocated a triangle buffer (and we're in the same context) if (context == _currentTriangleContext && _currentTriangle) return _currentTriangle; // trash the default values _currentTriangleContext = context; _currentTriangle = NULL; // read the current renderable cache line to ensure there is room for the triangle data // in the cache line buffer; we do this by comparing against all 16 cache line blocks // to make sure that extending the write pointer wouldn't clobber the data unsigned long long cache_ea = context->renderableCacheLine; if (cache_ea == 0) return NULL; char cachebuffer[128+127]; RenderableCacheLine* cache = (RenderableCacheLine*) ( ((unsigned int)cachebuffer+127) & ~127 ); // printf("GTB: reading to %x from %x:%x\n", cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea)); spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // extendvalid = ( read<=write && test<end ) || ( read>write && test<read ) // extendvalid = ( read>write && read>test ) || ( read<=write && end>test ) // simplifies to extendvalid = selb(end, read, read>write) > test // or extendvalid = selb(end>test, read>test, read>write) // rewind = next >= end // rewindvalid = read != 0 // valid = extendvalid && (!rewind || rewindvalid) // = extendvalid && (!rewind || !rewindinvalid) // = extendvalid && !(rewind && rewindinvalid) // invalid = ! (extendvalid && !(rewind && rewindinvalid)) // = (!extendvalid || (rewind && rewindinvalid)) vec_ushort8 v_writeptr = spu_splats( cache->endTriangle ); vec_ushort8 v_readptr0 = cache->chunkTriangle[0]; vec_ushort8 v_readptr1 = cache->chunkTriangle[1]; vec_ushort8 v_testptr = spu_add(v_writeptr, TRIANGLE_MAX_SIZE); vec_ushort8 v_nextptr = spu_add(v_writeptr, 2*TRIANGLE_MAX_SIZE); vec_ushort8 v_endptr = spu_splats( (unsigned short)TRIANGLE_BUFFER_SIZE); vec_ushort8 v_zero = spu_splats( (unsigned short) 0 ); vec_uchar16 v_merger = (vec_uchar16) { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; vec_ushort8 v_max0_test = spu_sel( v_endptr, v_readptr0, spu_cmpgt( v_readptr0, v_writeptr ) ); vec_ushort8 v_max1_test = spu_sel( v_endptr, v_readptr1, spu_cmpgt( v_readptr1, v_writeptr ) ); vec_ushort8 v_extend0_valid = spu_cmpgt( v_max0_test, v_testptr ); vec_ushort8 v_extend1_valid = spu_cmpgt( v_max1_test, v_testptr ); vec_ushort8 v_rewind0_invalid = spu_cmpeq( v_readptr0, v_zero ); vec_ushort8 v_rewind1_invalid = spu_cmpeq( v_readptr1, v_zero ); vec_ushort8 v_rewind8 = spu_cmpgt( v_nextptr, v_endptr ); vec_uchar16 v_extend_valid = (vec_uchar16) spu_shuffle( v_extend0_valid, v_extend1_valid, v_merger ); vec_uchar16 v_rewind_invalid = (vec_uchar16) spu_shuffle( v_rewind0_invalid, v_rewind1_invalid, v_merger ); vec_uchar16 v_rewind = (vec_uchar16) v_rewind8; vec_uchar16 v_valid_rhs = spu_and( v_rewind_invalid, v_rewind ); vec_uchar16 v_invalid = spu_orc( v_valid_rhs, v_extend_valid ); // check to see if the chunk is being processed vec_uint4 v_free = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); vec_uint4 v_invalid_bits = spu_andc( spu_gather( v_invalid ), (vec_uint4) v_free ); // if any of the bits are invalid, then no can do if ( spu_extract(v_invalid_bits, 0) ) { return NULL; } // fetch in the data before this triangle in the cache buffer unsigned int offset = cache->endTriangle; _currentTriangleBufferExtra = offset & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (offset & ~127); if (_currentTriangleBufferExtra) { spu_mfcdma64(_currentTriangleBuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), 128, 0, MFC_GET_CMD); // ensure DMA did actually complete mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // final bit of initialisation _currentTriangle = (Triangle*) (_currentTriangleBuffer+_currentTriangleBufferExtra); _currentTriangleOffset = offset; _currentTriangleRewind = v_rewind8; _currentTriangleCacheEndTriangleEAL = mfc_ea2l(cache_ea) + (((char*)&cache->endTriangle) - ((char*)cache)); _currentTriangleCacheEndTriangleEAH = mfc_ea2h(cache_ea); _currentTriangleBufferEA = trianglebuffer_ea; // printf("Allocated new triangle buffer: %x\n", offset); // and return the buffer ready to go return _currentTriangle; }
void process_data_simd (float* buf_in, float* buf_out, unsigned int size) { unsigned int i; vector float *vbuf_in, *vbuf_out; vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f}; vbuf_in = (vector float*) buf_in; vbuf_out = (vector float*) buf_out; for (i = 0; i < (size / 4); i++) { vbuf_out[i] = spu_add (vbuf_in[i], v1); } } int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; int i, num_chunks; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks * and fetches one 'chunk' of data at a time, process it, and write * it back to system memory until done. */ for (i = 0; i < num_chunks; i++) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float)); /* issue a DMA get command to fetch the chunk of data from system memory */ mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into * local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE); /* issue the DMA put command to transfer result from local memory to * system memory */ mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), tag, 0, 0); /* wait for the DMA put to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks) { const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) { // merge lo bytes from unsigned shorts (array) 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) { // get busy flag with ones in unused bytes 0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0 }; const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0); char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ]; char sync_buffer[128+127]; void* aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 ); RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer; unsigned long long cache_ea; spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); while (cache_ea) { // terminate immediately if possible if (spu_stat_in_mbox()) return; // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); unsigned int endTriangle = cache->endTriangle; vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle); // first look for short chunks vec_uchar16 next = cache->chunkNext; vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 start0 = cache->chunkStart[0]; vec_ushort8 start1 = cache->chunkStart[1]; vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) ); vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) ); vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0); vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1); vec_ushort8 len0 = spu_sub( end0, start0); vec_ushort8 len1 = spu_sub( end1, start1); vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0); vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1); vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE ); vec_uint4 smallChunkGather = spu_gather(small); // check to see if chunk is already at the last triangle vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle( (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]), (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]), SHUFFLE_MERGE_BYTES) ); // check if the chunk is free vec_uint4 freeChunkGather = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); // check to see if the chunk is being processed vec_uint4 busyChunkGather = spu_gather( spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK), spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) ); // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0 // note that if freeChunkGather is true then busyChunkGather must also be true // done=false, free=false, busy=false -> can process // free=false, busy=false -> can be merged // decide which chunk to process vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather ); vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather ); vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) ); vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask ); /* if (!spu_extract(shortSelMask, 0)) printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n", spu_extract(mayProcessGather, 0) & 0xffff, spu_extract(smallChunkGather, 0), spu_extract(mayProcessShortGather, 0), spu_extract(shortSelMask, 0) & 0xffff, spu_extract(mayProcessSelection, 0) & 0xffff ); */ vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16); unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0); unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0); // if there's nothing to process, try the next cache line in the rendering tasks list if (!spu_extract(mayProcessBits, 0)) { trynextcacheline: cache_ea = cache->next; // sleep(); continue; } unsigned int chunkStart = cache->chunkStartArray [chunkToProcess]; unsigned int chunkTriangle = cache->chunkTriangleArray[chunkToProcess]; unsigned int chunkNext = cache->chunkNextArray [chunkToProcess] & CHUNKNEXT_MASK; unsigned int chunkEnd = (cache->chunkStartArray [chunkNext]-1) & (NUMBER_OF_TILES-1); unsigned int chunkLength = 1 + chunkEnd-chunkStart; // only need an extra block if the block is especially long if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) { freeChunk = 32; } // mark this block as busy cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT; // if there's at least one free chunk, claim it if (freeChunk != 32) { cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED; cache->chunkTriangleArray[freeChunk] = chunkTriangle; } // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) continue; #ifdef INFO printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID, chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle, freeChunk!=32 ? freeChunk : -1 ); // debug_render_tasks(cache); #endif Triangle* triangle; int firstTile; do { // read the triangle data for the current triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // get the triangle deltas firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd); if (firstTile>=0) break; // no match, try next triangle chunkTriangle = triangle->next_triangle; } while (chunkTriangle != endTriangle); // if we actually have something to process... if (firstTile>=0) { // the "normal" splitting will now become: // chunkStart .. (firstTile-1) -> triangle->next_triangle // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY) // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE) int tailChunk; int thisChunk; int nextBlockStart; int thisBlockStart; int realBlockStart; do { retry: // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // calculate start of next block nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK; if (nextBlockStart > chunkEnd) nextBlockStart = chunkEnd+1; // calculate start of block to mark as busy thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK; if (thisBlockStart < chunkStart) thisBlockStart = chunkStart; realBlockStart = thisBlockStart; #ifdef INFO printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID, nextBlockStart, realBlockStart, thisBlockStart, chunkStart); #endif // allocate some more free chunks vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq( spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16); unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); if (freeChunk == 32) { // if we didn't have one before, try again freeChunk = freeChunk2; // and try to get the second one freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) ); freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); } else { // speculatively clear the free chunk just in case we don't need it cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK; } #ifdef INFO printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n", _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart ); #endif // mark region after as available for processing if required if (nextBlockStart < chunkEnd) { if (freeChunk==32) { // if no free chunk, relinquish entire block and write back cache->chunkNextArray[chunkToProcess] = chunkNext; spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); // if writeback failed, we *might* have a free block, retry if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) goto retry; // otherwise give up and try the next cache line goto trynextcacheline; } cache->chunkStartArray[freeChunk] = nextBlockStart; cache->chunkNextArray[freeChunk] = chunkNext; cache->chunkTriangleArray[freeChunk] = chunkTriangle; cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT; tailChunk = freeChunk; #ifdef INFO printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess); debug_render_tasks(cache); #endif } else { // we're gonna use freeChunk2 for the "in front" block, as we've not // used freeChunk, let's use it as it's more likely to have a free chunk freeChunk2 = freeChunk; tailChunk = chunkNext; } // mark region before as available if required and possible thisChunk = chunkToProcess; if (thisBlockStart > chunkStart) { if (freeChunk2 != 32) { // mark this region as busy cache->chunkStartArray[freeChunk2]=thisBlockStart; cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT; cache->chunkTriangleArray[freeChunk2]=chunkTriangle; // mark region before as available for processing cache->chunkNextArray[chunkToProcess]=freeChunk2; cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle; thisChunk = freeChunk2; #ifdef INFO printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #endif } else { // need to keep whole block, update info and mark bust cache->chunkTriangleArray[chunkToProcess]=chunkTriangle; cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT; realBlockStart = chunkStart; printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #ifdef INFO #endif sleep(); } } // merge chunks merge_cache_blocks(cache); // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS); // finally after the write succeeded, update the variables chunkNext = tailChunk; chunkToProcess = thisChunk; chunkStart = firstTile; //thisBlockStart; chunkLength = nextBlockStart - firstTile; chunkEnd = chunkStart + chunkLength - 1; freeChunk = 32; // now we can process the block up to endTriangle initTileBuffers(thisBlockStart, chunkEnd); int ok=0; while (chunkTriangle != endTriangle) { #ifdef INFO printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n", _SPUID, chunkToProcess, chunkStart, chunkLength, chunkTriangle, firstTile, thisBlockStart); #endif // and actually process that triangle on these chunks processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok); ok=1; #ifdef PAUSE sleep(); #endif // and advance to the next-triangle chunkTriangle = triangle->next_triangle; // this should only ever happen if we're running really low on cache line slots // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles. // in this case, we process from thisBlockStart only (because we know that from // chunkStart to there has no result) and then we only process one triangle if (chunkStart != realBlockStart) { /* printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, " "firstTile=%d chunk=%d\n", _SPUID, chunkStart, realBlockStart, chunkEnd, firstTile, chunkToProcess); debug_render_tasks(cache); */ // abort the while loop break; } // read the next triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // until chunkTriangle == endTriangle // flush any output buffers flushTileBuffers(thisBlockStart, chunkEnd); } // firstTile>=0
void waitfor_matrix_io ( int tag ) { mfc_write_tag_mask(1<<tag); mfc_read_tag_status_all(); // Wait for the data array DMA to complete. }
int main(unsigned long long speid, addr64 argp, addr64 envp) { unsigned long long dummy ; int bi, bj, bk ; int i_initial, i_final ; int k; /* Here is the actual DMA call */ /* the first parameter is the address in local store to place the data */ /* the second parameter holds the main memory address */ /* the third parameter holds the number of bytes to DMA */ /* the fourth parameter identifies a "tag" to associate with this DMA */ /* (this should be a number between 0 and 31, inclusive) */ /* the last two parameters are only useful if you've implemented your */ /* own cache replacement management policy. Otherwise set them to 0. */ dummy = envp.ull ; dummy = speid ; mfc_get((void*)&args, argp.ull, 128, 31, 0, 0); /* Now, we set the "tag bit" into the correct channel on the hardware */ /* this is always 1 left-shifted by the tag specified with the DMA */ /* for whose completion you wish to wait. */ mfc_write_tag_mask(1<<31); /* Wait for the data array DMA to complete. */ mfc_read_tag_status_all(); pA_matrix = args.Amat ; pB_matrix = args.Bmat ; pC_matrix = args.Cmat ; i_initial = args.i_initial ; i_final = args.i_final ; for( k=0; k<loops; k++ ) { for(bi=args.i_initial; bi<(int)args.i_final; bi+=stsize) { for(bj=0; bj<tsize; bj+=stsize) { get_Cmatrix_segment ( 30, bi, bj ); waitfor_matrix_io ( 30 ) ; for(bk=0; bk<tsize; bk+=stsize) { get_Amatrix_segments ( 31, bi, bk ); get_Bmatrix_segments ( 31, bk, bj ); waitfor_matrix_io ( 31 ) ; { int i, j; for (i=0;i<stsize;i++) { for (j=0;j<stsize;j++) { for (k=0;k<stsize;k++) { CM0[i][j] += AM0[i][k] * BM0[k][j] ; } } } } } put_Cmatrix_segment ( 30, bi, bj ); waitfor_matrix_io ( 30 ) ; } } } # ifdef NEVER { int i, j; for (i=0;i<32;i++) { for (j=0;j<10;j++) { printf(" %7.2f",iM0[i][j]); } printf("\n"); } printf("\n\n"); } # endif # ifdef NEVER { int i, j; for (i=0;i<tsize;i++) { for (j=0;j<10;j++) { printf(" %7.2f",fM0[i][j]); } printf("\n"); } printf("\n\n"); } # endif return 0; }
int main(unsigned long long speid, addr64 argp, addr64 envp) { // Check size of structures, these offsets must match assembly STATIC_ASSERT(sizeof(struct OgrLevel) == 6*16+16+16); STATIC_ASSERT(sizeof(struct OgrState) == 2*16 + 8*16*29); STATIC_ASSERT(sizeof(CellOGRCoreArgs) == 16 + 2*16 + 8*16*29 + 16 + 16 + 16); STATIC_ASSERT(offsetof(CellOGRCoreArgs, state ) == 16); STATIC_ASSERT(offsetof(CellOGRCoreArgs, state.Levels) == 16 + 32); STATIC_ASSERT(sizeof(u16) == 2); /* DMA fetches of pchoose */ (void) speid; (void) envp; // One DMA used in program mfc_write_tag_mask(1<<DMA_ID); // Fetch arguments from main memory mfc_get(&myCellOGRCoreArgs, argp.a32[1], sizeof(CellOGRCoreArgs), DMA_ID, 0, 0); mfc_read_tag_status_all(); s32 retval; /* check for memory corruption in incoming arguments */ if (myCellOGRCoreArgs.sign1 != SIGN_PPU_TO_SPU_1) { retval = RETVAL_ERR_BAD_SIGN1; goto done; } if (myCellOGRCoreArgs.sign2 != SIGN_PPU_TO_SPU_2) { retval = RETVAL_ERR_BAD_SIGN2; goto done; } // Prepare arguments to be passed to the core struct OgrState* state = &myCellOGRCoreArgs.state; int* pnodes = &myCellOGRCoreArgs.pnodes; u32 upchoose = myCellOGRCoreArgs.upchoose; static int cached_maxdepth; if (state->maxdepth != cached_maxdepth) { cached_maxdepth = state->maxdepth; cleargroups(); } // Call the core // s32 retval = SPE_CORE_FUNCTION(CORE_NAME) (state, pnodes, ogr_choose_dat); if (*pnodes) /* core will not handle nodes == 0 */ myCellOGRCoreArgs.ret_depth = ogr_cycle_256_test(state, pnodes, upchoose); // Check for memory corruption after core exit if (myCellOGRCoreArgs.sign1 != SIGN_PPU_TO_SPU_1) retval = RETVAL_ERR_TRASHED_SIGN1; else if (myCellOGRCoreArgs.sign2 != SIGN_PPU_TO_SPU_2) retval = RETVAL_ERR_TRASHED_SIGN2; else retval = 0; update_groups_stats(); done: // Update changes in main memory myCellOGRCoreArgs.sign1 = SIGN_SPU_TO_PPU_1; myCellOGRCoreArgs.sign2 = SIGN_SPU_TO_PPU_2; mfc_put(&myCellOGRCoreArgs, argp.a32[1], sizeof(CellOGRCoreArgs), DMA_ID, 0, 0); mfc_read_tag_status_all(); return retval; /* no status codes in ogr-ng, core info returned in ret_depth */ }
void process_image_2lines(struct image* img){ unsigned char *input, *output, *output2, *temp; unsigned int addr1, addr2, i, j, k, r1, g1, b1, r2, g2, b2; int block_nr = img->block_nr; vector unsigned char *v1_1, *v1_2, *v1_3, *v1_4, *v1_5; vector unsigned char *v2_1, *v2_2, *v2_3, *v2_4, *v2_5; // optimization unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width; unsigned int num_channels_X_img_width_X_SCALE_FACTOR = num_channels_X_img_width * SCALE_FACTOR; input = malloc_align(2 * num_channels_X_img_width_X_SCALE_FACTOR, 4); output = malloc_align(num_channels_X_img_width / SCALE_FACTOR, 4); output2 = malloc_align(num_channels_X_img_width / SCALE_FACTOR, 4); temp = malloc_align(2 * NUM_CHANNELS * img->width, 4); // first line v1_1 = (vector unsigned char *) &input[0]; v1_2 = (vector unsigned char *) &input[1 * num_channels_X_img_width]; v1_3 = (vector unsigned char *) &input[2 * num_channels_X_img_width]; v1_4 = (vector unsigned char *) &input[3 * num_channels_X_img_width]; v1_5 = (vector unsigned char *) temp; // second line v2_1 = (vector unsigned char *) &input[4 * num_channels_X_img_width]; v2_2 = (vector unsigned char *) &input[5 * num_channels_X_img_width]; v2_3 = (vector unsigned char *) &input[6 * num_channels_X_img_width]; v2_4 = (vector unsigned char *) &input[7 * num_channels_X_img_width]; v2_5 = (vector unsigned char *) &temp[num_channels_X_img_width]; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS * img->width / NUM_IMAGES_WIDTH; for (i = 0; i<img->height / SCALE_FACTOR / 2; i++){ // get 8 lines addr1 = ((unsigned int)img->src) + 2 * i * num_channels_X_img_width_X_SCALE_FACTOR; mfc_get(input, addr1, 2 * num_channels_X_img_width * SCALE_FACTOR, MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); // compute the 2 scaled line for (j = 0; j < num_channels_X_img_width / 16; j++){ v1_5[j] = spu_avg(spu_avg(v1_1[j], v1_2[j]), spu_avg(v1_3[j], v1_4[j])); v2_5[j] = spu_avg(spu_avg(v2_1[j], v2_2[j]), spu_avg(v2_3[j], v2_4[j])); } for (j = 0; j < img->width; j += SCALE_FACTOR){ r1 = g1 = b1 = 0; r2 = b2 = g2 = 0; for (k = j; k < j + SCALE_FACTOR; k++) { unsigned int k_X_NUM_CHANNELS = k * NUM_CHANNELS; r1 += temp[k_X_NUM_CHANNELS + 0]; g1 += temp[k_X_NUM_CHANNELS + 1]; b1 += temp[k_X_NUM_CHANNELS + 2]; r2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 0]; g2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 1]; b2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 2]; } r1 /= SCALE_FACTOR; b1 /= SCALE_FACTOR; g1 /= SCALE_FACTOR; r2 /= SCALE_FACTOR; b2 /= SCALE_FACTOR; g2 /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r1; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g1; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b1; output2[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r2; output2[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g2; output2[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b2; } //put the scaled line back mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block // trimite si al 2-lea set mfc_put(output2, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); } free_align(temp); free_align(input); free_align(output); free_align(output2); }
void process_image_double(struct image* img){ unsigned char *input[2], *output, *temp; unsigned int addr1, addr2, i, j, k, r, g, b; int block_nr = img->block_nr; vector unsigned char *v1[2], *v2[2], *v3[2], *v4[2], *v5; int buf, nxt_buf; //index of the buffer (0/1) input[0] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); input[1] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4); temp = malloc_align(NUM_CHANNELS * img->width, 4); //optimization unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width; v1[0] = (vector unsigned char *) &input[0][0]; v2[0] = (vector unsigned char *) &input[0][1 * num_channels_X_img_width]; v3[0] = (vector unsigned char *) &input[0][2 * num_channels_X_img_width]; v4[0] = (vector unsigned char *) &input[0][3 * num_channels_X_img_width]; v5 = (vector unsigned char *) temp; v1[1] = (vector unsigned char *) &input[1][0]; v2[1] = (vector unsigned char *) &input[1][1 * num_channels_X_img_width]; v3[1] = (vector unsigned char *) &input[1][2 * num_channels_X_img_width]; v4[1] = (vector unsigned char *) &input[1][3 * num_channels_X_img_width]; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * num_channels_X_img_width * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * num_channels_X_img_width / NUM_IMAGES_WIDTH; addr1 = ((unsigned int)img->src); buf = 0; // first data transfer mfc_getb(input[buf], addr1, SCALE_FACTOR * num_channels_X_img_width, 0, 0, 0); for (i = 1; i<img->height / SCALE_FACTOR; i++){ // get 4 lines nxt_buf = buf ^ 1; //ask for next data buffer from PPU //mfg_get with barrier addr1 = ((unsigned int)img->src) + i * num_channels_X_img_width * SCALE_FACTOR; mfc_getb(input[nxt_buf], addr1, SCALE_FACTOR * num_channels_X_img_width, nxt_buf, 0, 0); mfc_write_tag_mask(1 << buf); mfc_read_tag_status_all(); // process current buffer for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j])); } for (j = 0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } // sent precedent buffer to PPU mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); buf = nxt_buf; //prepare next iteration } mfc_write_tag_mask(1 << buf); mfc_read_tag_status_all(); // process last buffer for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j])); } for (j=0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } // send last buffer to PPU mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); free_align(temp); free_align(input[0]); free_align(input[1]); free_align(output); }
int main (unsigned long long spe_id, unsigned long long argp, unsigned long long envp) { unsigned int id; int i, j, bufindex; vector float temp[4]; /* this is a set of 2 16K buffers */ vector float buf[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128))); vector float out[2][BLOCK][BLOCK/4] __attribute__ ((aligned(128))); vector unsigned char maskLeft = (vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17}; vector unsigned char maskRight = (vector unsigned char){0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f}; transpose_package_t package; /* location markers */ unsigned long long dataaddr = 0; int rowid, blockid, blockaddr, blockstart, row; int opporowid, oppoblockaddr; /* read in package */ mfc_get(&package, argp, sizeof(transpose_package_t), TAG, 0, 0); mfc_write_tag_mask(1<<TAG); mfc_read_tag_status_all(); id = package.id; blockstart = id * (N / THREADCNT / BLOCK) * BLOCK * sizeof(float); /* For each Row set (64 rows in a row set) * for each block * for each row in a block * read */ for (rowid = 0; rowid < N; rowid += BLOCK) { /* read in prebuf */ blockid = 0; blockaddr = blockstart + (blockid * sizeof(buf[0][0])); /* each rowset is 64 rows */ for (row = rowid; row < rowid + BLOCK; row++) { dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr; mfc_get( buf[blockid & 1][row % BLOCK], dataaddr, sizeof(buf[0][0]), 0, 0, 0); } /* each spu must walk 8 blocks per rowset */ for (blockid = 1; blockid < (N / THREADCNT / BLOCK); blockid++) { blockaddr = blockstart + (blockid * sizeof(buf[0][0])); /* each rowset is 64 rows */ for (row = rowid; row < rowid + BLOCK; row++) { dataaddr = package.srcbuf + (row * N * sizeof(float)) + blockaddr; mfc_get( buf[blockid & 1][row % BLOCK], dataaddr, sizeof(buf[0][0]), blockid & 1, 0, 0); } mfc_write_tag_mask(1 << (1 - (blockid & 1))); mfc_read_tag_status_all(); bufindex = (blockid & 1) ? 0 : 1; /* transpose the previous block */ for (i = 0; i < BLOCK; i+= 4) { for (j = 0; j < BLOCK / 4; j++) { /* first phase */ temp[0] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskLeft); temp[1] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskRight); temp[2] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskLeft); temp[3] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskRight); /* second phase */ out[bufindex][j*4][i/4] = spu_shuffle(temp[0], temp[2], maskLeft); out[bufindex][(j*4)+1][i/4] = spu_shuffle(temp[0], temp[2], maskRight); out[bufindex][(j*4)+2][i/4] = spu_shuffle(temp[1], temp[3], maskLeft); out[bufindex][(j*4)+3][i/4] = spu_shuffle(temp[1], temp[3], maskRight); } } /* calculating opposite location! */ oppoblockaddr = rowid * sizeof(float); blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0])); opporowid = blockaddr / sizeof(float); /* write the block back out -> to the opposite location! */ for (row = opporowid; row < opporowid + BLOCK; row++) { dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr; mfc_put( out[1 - (blockid & 1)][row % BLOCK], dataaddr, sizeof(buf[0][0]), 1 - (blockid & 1), 0, 0); } } /* handle final block in row */ mfc_write_tag_mask(2); mfc_read_tag_status_all(); /* process remaining block */ bufindex = (blockid == 1) ? 0 : 1; /* transpose the previous block */ /* i indexes the row */ for (i = 0; i < BLOCK; i+=4) { /* j indexes the column */ for (j = 0; j < BLOCK / 4; j++) { /* first phase */ temp[0] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskLeft); temp[1] = spu_shuffle( buf[bufindex][i][j], buf[bufindex][i+2][j], maskRight); temp[2] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskLeft); temp[3] = spu_shuffle( buf[bufindex][i+1][j], buf[bufindex][i+3][j], maskRight); /* second phase */ out[bufindex][j*4][i/4] = spu_shuffle(temp[0], temp[2], maskLeft); out[bufindex][(j*4)+1][i/4] = spu_shuffle(temp[0], temp[2], maskRight); out[bufindex][(j*4)+2][i/4] = spu_shuffle(temp[1], temp[3], maskLeft); out[bufindex][(j*4)+3][i/4] = spu_shuffle(temp[1], temp[3], maskRight); } } /* calculating opposite for the previous block */ blockaddr = blockstart + ((blockid - 1) * sizeof(buf[0][0])); oppoblockaddr = rowid * sizeof(float); opporowid = blockaddr / sizeof(float); /* write the block back out -> to the opposite location! */ for (row = opporowid; row < opporowid + BLOCK; row++) { dataaddr = package.destbuf + (row * N * sizeof(float)) + oppoblockaddr; mfc_put( out[bufindex][row % BLOCK], dataaddr, sizeof(buf[0][0]), 1, 0, 0); } mfc_read_tag_status_all(); } return 0; }
void work(param_t param) { printf("SPU[%u] work()\n", param.proc); unsigned int inbox, offset; unsigned int *in = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *out = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *use = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *def = malloc_align(param.bitset_size, ALIGN_EXP); if(in == NULL || out == NULL || use == NULL || def == NULL) { printf("malloc_align() failed\n"); exit(1); } unsigned tag_1, tag_2, tag_3, tag_4; unsigned int tag_id; /* Reserve a tag for application usage */ if ((tag_1 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_1\n"); } if ((tag_2 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_2\n"); } if ((tag_3 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_3\n"); } if ((tag_4 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_4\n"); } while(1) { inbox = spu_read_in_mbox(); if(inbox == UINT_MAX) { printf("SPU[%u] received exit signal.. exiting.\n", param.proc); return; } offset = param.bitset_subsets*inbox; mfc_get(in, (unsigned int) (param.bs_in_addr + offset), param.bitset_size, tag_1, 0, 0); mfc_get(out, (unsigned int) (param.bs_out_addr + offset), param.bitset_size, tag_2, 0, 0); mfc_get(use, (unsigned int) (param.bs_use_addr + offset), param.bitset_size, tag_3, 0, 0); mfc_get(def, (unsigned int) (param.bs_def_addr + offset), param.bitset_size, tag_4, 0, 0); mfc_write_tag_mask(1 << tag_1 | 1 << tag_2 | 1 << tag_3 | 1 << tag_4); mfc_read_tag_status_all(); D(printf("SPU[%d] index: %u bitset_subsets: %u offset: %u\n", param.proc, inbox, param.bitset_subsets, offset); printf("SPU[%d]\t&use: %p\n\t&def: %p\n\t&out: %p\n\t&in: %p\n", param.proc, (void*)param.bs_use_addr, (void*)param.bs_def_addr, (void*)param.bs_out_addr, (void*)param.bs_in_addr); void *tmp_ptr = (void*) (param.bs_use_addr + offset); printf("SPU[%d] read\t\t&%p = use(%p)={", param.proc, (void*)use, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(use, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_def_addr + offset); printf("SPU[%d] read\t\t&%p = def(%p)={", param.proc, (void*)def, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(def, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_out_addr + offset); printf("SPU[%d] read\t\t&%p = out(%p)={", param.proc, (void*)out, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(out, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_in_addr + offset); printf("SPU[%d] read\t\t&%p = in (%p)={", param.proc, (void*)in, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(in, i) ) { printf("%d ", i); } } printf("}\n")); bitset_megaop(param, in, out, use, def); D(printf("SPU[%d] calculated\tin={", param.proc); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(in, i) ) { printf("%d ", i); } } printf("}\n");) mfc_put(in, (unsigned int) (param.bs_in_addr + offset), param.bitset_size, tag_1, 0, 0); mfc_write_tag_mask(1 << tag_1); mfc_read_tag_status_all(); spu_write_out_intr_mbox(inbox); }
/* loads the scene using DMA - blocks until done */ void load_scene(unsigned long long ea, scene_t *scene) { unsigned int i = 0; object3d_t *objects = 0; pointlight_t *lights = 0; point_t *v = 0; #if defined(_DEBUG) && _DEBUG > 2 printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for SCENE\n", sizeof(scene_t), &scene, mfc_ea2h(ea), mfc_ea2l(ea)); #endif /* DMA request for scene */ spu_mfcdma64(scene, mfc_ea2h(ea), mfc_ea2l(ea), sizeof(scene_t), SPUDMA_SCENE, MFC_GET_CMD); /* wait for request to complete */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_SCENE); mfc_read_tag_status_all(); /* copy over objects */ objects = _malloc_align(sizeof(object3d_t) * scene->nObjects, 4); #if defined(_DEBUG) && _DEBUG > 2 printf("Transferring %d bytes to LSaddr(%8lX) from EAadd(%8lX:%8lX) for OBJECTS\n", sizeof(object3d_t) * scene->nObjects, objects, mfc_ea2h(scene->objects_ea), mfc_ea2l(scene->objects_ea)); #endif /* initiate DMA */ spu_mfcdma64(objects, mfc_ea2h(scene->objects_ea), mfc_ea2l(scene->objects_ea), sizeof(object3d_t) * scene->nObjects, SPUDMA_OBJECTS, MFC_GET_CMD); /* copy over lights */ lights = _malloc_align(sizeof(pointlight_t) * scene->nLights, 4); #if defined(_DEBUG) && _DEBUG > 2 printf("Transferring %d bytes to LSaddr(%8X) from EAadd(%8lX:%8lX) for LIGHTS\n", sizeof(pointlight_t) * scene->nLights, lights, mfc_ea2h(scene->lights_ea), mfc_ea2l(scene->lights_ea)); #endif /* initiate DMA for lights */ spu_mfcdma64(lights, mfc_ea2h(scene->lights_ea), mfc_ea2l(scene->lights_ea), sizeof(pointlight_t) * scene->nLights, SPUDMA_LIGHTS, MFC_GET_CMD); /* wait for objects to complete */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_OBJECTS); mfc_read_tag_status_all(); /* assign local store pointer to objects */ scene->objects = objects; /* iterate each object locally */ for(; i < scene->nObjects; ++i) { if(objects[i].geometryType == GEOMETRY_POLYGON) { /* allocate memory for vertex */ v = _malloc_align(sizeof(point_t) * objects[i].poly_obj.nVerticies, 4); /* initiate DMA to get verticies */ spu_mfcdma64(v, mfc_ea2h(objects[i].poly_obj.vertex_ea), mfc_ea2l(objects[i].poly_obj.vertex_ea), sizeof(point_t) * objects[i].poly_obj.nVerticies, SPUDMA_VERTEXES, MFC_GET_CMD); /* assign local store pointer - WARNING - safe? */ objects[i].poly_obj.vertex = v; } } /* wait for all DMA to finish (vertexes, lights) */ spu_writech(MFC_WrTagMask, 1 << SPUDMA_LIGHTS | 1 << SPUDMA_VERTEXES ); mfc_read_tag_status_all(); /* assign local store lights pointer */ scene->lights = lights; }