int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { int i; unsigned int tag_id; /* Reserve a tag for application usage */ if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag\n"); return 1; } /* Here is the actual DMA call */ /* the first parameter is the address in local store to place the data */ /* the second parameter holds the main memory address */ /* the third parameter holds the number of bytes to DMA */ /* the fourth parameter identifies a "tag" to associate with this DMA */ /* (this should be a number between 0 and 31, inclusive) */ /* the last two parameters are only useful if you've implemented your */ /* own cache replacement management policy. Otherwise set them to 0. */ mfc_get(&cb, argp, sizeof(cb), tag_id, 0, 0); /* Now, we set the "tag bit" into the correct channel on the hardware */ /* this is always 1 left-shifted by the tag specified with the DMA */ /* for whose completion you wish to wait. */ mfc_write_tag_mask(1<<tag_id); /* Now, issue the read and wait to guarantee DMA completion before we */ /* continue. */ mfc_read_tag_status_all(); /* DMA the data from system memory to our local store buffer. */ mfc_get(data, cb.addr, DATA_BUFFER_SIZE, tag_id, 0, 0); printf("Address received through control block = 0x%llx\n", cb.addr); /* Wait for the data array DMA to complete. */ mfc_read_tag_status_all(); /* Verify that the data array contains a valid fibonacci sequence. */ for (i=2; i<DATA_BUFFER_ENTRIES; i++) { if (data[i] != data[i-1] + data[i-2]) { printf("ERROR: fibonacci sequence error at entry %d. Expected %d, Got %d\n", i, data[i-1] + data[i-2], data[i]); return (1); } } return 0; }
int main2mod(unsigned long long spe_id, unsigned long long program_data_ea, unsigned long long env) { unsigned tagid = spe_id&31; uint32 i,j; // get program data mfc_get(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0); mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); // precompute partial working states based on ihv & partial msg block pre_compute(pd.ihv1, pd.ihv2, pd.m1, pd.m2); if (pd.collisiondata > 0) { j = pd.collisiondata*8; vec_uint32* bufferptr = &buffer[j]; // get the trail buffer for (i = 0; i < j; i += 128) mfc_get(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0); mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); // process collision trails reduce_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr); reduce_trails2mod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr); find_collmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr); // store the trail buffer for (i = 0; i < j; i += 128) mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0); mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); } else { // fill the trail buffer in steps and do intermediate DMA transfers vec_uint32* bufferptr = &buffer[0]; for (i = 0; i < BUFFERSIZE; i += 256) { bufferptr = generate_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, bufferptr, &buffer[i+256]); mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0); mfc_put(&buffer[i+128], &pd.buffer[i+128], sizeof(vec_uint32)*128, tagid, 0, 0); } } // transfer the current program data back mfc_put(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0); // wait for dma transfers to complete mfc_write_tag_mask(1<<tagid); mfc_read_tag_status_all(); return 0; }
void getlarge( void* to, unsigned long from, int size, int tag ) { unsigned long ito, ifrom; ito = (unsigned long)to; ifrom = (unsigned long)from; while ( size >= 16384 ) { mfc_get((void*)ito, (unsigned long)ifrom, 16384, tag, 0, 0); size -= 16384; ito += 16384; ifrom += 16384; } if (size > 0 ) { mfc_get((void*)ito, (unsigned long)ifrom, size, tag, 0, 0); } }
void triad() { int i, j, n; vector float s = spu_splats(args.scalar); n = SIZE * sizeof(float); for (i = 0; (i + SIZE) < args.N; i += SIZE) { mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0); mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); for (j = 0; j < (SIZE / 4); ++j) ls3[j] = spu_madd(s, ls2[j], ls1[j]); mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0); } mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); if (unlikely(i < args.N)) { /* * args.N - i will be smaller than SIZE at this point so * it is safe to do a DMA transfer. * We need to make sure that size is a multiple of 16. */ n = ((args.N - i) * sizeof(float)) & (~127); mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0); mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); /* n must be divisible by 4. */ for (j = 0; j < ((args.N - i) / 4); ++j) ls3[j] = spu_madd(s, ls2[j], ls1[j]); mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); } /* * At this point it may be that i is still smaller than args.N if the length * was not divisible by the number of SPUs times 16. */ }
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp) { int i = 0; ppu_data_t ppu_data __attribute__ ((aligned(16))); tag_id = mfc_tag_reserve(); if (tag_id == MFC_TAG_INVALID){ printf("SPU: ERROR can't allocate tag ID\n"); return -1; } /* Obtin prin DMA structura cu pointeri, nr de frame-uri si spe_id */ dprintf("SPU: am intrat in spu %llx %lu %llx\n", speid, sizeof(ppu_data_t), envp); mfc_get((void*)&ppu_data, argp, (uint32_t)envp, tag_id, 0, 0); waittag(tag_id); dprintf("SPU: speid:%llx got struct\n", speid); dprintf("SPU: speid:%llx id:%02d input:%p big_img:%p num_frms:%d\n", speid, ppu_data.spe_id, ppu_data.input, ppu_data.big_image, ppu_data.num_frames); speid = speid; /* Frame processing goes here */ for (i = 0; i < ppu_data.num_frames; ++i) { process_frame(ppu_data, i); } return 0; }
int main(uint64_t ea, uint64_t outptr, uint64_t arg3, uint64_t arg4) { /* memory-aligned buffer (vectors always are properly aligned) */ volatile vec_uchar16 v; /* fetch the 16 bytes using dma */ mfc_get(&v, ea, 16, TAG, 0, 0); wait_for_completion(); /* compare all characters with the small 'a' character code */ vec_uchar16 cmp = spu_cmpgt(v, spu_splats((unsigned char)('a'-1))); /* for all small characters, we remove 0x20 to get the corresponding capital*/ vec_uchar16 sub = spu_splats((unsigned char)0x20) & cmp; /* convert all small characters to capitals */ v = v - sub; /* send the updated vector to ppe */ mfc_put(&v, ea, 16, TAG, 0, 0); wait_for_completion(); /* send a message to inform the ppe program that the work is done */ uint32_t ok __attribute__((aligned(16))) = 1; mfc_put(&ok, outptr, 4, TAG, 0, 0); wait_for_completion(); /* properly exit the thread */ spu_thread_exit(0); return 0; }
void process_image_simple(struct image* img){ unsigned char *input, *output, *temp; unsigned int addr1, addr2, i, j, k, r, g, b; int block_nr = img->block_nr; vector unsigned char *v1, *v2, *v3, *v4, *v5 ; input = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4); temp = malloc_align(NUM_CHANNELS * img->width, 4); v1 = (vector unsigned char *) &input[0]; v2 = (vector unsigned char *) &input[1 * img->width * NUM_CHANNELS]; v3 = (vector unsigned char *) &input[2 * img->width * NUM_CHANNELS]; v4 = (vector unsigned char *) &input[3 * img->width * NUM_CHANNELS]; v5 = (vector unsigned char *) temp; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS * img->width / NUM_IMAGES_WIDTH; for (i=0; i<img->height / SCALE_FACTOR; i++){ //get 4 lines addr1 = ((unsigned int)img->src) + i * img->width * NUM_CHANNELS * SCALE_FACTOR; mfc_get(input, addr1, SCALE_FACTOR * img->width * NUM_CHANNELS, MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); //compute the scaled line for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[j], v2[j]), spu_avg(v3[j], v4[j])); } for (j=0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } //put the scaled line back mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); } free_align(temp); free_align(input); free_align(output); }
int cacheGetPrime(int n) { if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart)) { int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; } // Haal op. uint32_t tag, size; tag = mfc_tag_reserve(); size = CACHE_PRIME_SIZE*16; unsigned long long EA = setup.vPrimes + (n - n%4) * 4; mfc_get(&primeCacheData, EA, size, tag, 0, 0); mfc_write_tag_mask(1 << tag); mfc_read_tag_status_all(); mfc_tag_release(tag); primeCacheStart = n - (n % 4); int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4); return r; }
void first_preload_particle(volatile void *ls, unsigned long long ea, unsigned long size) { tag_preload = get_tagid(); tag_store = 0; mfc_get(ls, ea, size, tag_preload, 0, 0); }
int main(unsigned long long spe_id, unsigned long long program_data_ea,unsigned long long env) { char array[MAX] __attribute__((aligned(128))); int func,dma_count; unsigned int tag = 1,count,k,byte_size,chunk_size, transfered_size,dest_inc; unsigned int count1,add_inc; unsigned long int rep; char arr[MAX]; unsigned long int array_size = 32768; unsigned long int data_size; spu_write_decrementer(0); rep = spu_read_in_mbox(); data_size = spu_read_in_mbox(); func = spu_read_in_mbox(); byte_size = data_size; k = byte_size - MAX; chunk_size = byte_size; mfc_get(array, (unsigned int)program_data_ea, chunk_size, tag, 0, 0); mfc_write_tag_mask(1<<tag); mfc_read_tag_status_any(); for(count = 0; count < rep;count++) for(count1 = 0 ; count1 < chunk_size ; count1++) { arr[count1%array_size] = array[count1]; } return 0; }
void compute() { // Compute my portion to compute int my_rows = rows / nspe + (rank < rows % nspe); int offset = rank * (rows / nspe) + std::min(rank, rows % nspe); #if DEBUG printf("Compute (%d/%d %d, %d) %d/%d\n", my_rows, rows, offset, cols, rank, nspe); #endif int tag = 23; uint64_t pin0 = in0 + offset * cols * sizeof(float); uint64_t pin1 = in1 + offset * cols * sizeof(float); uint64_t pin2 = in2 + offset * cols * sizeof(float); uint64_t pout = out + offset * cols * sizeof(float); float buf[4*cols]; float* buf0 = buf + 0*cols; float* buf1 = buf + 1*cols; float* buf2 = buf + 2*cols; float* buf3 = buf + 3*cols; for (int r=0; r<my_rows; ++r) { mfc_get(buf0, pin0, cols*sizeof(float), tag, 0, 0); mfc_get(buf1, pin1, cols*sizeof(float), tag, 0, 0); mfc_get(buf2, pin2, cols*sizeof(float), tag, 0, 0); pin0 += cols * sizeof(float); pin1 += cols * sizeof(float); pin2 += cols * sizeof(float); // Wait for DMAs to complete mfc_write_tag_mask(1<<tag); mfc_read_tag_status_all(); for (int c=0; c<cols; ++c) buf3[c] = buf0[c] * buf1[c] + buf2[c]; mfc_put(buf3, pout, cols*sizeof(float), tag, 0, 0); pout += cols * sizeof(float); } mfc_write_tag_mask(1<<tag); mfc_read_tag_status_all(); }
void get_Bmatrix_segments ( int tag, int seg_i, int seg_j ) { int i, j; j=0; for(i=seg_i; i<(seg_i+stsize); i++ ) { mfc_get((void*)&(BM0[j][0]), (unsigned long )&((*pB_matrix)[i][seg_j]), stsize*sizeof(float), tag, 0, 0); j++; } }
void* cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid) { #if defined (__SPU__) || defined (USE_LIBSPE2) mfc_get(ls,ea,size,tag,0,0); return ls; #else return (void*)(uint32_t)ea; #endif }
void pull(int side){ int avail_in = num_free_in_buffer(side); int avail_mm = mcb[am].data_size[side] - md[am].num_pulled[side]; int num_pull = avail_in < avail_mm ? avail_in : avail_mm; num_pull = num_pull < MAX_DMA_SIZE ? num_pull : MAX_DMA_SIZE; int head = spu_extract(md[am].idx[side][HEAD],0); int avail_from_head = mcb[am].buffer_size[side] - head; int first_pull = num_pull < avail_from_head ? num_pull : avail_from_head; if(!first_pull) return; // pull #first_pull unsigned int to_ea = (unsigned int) &md[am].buffer[side][head]; int tag = mfc_tag_reserve(); if(tag == MFC_TAG_INVALID){ return; } else { md[am].held_tag[side] = tag; } mfc_get((void*)to_ea, mcb[am].block_addr[side], first_pull * sizeof(vector signed int), md[am].held_tag[side], 0,0); mcb[am].block_addr[side] += first_pull * sizeof(vector signed int); if(first_pull < num_pull){ to_ea = (unsigned int) &md[am].buffer[side][0]; int second_pull = num_pull - first_pull; mfc_get((void*)to_ea, mcb[am].block_addr[side], second_pull * sizeof(vector signed int), md[am].held_tag[side], 0,0); mcb[am].block_addr[side] += second_pull * sizeof(vector signed int); } md[am].num_waiting[side] = num_pull; }
/** * Get arguments from main memory synchronously */ void get_transport_argv(uint64_t argvp, real_t *dt, real_t *size, uint32_t *block) { mfc_get(&argv, argvp, sizeof(spe_argv_t), GET_ARG_TAG_MASK, 0, 0); wait_for_dma(GET_ARG_TAG_MASK); conc[0].ea_base = argv.arg[0].u64; wind[0].ea_base = argv.arg[1].u64; diff[0].ea_base = argv.arg[2].u64; buff[0].ea_base = argv.arg[0].u64; conc[0].length = argv.arg[5].u32[0]; wind[0].length = conc[0].length; diff[0].length = conc[0].length; buff[0].length = conc[0].length; conc[1].ea_base = conc[0].ea_base; wind[1].ea_base = wind[0].ea_base; diff[1].ea_base = diff[0].ea_base; buff[1].ea_base = buff[0].ea_base; conc[1].length = conc[0].length; wind[1].length = wind[0].length; diff[1].length = diff[0].length; buff[1].length = buff[0].length; conc[2].ea_base = conc[0].ea_base; wind[2].ea_base = wind[0].ea_base; diff[2].ea_base = diff[0].ea_base; buff[2].ea_base = buff[0].ea_base; conc[2].length = conc[0].length; wind[2].length = wind[0].length; diff[2].length = diff[0].length; buff[2].length = buff[0].length; clist[0].length = conc[0].length; wlist[0].length = wind[0].length; dlist[0].length = diff[0].length; clist[1].length = conc[1].length; wlist[1].length = wind[1].length; dlist[1].length = diff[1].length; clist[2].length = conc[1].length; wlist[2].length = wind[1].length; dlist[2].length = diff[1].length; shuffle[0].length = conc[0].length; shuffle[1].length = wind[0].length; shuffle[2].length = diff[0].length; shuffle[3].length = buff[0].length; *dt = argv.arg[3].dbl; *size = argv.arg[4].dbl; *block = argv.arg[5].u32[1]; }
int main(uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) { /* get data structure */ spu_ea = arg1; mfc_get(&spu, spu_ea, sizeof(spustr_t), TAG, 0, 0); wait_for_completion(TAG); /* main loop: wait for screen address or 0 to end */ uint32_t buffer_ea; while ((buffer_ea = spu_read_signal1()) != 0) { mfc_get(&spu, spu_ea, sizeof(spustr_t), TAG, 0, 0); wait_for_completion(TAG); draw_frame(buffer_ea); send_response(1); wait_for_completion(TAG); } /* properly exit the thread */ spu_thread_exit(0); return 0; }
/* * The argv argument will be populated with the address that the PPE provided, * from the 4th argument to spe_context_run() */ int main(uint64_t speid, uint64_t argv, uint64_t envp) { struct spe_args args __attribute__((aligned(SPE_ALIGN))); mfc_get(&args, argv, sizeof(args), 0, 0, 0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); cmap_calls = 0; dma_puts = 0; spu_write_decrementer(-1); // Run multiple renders with offsets. Should be factored into render_fractal() render_fractal(&args.fractal, args.thread_idx, args.n_threads, 0.); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 7 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 3 / 4); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 5 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 2); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 3 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 4); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 8); // Send remaining points if(fill%2048) { // select the last buffer used int f = fill / 2048; mfc_put(&points[f*2048], (uint)args.fractal.pointbuf[f], 16384, 0, 0, 0); // Block for completion mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // Send a message with top bit set to indicate final item spu_write_out_intr_mbox((1<<31)|f); // Send another message indicating count spu_write_out_intr_mbox(fill%2048); ++dma_puts; } // Report some stats uint ticks = -1 - spu_read_decrementer(); printf("cmap calls %d ticks %u calls/tick %f\n", cmap_calls, ticks, (double)cmap_calls/ticks ); printf("dma puts %d\n", dma_puts); return 0; }
/** * Get arguments from main memory synchronously */ void get_chemistry_argv(uint64_t argvp, uint32_t* rows) { mfc_get(&argv, argvp, sizeof(spe_argv_t), GET_ARG_TAG_MASK, 0, 0); wait_for_dma(GET_ARG_TAG_MASK); conc[0].ea_base = argv.arg[0].u64; conc[0].length = NSPEC; conc[1].ea_base = conc[0].ea_base; conc[1].length = conc[0].length; TIME = argv.arg[1].dbl; DT = argv.arg[2].dbl; *rows = argv.arg[3].u32[0]; }
int main(uint64_t speid, uint64_t argp, uint64_t envp){ unsigned int data[NUM_STREAMS]; unsigned int num_spus = (unsigned int)argp, i, num_images; struct image my_image __attribute__ ((aligned(16))); int mode = (int)envp; speid = speid; //get rid of warning while(1){ num_images = 0; for (i = 0; i < NUM_STREAMS / num_spus; i++){ //assume NUM_STREAMS is a multiple of num_spus while(spu_stat_in_mbox() == 0); data[i] = spu_read_in_mbox(); if (!data[i]) return 0; num_images++; } for (i = 0; i < num_images; i++){ mfc_get(&my_image, data[i], sizeof(struct image), MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); switch(mode){ default: case MODE_SIMPLE: process_image_simple(&my_image); break; case MODE_2LINES: process_image_2lines(&my_image); break; case MODE_DOUBLE: process_image_double(&my_image); break; case MODE_DMALIST: process_image_dmalist(&my_image); break; } } data[0] = DONE; spu_write_out_intr_mbox(data[0]); } return 0; }
static void cleargroups(void) { unsigned i; for (i = 0; i < GROUPS_COUNT; i++) { group_keysvectors[i] = spu_splats((u16) 0); group_insertpos[i] = spu_splats((u32) 0); #ifdef GET_CACHE_STATS group_length[i] = 0; #endif } /* All vectors now points to group0, so fill all entries with true data for group 0 */ mfc_get(group_values[0][0], myCellOGRCoreArgs.upchoose, GROUP_ELEMENTS * 2, DMA_ID, 0, 0); mfc_read_tag_status_all(); for (i = 1; i < GROUPS_COUNT * GROUPS_LENGTH; i++) memcpy(group_values[0][i], group_values[0][0], GROUP_ELEMENTS * 2); }
/** * Get arguments from main memory synchronously */ void get_chemistry_argv(uint64_t argvp, uint32_t* rows) { timer_start(&metrics.comm); mfc_get(&argv, argvp, sizeof(spe_argv_t), 31, 0, 0); wait_for_dma(31); conc[0].ea_base = argv.arg[0].u64; conc[0].length = NSPEC; conc[1].ea_base = conc[0].ea_base; conc[1].length = conc[0].length; TIME = argv.arg[1].dbl; DT = argv.arg[2].dbl; *rows = argv.arg[3].u32[0]; timer_stop(&metrics.comm); }
static void init(unsigned long long argp) { mfc_get(&spu_arguments, (unsigned) argp, sizeof(spu_arguments), 0, 0, 0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); first_channel = spu_arguments.spu_id * NR_CHANNELS / NR_SPUS; last_channel = (spu_arguments.spu_id + 1) * NR_CHANNELS / NR_SPUS; for(int i=0; i<NR_STATIONS; i++) { samples_dma_list[i].size = sizeof(samples[0][0]); } if(spu_arguments.spu_id == 0) { printf("SPU sample dma size = %ld bytes\n", sizeof(samples[0][0])); printf("SPU in buffers = %ld KB @ %p, out buffers = %ld B @ %p\n", sizeof(samples) / 1024, samples, sizeof(visibilities), visibilities); } printf("I am spu %d, calculating channels %3d - %3d\n", spu_arguments.spu_id, first_channel, last_channel); }
void initialize( Fastconv_params* fc, void* p_kernel, fft1d_f* obj, void* buf) { unsigned int size = fc->elements*2*sizeof(float); // The kernel matches the input and output size mfc_get(p_kernel, fc->ea_kernel, size, 31, 0, 0); mfc_write_tag_mask(1<<31); mfc_read_tag_status_all(); if (fc->transform_kernel) { // Perform the forward FFT on the kernel, in place. This only need // be done once -- subsequent calls will utilize the same kernel. cml_ccfft1d_ip_f(obj, (float*)coeff, CML_FFT_FWD, buf); } }
void spu_dma_get(volatile void *ls, unsigned long long ea, unsigned long size) { // Check that we're on 16B boundaries, and // the size of the struct we're bringing in is // a multiple of 16B // fprintf(stderr, "size %d\n", size); assert(((unsigned long)ls & 15) == 0); assert((ea & 15) == 0); assert((size & 15) == 0); //fflush(stdout); // fprintf(stderr,"dma_get %p %llu %lu\n", ls, ea, size); int tagid = get_tagid(); assert(tagid >= 0); // fprintf(stderr, " size %lu \n", size); mfc_get(ls, ea, size, tagid, 0, 0); wait_tagid(tagid); put_tagid(tagid); }
int main(ull id, ull argp, ull envp) { unsigned int cmd; mfc_get(&args, argp, sizeof(args), TAG, 0, 0); mfc_write_tag_mask(1 << TAG); mfc_read_tag_status_all(); while (1) { cmd = spu_read_in_mbox(); if (unlikely(SPU2_MSG_PPU_TO_SPU_EXIT == cmd)) break; switch (cmd) { case SPU2_MSG_PPU_TO_SPU_DO_COPY: copy(); break; case SPU2_MSG_PPU_TO_SPU_DO_SCALE: scale(); break; case SPU2_MSG_PPU_TO_SPU_DO_ADD: add(); break; case SPU2_MSG_PPU_TO_SPU_DO_TRIAD: triad(); break; default: fprintf(stderr, " [SPU]: Invalid command received in mailbox\n"); } spu_write_out_mbox(SPU2_MSG_SPU_TO_PPU_DONE); } return 0; }
int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; unsigned int i, num_chunks; mfc_list_element_t* dma_list_in; unsigned int tmp_addr; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* calculate the address of the local buffer where we can point the * dma_list_in pointer to */ tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS)); dma_list_in = (mfc_list_element_t*) (tmp_addr); /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks of data * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE * of data into system memory. Data is moved into local store, processed, and * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration. */ for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float)); /* fill the dma list with the appropriate lower 32bit effective address and size for * each dma list element. This dma list is used to gather the input data * from system memory */ fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory. * The data will be gathered into local buffer local_buffer_in */ mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA get list command to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS); /* fill the dma list with the appropriate lower 32 bit ea and size for each * dma list element. This dma list is used to scatter the output data to system memory */ fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); /* issue the DMA put list command to scatter the result from local memory to * different places in system memory */ mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA put list to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
void process_data_simd (float* buf_in, float* buf_out, unsigned int size) { unsigned int i; vector float *vbuf_in, *vbuf_out; vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f}; vbuf_in = (vector float*) buf_in; vbuf_out = (vector float*) buf_out; for (i = 0; i < (size / 4); i++) { vbuf_out[i] = spu_add (vbuf_in[i], v1); } } int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; int i, num_chunks; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks * and fetches one 'chunk' of data at a time, process it, and write * it back to system memory until done. */ for (i = 0; i < num_chunks; i++) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float)); /* issue a DMA get command to fetch the chunk of data from system memory */ mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into * local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE); /* issue the DMA put command to transfer result from local memory to * system memory */ mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), tag, 0, 0); /* wait for the DMA put to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes) int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size) { btAssert(size<32); ATTRIBUTE_ALIGNED16(char tmpBuffer[32]); char* mainMem = (char*)ea; char* localStore = (char*)ls; uint32_t i; ///make sure last 4 bits are the same, for cellDmaSmallGet uint32_t last4BitsOffset = ea & 0x0f; char* tmpTarget = tmpBuffer + last4BitsOffset; #if defined (__SPU__) || defined (USE_LIBSPE2) int remainingSize = size; //#define FORCE_cellDmaUnalignedGet 1 #ifdef FORCE_cellDmaUnalignedGet cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0); #else char* remainingTmpTarget = tmpTarget; uint64_t remainingEa = ea; while (remainingSize) { switch (remainingSize) { case 1: case 2: case 4: case 8: case 16: { mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0); remainingSize=0; break; } default: { //spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize); int actualSize = 0; if (remainingSize > 16) actualSize = 16; else if (remainingSize >8) actualSize=8; else if (remainingSize >4) actualSize=4; else if (remainingSize >2) actualSize=2; mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0); remainingSize-=actualSize; remainingTmpTarget+=actualSize; remainingEa += actualSize; } } } #endif//FORCE_cellDmaUnalignedGet #else //copy into final destination #ifdef USE_MEMCPY memcpy(tmpTarget,mainMem,size); #else for ( i=0;i<size;i++) { tmpTarget[i] = mainMem[i]; } #endif //USE_MEMCPY #endif cellDmaWaitTagStatusAll(DMA_MASK(1)); //this is slowish, perhaps memcpy on SPU is smarter? for (i=0; btLikely( i<size );i++) { localStore[i] = tmpTarget[i]; } return 0; }
void loop_preload_particle(volatile void *ls, unsigned long long ea, unsigned long size) { wait_tagid(tag_preload); mfc_get(ls, ea, size, tag_preload, 0, 0); }
int main(unsigned long long speid, addr64 argp, addr64 envp) { unsigned long long dummy; int l ; int p0, p1 ; int i1, i2, i3 ; int j1, j2, j3 ; dummy = envp.ull ; dummy = speid ; // get arguments mfc_get((void*)&args, argp.ull, 128, 31, 0, 0); waitfor_matrix_io ( 31 ); // printf("SPE(%lld): Data received is: %d %d %d %d\n", speid, (int)args.inA // , (int)args.out, (int)args.i_initial, (int)args.i_final ); // printf("SPE(%lld): Data received is: %x %x %d %d\n", speid, (int)args.inA // , (int)args.out, (int)args.i_initial, (int)args.i_final ); // printf("SPE(%lld): size= %d \n", speid, (int)args.out-(int)args.inA ); // fflush(stdout); if ( args.sortType == 0 ) { for( l=args.i_initial; l<args.i_final; l++ ) { getlarge( (void*)&darrayA0, (unsigned long)(args.inA)+(l*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); array0 = &darrayC0 ; array1 = &darrayA0 ; # ifdef NEVER { int j, k, k2 ; for(k=2; k<=bsize; k<<=1 ) { k2 = k/2 ; array2 = array0 ; array0 = array1 ; array1 = array2 ; for(j=0; j<bsize; j+=k ) { j1=j; j2=j+k2; j3=j; while ( j1<j+k2 && j2<j+k ) { if ( (*array0)[j1].key > (*array0)[j2].key ) { (*array1)[j3] = (*array0)[j1]; j3++; j1++; } else { (*array1)[j3] = (*array0)[j2]; j3++; j2++; } } while ( j1<j+k2 ) { (*array1)[j3] = (*array0)[j1]; j3++; j1++; } while ( j2<j+k ) { (*array1)[j3] = (*array0)[j2]; j3++; j2++; } } } } # else # ifdef NEVER array1 = phase1C(array0,array1,bsize); # else array1 = phase1(array0,array1); # endif # endif putlarge( (void*)&((*array1)[0]), (unsigned long)(args.out)+(l*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); } } else if ( args.sortType == 1 ) { arrayA = (unsigned long)args.out; arrayB = (unsigned long)args.inA; for( p0=1; p0<args.blocks; p0<<=1 ) { arrayC=arrayA; arrayA=arrayB; arrayB=arrayC; for( p1=args.i_initial; p1<args.i_final; p1+=(p0*2) ) { i1 = p1 ; i2 = p1+p0; i3 = p1; getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 ); array0 = &darrayA0 ; j1=0; getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 ); array1 = &darrayB0 ; j2=0; waitfor_matrix_io ( 31 ); array2 = &darrayC0 ; j3=0; while ( i1<(p1+p0) && i2<(p1+2*p0) ) { # ifdef NEVER if ( (*array0)[j1].key > (*array1)[j2].key ) { (*array2)[j3] = (*array0)[j1]; j3++; j1++; } else { (*array2)[j3] = (*array1)[j2]; j3++; j2++; } # else # ifdef NEVER phase22C(array0,array1,array2,&j1,&j2,&j3,bsize); # else phase22(array0,array1,array2,&j1,&j2,&j3,bsize); # endif # endif if ( j1>=bsize ) { i1++; if ( i1<(p1+p0) ) { getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j1=0; } } if ( j2>=bsize ) { i2++; if ( i2<(p1+2*p0) ) { getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j2=0; } } if ( j3>=bsize ) { if ( i3<(p1+2*p0) ) { putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j3=0; } i3++; } } while ( i1<(p1+p0) ) { # ifdef NEVER (*array2)[j3] = (*array0)[j1]; j3++; j1++; # else # ifdef NEVER phase21C(array0,array2,&j1,&j3,bsize); # else phase21(array0,array2,&j1,&j3,bsize); # endif # endif if ( j1>=bsize ) { i1++; if ( i1<(p1+p0) ) { getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j1=0; } } if ( j3>=bsize ) { if ( i3<(p1+2*p0) ) { putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j3=0; } i3++; } } while ( i2<(p1+2*p0) ) { # ifdef NEVER (*array2)[j3] = (*array1)[j2]; j3++; j2++; # else # ifdef NEVER phase21C(array1,array2,&j2,&j3,bsize); # else phase21(array1,array2,&j2,&j3,bsize); # endif # endif if ( j2>=bsize ) { i2++; if ( i2<(p1+2*p0) ) { getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j2=0; } } if ( j3>=bsize ) { if ( i3<(p1+2*p0) ) { putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j3=0; } i3++; } } }} } else if ( args.sortType == 2 ) { arrayA = (unsigned long)args.inA; arrayB = (unsigned long)args.out; p0 = args.blocks/2 ; p1 = args.i_initial ; i1 = p1; i2 = p1+p0; i3 = p1; getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 ); array0 = &darrayA0 ; j1=0; getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 ); array1 = &darrayB0 ; j2=0; waitfor_matrix_io ( 31 ); array2 = &darrayC0 ; j3=0; while ( i1<(p1+p0) && i2<(p1+2*p0) ) { # ifdef NEVER if ( (*array0)[j1].key > (*array1)[j2].key ) { (*array2)[j3] = (*array0)[j1]; j3++; j1++; } else { (*array2)[j3] = (*array1)[j2]; j3++; j2++; } # else # ifdef NEVER phase22C(array0,array1,array2,&j1,&j2,&j3,bsize); # else phase22(array0,array1,array2,&j1,&j2,&j3,bsize); # endif # endif if ( j1>=bsize ) { i1++; if ( i1<(p1+p0) ) { getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j1=0; } } if ( j2>=bsize ) { i2++; if ( i2<(p1+2*p0) ) { getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j2=0; } } if ( j3>=bsize ) { if ( i3<(p1+2*p0) ) { putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j3=0; } i3++; } } while ( i1<(p1+p0) ) { # ifdef NEVER (*array2)[j3] = (*array0)[j1]; j3++; j1++; # else # ifdef NEVER phase21C(array0,array2,&j1,&j3,bsize); # else phase21(array0,array2,&j1,&j3,bsize); # endif # endif if ( j1>=bsize ) { i1++; if ( i1<(p1+p0) ) { getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j1=0; } } if ( j3>=bsize ) { if ( i3<(p1+2*p0) ) { putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j3=0; } i3++; } } while ( i2<(p1+2*p0) ) { # ifdef NEVER (*array2)[j3] = (*array1)[j2]; j3++; j2++; # else # ifdef NEVER phase21C(array1,array2,&j2,&j3,bsize); # else phase21(array1,array2,&j2,&j3,bsize); # endif # endif if ( j2>=bsize ) { i2++; if ( i2<(p1+2*p0) ) { getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j2=0; } } if ( j3>=bsize ) { if ( i3<(p1+2*p0) ) { putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 ); waitfor_matrix_io ( 31 ); j3=0; } i3++; } } } return 0; }