/* * The argv argument will be populated with the address that the PPE provided, * from the 4th argument to spe_context_run() */ int main(uint64_t speid, uint64_t argv, uint64_t envp) { struct spe_args args __attribute__((aligned(SPE_ALIGN))); mfc_get(&args, argv, sizeof(args), 0, 0, 0); mfc_write_tag_mask(1 << 0); mfc_read_tag_status_all(); cmap_calls = 0; dma_puts = 0; spu_write_decrementer(-1); // Run multiple renders with offsets. Should be factored into render_fractal() render_fractal(&args.fractal, args.thread_idx, args.n_threads, 0.); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 7 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 3 / 4); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 5 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 2); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta * 3 / 8); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 4); render_fractal(&args.fractal, args.thread_idx, args.n_threads, args.fractal.delta / 8); // Send remaining points if(fill%2048) { // select the last buffer used int f = fill / 2048; mfc_put(&points[f*2048], (uint)args.fractal.pointbuf[f], 16384, 0, 0, 0); // Block for completion mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // Send a message with top bit set to indicate final item spu_write_out_intr_mbox((1<<31)|f); // Send another message indicating count spu_write_out_intr_mbox(fill%2048); ++dma_puts; } // Report some stats uint ticks = -1 - spu_read_decrementer(); printf("cmap calls %d ticks %u calls/tick %f\n", cmap_calls, ticks, (double)cmap_calls/ticks ); printf("dma puts %d\n", dma_puts); return 0; }
void _gc_log_write(gc_log_entry_t entry) { if (log_base_ea == 0) return; entry.seqno = log_seqno++; entry.timestamp = spu_read_decrementer(); if (tmp_buffer_busy & (1 << tmp_buffer_idx)){ mfc_write_tag_mask(1 << (log_tags + tmp_buffer_idx)); mfc_read_tag_status_all(); } tmp_buffer[tmp_buffer_idx] = entry; // save local copy mfc_put(&tmp_buffer[tmp_buffer_idx], log_base_ea + log_idx * sizeof(entry), sizeof(entry), log_tags + tmp_buffer_idx, 0, 0); tmp_buffer_busy |= (1 << tmp_buffer_idx); tmp_buffer_idx ^= 0x1; log_idx = (log_idx + 1) & log_idx_mask; }
int main(unsigned long long spe_id, unsigned long long argp, unsigned long long envp) { #ifdef TRACE_TIME spu_write_decrementer(0); #endif setup_spu(argp); //unsigned int prod_ticks = 0; // Merge num_active_mergers = spu_ctrlblock.num_mergers; while(num_active_mergers > 0){ if(md[am].done){ am = (am+1) % spu_ctrlblock.num_mergers; continue; } // Check DMA completion if leaf node or extern parent if(mcb[am].leaf_node){ check_pull_dma(LEFT); check_pull_dma(RIGHT); if(md[am].num_waiting[LEFT] + md[am].num_waiting[RIGHT]){ am = (am+1) % spu_ctrlblock.num_mergers; continue; } } if(mcb[am].local[OUT] == 255){ check_push_dma(); if(md[am].held_tag[OUT] < 32){ am = (am+1) % spu_ctrlblock.num_mergers; continue; } } if(md[am].done){ am = (am+1) % spu_ctrlblock.num_mergers; continue; } // Produce #ifdef TRACE_TIME dec_val = spu_read_decrementer(); #endif if(md[am].depleted[LEFT] && !md[am].depleted[RIGHT]){ cp_buffer(RIGHT); } else if(md[am].depleted[RIGHT] && !md[am].depleted[LEFT]){ cp_buffer(LEFT); } else { merge_buffers(); } #ifdef TRACE_TIME merge_ticks += -(spu_read_decrementer() - dec_val); #endif // Push, if parent not local if(mcb[am].local[OUT] == 255) push(); // Pull if leaf node if(mcb[am].leaf_node){ pull(LEFT); pull(RIGHT); } am = (am+1) % spu_ctrlblock.num_mergers; } #ifdef TRACE_TIME float tot = -spu_read_decrementer() / (79.8*1000); float sort = sort_v_ticks / (79.8*1000); float merge = merge_ticks / (79.8*1000); float merge_loop = merge_loop_ticks / (79.8*1000); printf("SPU%d: Total %fms, merge %fms, inner loop %fms, sort %fms\n", spu_ctrlblock.spu_id,tot, merge, merge_loop, sort); #endif return 0; }
void merge_buffers(){ vector unsigned int cmp_v, cmp_v2; const vector signed int one_at_0 = {1,0,0,0}; const vector signed int one_at_1 = {0,1,0,0}; const vector signed int one_at_2 = {0,0,1,0}; const vector signed int ones = {1,1,1,1}; const vector signed int zeros = {0,0,0,0}; const vector unsigned char cmp_v_shuffle_mask = {31,31,31,31, 31,31,31,31, 31,31,31,31, 31,31,31,31}; vector unsigned char rev_mask; const vector unsigned char rev_left = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; const vector unsigned char rev_right = {28,29,30,31, 24,25,26,27, 20,21,22,23, 16,17,18,19}; vector signed int *out_head_idx; if(mcb[am].local[OUT] < 255){ int parent_idx = mcb[am].local[OUT]; int side = (mcb[am].id+1)&1; out_head_idx = (vector signed int*) &md[parent_idx].idx[side][HEAD]; } else { out_head_idx = (vector signed int*) &md[am].idx[OUT][HEAD]; } vector signed int *left_tail_idx = (vector signed int*) &md[am].idx[LEFT][TAIL]; vector signed int *right_tail_idx = (vector signed int*) &md[am].idx[RIGHT][TAIL]; vector signed int size_v = {mcb[am].buffer_size[LEFT], mcb[am].buffer_size[RIGHT], mcb[am].buffer_size[OUT], 0}; vector signed int avail_v = {num_in_buffer(LEFT), num_in_buffer(RIGHT), num_free_in_buffer(OUT), 1}; vector signed int avail_before = { spu_extract(avail_v, 0), spu_extract(avail_v, 1), 0, 0 }; vector unsigned int avail = spu_gather( spu_cmpgt(avail_v, zeros) ); // avail = 0x0F if all avail_v > zeros vector signed int *left, *right, *out; left = (vector signed int*) &md[am].buffer[LEFT][ spu_extract(*left_tail_idx,0) ]; right = (vector signed int*) &md[am].buffer[RIGHT][ spu_extract(*right_tail_idx,0) ]; out = (vector signed int*) &md[am].buffer[OUT][ spu_extract(*out_head_idx,0) ]; #ifdef TRACE_TIME dec_val2 = spu_read_decrementer(); #endif while(spu_extract(avail,0) == 0x0F){ // cmp left and right to determine who gets eaten cmp_v = spu_cmpgt(*left,*right); cmp_v = spu_shuffle(cmp_v, cmp_v, cmp_v_shuffle_mask); // cmp_v = {FFFF,FFFF,FFFF,FFFF} if left[3] > right[3] *out = spu_sel(*left,*right,cmp_v); rev_mask = spu_sel(rev_right,rev_left,(vector unsigned char)cmp_v); *left = spu_shuffle(*left,*right,rev_mask); // data to be sorted is now in out and left, left in descending order sort_vectors(out,left); // update index of the used side if( spu_extract(cmp_v,0) ){ // left[3] > right[3] *right_tail_idx = spu_add(*right_tail_idx,ones); avail_v = spu_sub(avail_v, one_at_1); right++; // modulus hack cmp_v2 = spu_cmpeq(*right_tail_idx, size_v); if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){ *right_tail_idx = zeros; right = (vector signed int*) &md[am].buffer[RIGHT][0]; } } else { *right = *left; *left_tail_idx = spu_add(*left_tail_idx,ones); avail_v = spu_sub(avail_v, one_at_0); left++; // modulus hack cmp_v2 = spu_cmpeq(*left_tail_idx, size_v); if( __builtin_expect( spu_extract(cmp_v2,0) ,0) ){ *left_tail_idx = zeros; left = (vector signed int*) &md[am].buffer[LEFT][0]; } } // update out head idx *out_head_idx = spu_add(*out_head_idx,ones); avail_v = spu_sub(avail_v, one_at_2); out++; // modulus hack cmp_v2 = spu_cmpeq(*out_head_idx, size_v); if( __builtin_expect(spu_extract(cmp_v2,0),0) ){ out = (vector signed int*) &md[am].buffer[OUT][0]; *out_head_idx = zeros; } // is there data still available? avail = spu_gather(spu_cmpgt(avail_v, zeros)); } #ifdef TRACE_TIME merge_loop_ticks += -(spu_read_decrementer() - dec_val2); #endif // how much got produced? vector signed int consumed = spu_sub(avail_before, avail_v); int consumed_left = spu_extract(consumed, 0); int consumed_right = spu_extract(consumed, 1); if(consumed_left) update_tail(LEFT); if(consumed_right) update_tail(RIGHT); md[am].consumed[LEFT] += consumed_left; md[am].consumed[RIGHT] += consumed_right; if(md[am].consumed[LEFT] == mcb[am].data_size[LEFT]) md[am].depleted[LEFT] = 1; if(md[am].consumed[RIGHT] == mcb[am].data_size[RIGHT]) md[am].depleted[RIGHT] = 1; if(mcb[am].local[OUT] < 255 && md[am].depleted[LEFT] && md[am].depleted[RIGHT]){ md[am].done = 1; --num_active_mergers; } }