// samples are non-contiguous in mem of the ppu... static inline void load_samples(int channel, int time, int buffer) { #if DO_DMA for(int stat=0; stat<NR_STATIONS; stat++) { samples_dma_list[stat].ea_low = (unsigned) (*((ppu_samples_type*)spu_arguments.ppu_samples))[channel][stat][time]; // printf("SPE sample for station %u, time %u = %x %u, size = %u\n", stat, time, samples_dma_list[stat].ea_low, samples_dma_list[stat].ea_low, samples_dma_list[stat].size); } mfc_getl(samples[buffer], 0, samples_dma_list, sizeof(struct spu_dma_list_elt) * NR_STATIONS, buffer, 0, 0); #endif }
/** * Fetches chemistry data from main memory */ void fetch_chem_buffer(uint32_t i, uint64_t ea_off) { uint32_t j; /* Build DMA lists */ //timer_start(&metrics.array_copy); clist[i].length = conc[i].length; for(j=0; j<clist[i].length; ++j) { clist[i].data[j].eal = conc[i].ea_base + ea_off + j*NX_ALIGNED_SIZE*NY; clist[i].data[j].size = 16; } //timer_stop(&metrics.array_copy); /* Fetch data */ mfc_getl(conc[i].data, conc[i].ea_base, clist[i].data, clist[i].length*sizeof(mfc_list_element_t), i, 0, 0); }
int main(unsigned long long speid __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { unsigned int tag; unsigned long long in_addr, out_addr; unsigned int i, num_chunks; mfc_list_element_t* dma_list_in; unsigned int tmp_addr; #ifdef USE_TIMER uint64_t start, time_working; spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih); spu_clock_start(); start = spu_clock_read(); #endif /* USE_TIMER */ /* First, we reserve a MFC tag for use */ tag = mfc_tag_reserve(); if (tag == MFC_TAG_INVALID) { printf ("SPU ERROR, unable to reserve tag\n"); return 1; } /* calculate the address of the local buffer where we can point the * dma_list_in pointer to */ tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS)); dma_list_in = (mfc_list_element_t*) (tmp_addr); /* issue DMA transfer to get the control block information from * system memory */ mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0); /* wait for the DMA get to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* calculate the number of blocks (chunks) that this spe is assigned * to process */ num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE; /* * This is the main loop. We basically goes through the num_chunks of data * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE * of data into system memory. Data is moved into local store, processed, and * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration. */ for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS) { /* set the in_addr and out_addr variables, we will use these for * issuing DMA get and put commands */ in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float)); out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float)); /* fill the dma list with the appropriate lower 32bit effective address and size for * each dma list element. This dma list is used to gather the input data * from system memory */ fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory. * The data will be gathered into local buffer local_buffer_in */ mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA get list command to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); /* invoke process_data to work on the data that's just been moved into local store*/ process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS); /* fill the dma list with the appropriate lower 32 bit ea and size for each * dma list element. This dma list is used to scatter the output data to system memory */ fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); /* issue the DMA put list command to scatter the result from local memory to * different places in system memory */ mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0); /* wait for the DMA put list to complete */ mfc_write_tag_mask (1 << tag); mfc_read_tag_status_all (); } #ifdef USE_TIMER time_working = (spu_clock_read() - start); spu_clock_stop(); printf ("SPE time_working = %lld\n", time_working); #endif /* USE_TIMER */ return 0; }
/** * Fetches z data from main memory */ void fetch_z_buffer(uint32_t i, uint64_t ea_off) { uint32_t j, x; /* Build DMA lists */ //timer_start(&metrics.array_copy); #define UNROLL_ELEMENT \ clist[i].data[j].eal = conc[i].ea_base + ea_off + j*NX_ALIGNED_SIZE*NY; \ wlist[i].data[j].eal = wind[i].ea_base + ea_off + j*NX_ALIGNED_SIZE*NY; \ dlist[i].data[j].eal = diff[i].ea_base + ea_off + j*NX_ALIGNED_SIZE*NY; \ clist[i].data[j].size = 16; \ wlist[i].data[j].size = 16; \ dlist[i].data[j].size = 16 j=0; x = clist[i].length; while(x > 8) { UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; x -= 8; } while(x > 4) { UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; UNROLL_ELEMENT; ++j; x -= 4; } while(x > 0) { UNROLL_ELEMENT; ++j; --x; } #undef UNROLL_ELEMENT //timer_stop(&metrics.array_copy); /* Fetch data */ mfc_getlb(conc[i].data, conc[i].ea_base, clist[i].data, clist[i].length*sizeof(mfc_list_element_t), i, 0, 0); mfc_getl(wind[i].data, wind[i].ea_base, wlist[i].data, wlist[i].length*sizeof(mfc_list_element_t), i, 0, 0); mfc_getl(diff[i].data, diff[i].ea_base, dlist[i].data, dlist[i].length*sizeof(mfc_list_element_t), i, 0, 0); }