static s32 dma_FPGAmem2ARMmem_use_acp(void) { s32 ret = ALT_E_SUCCESS; /* for DMAC PC, not cache-able */ ALT_DMA_PROGRAM_t *program = (ALT_DMA_PROGRAM_t *)AMP_SHARE_DMABUF_START; //u32 dma_phy = AMPPHY_START + 0x200000; //physical address: 0x1e200000 u32 src_phy = FPGA_SDRAM_PHYS_BASE; //source physical address for dma : offset: 0 u32 dst_phy = (SH_MEM_START + 0x200000); // destination physical address for dma: offset 0x200000 //source virtual address for mpu, cache-able, WBWA shareable u8 * src_virt = (u8 *)phy_to_virt(src_phy); // desination virtual address for mpu, cache-able, WBWA shareable u8 * dst_virt = (u8 *)phy_to_virt(dst_phy); int i, size = 0x4000; gDMAtestIrqInfoChan0.src_mpu = src_virt; gDMAtestIrqInfoChan0.dst_mpu = dst_virt; gDMAtestIrqInfoChan0.size = size; dma_use_acp_init_fpga(ALT_DMA_CHANNEL_0); memset(program, 0x0, sizeof(ALT_DMA_PROGRAM_t)); ret = alt_dma_channel_alloc(ALT_DMA_CHANNEL_0); if(ret != ALT_E_SUCCESS) { bmlog("ERROR: dma_channel_alloc error!\n"); return ret; } /* init the source memory and the destination memory for dma using ACP */ for(i = 0; i < size; i++) { src_virt[i] = i + 1; dst_virt[i] = 0x0; } dma_init_interrupt_channel0(); /* Init the dma engine's micro code PC */ program->program_phy = virt_to_phy((u32)(program->program)); ret = alt_dma_memory_to_memory(ALT_DMA_CHANNEL_0, program, (void *)(dst_phy|(0x1 << 31)), (const void *)(src_phy&(~(0x1 << 30))), size, true, ALT_DMA_EVENT_0, 1); if(ret != ALT_E_SUCCESS) { bmlog("ERROR: dma memory_to_memory acp failed!\n"); return ret; } return ret; }
static int dma_ARMmem2FPGAmem(void) { s32 ret = ALT_E_SUCCESS; /* for DMAC PC, not cache-able */ ALT_DMA_PROGRAM_t *program = (ALT_DMA_PROGRAM_t *)AMP_SHARE_DMABUF_START; u32 dma_phy = AMPPHY_START + AMPMAP_SIZE - DMABUF_SIZE; u32 src_phy = (dma_phy + 0x100000);//offset: 1MB u32 dst_phy = FPGA_SDRAM_PHYS_BASE + (FPGA_SDRAM_SIZE/2); //non-cacherable, bufferable src u8 * src_virt = (u8 *)phy_to_virt(src_phy); //non-cacherable, bufferable dest u8 * dst_virt = (u8 *)phy_to_virt(dst_phy); int i, size = 0x4000; gDMAtestIrqInfoChan0.src_mpu = src_virt; gDMAtestIrqInfoChan0.dst_mpu = dst_virt; gDMAtestIrqInfoChan0.size = size; memset(program, 0x0, sizeof(ALT_DMA_PROGRAM_t)); ret = alt_dma_channel_alloc(ALT_DMA_CHANNEL_0); if(ret != ALT_E_SUCCESS) { bmlog("ERROR: dma_channel_alloc error!\n"); return ret; } for(i = 0; i < size; i++) { src_virt[i] = i + 1; dst_virt[i] = 0x0; } dma_init_interrupt_channel0(); program->program_phy = virt_to_phy((u32)(program->program)); ret = alt_dma_memory_to_memory(ALT_DMA_CHANNEL_0, program, (void *)dst_phy, (const void *)src_phy, size, true, ALT_DMA_EVENT_0, 0); if(ret != ALT_E_SUCCESS) { bmlog("ERROR: dma memory_to_memory failed!\n"); return ret; } return ret; }
void gic_sgi8_handler (void *p_arg) { /* ************************************************************************ ***** we are in SVC mode and interrupt enable in the ISR_Handler **************** ***** then we can service other high priority IRQ when do current ISR_Handler ****** ************************************************************************ */ /********** ISR Top Half begin ************/ /***************************************/ /* place ur emerency TH bussiness here ******/ /********** ISR Top Half begin ************/ gCacheCoherence ++; if(gCacheCoherence == 2) { bmlog("lock_code_data_section_to_L2=0x%08x\n", lock_code_data_section_to_L2); memcpy((void *)AMP_SHARE_DMABUF_START, (void *)SDRAM_PHYS_BASE, 0x20000); /* ******************************************************************************** * lock code & data section to L2 cache function need to be run in non-cache memory region *** ******************************************************************************** */ __asm__ __volatile__("dsb\n"); l2lock = (l2lock_fn)(((u32)lock_code_data_section_to_L2) + (AMP_SHARE_DMABUF_START - AMPMAP_START)); l2lock(SDRAM_PHYS_BASE, 0x20000); ACCESS_ONCE(*(u32 *)(AMP_SHARE_DMABUF_START + 0x100000)) = (('L'<<24) | ('O'<<16) | ('C'<<8)| 'K'); } /********** Set Flags to indecate while{} loop block do the Buttom Half ************/ }
static void ann_write_bmlog(char *path) { char *ptr, board[STRLEN]; if ((ptr = strstr(path, "groups/")) != NULL) ann_get_board(ptr, board, sizeof(board)); if (board[0] != '\0') bmlog(getCurrentUser()->userid, board, 13, 1); }
static s32 check_dma_memory_result(u8 *src, u8 *dst, int size) { while(size && size--) { if(*src != *dst) break; src++; dst++; } if(size) { bmlog("src =0x%p, *src=%u; dst = 0x%p, *dst = %u;size=%d\n", src, *src, dst, *dst, size); return ALT_E_ERROR; } else bmlog("DMA test memory(0x%x) to memory(0x%x) is ok!\n", virt_to_phy((u32)src), virt_to_phy((u32)dst)); return ALT_E_SUCCESS; }
static s32 dma_mem2mem_use_acp_done(void) { s32 ret = ALT_E_SUCCESS; ret = alt_dma_channel_free(ALT_DMA_CHANNEL_0); if(ret != ALT_E_SUCCESS) { bmlog("ERROR: dma free channel error!\n"); return ret; } memset((void *)&gDMAtestIrqInfoChan0, 0x0, sizeof(gDMAtestIrqInfoChan0)); ACCESS_ONCE(gDMAtestDone) = 0; return ret; }
static int dma_init(void) { s32 ret; ALT_DMA_CFG_t cfg; init_dma_cfg(&cfg); ret = alt_dma_init(&cfg); if(ret != ALT_E_SUCCESS) { bmlog("ERROR: dma_init error!\n"); ret = ALT_E_ERROR; } return ALT_E_SUCCESS; }
int main (void) { u32 count = 0; s32 i, j, k = 0; u32 max, sum; volatile u32 tmp; u32 *p; char *pstr; p = (u32 *)(DRAM_PHYS_START + 0x6000); u32 boot; u32 load_start, load_end; u32 boot_start, boot_end; u32 bm_load_duration_ns, bm_load_duration_ms; /* get the timestamp of the osc timer1 */ boot_end = readl((const volatile void *)SOCFPGA_OSC1TIMER1_ADDRESS + 0x4); InitPeripheral(); cpu_local_irq_disable(); /* asp is cleared by the preloader , and preloader will save * qspi probe, read function in it. Please don't clear * we reuse the preloader qspi driver at here, so we save * the probe and read function pointer into asp. * it at here. */ //memset(asp, 0, sizeof(struct amp_share_param)); asp->bm_magic = ('b' << 8) | 'm'; /**** interrupt initialize*******/ gic_Int_init(); /**** hook ISR callback ******/ gic_sgi_init(); #ifdef NEED_SAVE_RESTORE_GIC_REGS_FROM_BM gic_dist_save(); #endif /**** interrupt ready *****/ cpu_local_irq_enable(); UART_DEBUG("start bm...... %x\r\n", p); UART_DEBUG("++start bm...... 12345\r\n"); bmlog("start bm......%x\r\n", p); UART_DEBUG("start bm...... %x\r\n", 0x55aa); UART_DEBUG("--start bm...... 67890\r\n"); /* auto detect the boot mode */ boot = bm_get_boot_mode(); if((BOOTSEL_MODE_SD_1_8V == boot) || (BOOTSEL_MODE_SD_3_3V == boot)) { if(!bm_sd_load_rbf()) bmlog("load fpga rbf is ok!\n"); } else if((BOOTSET_MODE_QSPI_1_8V == boot) || (BOOTSET_MODE_QSPI_3_3V == boot)) { if(!bm_qspi_load_rbf()) bmlog("load fpga rbf is ok!\n"); } /* osc_timer1 is running at 25MHz, 40ns per cycle */ asp->boot_end_stamp = boot_end; boot_start = asp->boot_start_stamp; bm_load_duration_ns = (boot_start - boot_end) * 40; bm_load_duration_ms = bm_load_duration_ns/1000/1000; bmlog("start[0x%x],end[%x]\n, bm_load_ns = %u(ns), bm_load_ms= %u(ms)\n", boot_start, boot_end, bm_load_duration_ns, bm_load_duration_ms ); load_start = asp->load_bm_start; load_end = asp->load_bm_end; bmlog("real loading bm time duration is %u(ms)(including qspi/sd init)\n", load_end - load_start); if(fpgamgr_program_fpga((const unsigned long *)FPGA_RBF_BASE, FPGA_RBF_SIZE) < 0) UART_DEBUG("config fpga failed!\r\n"); else { UART_DEBUG("config fpga OK!\r\n"); writel(('R'<<16)|('B'<<8)|('F'<<0),&(asp->preloader_wait_bm_load_rbf)); } pstr = (char *)FPGA_SDRAM_PHYS_BASE; sprintf(pstr, "%s\n", "i am from fpga ddr ram"); bmlog("%s", pstr); pstr = (char *)FPGA_SRAM_PHYS_BASE; sprintf(pstr, "%s\n", "i am from fpga sram"); bmlog("%s", pstr); #ifdef LCD1602_DISP IIC_InitIp(); LCD_SetCursor(0); sprintf(cDispBuf[0], "fpga config done!"); IIC_EXfer(LCD_ADDR, cDispBuf[0], strlen(cDispBuf[0])>15?16:strlen(cDispBuf[0])); #endif while(!gCacheCoherence) { /** led blink for hand shake debug with linux **/ k++; if((k&0x3FFFF) == 0) LED27_BLINK();//*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x1<<12); } /* ************************************************************** **** cause Linux peer use 2GB user space/2GB kernel space split ******* **** so it is 2048 L1 entry here, if use 3GB user/1 GB space split ******** **** it woule be 1024 here, and p should adjust to 0x7000 yejc ******** ************************************************************** */ for(j = 0; j < 2048-16; j++) //16MB for IO map space PageTable[j+2048] = *p++; /* ************************************************************** **** time to bring bm core to smp cache coherence environmence ******* ************************************************************** */ __asm__ __volatile__("dsb\n" "isb\n" "mrc p15, 0, r1, c1, c0, 0\n" "ldr r2, =0x40180d\n" "orr r1, r1, r2\n" "mcr p15, 0, r1, c1, c0, 0\n" "dsb\n" "isb\n" : : :"memory", "cc"); bmlog("L1 L2 cache enabled, SCU enabled~~~\n"); //asp = 0xFC700000; /* ******************************************************************************** ** from this point on, you can free to invoke the linux kernel space text function from BM, ***** ** only if the function would not cause the shceduler to action, or the CPU of BM would trap ***** ** in the cpu_idle(clone from linux cpu core) process if we can't not obtain the lock/mutex ***** ** /semaphore **** ******************************************************************************** */ //printk = (printk_fn)asp->sta.printk_fn; _raw_spinlock = (raw_spinlock_fn)asp->sta.spinlock_lock_fn; _raw_spinunlock = (raw_spinlock_fn)asp->sta.spinlock_unlock_fn; /* semaphore */ down_trylock = (down_trylock_fn)asp->sta._down_trylock_fn; while(1) { if(sgi15task_pending) { if(ACCESS_ONCE(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args) == 0) { /**************************************************************************************/ /**** place interrupt here test worse case interrupt latency would be more accuracy ***/ /**** casue we not only take care of 10000 interrupts/second from FPGA, but also ***/ /**** suffer more than 6000 extra interrupts/second from IPI and offen L1 data cahce miss **/ /**************************************************************************************/ #ifdef TEST_IL_MORE_ACCURACY if(testIL) { bmlog("\n________________________________________________________________________________\n"); bmlog("Now the BM CPU would suffer more than 16000 interrupts/second(10000i/s from\n"); bmlog("FPGA_IRQ0 req(100us),more than 6000i/s from IPI(Inner Process Interrupt), and\n"); bmlog("process several Gbps data per second, they are all concurrently, very intensive\n"); bmlog("load for BM CPU core!\n"); bmlog("__________________________________________________________________________________\n"); testIL = 0; gic_Int_dis (GIC_PFGA0); //72 FPGA_IRQ0 gic_Int_clr (GIC_PFGA0); pvt_init(1); fire_fpga_irq(); pvt_start(); gic_Int_en (GIC_PFGA0); } #endif /************************************************************************/ //p = (unsigned int *)0x1E200000; p = (unsigned int *)0xFC800000; for(i = 0; i < (DRAM_BUF_SIZE/4); i++) { if(*p != CPU_DATA_PATERN0) bmlog("check buf from linux failed! i=%d, *p=%x\n", i, *p); p++; } //bmlog("check buf from linux finish!\n"); //p = (unsigned int *)0x1E200000; p = (unsigned int *)0xFC800000; memset_int(p, CPU_DATA_PATERN3, DRAM_BUF_SIZE); //now u can mesure blink frequency on hps LED3 and multiply 2 to calculate the interrupte frequency //and then you can evaulate how fast the cpu do 2 times DRAM_SIZE write and 2 times DRAM_SIZE read //you sholud know that cpu spend must time to iter, compare in the read "for loop" and BM do interrupt //handler&cpu mode switch Linux handle system tick&sgi intrrupt and sched task, so the actual memory system //band width is greater than this evaluate value //be aware 65536B is bigger than L1 Dcache but little than L2 Dcache //evaluate data process speed in our case is: //1842Hz * 2toggle * 2w * 2r&check * DRAM_SIZE(65536B) = 965738496B/s = 7.73Gbps //toggle hps led //*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x1<<12); LED27_BLINK(); sgi15task_pending = 0; gic_raise_interrupt(CPU0, GIC_SGI13); } else if(ACCESS_ONCE(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args) == 1) { p = (unsigned int *)0xfe700000; for(i = 0; i < (SRAM_BUF_SIZE/4); i++) { if(*p != CPU_DATA_PATERN0) bmlog("check buf from linux failed! i=%d, *p=%x\n", i, *p); p++; } //bmlog("check buf from linux finish!\n"); p = (unsigned int *)0xfe700000; memset_int(p, CPU_DATA_PATERN3, SRAM_BUF_SIZE); //now u can mesure blink frequency on hps LED2 and multiply 2 to calculate the interrupte frequency //and then you can evaulate how fast the cpu do 2 times SRAM_SIZE write and 2 times SRAM_SIZE read //you sholud know that cpu spend must time to iter, compare in the read "for loop" and BM do interrupt //handler&cpu mode switch Linux handle system tick&sgi intrrupt and sched task, so the actual memory system //band width is greater than this evaluate value //be aware 32768B is bigger than L1 Dcache but little than L2 Dcache //evaluate data process speed in our case is: //3230Hz * 2toggle * 2w * 2r&check * SRAM_SIZE(32768B) = 846725120B/s = 6.77Gbps //little then DRAM cause much more interrupt overhead //toggle hps led //*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x2<<12); LED28_BLINK(); sgi15task_pending = 0; gic_raise_interrupt(CPU0, GIC_SGI13); } //bmlog("CPU1#%04d:msg from cpu1 call~~~~~~~~~~~~~~~~\n", i); } if(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args == 2) { for(i = 0; i < SPINLOCK_TEST_COUNT; i++) { _raw_spinlock(&asp->rslocks[0]); tmp = asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status; //dummy j++ j++; asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status += 2; if((asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status - tmp) != 2) bmlog("BM:spinlock test failed!\n"); _raw_spinunlock(&asp->rslocks[0]); //dummy operation on j++ simulate the actual scenario to give another cpu chance //to take lock, reduce starvation situation j++; } //bmlog("\nBM spinlock test:%d\n", tmp + 2); bmlog("\n----------------------------\n"); bmlog("BM spinlock test:%d\n", tmp + 2); bmlog("----------------------------\n"); /*************************************************************************************/ /**** place interrupt test here test cpu data process speed would be more accuracy ***/ /*************************************************************************************/ #ifdef TEST_DATA_PROCESS_SPEED_MORE_ACCURACY gic_Int_dis (GIC_PFGA0); //72 FPGA_IRQ0 gic_Int_clr (GIC_PFGA0); pvt_init(1); fire_fpga_irq(); pvt_start(); gic_Int_en (GIC_PFGA0); #endif /************************************************************************/ while(!ACCESS_ONCE(gINTtestDone)); ACCESS_ONCE(gINTtestDone) = 0; for(i = 0; i < IL_TEST_COUNT - 1; i++) { p_pvt[i] = p_pvt[i] - p_pvt[i + 1]; //delta t of pvt, be careful pvt counter overlap! } for(i = 0; i < IL_TEST_COUNT - 1; i++) { if(p_pvt[i] > PVT_100US_CYCLE) p_pvt[i] -= PVT_100US_CYCLE; /* pvt 10ns resolution, 10000 * 10ns = 100us*/ //IL(interrupt latency jitter) else p_pvt[i] = PVT_100US_CYCLE - p_pvt[i]; } max = p_pvt[0]; k = 0; for(i = 0; i < IL_TEST_COUNT - 1; i++) if(p_pvt[i] > max) { max = p_pvt[i]; k = i; } max *= 10; bmlog("\n------------------------------------------------------------------------------------------\n"); //bmlog("\ninterrupt latency test method 1(use private timer)\n"); bmlog("interrupt latency test method 1(use private timer)\n"); //bmlog("max interrupt latency jitter: %d ns\n", max); bmlog("max interrupt latency jitter: %d ns\n", max); sum = 0; for(i = 0; i < IL_TEST_COUNT - 1; i++) { sum += p_pvt[i]; //be carefule sum overflow } sum /= (IL_TEST_COUNT - 1); sum *= 10; //bmlog("average interrupt latency jitter: %d ns\n", sum); bmlog("average interrupt latency jitter: %d ns\n", sum); //90ns is measure from oscilloscope, see AMP Reference Design for detail bmlog("max interrupt latency: %d ns(use private timer@CPU core cluster)\n", max + 90); if(max>410) bmlog("max interrupt latency: %d ns(use private timer@CPU core cluster),max_index=%d\n", max + 90, k); bmlog("average interrupt latency: %d ns(use private timer@CPU core cluster)\n", sum + 90); bmlog("-----------------------------------------------------------------------------------------------\n"); bmlog("\n-----------------------------------------------------------------------------------------------\n"); //bmlog("\nfpga send %d times irq req to arm, arm ack %d times, lost irq %d times\n", gFPGA_IRQ_req, gARM_IRQ_ack, gFPGA_IRQ_req - gARM_IRQ_ack); bmlog("fpga send %d times irq req to arm, arm ack %d times, lost irq %d times\n", ACCESS_ONCE(gFPGA_IRQ_req), ACCESS_ONCE(gARM_IRQ_ack), ACCESS_ONCE(gFPGA_IRQ_req) - ACCESS_ONCE(gARM_IRQ_ack)); bmlog("-------------------------------------------------------------------------------------------------\n"); testIL = 1; #ifdef DMA_AND_ACP_TEST dma_init(); //DMAC_regs_dump(0); dma_mem2mem();//new for test //DMAC_regs_dump(0); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_done(); //DMAC_regs_dump(0); bmlog("DMA test done\n"); bmlog("-----------------------------------------------\n\n"); dma_mem2mem_use_acp(); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_use_acp_done(); bmlog("DMA test acp done\n"); bmlog("-----------------------------------------------\n\n"); dma_ARMmem2FPGAmem(); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_done(); bmlog("dma_ARMmem2FPGAmem test done\n"); bmlog("-----------------------------------------------\n\n"); dma_FPGAmem2ARMmem_use_acp(); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_done(); bmlog("dma_FPGAmem2ARMmem_use_acp test done\n"); bmlog("-----------------------------------------------\n\n"); #endif count++; #ifdef LCD1602_DISP LCD_SetCursor(1); memset(cDispBuf[1], ' ', 16); sprintf(cDispBuf[1], "test:%d", count); IIC_EXfer(LCD_ADDR, cDispBuf[1], 16); #endif asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args = -1; //tell linux interrupt latency and dma test done! } //dummy k++ and toggle hps led1, waste cpu time.. //indicate bm activity, u can measure the toggle freqency to evalute how much time spend per loop //loop cycle = 1s /(toggle freqency * 2 * 1048576) //in our case cpu = 800MHz, no interrupt to handle no work cmd, // loop cycle = 1/(42.384*2*1048576) * 10^9 = 11.25ns //if remov the K++, if((k&0xFFFFF) == 0){...} block, we can remove about 7 dissemable instrution~8 clock cycle // 11.25 * (6 +6)(instrution)/(6+6+8) total instrution = 6.75ns k++; if((k&0x1FFFFF) == 0) LED30_BLINK(); } }
void DMAC_regs_dump(int channel) { void *base = (void *)HPS_DMAC_BASE; bmlog("*********************DMAC_regs_dump**********************\n"); bmlog("DS: 0x%08x\n", readl(base)); bmlog("FSM: 0x%08x\n", readl(base + 0x30)); bmlog("FSC: 0x%08x\n", readl(base + 0x34)); bmlog("FTM: 0x%08x\n", readl(base + 0x38)); bmlog("FTC0: 0x%08x\n", readl(base + 0x40)); bmlog("CS0: 0x%08x\n", readl(base + 0x100)); bmlog("CPC0: 0x%08x\n", readl(base + 0x104)); bmlog("SA_0: 0x%08x\n", readl(base + 0x400)); bmlog("DA_0: 0x%08x\n", readl(base + 0x404)); bmlog("CC_0: 0x%08x\n", readl(base + 0x408)); bmlog("LC00: 0x%08x\n", readl(base + 0x40C)); bmlog("LC10: 0x%08x\n", readl(base + 0x410)); bmlog("*********************************************************\n"); }
void gic_sgi0to7_handler (void *p_arg) { bmlog("BM SGI#%d triggered!\n", (u32 *)p_arg); }