int main (void) { u32 count = 0; s32 i, j, k = 0; u32 max, sum; volatile u32 tmp; u32 *p; char *pstr; p = (u32 *)(DRAM_PHYS_START + 0x6000); u32 boot; u32 load_start, load_end; u32 boot_start, boot_end; u32 bm_load_duration_ns, bm_load_duration_ms; /* get the timestamp of the osc timer1 */ boot_end = readl((const volatile void *)SOCFPGA_OSC1TIMER1_ADDRESS + 0x4); InitPeripheral(); cpu_local_irq_disable(); /* asp is cleared by the preloader , and preloader will save * qspi probe, read function in it. Please don't clear * we reuse the preloader qspi driver at here, so we save * the probe and read function pointer into asp. * it at here. */ //memset(asp, 0, sizeof(struct amp_share_param)); asp->bm_magic = ('b' << 8) | 'm'; /**** interrupt initialize*******/ gic_Int_init(); /**** hook ISR callback ******/ gic_sgi_init(); #ifdef NEED_SAVE_RESTORE_GIC_REGS_FROM_BM gic_dist_save(); #endif /**** interrupt ready *****/ cpu_local_irq_enable(); UART_DEBUG("start bm...... %x\r\n", p); UART_DEBUG("++start bm...... 12345\r\n"); bmlog("start bm......%x\r\n", p); UART_DEBUG("start bm...... %x\r\n", 0x55aa); UART_DEBUG("--start bm...... 67890\r\n"); /* auto detect the boot mode */ boot = bm_get_boot_mode(); if((BOOTSEL_MODE_SD_1_8V == boot) || (BOOTSEL_MODE_SD_3_3V == boot)) { if(!bm_sd_load_rbf()) bmlog("load fpga rbf is ok!\n"); } else if((BOOTSET_MODE_QSPI_1_8V == boot) || (BOOTSET_MODE_QSPI_3_3V == boot)) { if(!bm_qspi_load_rbf()) bmlog("load fpga rbf is ok!\n"); } /* osc_timer1 is running at 25MHz, 40ns per cycle */ asp->boot_end_stamp = boot_end; boot_start = asp->boot_start_stamp; bm_load_duration_ns = (boot_start - boot_end) * 40; bm_load_duration_ms = bm_load_duration_ns/1000/1000; bmlog("start[0x%x],end[%x]\n, bm_load_ns = %u(ns), bm_load_ms= %u(ms)\n", boot_start, boot_end, bm_load_duration_ns, bm_load_duration_ms ); load_start = asp->load_bm_start; load_end = asp->load_bm_end; bmlog("real loading bm time duration is %u(ms)(including qspi/sd init)\n", load_end - load_start); if(fpgamgr_program_fpga((const unsigned long *)FPGA_RBF_BASE, FPGA_RBF_SIZE) < 0) UART_DEBUG("config fpga failed!\r\n"); else { UART_DEBUG("config fpga OK!\r\n"); writel(('R'<<16)|('B'<<8)|('F'<<0),&(asp->preloader_wait_bm_load_rbf)); } pstr = (char *)FPGA_SDRAM_PHYS_BASE; sprintf(pstr, "%s\n", "i am from fpga ddr ram"); bmlog("%s", pstr); pstr = (char *)FPGA_SRAM_PHYS_BASE; sprintf(pstr, "%s\n", "i am from fpga sram"); bmlog("%s", pstr); #ifdef LCD1602_DISP IIC_InitIp(); LCD_SetCursor(0); sprintf(cDispBuf[0], "fpga config done!"); IIC_EXfer(LCD_ADDR, cDispBuf[0], strlen(cDispBuf[0])>15?16:strlen(cDispBuf[0])); #endif while(!gCacheCoherence) { /** led blink for hand shake debug with linux **/ k++; if((k&0x3FFFF) == 0) LED27_BLINK();//*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x1<<12); } /* ************************************************************** **** cause Linux peer use 2GB user space/2GB kernel space split ******* **** so it is 2048 L1 entry here, if use 3GB user/1 GB space split ******** **** it woule be 1024 here, and p should adjust to 0x7000 yejc ******** ************************************************************** */ for(j = 0; j < 2048-16; j++) //16MB for IO map space PageTable[j+2048] = *p++; /* ************************************************************** **** time to bring bm core to smp cache coherence environmence ******* ************************************************************** */ __asm__ __volatile__("dsb\n" "isb\n" "mrc p15, 0, r1, c1, c0, 0\n" "ldr r2, =0x40180d\n" "orr r1, r1, r2\n" "mcr p15, 0, r1, c1, c0, 0\n" "dsb\n" "isb\n" : : :"memory", "cc"); bmlog("L1 L2 cache enabled, SCU enabled~~~\n"); //asp = 0xFC700000; /* ******************************************************************************** ** from this point on, you can free to invoke the linux kernel space text function from BM, ***** ** only if the function would not cause the shceduler to action, or the CPU of BM would trap ***** ** in the cpu_idle(clone from linux cpu core) process if we can't not obtain the lock/mutex ***** ** /semaphore **** ******************************************************************************** */ //printk = (printk_fn)asp->sta.printk_fn; _raw_spinlock = (raw_spinlock_fn)asp->sta.spinlock_lock_fn; _raw_spinunlock = (raw_spinlock_fn)asp->sta.spinlock_unlock_fn; /* semaphore */ down_trylock = (down_trylock_fn)asp->sta._down_trylock_fn; while(1) { if(sgi15task_pending) { if(ACCESS_ONCE(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args) == 0) { /**************************************************************************************/ /**** place interrupt here test worse case interrupt latency would be more accuracy ***/ /**** casue we not only take care of 10000 interrupts/second from FPGA, but also ***/ /**** suffer more than 6000 extra interrupts/second from IPI and offen L1 data cahce miss **/ /**************************************************************************************/ #ifdef TEST_IL_MORE_ACCURACY if(testIL) { bmlog("\n________________________________________________________________________________\n"); bmlog("Now the BM CPU would suffer more than 16000 interrupts/second(10000i/s from\n"); bmlog("FPGA_IRQ0 req(100us),more than 6000i/s from IPI(Inner Process Interrupt), and\n"); bmlog("process several Gbps data per second, they are all concurrently, very intensive\n"); bmlog("load for BM CPU core!\n"); bmlog("__________________________________________________________________________________\n"); testIL = 0; gic_Int_dis (GIC_PFGA0); //72 FPGA_IRQ0 gic_Int_clr (GIC_PFGA0); pvt_init(1); fire_fpga_irq(); pvt_start(); gic_Int_en (GIC_PFGA0); } #endif /************************************************************************/ //p = (unsigned int *)0x1E200000; p = (unsigned int *)0xFC800000; for(i = 0; i < (DRAM_BUF_SIZE/4); i++) { if(*p != CPU_DATA_PATERN0) bmlog("check buf from linux failed! i=%d, *p=%x\n", i, *p); p++; } //bmlog("check buf from linux finish!\n"); //p = (unsigned int *)0x1E200000; p = (unsigned int *)0xFC800000; memset_int(p, CPU_DATA_PATERN3, DRAM_BUF_SIZE); //now u can mesure blink frequency on hps LED3 and multiply 2 to calculate the interrupte frequency //and then you can evaulate how fast the cpu do 2 times DRAM_SIZE write and 2 times DRAM_SIZE read //you sholud know that cpu spend must time to iter, compare in the read "for loop" and BM do interrupt //handler&cpu mode switch Linux handle system tick&sgi intrrupt and sched task, so the actual memory system //band width is greater than this evaluate value //be aware 65536B is bigger than L1 Dcache but little than L2 Dcache //evaluate data process speed in our case is: //1842Hz * 2toggle * 2w * 2r&check * DRAM_SIZE(65536B) = 965738496B/s = 7.73Gbps //toggle hps led //*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x1<<12); LED27_BLINK(); sgi15task_pending = 0; gic_raise_interrupt(CPU0, GIC_SGI13); } else if(ACCESS_ONCE(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args) == 1) { p = (unsigned int *)0xfe700000; for(i = 0; i < (SRAM_BUF_SIZE/4); i++) { if(*p != CPU_DATA_PATERN0) bmlog("check buf from linux failed! i=%d, *p=%x\n", i, *p); p++; } //bmlog("check buf from linux finish!\n"); p = (unsigned int *)0xfe700000; memset_int(p, CPU_DATA_PATERN3, SRAM_BUF_SIZE); //now u can mesure blink frequency on hps LED2 and multiply 2 to calculate the interrupte frequency //and then you can evaulate how fast the cpu do 2 times SRAM_SIZE write and 2 times SRAM_SIZE read //you sholud know that cpu spend must time to iter, compare in the read "for loop" and BM do interrupt //handler&cpu mode switch Linux handle system tick&sgi intrrupt and sched task, so the actual memory system //band width is greater than this evaluate value //be aware 32768B is bigger than L1 Dcache but little than L2 Dcache //evaluate data process speed in our case is: //3230Hz * 2toggle * 2w * 2r&check * SRAM_SIZE(32768B) = 846725120B/s = 6.77Gbps //little then DRAM cause much more interrupt overhead //toggle hps led //*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x2<<12); LED28_BLINK(); sgi15task_pending = 0; gic_raise_interrupt(CPU0, GIC_SGI13); } //bmlog("CPU1#%04d:msg from cpu1 call~~~~~~~~~~~~~~~~\n", i); } if(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args == 2) { for(i = 0; i < SPINLOCK_TEST_COUNT; i++) { _raw_spinlock(&asp->rslocks[0]); tmp = asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status; //dummy j++ j++; asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status += 2; if((asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status - tmp) != 2) bmlog("BM:spinlock test failed!\n"); _raw_spinunlock(&asp->rslocks[0]); //dummy operation on j++ simulate the actual scenario to give another cpu chance //to take lock, reduce starvation situation j++; } //bmlog("\nBM spinlock test:%d\n", tmp + 2); bmlog("\n----------------------------\n"); bmlog("BM spinlock test:%d\n", tmp + 2); bmlog("----------------------------\n"); /*************************************************************************************/ /**** place interrupt test here test cpu data process speed would be more accuracy ***/ /*************************************************************************************/ #ifdef TEST_DATA_PROCESS_SPEED_MORE_ACCURACY gic_Int_dis (GIC_PFGA0); //72 FPGA_IRQ0 gic_Int_clr (GIC_PFGA0); pvt_init(1); fire_fpga_irq(); pvt_start(); gic_Int_en (GIC_PFGA0); #endif /************************************************************************/ while(!ACCESS_ONCE(gINTtestDone)); ACCESS_ONCE(gINTtestDone) = 0; for(i = 0; i < IL_TEST_COUNT - 1; i++) { p_pvt[i] = p_pvt[i] - p_pvt[i + 1]; //delta t of pvt, be careful pvt counter overlap! } for(i = 0; i < IL_TEST_COUNT - 1; i++) { if(p_pvt[i] > PVT_100US_CYCLE) p_pvt[i] -= PVT_100US_CYCLE; /* pvt 10ns resolution, 10000 * 10ns = 100us*/ //IL(interrupt latency jitter) else p_pvt[i] = PVT_100US_CYCLE - p_pvt[i]; } max = p_pvt[0]; k = 0; for(i = 0; i < IL_TEST_COUNT - 1; i++) if(p_pvt[i] > max) { max = p_pvt[i]; k = i; } max *= 10; bmlog("\n------------------------------------------------------------------------------------------\n"); //bmlog("\ninterrupt latency test method 1(use private timer)\n"); bmlog("interrupt latency test method 1(use private timer)\n"); //bmlog("max interrupt latency jitter: %d ns\n", max); bmlog("max interrupt latency jitter: %d ns\n", max); sum = 0; for(i = 0; i < IL_TEST_COUNT - 1; i++) { sum += p_pvt[i]; //be carefule sum overflow } sum /= (IL_TEST_COUNT - 1); sum *= 10; //bmlog("average interrupt latency jitter: %d ns\n", sum); bmlog("average interrupt latency jitter: %d ns\n", sum); //90ns is measure from oscilloscope, see AMP Reference Design for detail bmlog("max interrupt latency: %d ns(use private timer@CPU core cluster)\n", max + 90); if(max>410) bmlog("max interrupt latency: %d ns(use private timer@CPU core cluster),max_index=%d\n", max + 90, k); bmlog("average interrupt latency: %d ns(use private timer@CPU core cluster)\n", sum + 90); bmlog("-----------------------------------------------------------------------------------------------\n"); bmlog("\n-----------------------------------------------------------------------------------------------\n"); //bmlog("\nfpga send %d times irq req to arm, arm ack %d times, lost irq %d times\n", gFPGA_IRQ_req, gARM_IRQ_ack, gFPGA_IRQ_req - gARM_IRQ_ack); bmlog("fpga send %d times irq req to arm, arm ack %d times, lost irq %d times\n", ACCESS_ONCE(gFPGA_IRQ_req), ACCESS_ONCE(gARM_IRQ_ack), ACCESS_ONCE(gFPGA_IRQ_req) - ACCESS_ONCE(gARM_IRQ_ack)); bmlog("-------------------------------------------------------------------------------------------------\n"); testIL = 1; #ifdef DMA_AND_ACP_TEST dma_init(); //DMAC_regs_dump(0); dma_mem2mem();//new for test //DMAC_regs_dump(0); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_done(); //DMAC_regs_dump(0); bmlog("DMA test done\n"); bmlog("-----------------------------------------------\n\n"); dma_mem2mem_use_acp(); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_use_acp_done(); bmlog("DMA test acp done\n"); bmlog("-----------------------------------------------\n\n"); dma_ARMmem2FPGAmem(); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_done(); bmlog("dma_ARMmem2FPGAmem test done\n"); bmlog("-----------------------------------------------\n\n"); dma_FPGAmem2ARMmem_use_acp(); while(!ACCESS_ONCE(gDMAtestDone)); ACCESS_ONCE(gDMAtestDone) = 0; dma_mem2mem_done(); bmlog("dma_FPGAmem2ARMmem_use_acp test done\n"); bmlog("-----------------------------------------------\n\n"); #endif count++; #ifdef LCD1602_DISP LCD_SetCursor(1); memset(cDispBuf[1], ' ', 16); sprintf(cDispBuf[1], "test:%d", count); IIC_EXfer(LCD_ADDR, cDispBuf[1], 16); #endif asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args = -1; //tell linux interrupt latency and dma test done! } //dummy k++ and toggle hps led1, waste cpu time.. //indicate bm activity, u can measure the toggle freqency to evalute how much time spend per loop //loop cycle = 1s /(toggle freqency * 2 * 1048576) //in our case cpu = 800MHz, no interrupt to handle no work cmd, // loop cycle = 1/(42.384*2*1048576) * 10^9 = 11.25ns //if remov the K++, if((k&0xFFFFF) == 0){...} block, we can remove about 7 dissemable instrution~8 clock cycle // 11.25 * (6 +6)(instrution)/(6+6+8) total instrution = 6.75ns k++; if((k&0x1FFFFF) == 0) LED30_BLINK(); } }
int idt_initialize() { memset_int(&idts,0,sizeof(struct idt_struct)*NUM_IDT_DESCR); idt_base.limit = (NUM_IDT_DESCR*sizeof(struct idt_struct))-1; idt_base.base = (unsigned int)&idts; //Remap the irq table. outb(0x20,0x11); outb(0xA0,0x11); outb(0x21,0x20); outb(0xA1,0x28); outb(0x21,0x04); outb(0xA1,0x02); outb(0x21,0x01); outb(0xA1,0x01); outb(0x21,0x00); outb(0xA1,0x00); set_idt(0, (unsigned int) isr0, 0x08, 0x8E); set_idt(1, (unsigned int) isr1, 0x08, 0x8E); set_idt(2, (unsigned int) isr2, 0x08, 0x8E); set_idt(3, (unsigned int) isr3, 0x08, 0x8E); set_idt(4, (unsigned int) isr4, 0x08, 0x8E); set_idt(5, (unsigned int) isr5, 0x08, 0x8E); set_idt(6, (unsigned int) isr6, 0x08, 0x8E); set_idt(7, (unsigned int) isr7, 0x08, 0x8E); set_idt(8, (unsigned int) isr8, 0x08, 0x8E); set_idt(9, (unsigned int) isr9, 0x08, 0x8E); set_idt(10, (unsigned int) isr10, 0x08, 0x8E); set_idt(11, (unsigned int) isr11, 0x08, 0x8E); set_idt(12, (unsigned int) isr12, 0x08, 0x8E); set_idt(13, (unsigned int) isr13, 0x08, 0x8E); set_idt(14, (unsigned int) isr14, 0x08, 0x8E); set_idt(15, (unsigned int) isr15, 0x08, 0x8E); set_idt(16, (unsigned int) isr16, 0x08, 0x8E); set_idt(17, (unsigned int) isr17, 0x08, 0x8E); set_idt(18, (unsigned int) isr18, 0x08, 0x8E); set_idt(19, (unsigned int) isr19, 0x08, 0x8E); set_idt(20, (unsigned int) isr20, 0x08, 0x8E); set_idt(21, (unsigned int) isr21, 0x08, 0x8E); set_idt(22, (unsigned int) isr22, 0x08, 0x8E); set_idt(23, (unsigned int) isr23, 0x08, 0x8E); set_idt(24, (unsigned int) isr24, 0x08, 0x8E); set_idt(25, (unsigned int) isr25, 0x08, 0x8E); set_idt(26, (unsigned int) isr26, 0x08, 0x8E); set_idt(27, (unsigned int) isr27, 0x08, 0x8E); set_idt(28, (unsigned int) isr28, 0x08, 0x8E); set_idt(29, (unsigned int) isr29, 0x08, 0x8E); set_idt(30, (unsigned int) isr30, 0x08, 0x8E); set_idt(31, (unsigned int) isr31, 0x08, 0x8E); set_idt(32, (unsigned int) irq0, 0x08,0x8E); set_idt(33, (unsigned int) irq1, 0x08,0x8E); set_idt(34, (unsigned int) irq2, 0x08,0x8E); set_idt(35, (unsigned int) irq3, 0x08,0x8E); set_idt(36, (unsigned int) irq4, 0x08,0x8E); set_idt(37, (unsigned int) irq5, 0x08,0x8E); set_idt(38, (unsigned int) irq6, 0x08,0x8E); set_idt(39, (unsigned int) irq7, 0x08,0x8E); set_idt(40, (unsigned int) irq8, 0x08,0x8E); set_idt(41, (unsigned int) irq9, 0x08,0x8E); set_idt(42, (unsigned int) irq10, 0x08,0x8E); set_idt(43, (unsigned int) irq11, 0x08,0x8E); set_idt(44, (unsigned int) irq12, 0x08,0x8E); set_idt(45, (unsigned int) irq13, 0x08,0x8E); set_idt(46, (unsigned int) irq14, 0x08,0x8E); set_idt(47, (unsigned int) irq15, 0x08,0x8E); // Load IDT asm volatile ("movl %0,%%eax"::"r"(&idt_base):"%eax"); asm volatile ("lidt (%eax)"); }