int  main (void)

{
	u32 count = 0;
	s32 i, j, k = 0;
	u32 max, sum;
	volatile u32 tmp;
	u32 *p;
	char *pstr;
	p = (u32 *)(DRAM_PHYS_START + 0x6000);
	u32  boot;
	u32  load_start, load_end;
	u32  boot_start, boot_end;
	u32 bm_load_duration_ns, bm_load_duration_ms;

	/* get the timestamp of the osc timer1 */
	boot_end = readl((const volatile void *)SOCFPGA_OSC1TIMER1_ADDRESS + 0x4);

	InitPeripheral();
	cpu_local_irq_disable();
	
	/*  asp is cleared by the preloader , and preloader will save
	  *  qspi probe, read function in it. Please don't clear
	  *  we reuse the preloader qspi driver at here, so we save
	  *  the probe and read function pointer into asp.
	  *  it at here.
	  */
	//memset(asp, 0, sizeof(struct amp_share_param));
	asp->bm_magic = ('b' << 8) | 'm';
	
	/**** interrupt initialize*******/
    gic_Int_init();
	/**** hook ISR callback ******/
	gic_sgi_init();
	
#ifdef NEED_SAVE_RESTORE_GIC_REGS_FROM_BM
	gic_dist_save();
#endif
	/**** interrupt ready *****/
	cpu_local_irq_enable();
	UART_DEBUG("start bm...... %x\r\n", p);
	UART_DEBUG("++start bm...... 12345\r\n");
	bmlog("start bm......%x\r\n", p);
	UART_DEBUG("start bm...... %x\r\n", 0x55aa);
	UART_DEBUG("--start bm...... 67890\r\n");

	/* auto detect the boot mode */
	boot = bm_get_boot_mode();
	if((BOOTSEL_MODE_SD_1_8V == boot) || (BOOTSEL_MODE_SD_3_3V == boot))
	{		
		if(!bm_sd_load_rbf())
			bmlog("load fpga rbf is ok!\n");
	}
	else if((BOOTSET_MODE_QSPI_1_8V == boot) || (BOOTSET_MODE_QSPI_3_3V == boot))
	{
		if(!bm_qspi_load_rbf())
			bmlog("load fpga rbf is ok!\n");
	}
	
	/* osc_timer1 is running at 25MHz, 40ns per cycle */
	asp->boot_end_stamp = boot_end;
	boot_start =  asp->boot_start_stamp;
	bm_load_duration_ns = (boot_start - boot_end) * 40;
	bm_load_duration_ms = bm_load_duration_ns/1000/1000;
	bmlog("start[0x%x],end[%x]\n, bm_load_ns = %u(ns), bm_load_ms= %u(ms)\n", 
										   boot_start,   
										   boot_end,
										   bm_load_duration_ns,
										   bm_load_duration_ms
										   );
	load_start = asp->load_bm_start;
	load_end   = asp->load_bm_end;
	bmlog("real loading bm time duration is %u(ms)(including qspi/sd init)\n", load_end - load_start);

	if(fpgamgr_program_fpga((const unsigned long *)FPGA_RBF_BASE, FPGA_RBF_SIZE) < 0)	
		UART_DEBUG("config fpga failed!\r\n");

	else
	{
		UART_DEBUG("config fpga OK!\r\n");
		writel(('R'<<16)|('B'<<8)|('F'<<0),&(asp->preloader_wait_bm_load_rbf));
	}


	pstr = (char *)FPGA_SDRAM_PHYS_BASE;
	sprintf(pstr, "%s\n", "i am from fpga ddr ram");
	bmlog("%s", pstr);
	
	pstr = (char *)FPGA_SRAM_PHYS_BASE;
	sprintf(pstr, "%s\n", "i am from fpga sram");
	bmlog("%s", pstr);

#ifdef	LCD1602_DISP
	IIC_InitIp();
	LCD_SetCursor(0);
	sprintf(cDispBuf[0], "fpga config done!");
	IIC_EXfer(LCD_ADDR, cDispBuf[0], strlen(cDispBuf[0])>15?16:strlen(cDispBuf[0]));
#endif
	
	while(!gCacheCoherence)
	{
		/** led blink for hand shake debug with linux **/
		k++;
		if((k&0x3FFFF) == 0)
			LED27_BLINK();//*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x1<<12);
	}
	/*
	**************************************************************
	**** cause Linux peer use 2GB user space/2GB kernel space split *******
	**** so it is 2048 L1 entry here, if use 3GB user/1 GB space split ********
	**** it woule be 1024 here, and p should adjust to 0x7000 yejc    ********
	**************************************************************
	*/
	for(j = 0; j < 2048-16; j++) //16MB for IO map space
				PageTable[j+2048] = *p++;

	/*
	**************************************************************
	**** time to bring bm core to smp cache coherence environmence *******
	**************************************************************
	*/
	__asm__ __volatile__("dsb\n"
	"isb\n"
	"mrc    p15, 0, r1, c1, c0, 0\n"
	"ldr	r2, =0x40180d\n"
	"orr	r1, r1, r2\n"
	"mcr    p15, 0, r1, c1, c0, 0\n"
	"dsb\n"
	"isb\n"
	: : :"memory", "cc");


	bmlog("L1 L2 cache enabled, SCU enabled~~~\n");
	//asp = 0xFC700000;
	
	/*
	********************************************************************************
	** from this point on, you can free to invoke the linux kernel space text function from BM,  *****
	** only if the function would not cause the shceduler to action, or the CPU of BM would trap ***** 
	** in the cpu_idle(clone from linux cpu core) process if we can't not obtain the lock/mutex  *****
	** /semaphore                                                                                                              ****
	********************************************************************************
	*/
	//printk = (printk_fn)asp->sta.printk_fn;
	_raw_spinlock = (raw_spinlock_fn)asp->sta.spinlock_lock_fn;
	_raw_spinunlock = (raw_spinlock_fn)asp->sta.spinlock_unlock_fn;
	/* semaphore */
	down_trylock = (down_trylock_fn)asp->sta._down_trylock_fn;

	while(1)
	{
		if(sgi15task_pending)
		{	
			if(ACCESS_ONCE(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args) == 0)
			{
			/**************************************************************************************/
			/**** place interrupt here test worse case interrupt latency would be more accuracy ***/
			/**** casue we not only take care of 10000 interrupts/second from FPGA, but also    ***/
			/**** suffer more than 6000 extra interrupts/second from IPI and offen L1 data cahce miss **/
			/**************************************************************************************/
#ifdef TEST_IL_MORE_ACCURACY
			if(testIL)
			{
				bmlog("\n________________________________________________________________________________\n");
                bmlog("Now the BM CPU would suffer more than 16000 interrupts/second(10000i/s from\n");
                bmlog("FPGA_IRQ0 req(100us),more than 6000i/s from IPI(Inner Process Interrupt), and\n");
                bmlog("process several Gbps data per second, they are all concurrently, very intensive\n");
                bmlog("load for BM CPU core!\n");
                bmlog("__________________________________________________________________________________\n");
				testIL = 0;
				gic_Int_dis (GIC_PFGA0); //72 FPGA_IRQ0
				gic_Int_clr (GIC_PFGA0);
				pvt_init(1);
				fire_fpga_irq();
				pvt_start();
				gic_Int_en (GIC_PFGA0);
			}
#endif
			/************************************************************************/
			//p = (unsigned int *)0x1E200000;
			p = (unsigned int *)0xFC800000;
			for(i = 0; i < (DRAM_BUF_SIZE/4); i++)
			{
				if(*p != CPU_DATA_PATERN0)
					bmlog("check buf from linux failed! i=%d, *p=%x\n", i, *p);
				p++;
			}
			//bmlog("check buf from linux finish!\n");
			//p = (unsigned int *)0x1E200000;
			p = (unsigned int *)0xFC800000;
			memset_int(p, CPU_DATA_PATERN3, DRAM_BUF_SIZE);
			//now u can mesure blink frequency on hps  LED3 and multiply 2 to calculate the interrupte frequency
			//and then you can evaulate how fast the cpu do 2 times DRAM_SIZE write and 2 times DRAM_SIZE read
			//you sholud know that cpu spend must time to iter, compare in the read "for loop" and BM do interrupt 
			//handler&cpu mode switch Linux handle system tick&sgi intrrupt and sched task, so the actual memory system
			//band width is greater than this evaluate value
			//be aware 65536B is bigger than L1 Dcache but little than L2 Dcache
			//evaluate data process speed in our case is:
			//1842Hz * 2toggle * 2w * 2r&check * DRAM_SIZE(65536B) = 965738496B/s = 7.73Gbps
			

			//toggle hps led
			//*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x1<<12);
			LED27_BLINK();
			sgi15task_pending = 0;
			gic_raise_interrupt(CPU0, GIC_SGI13);
				}
			else if(ACCESS_ONCE(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args) == 1)
				{
			p = (unsigned int *)0xfe700000;
			for(i = 0; i < (SRAM_BUF_SIZE/4); i++)
			{
				if(*p != CPU_DATA_PATERN0)
					bmlog("check buf from linux failed! i=%d, *p=%x\n", i, *p);
				p++;
			}
			//bmlog("check buf from linux finish!\n");
			p = (unsigned int *)0xfe700000;
			memset_int(p, CPU_DATA_PATERN3, SRAM_BUF_SIZE);
			//now u can mesure blink frequency on hps LED2  and multiply 2 to calculate the interrupte frequency
			//and then you can evaulate how fast the cpu do 2 times SRAM_SIZE write and 2 times SRAM_SIZE read
			//you sholud know that cpu spend must time to iter, compare in the read "for loop" and BM do interrupt 
			//handler&cpu mode switch Linux handle system tick&sgi intrrupt and sched task, so the actual memory system
			//band width is greater than this evaluate value
			//be aware 32768B is bigger than L1 Dcache but little than L2 Dcache
			//evaluate data process speed in our case is:
			//3230Hz * 2toggle * 2w * 2r&check * SRAM_SIZE(32768B) = 846725120B/s = 6.77Gbps //little then DRAM cause much more interrupt overhead
			

			//toggle hps led
			//*(volatile u32 *)HPS_GPIO1_BASE_ADDR ^= (0x2<<12);
			LED28_BLINK();
			sgi15task_pending = 0;
			gic_raise_interrupt(CPU0, GIC_SGI13);
				
		}
		//bmlog("CPU1#%04d:msg from cpu1 call~~~~~~~~~~~~~~~~\n", i);
	}
		if(asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args == 2)
		{
					for(i = 0; i < SPINLOCK_TEST_COUNT; i++)
        			{
                		_raw_spinlock(&asp->rslocks[0]);
                		tmp = asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status;
                		//dummy j++
                		j++;
                		asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status += 2;
                		if((asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].bm_cmd_status - tmp) != 2)
                        	bmlog("BM:spinlock test failed!\n");
                		_raw_spinunlock(&asp->rslocks[0]);
                		//dummy operation on j++ simulate the actual scenario to give another cpu chance
                		//to take lock, reduce starvation situation
                		j++;
        			}
					//bmlog("\nBM spinlock test:%d\n", tmp + 2);
					bmlog("\n----------------------------\n");
					bmlog("BM spinlock test:%d\n", tmp + 2);
					bmlog("----------------------------\n");


			/*************************************************************************************/
			/**** place interrupt test here test cpu data process speed would be more accuracy ***/
			/*************************************************************************************/
#ifdef TEST_DATA_PROCESS_SPEED_MORE_ACCURACY
			gic_Int_dis (GIC_PFGA0); //72 FPGA_IRQ0
			gic_Int_clr (GIC_PFGA0);
			pvt_init(1);
			fire_fpga_irq();
			pvt_start();
			gic_Int_en (GIC_PFGA0);
#endif
			/************************************************************************/
			while(!ACCESS_ONCE(gINTtestDone));
			ACCESS_ONCE(gINTtestDone) = 0;
			for(i = 0; i < IL_TEST_COUNT - 1; i++)
			{
				p_pvt[i] = p_pvt[i] - p_pvt[i + 1]; //delta t of pvt, be careful pvt counter overlap!
			}
			for(i = 0; i < IL_TEST_COUNT - 1; i++)
			{
				if(p_pvt[i] > PVT_100US_CYCLE)
					p_pvt[i] -= PVT_100US_CYCLE; /* pvt 10ns resolution, 10000 * 10ns = 100us*/ //IL(interrupt latency jitter)
				else
					p_pvt[i] = PVT_100US_CYCLE - p_pvt[i];
			}
			max = p_pvt[0];
			k = 0;
			for(i = 0; i < IL_TEST_COUNT - 1; i++)
				if(p_pvt[i] > max)
				{
					max = p_pvt[i];
					k = i;
				}
			max *= 10;
			bmlog("\n------------------------------------------------------------------------------------------\n");
			//bmlog("\ninterrupt latency test method 1(use private timer)\n");
			bmlog("interrupt latency test method 1(use private timer)\n");
			//bmlog("max interrupt latency jitter: %d ns\n", max);
			bmlog("max interrupt latency jitter: %d ns\n", max);
			sum = 0;
			for(i = 0; i < IL_TEST_COUNT - 1; i++)
			{
				sum += p_pvt[i];				//be carefule sum overflow
			}
			sum /= (IL_TEST_COUNT - 1);
			sum *= 10;
			//bmlog("average interrupt latency jitter: %d ns\n", sum);
			bmlog("average interrupt latency jitter: %d ns\n", sum);
			//90ns is measure from oscilloscope, see AMP Reference Design for detail
			bmlog("max interrupt latency: %d ns(use private timer@CPU core cluster)\n", max + 90);
			if(max>410)
			    bmlog("max interrupt latency: %d ns(use private timer@CPU core cluster),max_index=%d\n", max + 90, k); 
			bmlog("average interrupt latency: %d ns(use private timer@CPU core cluster)\n", sum + 90);
			bmlog("-----------------------------------------------------------------------------------------------\n");


			bmlog("\n-----------------------------------------------------------------------------------------------\n");
			//bmlog("\nfpga send %d times irq req to arm, arm ack %d times, lost irq %d times\n", gFPGA_IRQ_req, gARM_IRQ_ack, gFPGA_IRQ_req - gARM_IRQ_ack);
			bmlog("fpga send %d times irq req to arm, arm ack %d times, lost irq %d times\n", 
			ACCESS_ONCE(gFPGA_IRQ_req), ACCESS_ONCE(gARM_IRQ_ack), ACCESS_ONCE(gFPGA_IRQ_req) - ACCESS_ONCE(gARM_IRQ_ack));
			bmlog("-------------------------------------------------------------------------------------------------\n");

			testIL = 1;
#ifdef DMA_AND_ACP_TEST
			dma_init();
			//DMAC_regs_dump(0);
			dma_mem2mem();//new for test
			//DMAC_regs_dump(0);
			while(!ACCESS_ONCE(gDMAtestDone));
			ACCESS_ONCE(gDMAtestDone) = 0;
			dma_mem2mem_done();
			//DMAC_regs_dump(0);
			bmlog("DMA test done\n");
			bmlog("-----------------------------------------------\n\n");

			dma_mem2mem_use_acp();
			while(!ACCESS_ONCE(gDMAtestDone));
			ACCESS_ONCE(gDMAtestDone) = 0;
			dma_mem2mem_use_acp_done();
			bmlog("DMA test acp done\n");
			bmlog("-----------------------------------------------\n\n");

			dma_ARMmem2FPGAmem();
			while(!ACCESS_ONCE(gDMAtestDone));
			ACCESS_ONCE(gDMAtestDone) = 0;
			dma_mem2mem_done();
			bmlog("dma_ARMmem2FPGAmem test done\n");
			bmlog("-----------------------------------------------\n\n");

			dma_FPGAmem2ARMmem_use_acp();
			while(!ACCESS_ONCE(gDMAtestDone));
			ACCESS_ONCE(gDMAtestDone) = 0;
			dma_mem2mem_done();
			bmlog("dma_FPGAmem2ARMmem_use_acp test done\n");
			bmlog("-----------------------------------------------\n\n");
#endif

			count++;
#ifdef	LCD1602_DISP
			LCD_SetCursor(1);
			memset(cDispBuf[1], ' ', 16);
			sprintf(cDispBuf[1], "test:%d", count);
			IIC_EXfer(LCD_ADDR, cDispBuf[1], 16);
#endif
			
			asp->sra[SGI_LINUX_REQ_BM_CONSUME_BUF].linux_cmd_args = -1; //tell linux interrupt latency and dma test done!
	
		}
		//dummy k++ and toggle hps led1, waste cpu time..
		//indicate bm activity, u can measure the toggle freqency to evalute how much time spend per loop
		//loop cycle = 1s /(toggle freqency * 2 * 1048576)
		//in our case cpu = 800MHz, no interrupt to handle no work cmd,
		// loop cycle = 1/(42.384*2*1048576) * 10^9 = 11.25ns
		//if remov the K++, if((k&0xFFFFF) == 0){...} block, we can remove about 7 dissemable instrution~8 clock cycle
		// 11.25 * (6 +6)(instrution)/(6+6+8) total instrution = 6.75ns
		k++;
		if((k&0x1FFFFF) == 0)
			LED30_BLINK();
	}

}
Example #2
0
int idt_initialize()
{
  memset_int(&idts,0,sizeof(struct idt_struct)*NUM_IDT_DESCR);

  idt_base.limit = (NUM_IDT_DESCR*sizeof(struct idt_struct))-1;
  idt_base.base  = (unsigned int)&idts;

  //Remap the irq table.
  outb(0x20,0x11);
  outb(0xA0,0x11);
  outb(0x21,0x20);
  outb(0xA1,0x28);
  outb(0x21,0x04);
  outb(0xA1,0x02);
  outb(0x21,0x01);
  outb(0xA1,0x01);
  outb(0x21,0x00);
  outb(0xA1,0x00);

  set_idt(0, (unsigned int) isr0, 0x08, 0x8E);
  set_idt(1, (unsigned int) isr1, 0x08, 0x8E);
  set_idt(2, (unsigned int) isr2, 0x08, 0x8E);
  set_idt(3, (unsigned int) isr3, 0x08, 0x8E);
  set_idt(4, (unsigned int) isr4, 0x08, 0x8E);
  set_idt(5, (unsigned int) isr5, 0x08, 0x8E);
  set_idt(6, (unsigned int) isr6, 0x08, 0x8E);
  set_idt(7, (unsigned int) isr7, 0x08, 0x8E);
  set_idt(8, (unsigned int) isr8, 0x08, 0x8E);
  set_idt(9, (unsigned int) isr9, 0x08, 0x8E);
  set_idt(10, (unsigned int) isr10, 0x08, 0x8E);
  set_idt(11, (unsigned int) isr11, 0x08, 0x8E);
  set_idt(12, (unsigned int) isr12, 0x08, 0x8E);
  set_idt(13, (unsigned int) isr13, 0x08, 0x8E);
  set_idt(14, (unsigned int) isr14, 0x08, 0x8E);
  set_idt(15, (unsigned int) isr15, 0x08, 0x8E);
  set_idt(16, (unsigned int) isr16, 0x08, 0x8E);
  set_idt(17, (unsigned int) isr17, 0x08, 0x8E);
  set_idt(18, (unsigned int) isr18, 0x08, 0x8E);
  set_idt(19, (unsigned int) isr19, 0x08, 0x8E);
  set_idt(20, (unsigned int) isr20, 0x08, 0x8E);
  set_idt(21, (unsigned int) isr21, 0x08, 0x8E);
  set_idt(22, (unsigned int) isr22, 0x08, 0x8E);
  set_idt(23, (unsigned int) isr23, 0x08, 0x8E);
  set_idt(24, (unsigned int) isr24, 0x08, 0x8E);
  set_idt(25, (unsigned int) isr25, 0x08, 0x8E);
  set_idt(26, (unsigned int) isr26, 0x08, 0x8E);
  set_idt(27, (unsigned int) isr27, 0x08, 0x8E);
  set_idt(28, (unsigned int) isr28, 0x08, 0x8E);
  set_idt(29, (unsigned int) isr29, 0x08, 0x8E);
  set_idt(30, (unsigned int) isr30, 0x08, 0x8E);
  set_idt(31, (unsigned int) isr31, 0x08, 0x8E);
  set_idt(32, (unsigned int) irq0, 0x08,0x8E);
  set_idt(33, (unsigned int) irq1, 0x08,0x8E);
  set_idt(34, (unsigned int) irq2, 0x08,0x8E);
  set_idt(35, (unsigned int) irq3, 0x08,0x8E);
  set_idt(36, (unsigned int) irq4, 0x08,0x8E);
  set_idt(37, (unsigned int) irq5, 0x08,0x8E);
  set_idt(38, (unsigned int) irq6, 0x08,0x8E);
  set_idt(39, (unsigned int) irq7, 0x08,0x8E);
  set_idt(40, (unsigned int) irq8, 0x08,0x8E);
  set_idt(41, (unsigned int) irq9, 0x08,0x8E);
  set_idt(42, (unsigned int) irq10, 0x08,0x8E);
  set_idt(43, (unsigned int) irq11, 0x08,0x8E);
  set_idt(44, (unsigned int) irq12, 0x08,0x8E);
  set_idt(45, (unsigned int) irq13, 0x08,0x8E);
  set_idt(46, (unsigned int) irq14, 0x08,0x8E);
  set_idt(47, (unsigned int) irq15, 0x08,0x8E);

  // Load IDT
  asm volatile ("movl %0,%%eax"::"r"(&idt_base):"%eax");
  asm volatile ("lidt (%eax)");
}