int dramc_ta2(unsigned int start, unsigned int len, void *ext_arg)
{
    int err = 0;
    int check_result = (int)ext_arg;
     /* cpu read test */
 
   if ((check_result == 0) && (ddr_type == 3))  // for DDR3 only
   	{
	    err = Read_Test(start, len, ext_arg);
	    *(volatile unsigned int *)DRAMC_SPCMD |= (1 << 9);
           *(volatile unsigned int *)DRAMC_SPCMD &= ~(1 << 9);
           DDR_PHY_RESET();	
	    if(err !=0)		
    		{
	         return err;
    		}
   	}


    *(volatile unsigned int *)DRAMC_LPDDR2 &= ~0x20000000;
    *(volatile unsigned int *)DRAMC_LPDDR2 |= 0x80000000;

    *(volatile unsigned int *)DRAMC_CONF2 |= (1 << 30) | (1 << 31);

    while (!((*(volatile unsigned int *)DRAMC_TESTRPT) & (1 << 10)));

    /* 
     * NoteXXX: Need to wait for at least 400 ns 
     *          After checking the simulation result, 
     *          there will be a delay on DLE_CNT_OK/DM_CMP_ERR updates after getting DM_CMP_CPT.
     *          i.e; After getting the complete status, need to wait for a while before reading DLE_CNT_OK/DM_CMP_ERR in the TESTRPT register.
     */
    delay_a_while(400);

    if (check_result) {
        if (*(volatile unsigned int *)DRAMC_TESTRPT & (1 << 14)) {
            err = -1;
        } else if (!(*(volatile unsigned int *)DRAMC_TESTRPT & (1 << 18))) {
            err = -1;
        }
    }

    *(volatile unsigned int *)DRAMC_CONF2 &= ~((1 << 30) | (1 << 31));

    *(volatile unsigned int *)DRAMC_LPDDR2 &= ~0x80000000;
    *(volatile unsigned int *)DRAMC_LPDDR2 |= 0x20000000;

    DDR_PHY_RESET();

    if (!err) {
        if ((*(volatile unsigned int *)DRAMC_OFFDLY6 == HW_DQS_GW_COUNTER)
            && (*(volatile unsigned int *)DRAMC_OFFDLY7 == HW_DQS_GW_COUNTER)) {
            err = 0;
        } else {
            err = -1;
        }
    }

    /* DQS gating window counter reset */
    *(volatile unsigned int *)DRAMC_SPCMD |= (1 << 9);
    *(volatile unsigned int *)DRAMC_SPCMD &= ~(1 << 9);

    DDR_PHY_RESET();

    if (!err) {
        if ((*(volatile unsigned int *)DRAMC_OFFDLY6 == HW_DQS_GW_COUNTER)
            && (*(volatile unsigned int *)DRAMC_OFFDLY7 == HW_DQS_GW_COUNTER)) {
            err = 0;
        } else {
            err = -1;
        }
    }

dramc_ta2_exit:
    return err;
}
/* Description
  *	RX DQ/DQS per bit calibration.
  * Registers
  *	- DQIDLY[1:8] : one register had 4 settings (4bits: 0~15, unit 20ps) with corresponding DQx
  *	- R0DELDLY : 4 settings for rank 0 DQS0~DQS3. 7 bits (0~127) with unit 30ps. 
  *	- R1DELDLY : 4 settings for rank 1 DQS0~DQS3. 7 bits (0~127) with unit 30ps.
  * Algorithm
  *	- Set DQS/DQ input delay to 0.
  *	- Delay all DQs from 0 to 15 until all failed.
  *	- Delay DQSs from 0 to 127 to find the pass range (min & max) of each DQ. Further find the largest pass range.
  *	- For each DQS, find the largest middle value of corresponding DQ byte. Then use this value to set each DQS input delay.
  *	- For each DQ, find the difference between original middle DQS delay and max DQS delay per byte. Then delay the difference more to align the middle of DQS per byte.
  */
int do_sw_rx_dq_dqs_calib(void)
{
    int result;
    unsigned int data, backup;
    int temp, timeout;
    unsigned int dqsi_dly0, dqsi_dly1, dqsi_dly2, dqsi_dly3;
    unsigned int test_len = 0x100;
    unsigned int dqidly1, dqidly2, dqidly3, dqidly4,dqidly5,dqidly6,dqidly7;
    unsigned int i,j;
    unsigned int dqs_input_delay;
    unsigned int cmp_err;
    unsigned int max;

    unsigned int dq_dly_max;
    char dqs_delay[DQS_NUMBER];
    char dq_delay_per_bit[DQ_DATA_WIDTH];
    unsigned int dqidly[DQ_DATA_WIDTH/DQS_NUMBER];
    unsigned int dq_tap;
    unsigned int dq_delay_done[DQ_DATA_WIDTH];
    RXDQS_PERBIT_DLY_T dqs_perbit_dly[DQ_DATA_WIDTH];
    result = 0;

    dbg_print("in do_sw_rx_dq_dqs_calib()\n");

#ifndef RELEASE
    dbg_print("*DQIDLY1 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY1));
    dbg_print("*DQIDLY2 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY2));
    dbg_print("*DQIDLY3 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY3));
    dbg_print("*DQIDLY4 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY4));
    dbg_print("*DQIDLY5 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY5));
    dbg_print("*DQIDLY6 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY6));
    dbg_print("*DQIDLY7 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY7));
    dbg_print("*DQIDLY8 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY8));
    dbg_print("*DRAMC_R0DELDLY = 0x%x\n", DRAMC_READ_REG(DRAMC_R0DELDLY));


#endif


    /*1. set DQS delay to 0 first*/
  
        DRAMC_WRITE_REG(0x0,DRAMC_R0DELDLY);
   

    // set DQ delay to 0x0.
    for (i = 0; i < 8; i++)
    {
        DRAMC_WRITE_REG(0x0,DRAMC_DQIDLY1+4*i);
    }

    // set DQ delay structure to 0x0.
    for (i = 0; i < DQ_DATA_WIDTH; i++)
    {
        dq_delay_per_bit[i] = 0x0; 
        dq_delay_done[i] = 0x0;
    }

    // delay DQ to find all failed 
    for(dq_tap = 0 ; dq_tap < MAX_RX_DQDLY_TAPS; dq_tap++ ){
        /* set test patern length*/
       
        DRAMC_WRITE_REG(0x55000000,0x3C);
       
        
        data = DRAMC_READ_REG(0x40);
        DRAMC_WRITE_REG((data & 0xAA000000) | test_len, 0x40);
        //Test Agent 2 write enabling, Test Agent 2 read enabling
        DRAMC_WRITE_SET((1 << 30) | (1 << 31),DRAMC_CONF2); 
        
        while(!(DRAMC_READ_REG(DRAMC_TESTRPT)&(1 << 10)));

        delay_a_while(400);

        cmp_err = DRAMC_READ_REG(DRAMC_CMP_ERR);
        dbg_print("cmp_err:%x\n",cmp_err);
        DRAMC_WRITE_CLEAR(((1 << 30) | (1 << 31)),DRAMC_CONF2); //disable test agent2 r/w
        if (cmp_err == 0xFFFFFFFF) break; 


	/* Bit i compare result
	  * 	-Compare success & never fail before, record the delay value. (dq_delay_per_bit[i] = delay value)
	  *	-Compare fail. Record fail. (dq_delay_done[i] = 1)
           */

        for (i = 0; i < DQ_DATA_WIDTH; i++)
        {
            if (!(cmp_err&(0x1<<i)) && dq_delay_done[i] == 0)
            {
                dq_delay_per_bit[i] = dq_tap; 
            }
            else
            {
                dq_delay_done[i] = 1;
            }
                dbg_print("%d)0x%x \n",i,dq_delay_per_bit[i]);
        }
        dbg_print("\n");

        for (i = 0; i < DQ_DATA_WIDTH; i+=4)
        {
            dqidly[i/4] = (dq_delay_per_bit[i]) + (dq_delay_per_bit[i+1] << 8) + (dq_delay_per_bit[i+2] << 16) + (dq_delay_per_bit[i+3] << 24);

            dbg_print("dqidly[%d]=0x%x\n",i/4,dqidly[i/4]);
        }
        
        for (i = 0; i < 8; i++)
        {
            DRAMC_WRITE_REG(dqidly[i],DRAMC_DQIDLY1+4*i);
        }
        dbg_print("*DQIDLY1 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY1));
        dbg_print("*DQIDLY2 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY2));
        dbg_print("*DQIDLY3 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY3));
        dbg_print("*DQIDLY4 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY4));
        dbg_print("*DQIDLY5 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY5));
        dbg_print("*DQIDLY6 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY6));
        dbg_print("*DQIDLY7 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY7));
        dbg_print("*DQIDLY8 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY8)); 
    
    }
    // After loop, dq_delay_per_bit[0:31] value non-zero mean the last valid settings when DQS input delay is 0. dq_delay_per_bit[0:31] value 0 means it is already  failed when DQS input delay is 0. Also, current DQIDLY[1:8] settings is the setting of dq_delay_per_bit[0:31].
    // We got the dq input delay in dq_delay_per_bit[i]
    /* 2. initialize parameters */
    for (i = 0; i < DQ_DATA_WIDTH; i++)
    {
        dqs_perbit_dly[i].min_cur = -1;
        dqs_perbit_dly[i].max_cur = -1;
        dqs_perbit_dly[i].min_best = -1;
        dqs_perbit_dly[i].max_best = -1;
        dqs_perbit_dly[i].center = 0;
        dqs_perbit_dly[i].dq_dly_last = dq_delay_per_bit[i];
    }
    /* find the minimum and maximum DQS input delay*/
    for (i = 0; i < MAX_RX_DQSDLY_TAPS; i++)
    {
        dqs_input_delay = (i) + (i << 8) + (i << 16) + (i << 24);
        
        DRAMC_WRITE_REG(dqs_input_delay,DRAMC_R0DELDLY);
      
        
        /* set test patern length*/
        data = DRAMC_READ_REG(0x40);
        DRAMC_WRITE_REG((data & 0xFF000000) | test_len, 0x40);
        //Test Agent 2 write enabling, Test Agent 2 read enabling
        DRAMC_WRITE_SET((1 << 30) | (1 << 31),DRAMC_CONF2); 
        
        while(!(DRAMC_READ_REG(DRAMC_TESTRPT)&(1 << 10)));

        delay_a_while(400);

        cmp_err = DRAMC_READ_REG(DRAMC_CMP_ERR);
        DRAMC_WRITE_CLEAR(((1 << 30) | (1 << 31)),DRAMC_CONF2); //disable test agent2 r/w


	/* if bit x test pass the first time, record to min input delay. (dqs_per_bit[x].min_cur = delay value.)
	  * If bit x already had min value and no max value and pass fail => max value is this delay-1. (dqs_per_bit[x].max_cur = delay value-1)
	  * If bit x already had min value and no max value and pass and delay value = 127 => max value = 127 (dqs_per_bit[x].max_cur = 127)
           */
        
        for (j = 0; j < DQ_DATA_WIDTH; j++)
        {
            if ((dqs_perbit_dly[j].min_cur == -1) && ((cmp_err&((U32)1<<j)) == 0x0))
            {
                // min pass delay
                dqs_perbit_dly[j].min_cur = i;
            }
            if ((dqs_perbit_dly[j].min_cur != -1) && (dqs_perbit_dly[j].max_cur == -1) && (((cmp_err&((U32)1<<j)) != 0x0) || (i == (MAX_RX_DQSDLY_TAPS-1))) )
            {
                // we get the dqs_perbit_dly pass max
                if ((i == (MAX_RX_DQSDLY_TAPS-1)) && ((cmp_err&((U32)1<<j)) == 0x0))
                {
                    dqs_perbit_dly[j].max_cur = MAX_RX_DQSDLY_TAPS-1;
                }
                else
                {
                    dqs_perbit_dly[j].max_cur = i - 1;
                }

                // there may be more than 1 pass range, find the max range
                // ex: x00xxxxxx00000000000000xx...(get the second one)
                if ((dqs_perbit_dly[j].max_cur-dqs_perbit_dly[j].min_cur) > (dqs_perbit_dly[j].max_best-dqs_perbit_dly[j].min_best))
                {
                    dqs_perbit_dly[j].max_best = dqs_perbit_dly[j].max_cur;
                    dqs_perbit_dly[j].min_best = dqs_perbit_dly[j].min_cur;
                }
                // clear to find the next pass range if it has
                dqs_perbit_dly[j].max_cur = -1;
                dqs_perbit_dly[j].min_cur = -1;
            }

        }
    }
    // 3
    // get dqs delay center per bit
    for (j = 0; j < DQ_DATA_WIDTH; j++)
    {
        if ((dqs_perbit_dly[j].max_best != -1) && (dqs_perbit_dly[j].min_best != -1))
        {
            dqs_perbit_dly[j].center = (dqs_perbit_dly[j].max_best + dqs_perbit_dly[j].min_best) / 2;
            dbg_print("dqs_perbit_dly[%d].center=0x%x\n",j,dqs_perbit_dly[j].center);
        }
    }

    // we get the delay value of the 4 DQS (min of center)
    for (i = 0; i < DQS_NUMBER; i++)
    {
        max = 0;
        // find the max of center
        for (j = 0; j < DQS_BIT_NUMBER; j++)
        {
            if (dqs_perbit_dly[i*DQS_BIT_NUMBER+j].center > max)
            {
                max = dqs_perbit_dly[i*DQS_BIT_NUMBER+j].center;
            }
        }
        // save dqs delay
        dqs_delay[i] = max;
        dbg_print("dqs_delay[%d]=0x%x\n",i,max);
    }
    data = ((U32) dqs_delay[0]) + (((U32)dqs_delay[1])<<8) + (((U32)dqs_delay[2])<<16) + (((U32)dqs_delay[3])<<24);
    /*set dqs input delay*/
    DRAMC_WRITE_REG(data,DRAMC_R0DELDLY);


    // delay DQ ,let dqsdly_ok_center == DQS_delay
    for (i = 0; i < DQ_DATA_WIDTH; i = i+4)
    {
        // every 4-bit dq have the same delay register address
        // dq_dly_max: taps for dq delay to be add
        for (j = 0; j < 4; j++)
        {
            dq_dly_max =  dqs_delay[i/DQS_BIT_NUMBER] - dqs_perbit_dly[i+j].center;
            dbg_print("1.bit:%d)dq_per_bit_dly:0x%x,dq_dly:0x%x\n",i+j,dqs_perbit_dly[i+j].dq_dly_last,dq_dly_max);
            data = dqs_perbit_dly[i+j].dq_dly_last + dq_dly_max;
            data = ((data > (MAX_RX_DQDLY_TAPS-1)) ? (MAX_RX_DQDLY_TAPS-1) : data);
            dqs_perbit_dly[i+j].dq_dly_last = data;

            dbg_print("2.bit:%d)dq_per_bit_dly:0x%x\n",i+j,dqs_perbit_dly[i+j].dq_dly_last);
        }

        data = ((U32) dqs_perbit_dly[i].dq_dly_last) + (((U32)dqs_perbit_dly[i+1].dq_dly_last)<<8) + (((U32)dqs_perbit_dly[i+2].dq_dly_last)<<16) + (((U32)dqs_perbit_dly[i+3].dq_dly_last)<<24);

        DRAMC_WRITE_REG(data,DRAMC_DQIDLY1+i);
    }
    for (j = 0; j < DQ_DATA_WIDTH; j++)
    {
        dbg_print("%d)min:0x%x,max:0x%x\n",j, dqs_perbit_dly[j].min_best, dqs_perbit_dly[j].max_best);
    }
#if defined(DEBUG_DRAMC_CALIB)
    print("*DQIDLY1 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY1));
    print("*DQIDLY2 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY2));
    print("*DQIDLY3 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY3));
    print("*DQIDLY4 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY4));
    print("*DQIDLY5 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY5));
    print("*DQIDLY6 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY6));
    print("*DQIDLY7 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY7));
    print("*DQIDLY8 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY8));
    print("*DRAMC_R0DELDLY = 0x%x\n", DRAMC_READ_REG(DRAMC_R0DELDLY));


#endif
#if defined(DEBUG_DRAMC_CALIB)
    print("*DQIDLY1 = 0x%x\n", DRAMC_READ_REG(DRAMC_DQIDLY1));
    // finish we can put result now .
    print("==================================================================\n");
    print("		RX	DQS perbit delay software calibration \n");
    print("==================================================================\n");        
    print("1.0-31 bit dq delay value\n");
    print("==================================================================\n");        
    print("bit|     0  1  2  3  4  5  6  7  8  9\n");
    print("--------------------------------------");
    for (i = 0; i < DQ_DATA_WIDTH; i++)
    {
        j = i / 10;
        if (i == (j*10))
        {
            print("\n");
            print("%d |    ", i);
        }
        print("%d ", dq_delay_per_bit[i]);
    }
    print("\n--------------------------------------\n\n");
    print("==================================================================\n");
    print("2.dqs window\nx=pass dqs delay value (min~max)center \ny=0-7bit DQ of every group\n");
    print("input delay:DQS0 =%d DQS1 = %d DQS2 =%d DQS3 = %d\n", dqs_delay[0], dqs_delay[1], dqs_delay[2], dqs_delay[3]);
    print("==================================================================\n");
    print("bit	DQS0	 bit      DQS1     bit     DQS2     bit     DQS3\n");
    for (i = 0; i < DQS_BIT_NUMBER; i++)
    {
        print("%d  (%d~%d)%d  %d  (%d~%d)%d  %d  (%d~%d)%d  %d  (%d~%d)%d\n", \
            i,    dqs_perbit_dly[i].min_best, dqs_perbit_dly[i].max_best, dqs_perbit_dly[i].center, \
            i+8,  dqs_perbit_dly[i+8].min_best, dqs_perbit_dly[i+8].max_best, dqs_perbit_dly[i+8].center, \
            i+16, dqs_perbit_dly[i+16].min_best, dqs_perbit_dly[i+16].max_best, dqs_perbit_dly[i+16].center, \
            i+24, dqs_perbit_dly[i+24].min_best, dqs_perbit_dly[i+24].max_best, dqs_perbit_dly[i+24].center);
    }
    print("==================================================================\n");
    print("3.dq delay value last\n");
    print("==================================================================\n");
    print("bit|    0  1  2  3  4  5  6  7  8   9\n");
    print("--------------------------------------");
    for (i = 0; i < DQ_DATA_WIDTH; i++)
    {
        j = i / 10;
        if (i == (j*10))
        {
            print("\n");
            print("%d |    ", i);
        }
        print("%d ", dqs_perbit_dly[i].dq_dly_last);
    }
    print("\n==================================================================\n");

#endif

    return 0;
}