LIS_INT lis_precon_create_ilut_bsr(LIS_SOLVER solver, LIS_PRECON precon)
{
  LIS_INT        err;
  LIS_INT        i,j,k,kk,bnr,bs;
  LIS_INT        n,nr,annz,lfil,len;
  LIS_SCALAR    gamma,t,tol,m;
  LIS_MATRIX    A;
  LIS_MATRIX_ILU  L,U;
  LIS_MATRIX_DIAG  D;

  LIS_SCALAR    tnorm, tolnorm;
  LIS_SCALAR    buf_ns[16],buf_fact[16],*xnrm,*wn,*w;
  LIS_INT        lenu,lenl,col,jpos,jrow,upos,para;
  LIS_INT        *jbuf,*iw;

  LIS_DEBUG_FUNC_IN;


  A      = solver->A;
  n      = A->n;
  nr     = A->nr;
  bnr    = A->bnr;
  bs     = bnr*bnr;
  tol    = solver->params[LIS_PARAMS_DROP-LIS_OPTIONS_LEN];
  m      = solver->params[LIS_PARAMS_RATE-LIS_OPTIONS_LEN];
  gamma  = solver->params[LIS_PARAMS_GAMMA-LIS_OPTIONS_LEN];
  annz   = 10+A->bnnz / A->nr;
  lfil   = (LIS_INT)(((double)A->bnnz/(2.0*nr))*m);

  L      = NULL;
  U      = NULL;


  err = lis_matrix_ilu_create(nr,bnr,&L);
  if( err ) return err;
  err = lis_matrix_ilu_create(nr,bnr,&U);
  if( err ) return err;
  err = lis_matrix_ilu_setCR(L);
  if( err ) return err;
  err = lis_matrix_ilu_setCR(U);
  if( err ) return err;
  err = lis_matrix_diag_duplicateM(A,&D);
  if( err )
  {
    return err;
  }

  w   = (LIS_SCALAR *)lis_malloc(bs*(nr+1)*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
  if( w==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }
  xnrm = (LIS_SCALAR *)lis_malloc(nr*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
  if( xnrm==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }
  wn = (LIS_SCALAR *)lis_malloc(nr*sizeof(LIS_SCALAR),"lis_precon_create_iluc_csr::w");
  if( wn==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_SCALAR));
    return LIS_OUT_OF_MEMORY;
  }

  jbuf   = (LIS_INT *)lis_malloc(n*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
  if( jbuf==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_INT));
    return LIS_OUT_OF_MEMORY;
  }
  iw   = (LIS_INT *)lis_malloc(nr*sizeof(LIS_INT),"lis_precon_create_iluc_csr::iw");
  if( iw==NULL )
  {
    LIS_SETERR_MEM(n*sizeof(LIS_INT));
    return LIS_OUT_OF_MEMORY;
  }


  for(i=0;i<nr;i++) iw[i] = -1;

  for(i=0;i<nr;i++)
  {
    tnorm = 0;
    for(j=A->bptr[i];j<A->bptr[i+1];j++)
    {
      lis_array_nrm2(bs,&A->value[bs*j],&t);
      tnorm = _max(t,tnorm);
    }
    tolnorm = tol * tnorm;

    lenu = 1;
    lenl = 0;
    jbuf[i] = i;
    memset(&w[bs*i],0,bs*sizeof(LIS_SCALAR));
    iw[i] = i;

    for(j=A->bptr[i];j<A->bptr[i+1];j++)
    {
      col = A->bindex[j];
      lis_array_nrm2(bs,&A->value[bs*j],&t);
      if( t<tolnorm && col!=i ) continue;
      if( col < i )
      {
        jbuf[lenl] = col;
        iw[col] = lenl;
        memcpy(&w[bs*lenl],&A->value[bs*j],bs*sizeof(LIS_SCALAR));
        lenl++;
      }
      else if( col == i )
      {
        memcpy(&w[bs*i],&A->value[bs*j],bs*sizeof(LIS_SCALAR));
      }
      else
      {
        jpos = i + lenu;
        jbuf[jpos] = col;
        iw[col] = jpos;
        memcpy(&w[bs*jpos],&A->value[bs*j],bs*sizeof(LIS_SCALAR));
        lenu++;
      }
    }

    j = -1;
    len = 0;

    while( ++j < lenl )
    {
      jrow = jbuf[j];
      jpos = j;
      for(k=j+1;k<lenl;k++)
      {
        if( jbuf[k]<jrow )
        {
          jrow = jbuf[k];
          jpos = k;
        }
      }
      if( jpos!=j )
      {
        col = jbuf[j];
        jbuf[j] = jbuf[jpos];
        jbuf[jpos] = col;
        iw[jrow] = j;
        iw[col] = jpos;
        memcpy(buf_ns,&w[bs*j],bs*sizeof(LIS_SCALAR));
        memcpy(&w[bs*j],&w[bs*jpos],bs*sizeof(LIS_SCALAR));
        memcpy(&w[bs*jpos],buf_ns,bs*sizeof(LIS_SCALAR));
      }
/*      lis_array_matmat(bnr,&D->value[bs*jrow],&w[bs*j],buf_fact,LIS_INS_VALUE);*/
      lis_array_matinv(bnr,&D->value[bs*jrow],&w[bs*j],buf_fact);
      iw[jrow] = -1;

      lis_array_nrm2(bs,buf_fact,&t);
      if( t * xnrm[jrow] <= tolnorm ) continue;

      for(k=0;k<U->nnz[jrow];k++)
      {
        col = U->index[jrow][k];
        lis_array_matmat(bnr,buf_fact,&U->value[jrow][bs*k],buf_ns,LIS_INS_VALUE);
        jpos = iw[col];

        lis_array_nrm2(bs,buf_ns,&t);
        if( t < tolnorm && jpos == -1 )
        {
          continue;
        }

        if( col >= i )
        {
          if( jpos == -1 )
          {
            upos = i + lenu;
            jbuf[upos] = col;
            iw[col] = upos;
            memcpy(&w[bs*upos],buf_ns,bs*sizeof(LIS_SCALAR));
            lenu++;
          }
          else
          {
            for(kk=0;kk<bs;kk++)
            {
              w[bs*jpos+kk] += buf_ns[kk];
            }
          }
        }
        else
        {
          if( jpos == -1 )
          {
            jbuf[lenl] = col;
            iw[col] = lenl;
            memcpy(&w[bs*lenl],buf_ns,bs*sizeof(LIS_SCALAR));
            lenl++;
          }
          else
          {
            for(kk=0;kk<bs;kk++)
            {
              w[bs*jpos+kk] += buf_ns[kk];
            }
          }
        }
      }
      for(kk=0;kk<bs;kk++)
      {
        w[bs*len+kk] = -buf_fact[kk];
      }
      jbuf[len] = jrow;
      len++;
    }
    lenl = len;
    len = _min(lfil,lenl);
    for(j=0;j<lenl;j++)
    {
      lis_array_nrm2(bs,&w[bs*j],&wn[j]);
      iw[j] = j;
    }
    lis_sort_di(0,lenl-1,wn,iw);
    lis_sort_i(0,len-1,iw);
    
    L->nnz[i] = len;
    if( len>0 )
    {
      L->index[i] = (LIS_INT *)malloc(len*sizeof(LIS_INT));
      L->value[i] = (LIS_SCALAR *)malloc(bs*len*sizeof(LIS_SCALAR));
    }
    for(j=0;j<len;j++)
    {
      jpos = iw[j];
      L->index[i][j] = jbuf[jpos];
      memcpy(&L->value[i][bs*j],&w[bs*jpos],bs*sizeof(LIS_SCALAR));
    }
    for(j=0;j<lenl;j++) iw[j] = -1;

    len = _min(lfil,lenu);
    for(j=1;j<lenu;j++)
    {
      jpos = i+j;
      lis_array_nrm2(bs,&w[bs*jpos],&wn[j-1]);
      iw[j-1] = jpos;
    }
    para = lenu - 1;
    lis_sort_di(0,para-1,wn,iw);
    lis_sort_i(0,len-2,iw);
    
    U->nnz[i] = len-1;
    if( len>1 )
    {
      U->index[i] = (LIS_INT *)malloc((len-1)*sizeof(LIS_INT));
      U->value[i] = (LIS_SCALAR *)malloc(bs*(len-1)*sizeof(LIS_SCALAR));
    }
    lis_array_nrm2(bs,&w[bs*i],&t);
    for(j=0;j<len-1;j++)
    {
      jpos = iw[j];
      U->index[i][j] = jbuf[jpos];
      memcpy(&U->value[i][bs*j],&w[bs*jpos],bs*sizeof(LIS_SCALAR));
      t = _max(t,wn[j]);
    }
    for(j=0;j<lenu-1;j++) iw[j] = -1;

    xnrm[i] = t;

    memcpy(&D->value[bs*i],&w[bs*i],bs*sizeof(LIS_SCALAR));

    if( i==nr-1 )
    {
      switch(bnr)
      {
      case 2:
        if( n%2!=0 )
        {
          D->value[4*(nr-1)+3] = 1.0;
        }
        break;
      case 3:
        if( n%3==1 )
        {
          D->value[9*(nr-1)+4] = 1.0;
          D->value[9*(nr-1)+8] = 1.0;
        }
        else if( n%3==2 )
        {
          D->value[9*(nr-1)+8] = 1.0;
        }
        break;
      }
    }
/*    lis_array_invGauss(bnr,&D->value[bs*i]);*/
    lis_array_LUdecomp(bnr,&D->value[bs*i]);

    for(j=0;j<lenu;j++)
    {
      iw[ jbuf[i+j] ] = -1;
    }
  }

  precon->L  = L;
  precon->U  = U;
  precon->WD  = D;

  lis_free2(5,w,iw,xnrm,wn,jbuf);

  LIS_DEBUG_FUNC_OUT;
  return LIS_SUCCESS;
}
LIS_INT lis_ecg(LIS_ESOLVER esolver)
{
  LIS_MATRIX        A;
  LIS_VECTOR        x;
  LIS_SCALAR        evalue;
  LIS_INT               emaxiter;
  LIS_REAL          tol;
  LIS_INT               iter,iter3,nsolver,i,j,output;
  LIS_INT               nprocs,my_rank;
  LIS_REAL          nrm2,resid,resid3;
  LIS_SCALAR        lshift;
  LIS_VECTOR        b,D,r,w,p,Aw,Ax,Ap,ones,Ds;
  LIS_SCALAR        *SA, *SB, *SW, *v3, *SAv3, *SBv3, *z3, *q3, *SBz3, evalue3, ievalue3;
  LIS_SOLVER        solver;
  LIS_PRECON        precon;
  LIS_MATRIX        A0;
  LIS_VECTOR        x0,z,q;
  double	    times,itimes,ptimes,p_c_times,p_i_times;
  LIS_INT           nsol, precon_type;
  char              solvername[128], preconname[128];

  A = esolver->A;
  x = esolver->x;
  
  emaxiter = esolver->options[LIS_EOPTIONS_MAXITER];
  tol = esolver->params[LIS_EPARAMS_RESID - LIS_EOPTIONS_LEN]; 
  output  = esolver->options[LIS_EOPTIONS_OUTPUT];
  lshift = esolver->lshift;

  if( A->my_rank==0 ) printf("local shift = %e\n", lshift);
  if (lshift != 0) lis_matrix_shift_diagonal(A, lshift);

  SA = (LIS_SCALAR *)lis_malloc(3*3*sizeof(LIS_SCALAR), "lis_ecg::SA");
  SB = (LIS_SCALAR *)lis_malloc(3*3*sizeof(LIS_SCALAR), "lis_ecg::SB");
  SW = (LIS_SCALAR *)lis_malloc(3*3*sizeof(LIS_SCALAR), "lis_ecg::SW");
  v3 = (LIS_SCALAR *)lis_malloc(3*sizeof(LIS_SCALAR), "lis_ecg::v3");
  SAv3 = (LIS_SCALAR *)lis_malloc(3*sizeof(LIS_SCALAR), "lis_ecg::SAv3");
  SBv3 = (LIS_SCALAR *)lis_malloc(3*sizeof(LIS_SCALAR), "lis_ecg::SBv3");
  SBz3 = (LIS_SCALAR *)lis_malloc(3*sizeof(LIS_SCALAR), "lis_ecg::SBz3");
  z3 = (LIS_SCALAR *)lis_malloc(3*sizeof(LIS_SCALAR), "lis_ecg::z3");
  q3 = (LIS_SCALAR *)lis_malloc(3*sizeof(LIS_SCALAR), "lis_ecg::q3");

  b = esolver->work[0];
  D = esolver->work[1];
  Ds = esolver->work[2];
  r = esolver->work[3];
  w = esolver->work[4];
  p = esolver->work[5];
  Aw = esolver->work[6];
  Ax = esolver->work[7];
  Ap = esolver->work[8];

  lis_vector_set_all(1.0,b);
  lis_vector_nrm2(b, &nrm2);
  lis_vector_scale(1/nrm2, b);
  lis_solver_create(&solver);
  lis_solver_set_option("-i bicg -p ilu",solver);
  lis_solver_set_optionC(solver);
  lis_solver_get_solver(solver, &nsol);
  lis_solver_get_precon(solver, &precon_type);
  lis_get_solvername(nsol, solvername);
  lis_get_preconname(precon_type, preconname);
  printf("solver     : %s %d\n", solvername, nsol);
  printf("precon     : %s %d\n", preconname, precon_type);
  lis_solve(A, b, x, solver);
  lis_vector_copy(b,Ax);

  lis_vector_nrm2(x, &nrm2);
  lis_vector_set_all(0.0,p);
  lis_vector_set_all(0.0,Ap);

  lis_precon_create(solver, &precon);
  solver->precon = precon;

  iter=0;

  while (iter<emaxiter)
    {
      iter = iter + 1;

      lis_vector_dot(x,Ax,&evalue);
      lis_vector_axpyz(-(evalue),x,Ax,r); 
      lis_vector_nrm2(r, &nrm2);
      resid = fabs(nrm2/(evalue));

      if( output )
	{
	  if( output & LIS_EPRINT_MEM ) esolver->residual[iter] = resid;
	  if( output & LIS_EPRINT_OUT && A->my_rank==0 ) printf("iter: %5d  residual = %e\n", iter, resid);
	}

      if (resid<tol) break;  

      lis_psolve(solver, x, w);
      lis_vector_copy(x,Aw);
      lis_vector_nrm2(w, &nrm2);

      lis_vector_dot(w,Aw,&SA[0]);
      lis_vector_dot(x,Aw,&SA[3]);
      lis_vector_dot(p,Aw,&SA[6]);
      SA[1] = SA[3];
      lis_vector_dot(x,Ax,&SA[4]);
      lis_vector_dot(p,Ax,&SA[7]);
      SA[2] = SA[6];
      SA[5] = SA[7];
      lis_vector_dot(p,Ap,&SA[8]);

      lis_vector_dot(w,w,&SB[0]);
      lis_vector_dot(x,w,&SB[3]);
      lis_vector_dot(p,w,&SB[6]);
      SB[1] = SB[3];
      lis_vector_dot(x,x,&SB[4]);
      lis_vector_dot(p,x,&SB[7]);
      SB[2] = SB[6];
      SB[5] = SB[7];
      lis_vector_dot(p,p,&SB[8]);
      
      lis_array_set_all(3, 1.0, v3);

      iter3=0;
      while (iter3<emaxiter)
	{
	  iter3 = iter3 + 1;
	  lis_array_nrm2(3, v3, &nrm2); 
	  lis_array_scale(3, 1/nrm2, v3);
	  lis_array_matvec(3, SB, v3, SBv3, LIS_INS_VALUE);
	  lis_array_invvec(3, SA, SBv3, z3);
	  lis_array_dot2(3, SBv3, z3, &ievalue3);
	  if (ievalue3==0) 
	    {
	      printf("ievalue3 is zero\n");
	      lis_precon_destroy(precon);
	      lis_solver_destroy(solver);
	      esolver->iter       = iter;
	      esolver->resid      = resid;
	      esolver->evalue[0] = evalue;

	      if (lshift != 0) lis_matrix_shift_diagonal(A, -lshift);
	      lis_free(SA);
	      lis_free(SB);
	      lis_free(SW);
	      lis_free(v3);
	      lis_free(SAv3);
	      lis_free(SBv3);
	      lis_free(SBz3);
	      lis_free(z3);
	      lis_free(q3);
	      return LIS_BREAKDOWN;
	    }
	  lis_array_axpyz(3, -ievalue3, SBv3, z3, q3);
	  lis_array_nrm2(3, q3, &resid3); 
	  resid3 = fabs(resid3 / ievalue3);
	  if (resid3<1e-12) break;   
	  lis_array_copy(3,z3,v3);
	}

      evalue3 = 1 / ievalue3;
      
      lis_vector_scale(v3[0],w);  
      lis_vector_axpy(v3[2],p,w);
      lis_vector_xpay(w,v3[1],x);
      lis_vector_copy(w,p);
      
      lis_vector_scale(v3[0],Aw);  
      lis_vector_axpy(v3[2],Ap,Aw);
      lis_vector_xpay(Aw,v3[1],Ax);
      lis_vector_copy(Aw,Ap);
      
      lis_vector_nrm2(x,&nrm2);
      lis_vector_scale(1/nrm2,x);
      lis_vector_scale(1/nrm2,Ax);
      
      lis_vector_nrm2(p,&nrm2);
      lis_vector_scale(1/nrm2,p);
      lis_vector_scale(1/nrm2,Ap);
      
      lis_solver_get_timeex(solver,&times,&itimes,&ptimes,&p_c_times,&p_i_times);
      esolver->ptimes += solver->ptimes;
      esolver->itimes += solver->itimes;
      esolver->p_c_times += solver->p_c_times;
      esolver->p_i_times += solver->p_i_times;

    }

  lis_precon_destroy(precon);
  lis_solver_destroy(solver);

  esolver->iter       = iter;
  esolver->resid      = resid;
  esolver->evalue[0] = evalue;

  if (lshift != 0) lis_matrix_shift_diagonal(A, -lshift);

  lis_free(SA);
  lis_free(SB);
  lis_free(SW);
  lis_free(v3);
  lis_free(SAv3);
  lis_free(SBv3);
  lis_free(SBz3);
  lis_free(z3);
  lis_free(q3);

  if (resid<tol) 
    {
      esolver->retcode = LIS_SUCCESS;
      return LIS_SUCCESS;
    }
  else
    {
      esolver->retcode = LIS_MAXITER;
      return LIS_MAXITER;
    }
}