int lis_precon_create_bjacobi(LIS_SOLVER solver, LIS_PRECON precon)
{
	int			err;
	LIS_MATRIX	A;

	LIS_DEBUG_FUNC_IN;

	A = solver->A;

	err = lis_matrix_convert_self(solver);
	if( err ) return err;

	if( !A->is_block )
	{
		solver->options[LIS_OPTIONS_PRECON] = LIS_PRECON_TYPE_JACOBI;
		precon->precon_type = LIS_PRECON_TYPE_JACOBI;
		err = lis_precon_create_jacobi(solver,precon);
		return err;
	}

	err = lis_matrix_split(A);
	if( err ) return err;
	err = lis_matrix_diag_duplicate(A->D,&precon->WD);
	if( err ) return err;
	lis_matrix_diag_copy(A->D,precon->WD);
	lis_matrix_diag_inverse(precon->WD);


	LIS_DEBUG_FUNC_OUT;
    return LIS_SUCCESS;
}
LIS_INT lis_precon_create_ilut(LIS_SOLVER solver, LIS_PRECON precon)
{
  #ifdef ENABLE_BSR
    LIS_INT        storage,block;
  #endif
  LIS_INT        err;
  LIS_MATRIX    A,B;

  LIS_DEBUG_FUNC_IN;

#ifdef ENABLE_BSR
  storage     = solver->options[LIS_OPTIONS_STORAGE];
  block       = solver->options[LIS_OPTIONS_STORAGE_BLOCK];

  if( solver->A->matrix_type!=LIS_MATRIX_BSR && storage==LIS_MATRIX_BSR )
  {
    err = lis_matrix_convert_self(solver);
    if( err ) return err;
  }
#endif

  switch( solver->A->matrix_type )
  {
  case LIS_MATRIX_CSR:
    err = lis_precon_create_ilut_csr(solver,precon);
    lis_psolve_xxx[LIS_PRECON_TYPE_ILUT]  = lis_psolve_ilut_csr;
    lis_psolvet_xxx[LIS_PRECON_TYPE_ILUT]  = lis_psolvet_ilut_csr;
    break;
#ifdef ENABLE_BSR
  case LIS_MATRIX_BSR:
    err = lis_precon_create_ilut_bsr(solver,precon);
    lis_psolve_xxx[LIS_PRECON_TYPE_ILUT]  = lis_psolve_ilut_bsr;
    lis_psolvet_xxx[LIS_PRECON_TYPE_ILUT]  = lis_psolvet_ilut_bsr;
    break;
#endif
  default:
    A = solver->A;
    err = lis_matrix_duplicate(A,&B);
    if( err ) return err;
    lis_matrix_set_type(B,LIS_MATRIX_CSR);
    err = lis_matrix_convert(A,B);
    if( err ) return err;
    solver->A = B;
    err = lis_precon_create_ilut_csr(solver,precon);
    lis_psolve_xxx[LIS_PRECON_TYPE_ILUT]  = lis_psolve_ilut_csr;
    lis_psolvet_xxx[LIS_PRECON_TYPE_ILUT]  = lis_psolvet_ilut_csr;
    lis_matrix_destroy(B);
    solver->A = A;
    break;
  }

  LIS_DEBUG_FUNC_OUT;
    return LIS_SUCCESS;
}
LIS_INT lis_solve_kernel(LIS_MATRIX A, LIS_VECTOR b, LIS_VECTOR x, LIS_SOLVER solver, LIS_PRECON precon)
{
	LIS_INT			nsolver, precon_type, maxiter;
	LIS_INT			err;
	LIS_SCALAR	*residual;
	LIS_VECTOR	xx;

	LIS_INT output;
	LIS_INT scale;
	LIS_INT conv_cond;
	LIS_INT precision,is_use_at,storage,block;
	LIS_INT i,n,np;
	double p_c_times, p_i_times,itimes;
	LIS_SCALAR nrm2,tol,tol_w;
	LIS_VECTOR t;
	LIS_VECTOR bb;
	LIS_MATRIX AA,B;
	LIS_MATRIX At;
	char buf[64];

	LIS_DEBUG_FUNC_IN;

	nsolver     = solver->options[LIS_OPTIONS_SOLVER];
	precon_type = solver->options[LIS_OPTIONS_PRECON];
	maxiter     = solver->options[LIS_OPTIONS_MAXITER];
	output      = solver->options[LIS_OPTIONS_OUTPUT];
	scale       = solver->options[LIS_OPTIONS_SCALE];
	precision   = solver->options[LIS_OPTIONS_PRECISION];
	is_use_at   = solver->options[LIS_OPTIONS_USE_AT];
	storage     = solver->options[LIS_OPTIONS_STORAGE];
	block       = solver->options[LIS_OPTIONS_STORAGE_BLOCK];
	conv_cond   = solver->options[LIS_OPTIONS_CONV_COND];
	tol         = solver->params[LIS_PARAMS_RESID-LIS_OPTIONS_LEN];
	tol_w       = solver->params[LIS_PARAMS_RESID_WEIGHT-LIS_OPTIONS_LEN];
	solver->precision = precision;

	if( nsolver < 1 || nsolver > LIS_SOLVERS_LEN )
	{
		LIS_SETERR2(LIS_ERR_ILL_ARG,"Parameter LIS_OPTIONS_SOLVER is %d (Set between 1 to %d)\n",nsolver, LIS_SOLVERS_LEN);
		return LIS_ERR_ILL_ARG;
	}
	if( precon_type < 0 || precon_type > precon_register_type )
	{
		LIS_SETERR2(LIS_ERR_ILL_ARG,"Parameter LIS_OPTIONS_PRECON is %d (Set between 0 to %d)\n",precon_type, precon_register_type-1);
		return LIS_ERR_ILL_ARG;
	}
	if( maxiter<0 )
	{
		LIS_SETERR1(LIS_ERR_ILL_ARG,"Parameter LIS_OPTIONS_MAXITER(=%d) is less than 0\n",maxiter);
		return LIS_ERR_ILL_ARG;
	}
	#ifdef USE_MPI
	if( precon_type == LIS_PRECON_TYPE_SAAMG  && solver->A->nprocs < 2)
	{
		LIS_SETERR1(LIS_ERR_ILL_ARG,"Parameter A->nprocs (=%d) is less than 2 (Set more than 1 when using parallel version of SAAMG)\n",solver->A->nprocs);
		return LIS_ERR_ILL_ARG;
	}
	#endif
	#ifdef USE_QUAD_PRECISION
		if( precision==LIS_PRECISION_QUAD && lis_solver_execute_quad[nsolver]==NULL )
		{
			LIS_SETERR1(LIS_ERR_NOT_IMPLEMENTED,"Quad precision solver %s is not implemented\n",lis_solvername[nsolver]);
			return LIS_ERR_NOT_IMPLEMENTED;
		}
		else if( precision==LIS_PRECISION_SWITCH && lis_solver_execute_switch[nsolver]==NULL )
		{
			LIS_SETERR1(LIS_ERR_NOT_IMPLEMENTED,"Switch solver %s is not implemented\n",lis_solvername[nsolver]);
			return LIS_ERR_NOT_IMPLEMENTED;
		}
		if( solver->options[LIS_OPTIONS_SWITCH_MAXITER]==-1 )
		{
			solver->options[LIS_OPTIONS_SWITCH_MAXITER] = maxiter;
		}
	#endif

	err = lis_solver_check_params[nsolver](solver);
	if( err )
	{
		solver->retcode = err;
		return err;
	}
	/* end parameter check */

	solver->A        = A;
	solver->b        = b;

	/* create initial vector */
	#ifndef USE_QUAD_PRECISION
		err = lis_vector_duplicate(A,&xx);
	#else
		if( precision==LIS_PRECISION_DOUBLE )
		{
			err = lis_vector_duplicate(A,&xx);
		}
		else
		{
			err = lis_vector_duplicateex(LIS_PRECISION_QUAD,A,&xx);
		}
	#endif
	if( err )
	{
		solver->retcode = err;
		return err;
	}
	if( solver->options[LIS_OPTIONS_INITGUESS_ZEROS] )
	{
	  if( output ) lis_printf(A->comm,"initial vector x = 0\n");
		#ifndef USE_QUAD_PRECISION
			lis_vector_set_all(0.0,xx);
		#else
			if( precision==LIS_PRECISION_DOUBLE )
			{
				lis_vector_set_all(0.0,xx);
			}
			else
			{
				lis_vector_set_allex_nm(0.0,xx);
			}
		#endif
	}
	else
	{
	  if( output ) lis_printf(A->comm,"initial vector x = user defined\n"); 
		#ifndef USE_QUAD_PRECISION
			lis_vector_copy(x,xx);
		#else
			if( precision==LIS_PRECISION_DOUBLE )
			{
				lis_vector_copy(x,xx);
			}
			else
			{
				lis_vector_copyex_nm(x,xx);
			}
		#endif
	}

	/* create residual history vector */
	if( solver->residual ) lis_free(solver->residual);
	residual = (LIS_SCALAR *)lis_malloc((maxiter+2)*sizeof(LIS_SCALAR),"lis_solve::residual");
	if( residual==NULL )
	{
		LIS_SETERR_MEM((maxiter+2)*sizeof(LIS_SCALAR));
		lis_vector_destroy(xx);
		solver->retcode = err;
		return err;
	}
	residual[0] = 1.0;


	n       = A->n;
	np      = A->np;
	t       = NULL;
	At      = NULL;


	p_c_times = lis_wtime();
	if( precon_type==LIS_PRECON_TYPE_IS )
	{
		if( solver->d==NULL )
		{
			err = lis_vector_duplicate(A,&solver->d);
			if( err )
			{
				return err;
			}
		}
		if( !A->is_scaled )
		{
			lis_matrix_scaling(A,b,solver->d,LIS_SCALE_JACOBI);
		}
		else if( !b->is_scaled )
		{
			#ifdef _OPENMP
			#pragma omp parallel for
			#endif
			for(i=0;i<n;i++)
			{
				b->value[i] = b->value[i]*solver->d->value[i];
			}
		}
		if( nsolver >= LIS_SOLVER_JACOBI && nsolver <= LIS_SOLVER_SOR )
		{
			solver->options[LIS_OPTIONS_ISLEVEL] = 0;
		}
	}
	else if( nsolver >= LIS_SOLVER_JACOBI && nsolver <= LIS_SOLVER_SOR && precon_type!=LIS_PRECON_TYPE_NONE )
	{
		if( solver->d==NULL )
		{
			err = lis_vector_duplicate(A,&solver->d);
			if( err )
			{
				return err;
			}
		}
		if( !A->is_scaled )
		{
			lis_matrix_scaling(A,b,solver->d,LIS_SCALE_JACOBI);
		}
	}
	else if( scale )
	{
		if( storage==LIS_MATRIX_BSR && scale==LIS_SCALE_JACOBI )
		{
			if( A->matrix_type!=LIS_MATRIX_BSR )
			{
				err = lis_matrix_duplicate(A,&B);
				if( err ) return err;
				lis_matrix_set_blocksize(B,block,block,NULL,NULL);
				lis_matrix_set_type(B,storage);
				err = lis_matrix_convert(A,B);
				if( err ) return err;
				lis_matrix_storage_destroy(A);
				lis_matrix_DLU_destroy(A);
				lis_matrix_diag_destroy(A->WD);
				if( A->l2g_map ) lis_free( A->l2g_map );
				if( A->commtable ) lis_commtable_destroy( A->commtable );
				if( A->ranges ) lis_free( A->ranges );
				err = lis_matrix_copy_struct(B,A);
				if( err ) return err;
				lis_free(B);
			}
			err = lis_matrix_split(A);
			if( err ) return err;
			err = lis_matrix_diag_duplicate(A->D,&solver->WD);
			if( err ) return err;
			lis_matrix_diag_copy(A->D,solver->WD);
			lis_matrix_diag_inverse(solver->WD);
			lis_matrix_bscaling_bsr(A,solver->WD);
			lis_vector_duplicate(A,&t);
			lis_matrix_diag_matvec(solver->WD,b,t);
			lis_vector_copy(t,b);
			lis_vector_destroy(t);
			t = NULL;
		}
		else
		{
			if( solver->d==NULL )
			{
				err = lis_vector_duplicate(A,&solver->d);
				if( err )
				{
					return err;
				}
			}
			if( scale==LIS_SCALE_JACOBI && nsolver==LIS_SOLVER_CG )
			{
				scale = LIS_SCALE_SYMM_DIAG;
			}
			if( !A->is_scaled )
			{
				lis_matrix_scaling(A,b,solver->d,scale);
			}
			else if( !b->is_scaled )
			{
				#ifdef _OPENMP
				#pragma omp parallel for
				#endif
				for(i=0;i<n;i++)
				{
					b->value[i] = b->value[i]*solver->d->value[i];
				}
			}
		}
	}

/*	precon_type = precon->precon_type;*/
	if( precon_type==LIS_PRECON_TYPE_IS )
	{
		if( nsolver < LIS_SOLVER_JACOBI || nsolver > LIS_SOLVER_SOR )
		{
			AA = solver->A;
			bb = solver->b;
		}
		else
		{
			AA = precon->A;
			bb = precon->Pb;
		}
	}
	else
	{
		AA = A;
		bb = b;
	}

	p_c_times = lis_wtime() - p_c_times;
	itimes = lis_wtime();

	/* Matrix Convert */
	solver->A  = AA;
	solver->b  = bb;
	err = lis_matrix_convert_self(solver);
	if( err )
	{
		lis_vector_destroy(xx);
		lis_solver_work_destroy(solver);
		lis_free(residual);
		solver->retcode = err;
		return err;
	}
	block = solver->A->bnr;

	if( A->my_rank==0 )
	{
	  if( output ) printf("precision : %s\n", lis_precisionname[precision]); 
	  if( output ) printf("solver    : %s %d\n", lis_solvername[nsolver],nsolver); 
		switch( precon_type )
		{
		case LIS_PRECON_TYPE_ILU:
			i = solver->options[LIS_OPTIONS_FILL];
			if( A->matrix_type==LIS_MATRIX_BSR || A->matrix_type==LIS_MATRIX_VBR )
			{
			  if( output ) sprintf(buf,"Block %s(%d)",lis_preconname[precon_type],i); 
			}
			else
			{
			  if( output ) sprintf(buf,"%s(%d)",lis_preconname[precon_type],i); 
			}
			break;
		default:
		  if( output ) sprintf(buf,"%s",lis_preconname[precon_type]); 
			break;
		}
		if( solver->options[LIS_OPTIONS_ADDS] && precon_type )
		{
		  if( output ) printf("precon    : %s + additive schwarz\n", buf); 
		}
		else
		{
		  if( output ) printf("precon    : %s\n", buf); 
		}
	}
	switch(conv_cond)
	{
	case LIS_CONV_COND_NRM2_R:
	case LIS_CONV_COND_NRM2_B:
		if( A->my_rank==0 )
		{
		  if( output ) ("CONV_COND : ||r||_2 <= %6.1e*||r_0||_2\n", tol); 
		}
		break;
	case LIS_CONV_COND_NRM1_B:
		lis_vector_nrm1(b,&nrm2);
		nrm2 = nrm2*tol_w + tol;
		if( A->my_rank==0 )
		{
		  if( output ) printf("conv_cond : ||r||_1 <= %6.1e*||b||_1 + %6.1e = %6.1e\n", tol_w,tol,nrm2);
		}
		break;
	}
	if( A->my_rank==0 )
	{
		if( AA->matrix_type==LIS_MATRIX_BSR || AA->matrix_type==LIS_MATRIX_BSC )
		{
		  if( output ) printf("storage   : %s(%d x %d)\n", lis_storagename[AA->matrix_type-1],block,block); 
		}
		else
		{
		  if( output ) printf("storage   : %s\n", lis_storagename[AA->matrix_type-1]); 
		}
	}


	/* create work vector */
	err = lis_solver_malloc_work[nsolver](solver); 
	if( err )
	{
		lis_vector_destroy(xx);
		lis_precon_destroy(precon);
		solver->retcode = err;
		return err;
	}
	if( nsolver==LIS_SOLVER_BICG && is_use_at )
	{
	  if( output ) lis_printf(A->comm,"Use At\n"); 
		lis_matrix_duplicate(AA,&At);
		lis_matrix_set_type(At,LIS_USE_AT_TYPE[AA->matrix_type]);
		lis_matrix_convert(AA,At);
		solver->At = At;
	}

	solver->x        = xx;
	solver->xx       = x;
	solver->precon   = precon;
	solver->residual = residual;

	/* execute solver */
	#ifndef USE_QUAD_PRECISION
		err = lis_solver_execute[nsolver](solver);
	#else
		if( precision==LIS_PRECISION_DOUBLE )
		{
			err = lis_solver_execute[nsolver](solver);
		}
		else if( precision==LIS_PRECISION_QUAD )
		{
			err = lis_solver_execute_quad[nsolver](solver);
		}
		else if( precision==LIS_PRECISION_SWITCH )
		{
			err = lis_solver_execute_switch[nsolver](solver);
		}
	#endif
	solver->retcode = err;

	if( scale==LIS_SCALE_SYMM_DIAG && precon_type!=LIS_PRECON_TYPE_IS)
	{
		#ifdef _OPENMP
		#pragma omp parallel for
		#endif
		for(i=0;i<n;i++)
		{
			x->value[i] = xx->value[i]*solver->d->value[i];
		}
	}
	else
	{
		#ifndef USE_QUAD_PRECISION
			lis_vector_copy(xx,x);
		#else
			if( precision==LIS_PRECISION_DOUBLE )
			{
				lis_vector_copy(xx,x);
			}
			else
			{
				lis_vector_copyex_mn(xx,x);
			}
		#endif
	}
	itimes = lis_wtime() - itimes - solver->ptimes;
	p_i_times = solver->ptimes;
	solver->ptimes = p_c_times + p_i_times;
	solver->p_c_times = p_c_times;
	solver->p_i_times = p_i_times;
	solver->times  = solver->ptimes + itimes;
	solver->itimes = itimes;
	lis_solver_work_destroy(solver);
	lis_vector_duplicate(A,&t);
	xx->precision = LIS_PRECISION_DEFAULT;
	lis_matvec(A,xx,t);
	lis_vector_xpay(b,-1.0,t);
	if( scale==LIS_SCALE_SYMM_DIAG && precon_type!=LIS_PRECON_TYPE_IS)
	{
		#ifdef _OPENMP
		#pragma omp parallel for
		#endif
		for(i=0;i<n;i++)
		{
			t->value[i] = t->value[i]/solver->d->value[i];
		}
	}
	lis_vector_nrm2(t,&nrm2);
	/*
	solver->resid = nrm2;
	*/
	if( A->my_rank==0 )
	{
		if( err )
		{
		  if( output ) printf("lis_solve : %s(code=%d)\n\n",lis_returncode[err],err); 

		}
		else
		{
		  if( output ) printf("lis_solve : normal end\n\n"); 
		}
	}
	if( precision==LIS_PRECISION_DOUBLE )
	{
		solver->iter2 = solver->iter;
	}
	else if( precision==LIS_PRECISION_QUAD )
	{
		solver->iter2 = 0;
	}


	lis_vector_destroy(t);
/*	lis_vector_destroy(d);*/
	lis_vector_destroy(xx);

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}