void ReduceRowEchelon(MatrixTemplate<T>& A,MatrixTemplate<T>& B)
{
  if(!B.isEmpty())
    Assert(B.m == A.m);

  int i,j,icur,jcur;
  for(icur=A.m-1;icur>=0;icur--) { 
    //find pivot for row icur
    jcur = -1;
    for(j=0;j<A.n;j++) if(A(icur,j) != 0) { jcur=j; break; }
    if(jcur == -1) continue;

    //normalize row by dividing by A(icur,jcur)
    T scale = One/A(icur,jcur);
    A(icur,jcur) = 1;
    for(j=jcur+1;j<A.n;j++) A(icur,j) *= scale;
    for(j=0;j<B.n;j++) B(icur,j) *= scale;

    //zero out columns above (icur,jcur)
    for(i=0;i<icur;i++) {
      if(A(i,jcur) == 0) continue;
      scale = A(i,jcur);
      A(i,jcur) = 0;
      for(j=jcur+1;j<A.n;j++) A(i,j) -= A(icur,j)*scale;
      for(j=0;j<B.n;j++) B(i,j) -= B(icur,j)*scale;
    }
  }
}
inline void Lt1BackSubstitute(const MatrixTemplate<T>& a, const MatrixTemplate<T>& b, MatrixTemplate<T>& x)
{
  if(x.isEmpty()) 
    x.resize(a.n,b.n);
  else Assert(x.m == a.n && x.n == b.n);
  for(int i=0;i<x.n;i++) {
    VectorTemplate<T> xi,bi;
    x.getColRef(i,xi);
    b.getColRef(i,bi);
    Lt1BackSubstitute(a,bi,xi);
  }
}
inline bool LBackSubstitute(const MatrixTemplate<T>& a, const MatrixTemplate<T>& b, MatrixTemplate<T>& x)
{
  if(x.isEmpty()) x.resize(a.n,b.n);
  else Assert(x.m == a.n && x.n == b.n);
  for(int i=0;i<x.n;i++) {
    VectorTemplate<T> xi,bi;
    x.getColRef(i,xi);
    b.getColRef(i,bi);
    if(!LBackSubstitute(a,bi,xi)) return false;
  }
  return true;
}
int RowEchelonDecompose(MatrixTemplate<T>& A,MatrixTemplate<T>& B,Real zeroTol)
{
  if(!B.isEmpty())
    Assert(B.m == A.m);
  int m=A.m,n=A.n;
  int p=B.n;

  int i,j,icur=0,jcur;
  T temp;
  for(jcur=0;jcur<n;jcur++) {
    //find pivot element in col jcur from rows icur..m
    Real big=Zero;
    int ipivot=-1;
    for(i=icur;i<m;i++) {
      if(Abs(A(i,jcur)) > big) {
	ipivot = i;
	big = Abs(A(i,jcur));
      }
    }
    if(!FuzzyZero(big,zeroTol)) { //nonzero pivot found
      //exchange rows ipivot,icur
      if(ipivot != icur) {
	for(j=jcur;j<n;j++) SWAP(A(ipivot,j),A(icur,j));
	for(j=0;j<p;j++) SWAP(B(ipivot,j),B(icur,j));
      }
      //eliminate rows below icur
      T scale;
      for(i=icur+1;i<m;i++) {
	//set row(ai) = row(ai)-aiJ/aIJ*row(aI)
	scale = A(i,jcur)/A(icur,jcur);
	for(j=jcur;j<n;j++) A(i,j) -= A(icur,j)*scale;
	for(j=0;j<p;j++) B(i,j) -= B(icur,j)*scale;
	A(i,jcur)=Zero;
      }
      icur++;
    }
    else {
      //either zero pivot, or very small one
      //set to zero to reduce numerical difficulties later 
      for(i=icur;i<m;i++) A(i,jcur)=Zero;
    }
  }
  return Max(m-icur,n-jcur);
}