/************************************************************************* Principal components analysis Subroutine builds orthogonal basis where first axis corresponds to direction with maximum variance, second axis maximizes variance in subspace orthogonal to first axis and so on. It should be noted that, unlike LDA, PCA does not use class labels. INPUT PARAMETERS: X - dataset, array[0..NPoints-1,0..NVars-1]. matrix contains ONLY INDEPENDENT VARIABLES. NPoints - dataset size, NPoints>=0 NVars - number of independent variables, NVars>=1 бшундмше оюпюлерпш: Info - return code: * -4, if SVD subroutine haven't converged * -1, if wrong parameters has been passed (NPoints<0, NVars<1) * 1, if task is solved S2 - array[0..NVars-1]. variance values corresponding to basis vectors. V - array[0..NVars-1,0..NVars-1] matrix, whose columns store basis vectors. -- ALGLIB -- Copyright 25.08.2008 by Bochkanov Sergey *************************************************************************/ void pcabuildbasis(const ap::real_2d_array& x, int npoints, int nvars, int& info, ap::real_1d_array& s2, ap::real_2d_array& v) { ap::real_2d_array a; ap::real_2d_array u; ap::real_2d_array vt; ap::real_1d_array m; ap::real_1d_array t; int i; int j; double mean; double variance; double skewness; double kurtosis; // // Check input data // if( npoints<0||nvars<1 ) { info = -1; return; } info = 1; // // Special case: NPoints=0 // if( npoints==0 ) { s2.setbounds(0, nvars-1); v.setbounds(0, nvars-1, 0, nvars-1); for(i = 0; i <= nvars-1; i++) { s2(i) = 0; } for(i = 0; i <= nvars-1; i++) { for(j = 0; j <= nvars-1; j++) { if( i==j ) { v(i,j) = 1; } else { v(i,j) = 0; } } } return; } // // Calculate means // m.setbounds(0, nvars-1); t.setbounds(0, npoints-1); for(j = 0; j <= nvars-1; j++) { ap::vmove(t.getvector(0, npoints-1), x.getcolumn(j, 0, npoints-1)); calculatemoments(t, npoints, mean, variance, skewness, kurtosis); m(j) = mean; } // // Center, apply SVD, prepare output // a.setbounds(0, ap::maxint(npoints, nvars)-1, 0, nvars-1); for(i = 0; i <= npoints-1; i++) { ap::vmove(&a(i, 0), &x(i, 0), ap::vlen(0,nvars-1)); ap::vsub(&a(i, 0), &m(0), ap::vlen(0,nvars-1)); } for(i = npoints; i <= nvars-1; i++) { for(j = 0; j <= nvars-1; j++) { a(i,j) = 0; } } if( !rmatrixsvd(a, ap::maxint(npoints, nvars), nvars, 0, 1, 2, s2, u, vt) ) { info = -4; return; } if( npoints!=1 ) { for(i = 0; i <= nvars-1; i++) { s2(i) = ap::sqr(s2(i))/(npoints-1); } } v.setbounds(0, nvars-1, 0, nvars-1); copyandtranspose(vt, 0, nvars-1, 0, nvars-1, v, 0, nvars-1, 0, nvars-1); }
/************************************************************************* Internal linear regression subroutine *************************************************************************/ static void lrinternal(const ap::real_2d_array& xy, const ap::real_1d_array& s, int npoints, int nvars, int& info, linearmodel& lm, lrreport& ar) { ap::real_2d_array a; ap::real_2d_array u; ap::real_2d_array vt; ap::real_2d_array vm; ap::real_2d_array xym; ap::real_1d_array b; ap::real_1d_array sv; ap::real_1d_array t; ap::real_1d_array svi; ap::real_1d_array work; int i; int j; int k; int ncv; int na; int nacv; double r; double p; double epstol; lrreport ar2; int offs; linearmodel tlm; epstol = 1000; // // Check for errors in data // if( npoints<nvars||nvars<1 ) { info = -1; return; } for(i = 0; i <= npoints-1; i++) { if( ap::fp_less_eq(s(i),0) ) { info = -2; return; } } info = 1; // // Create design matrix // a.setbounds(0, npoints-1, 0, nvars-1); b.setbounds(0, npoints-1); for(i = 0; i <= npoints-1; i++) { r = 1/s(i); ap::vmove(&a(i, 0), &xy(i, 0), ap::vlen(0,nvars-1), r); b(i) = xy(i,nvars)/s(i); } // // Allocate W: // W[0] array size // W[1] version number, 0 // W[2] NVars (minus 1, to be compatible with external representation) // W[3] coefficients offset // lm.w.setbounds(0, 4+nvars-1); offs = 4; lm.w(0) = 4+nvars; lm.w(1) = lrvnum; lm.w(2) = nvars-1; lm.w(3) = offs; // // Solve problem using SVD: // // 0. check for degeneracy (different types) // 1. A = U*diag(sv)*V' // 2. T = b'*U // 3. w = SUM((T[i]/sv[i])*V[..,i]) // 4. cov(wi,wj) = SUM(Vji*Vjk/sv[i]^2,K=1..M) // // see $15.4 of "Numerical Recipes in C" for more information // t.setbounds(0, nvars-1); svi.setbounds(0, nvars-1); ar.c.setbounds(0, nvars-1, 0, nvars-1); vm.setbounds(0, nvars-1, 0, nvars-1); if( !rmatrixsvd(a, npoints, nvars, 1, 1, 2, sv, u, vt) ) { info = -4; return; } if( ap::fp_less_eq(sv(0),0) ) { // // Degenerate case: zero design matrix. // for(i = offs; i <= offs+nvars-1; i++) { lm.w(i) = 0; } ar.rmserror = lrrmserror(lm, xy, npoints); ar.avgerror = lravgerror(lm, xy, npoints); ar.avgrelerror = lravgrelerror(lm, xy, npoints); ar.cvrmserror = ar.rmserror; ar.cvavgerror = ar.avgerror; ar.cvavgrelerror = ar.avgrelerror; ar.ncvdefects = 0; ar.cvdefects.setbounds(0, nvars-1); ar.c.setbounds(0, nvars-1, 0, nvars-1); for(i = 0; i <= nvars-1; i++) { for(j = 0; j <= nvars-1; j++) { ar.c(i,j) = 0; } } return; } if( ap::fp_less_eq(sv(nvars-1),epstol*ap::machineepsilon*sv(0)) ) { // // Degenerate case, non-zero design matrix. // // We can leave it and solve task in SVD least squares fashion. // Solution and covariance matrix will be obtained correctly, // but CV error estimates - will not. It is better to reduce // it to non-degenerate task and to obtain correct CV estimates. // for(k = nvars; k >= 1; k--) { if( ap::fp_greater(sv(k-1),epstol*ap::machineepsilon*sv(0)) ) { // // Reduce // xym.setbounds(0, npoints-1, 0, k); for(i = 0; i <= npoints-1; i++) { for(j = 0; j <= k-1; j++) { r = ap::vdotproduct(&xy(i, 0), &vt(j, 0), ap::vlen(0,nvars-1)); xym(i,j) = r; } xym(i,k) = xy(i,nvars); } // // Solve // lrinternal(xym, s, npoints, k, info, tlm, ar2); if( info!=1 ) { return; } // // Convert back to un-reduced format // for(j = 0; j <= nvars-1; j++) { lm.w(offs+j) = 0; } for(j = 0; j <= k-1; j++) { r = tlm.w(offs+j); ap::vadd(&lm.w(offs), &vt(j, 0), ap::vlen(offs,offs+nvars-1), r); } ar.rmserror = ar2.rmserror; ar.avgerror = ar2.avgerror; ar.avgrelerror = ar2.avgrelerror; ar.cvrmserror = ar2.cvrmserror; ar.cvavgerror = ar2.cvavgerror; ar.cvavgrelerror = ar2.cvavgrelerror; ar.ncvdefects = ar2.ncvdefects; ar.cvdefects.setbounds(0, nvars-1); for(j = 0; j <= ar.ncvdefects-1; j++) { ar.cvdefects(j) = ar2.cvdefects(j); } ar.c.setbounds(0, nvars-1, 0, nvars-1); work.setbounds(1, nvars); matrixmatrixmultiply(ar2.c, 0, k-1, 0, k-1, false, vt, 0, k-1, 0, nvars-1, false, 1.0, vm, 0, k-1, 0, nvars-1, 0.0, work); matrixmatrixmultiply(vt, 0, k-1, 0, nvars-1, true, vm, 0, k-1, 0, nvars-1, false, 1.0, ar.c, 0, nvars-1, 0, nvars-1, 0.0, work); return; } } info = -255; return; } for(i = 0; i <= nvars-1; i++) { if( ap::fp_greater(sv(i),epstol*ap::machineepsilon*sv(0)) ) { svi(i) = 1/sv(i); } else { svi(i) = 0; } } for(i = 0; i <= nvars-1; i++) { t(i) = 0; } for(i = 0; i <= npoints-1; i++) { r = b(i); ap::vadd(&t(0), &u(i, 0), ap::vlen(0,nvars-1), r); } for(i = 0; i <= nvars-1; i++) { lm.w(offs+i) = 0; } for(i = 0; i <= nvars-1; i++) { r = t(i)*svi(i); ap::vadd(&lm.w(offs), &vt(i, 0), ap::vlen(offs,offs+nvars-1), r); } for(j = 0; j <= nvars-1; j++) { r = svi(j); ap::vmove(vm.getcolumn(j, 0, nvars-1), vt.getrow(j, 0, nvars-1), r); } for(i = 0; i <= nvars-1; i++) { for(j = i; j <= nvars-1; j++) { r = ap::vdotproduct(&vm(i, 0), &vm(j, 0), ap::vlen(0,nvars-1)); ar.c(i,j) = r; ar.c(j,i) = r; } } // // Leave-1-out cross-validation error. // // NOTATIONS: // A design matrix // A*x = b original linear least squares task // U*S*V' SVD of A // ai i-th row of the A // bi i-th element of the b // xf solution of the original LLS task // // Cross-validation error of i-th element from a sample is // calculated using following formula: // // ERRi = ai*xf - (ai*xf-bi*(ui*ui'))/(1-ui*ui') (1) // // This formula can be derived from normal equations of the // original task // // (A'*A)x = A'*b (2) // // by applying modification (zeroing out i-th row of A) to (2): // // (A-ai)'*(A-ai) = (A-ai)'*b // // and using Sherman-Morrison formula for updating matrix inverse // // NOTE 1: b is not zeroed out since it is much simpler and // does not influence final result. // // NOTE 2: some design matrices A have such ui that 1-ui*ui'=0. // Formula (1) can't be applied for such cases and they are skipped // from CV calculation (which distorts resulting CV estimate). // But from the properties of U we can conclude that there can // be no more than NVars such vectors. Usually // NVars << NPoints, so in a normal case it only slightly // influences result. // ncv = 0; na = 0; nacv = 0; ar.rmserror = 0; ar.avgerror = 0; ar.avgrelerror = 0; ar.cvrmserror = 0; ar.cvavgerror = 0; ar.cvavgrelerror = 0; ar.ncvdefects = 0; ar.cvdefects.setbounds(0, nvars-1); for(i = 0; i <= npoints-1; i++) { // // Error on a training set // r = ap::vdotproduct(&xy(i, 0), &lm.w(offs), ap::vlen(0,nvars-1)); ar.rmserror = ar.rmserror+ap::sqr(r-xy(i,nvars)); ar.avgerror = ar.avgerror+fabs(r-xy(i,nvars)); if( ap::fp_neq(xy(i,nvars),0) ) { ar.avgrelerror = ar.avgrelerror+fabs((r-xy(i,nvars))/xy(i,nvars)); na = na+1; } // // Error using fast leave-one-out cross-validation // p = ap::vdotproduct(&u(i, 0), &u(i, 0), ap::vlen(0,nvars-1)); if( ap::fp_greater(p,1-epstol*ap::machineepsilon) ) { ar.cvdefects(ar.ncvdefects) = i; ar.ncvdefects = ar.ncvdefects+1; continue; } r = s(i)*(r/s(i)-b(i)*p)/(1-p); ar.cvrmserror = ar.cvrmserror+ap::sqr(r-xy(i,nvars)); ar.cvavgerror = ar.cvavgerror+fabs(r-xy(i,nvars)); if( ap::fp_neq(xy(i,nvars),0) ) { ar.cvavgrelerror = ar.cvavgrelerror+fabs((r-xy(i,nvars))/xy(i,nvars)); nacv = nacv+1; } ncv = ncv+1; } if( ncv==0 ) { // // Something strange: ALL ui are degenerate. // Unexpected... // info = -255; return; } ar.rmserror = sqrt(ar.rmserror/npoints); ar.avgerror = ar.avgerror/npoints; if( na!=0 ) { ar.avgrelerror = ar.avgrelerror/na; } ar.cvrmserror = sqrt(ar.cvrmserror/ncv); ar.cvavgerror = ar.cvavgerror/ncv; if( nacv!=0 ) { ar.cvavgrelerror = ar.cvavgrelerror/nacv; } }
/************************************************************************* Dense solver. This subroutine finds solution of the linear system A*X=B with non-square, possibly degenerate A. System is solved in the least squares sense, and general least squares solution X = X0 + CX*y which minimizes |A*X-B| is returned. If A is non-degenerate, solution in the usual sense is returned Additional features include: * iterative improvement INPUT PARAMETERS A - array[0..NRows-1,0..NCols-1], system matrix NRows - vertical size of A NCols - horizontal size of A B - array[0..NCols-1], right part Threshold- a number in [0,1]. Singular values beyond Threshold are considered zero. Set it to 0.0, if you don't understand what it means, so the solver will choose good value on its own. OUTPUT PARAMETERS Info - return code: * -4 SVD subroutine failed * -1 if NRows<=0 or NCols<=0 or Threshold<0 was passed * 1 if task is solved Rep - solver report, see below for more info X - array[0..N-1,0..M-1], it contains: * solution of A*X=B if A is non-singular (well-conditioned or ill-conditioned, but not very close to singular) * zeros, if A is singular or VERY close to singular (in this case Info=-3). SOLVER REPORT Subroutine sets following fields of the Rep structure: * R2 reciprocal of condition number: 1/cond(A), 2-norm. * N = NCols * K dim(Null(A)) * CX array[0..N-1,0..K-1], kernel of A. Columns of CX store such vectors that A*CX[i]=0. -- ALGLIB -- Copyright 24.08.2009 by Bochkanov Sergey *************************************************************************/ void rmatrixsolvels(const ap::real_2d_array& a, int nrows, int ncols, const ap::real_1d_array& b, double threshold, int& info, densesolverlsreport& rep, ap::real_1d_array& x) { ap::real_1d_array sv; ap::real_2d_array u; ap::real_2d_array vt; ap::real_1d_array rp; ap::real_1d_array utb; ap::real_1d_array sutb; ap::real_1d_array tmp; ap::real_1d_array ta; ap::real_1d_array tx; ap::real_1d_array buf; ap::real_1d_array w; int i; int j; int nsv; int kernelidx; double v; double verr; bool svdfailed; bool zeroa; int rfs; int nrfs; bool terminatenexttime; bool smallerr; if( nrows<=0||ncols<=0||ap::fp_less(threshold,0) ) { info = -1; return; } if( ap::fp_eq(threshold,0) ) { threshold = 1000*ap::machineepsilon; } // // Factorize A first // svdfailed = !rmatrixsvd(a, nrows, ncols, 1, 2, 2, sv, u, vt); zeroa = ap::fp_eq(sv(0),0); if( svdfailed||zeroa ) { if( svdfailed ) { info = -4; } else { info = 1; } x.setlength(ncols); for(i = 0; i <= ncols-1; i++) { x(i) = 0; } rep.n = ncols; rep.k = ncols; rep.cx.setlength(ncols, ncols); for(i = 0; i <= ncols-1; i++) { for(j = 0; j <= ncols-1; j++) { if( i==j ) { rep.cx(i,j) = 1; } else { rep.cx(i,j) = 0; } } } rep.r2 = 0; return; } nsv = ap::minint(ncols, nrows); if( nsv==ncols ) { rep.r2 = sv(nsv-1)/sv(0); } else { rep.r2 = 0; } rep.n = ncols; info = 1; // // Iterative improvement of xc combined with solution: // 1. xc = 0 // 2. calculate r = bc-A*xc using extra-precise dot product // 3. solve A*y = r // 4. update x:=x+r // 5. goto 2 // // This cycle is executed until one of two things happens: // 1. maximum number of iterations reached // 2. last iteration decreased error to the lower limit // utb.setlength(nsv); sutb.setlength(nsv); x.setlength(ncols); tmp.setlength(ncols); ta.setlength(ncols+1); tx.setlength(ncols+1); buf.setlength(ncols+1); for(i = 0; i <= ncols-1; i++) { x(i) = 0; } kernelidx = nsv; for(i = 0; i <= nsv-1; i++) { if( ap::fp_less_eq(sv(i),threshold*sv(0)) ) { kernelidx = i; break; } } rep.k = ncols-kernelidx; nrfs = densesolverrfsmaxv2(ncols, rep.r2); terminatenexttime = false; rp.setlength(nrows); for(rfs = 0; rfs <= nrfs; rfs++) { if( terminatenexttime ) { break; } // // calculate right part // if( rfs==0 ) { ap::vmove(&rp(0), &b(0), ap::vlen(0,nrows-1)); } else { smallerr = true; for(i = 0; i <= nrows-1; i++) { ap::vmove(&ta(0), &a(i, 0), ap::vlen(0,ncols-1)); ta(ncols) = -1; ap::vmove(&tx(0), &x(0), ap::vlen(0,ncols-1)); tx(ncols) = b(i); xdot(ta, tx, ncols+1, buf, v, verr); rp(i) = -v; smallerr = smallerr&&ap::fp_less(fabs(v),4*verr); } if( smallerr ) { terminatenexttime = true; } } // // solve A*dx = rp // for(i = 0; i <= ncols-1; i++) { tmp(i) = 0; } for(i = 0; i <= nsv-1; i++) { utb(i) = 0; } for(i = 0; i <= nrows-1; i++) { v = rp(i); ap::vadd(&utb(0), &u(i, 0), ap::vlen(0,nsv-1), v); } for(i = 0; i <= nsv-1; i++) { if( i<kernelidx ) { sutb(i) = utb(i)/sv(i); } else { sutb(i) = 0; } } for(i = 0; i <= nsv-1; i++) { v = sutb(i); ap::vadd(&tmp(0), &vt(i, 0), ap::vlen(0,ncols-1), v); } // // update x: x:=x+dx // ap::vadd(&x(0), &tmp(0), ap::vlen(0,ncols-1)); } // // fill CX // if( rep.k>0 ) { rep.cx.setlength(ncols, rep.k); for(i = 0; i <= rep.k-1; i++) { ap::vmove(rep.cx.getcolumn(i, 0, ncols-1), vt.getrow(kernelidx+i, 0, ncols-1)); } } }
/************************************************************************* Internal fitting subroutine *************************************************************************/ static void lsfitlinearinternal(const ap::real_1d_array& y, const ap::real_1d_array& w, const ap::real_2d_array& fmatrix, int n, int m, int& info, ap::real_1d_array& c, lsfitreport& rep) { double threshold; ap::real_2d_array ft; ap::real_2d_array q; ap::real_2d_array l; ap::real_2d_array r; ap::real_1d_array b; ap::real_1d_array wmod; ap::real_1d_array tau; int i; int j; double v; ap::real_1d_array sv; ap::real_2d_array u; ap::real_2d_array vt; ap::real_1d_array tmp; ap::real_1d_array utb; ap::real_1d_array sutb; int relcnt; if( n<1||m<1 ) { info = -1; return; } info = 1; threshold = sqrt(ap::machineepsilon); // // Degenerate case, needs special handling // if( n<m ) { // // Create design matrix. // ft.setlength(n, m); b.setlength(n); wmod.setlength(n); for(j = 0; j <= n-1; j++) { v = w(j); ap::vmove(&ft(j, 0), 1, &fmatrix(j, 0), 1, ap::vlen(0,m-1), v); b(j) = w(j)*y(j); wmod(j) = 1; } // // LQ decomposition and reduction to M=N // c.setlength(m); for(i = 0; i <= m-1; i++) { c(i) = 0; } rep.taskrcond = 0; rmatrixlq(ft, n, m, tau); rmatrixlqunpackq(ft, n, m, tau, n, q); rmatrixlqunpackl(ft, n, m, l); lsfitlinearinternal(b, wmod, l, n, n, info, tmp, rep); if( info<=0 ) { return; } for(i = 0; i <= n-1; i++) { v = tmp(i); ap::vadd(&c(0), 1, &q(i, 0), 1, ap::vlen(0,m-1), v); } return; } // // N>=M. Generate design matrix and reduce to N=M using // QR decomposition. // ft.setlength(n, m); b.setlength(n); for(j = 0; j <= n-1; j++) { v = w(j); ap::vmove(&ft(j, 0), 1, &fmatrix(j, 0), 1, ap::vlen(0,m-1), v); b(j) = w(j)*y(j); } rmatrixqr(ft, n, m, tau); rmatrixqrunpackq(ft, n, m, tau, m, q); rmatrixqrunpackr(ft, n, m, r); tmp.setlength(m); for(i = 0; i <= m-1; i++) { tmp(i) = 0; } for(i = 0; i <= n-1; i++) { v = b(i); ap::vadd(&tmp(0), 1, &q(i, 0), 1, ap::vlen(0,m-1), v); } b.setlength(m); ap::vmove(&b(0), 1, &tmp(0), 1, ap::vlen(0,m-1)); // // R contains reduced MxM design upper triangular matrix, // B contains reduced Mx1 right part. // // Determine system condition number and decide // should we use triangular solver (faster) or // SVD-based solver (more stable). // // We can use LU-based RCond estimator for this task. // rep.taskrcond = rmatrixlurcondinf(r, m); if( ap::fp_greater(rep.taskrcond,threshold) ) { // // use QR-based solver // c.setlength(m); c(m-1) = b(m-1)/r(m-1,m-1); for(i = m-2; i >= 0; i--) { v = ap::vdotproduct(&r(i, i+1), 1, &c(i+1), 1, ap::vlen(i+1,m-1)); c(i) = (b(i)-v)/r(i,i); } } else { // // use SVD-based solver // if( !rmatrixsvd(r, m, m, 1, 1, 2, sv, u, vt) ) { info = -4; return; } utb.setlength(m); sutb.setlength(m); for(i = 0; i <= m-1; i++) { utb(i) = 0; } for(i = 0; i <= m-1; i++) { v = b(i); ap::vadd(&utb(0), 1, &u(i, 0), 1, ap::vlen(0,m-1), v); } if( ap::fp_greater(sv(0),0) ) { rep.taskrcond = sv(m-1)/sv(0); for(i = 0; i <= m-1; i++) { if( ap::fp_greater(sv(i),threshold*sv(0)) ) { sutb(i) = utb(i)/sv(i); } else { sutb(i) = 0; } } } else { rep.taskrcond = 0; for(i = 0; i <= m-1; i++) { sutb(i) = 0; } } c.setlength(m); for(i = 0; i <= m-1; i++) { c(i) = 0; } for(i = 0; i <= m-1; i++) { v = sutb(i); ap::vadd(&c(0), 1, &vt(i, 0), 1, ap::vlen(0,m-1), v); } } // // calculate errors // rep.rmserror = 0; rep.avgerror = 0; rep.avgrelerror = 0; rep.maxerror = 0; relcnt = 0; for(i = 0; i <= n-1; i++) { v = ap::vdotproduct(&fmatrix(i, 0), 1, &c(0), 1, ap::vlen(0,m-1)); rep.rmserror = rep.rmserror+ap::sqr(v-y(i)); rep.avgerror = rep.avgerror+fabs(v-y(i)); if( ap::fp_neq(y(i),0) ) { rep.avgrelerror = rep.avgrelerror+fabs(v-y(i))/fabs(y(i)); relcnt = relcnt+1; } rep.maxerror = ap::maxreal(rep.maxerror, fabs(v-y(i))); } rep.rmserror = sqrt(rep.rmserror/n); rep.avgerror = rep.avgerror/n; if( relcnt!=0 ) { rep.avgrelerror = rep.avgrelerror/relcnt; } }