/* customized BLAS functions */ void LUdaxpy( ptrdiff_t n, double da, double *dx, ptrdiff_t incx,double *dy, ptrdiff_t incy) { dx++; dy++; daxpy( &n, &da, dx, &incx, dy, &incy); }
void dqrdc(double a[], int lda, int n, int p, double qraux[], int jpvt[], double work[], int job) /******************************************************************************/ /* Purpose: DQRDC computes the QR factorization of a real rectangular matrix. Discussion: DQRDC uses Householder transformations. Column pivoting based on the 2-norms of the reduced columns may be performed at the user's option. Licensing: This code is distributed under the GNU LGPL license. Modified: 07 June 2005 Author: C version by John Burkardt. Reference: Jack Dongarra, Cleve Moler, Jim Bunch and Pete Stewart, LINPACK User's Guide, SIAM, (Society for Industrial and Applied Mathematics), 3600 University City Science Center, Philadelphia, PA, 19104-2688. ISBN 0-89871-172-X Parameters: Input/output, double A(LDA,P). On input, the N by P matrix whose decomposition is to be computed. On output, A contains in its upper triangle the upper triangular matrix R of the QR factorization. Below its diagonal A contains information from which the orthogonal part of the decomposition can be recovered. Note that if pivoting has been requested, the decomposition is not that of the original matrix A but that of A with its columns permuted as described by JPVT. Input, int LDA, the leading dimension of the array A. LDA must be at least N. Input, int N, the number of rows of the matrix A. Input, int P, the number of columns of the matrix A. Output, double QRAUX[P], contains further information required to recover the orthogonal part of the decomposition. Input/output, integer JPVT[P]. On input, JPVT contains integers that control the selection of the pivot columns. The K-th column A(*,K) of A is placed in one of three classes according to the value of JPVT(K). > 0, then A(K) is an initial column. = 0, then A(K) is a free column. < 0, then A(K) is a final column. Before the decomposition is computed, initial columns are moved to the beginning of the array A and final columns to the end. Both initial and final columns are frozen in place during the computation and only free columns are moved. At the K-th stage of the reduction, if A(*,K) is occupied by a free column it is interchanged with the free column of largest reduced norm. JPVT is not referenced if JOB == 0. On output, JPVT(K) contains the index of the column of the original matrix that has been interchanged into the K-th column, if pivoting was requested. Workspace, double WORK[P]. WORK is not referenced if JOB == 0. Input, int JOB, initiates column pivoting. 0, no pivoting is done. nonzero, pivoting is done. */ { int jp; int j; int lup; int maxj; double maxnrm, nrmxl, t, tt; int pl = 1, pu = 0; /* If pivoting is requested, rearrange the columns. */ if (job != 0) { for (j = 1; j <= p; j++) { int swapj = (0 < jpvt[j - 1]); jpvt[j - 1] = (jpvt[j - 1] < 0) ? -j : j; if (swapj) { if (j != pl) dswap(n, a + 0 + (pl - 1)*lda, 1, a + 0 + (j - 1), 1); jpvt[j - 1] = jpvt[pl - 1]; jpvt[pl - 1] = j; pl++; } } pu = p; for (j = p; 1 <= j; j--) { if (jpvt[j - 1] < 0) { jpvt[j - 1] = -jpvt[j - 1]; if (j != pu) { dswap(n, a + 0 + (pu - 1)*lda, 1, a + 0 + (j - 1)*lda, 1); jp = jpvt[pu - 1]; jpvt[pu - 1] = jpvt[j - 1]; jpvt[j - 1] = jp; } pu = pu - 1; } } } /* Compute the norms of the free columns. */ for (j = pl; j <= pu; j++) qraux[j - 1] = dnrm2(n, a + 0 + (j - 1) * lda, 1); for (j = pl; j <= pu; j++) work[j - 1] = qraux[j - 1]; /* Perform the Householder reduction of A. */ lup = i4_min(n, p); for (int l = 1; l <= lup; l++) { /* Bring the column of largest norm into the pivot position. */ if (pl <= l && l < pu) { maxnrm = 0.0; maxj = l; for (j = l; j <= pu; j++) { if (maxnrm < qraux[j - 1]) { maxnrm = qraux[j - 1]; maxj = j; } } if (maxj != l) { dswap(n, a + 0 + (l - 1)*lda, 1, a + 0 + (maxj - 1)*lda, 1); qraux[maxj - 1] = qraux[l - 1]; work[maxj - 1] = work[l - 1]; jp = jpvt[maxj - 1]; jpvt[maxj - 1] = jpvt[l - 1]; jpvt[l - 1] = jp; } } /* Compute the Householder transformation for column L. */ qraux[l - 1] = 0.0; if (l != n) { nrmxl = dnrm2(n - l + 1, a + l - 1 + (l - 1) * lda, 1); if (nrmxl != 0.0) { if (a[l - 1 + (l - 1)*lda] != 0.0) nrmxl = nrmxl * r8_sign(a[l - 1 + (l - 1) * lda]); dscal(n - l + 1, 1.0 / nrmxl, a + l - 1 + (l - 1)*lda, 1); a[l - 1 + (l - 1)*lda] = 1.0 + a[l - 1 + (l - 1) * lda]; /* Apply the transformation to the remaining columns, updating the norms. */ for (j = l + 1; j <= p; j++) { t = -ddot(n - l + 1, a + l - 1 + (l - 1) * lda, 1, a + l - 1 + (j - 1) * lda, 1) / a[l - 1 + (l - 1) * lda]; daxpy(n - l + 1, t, a + l - 1 + (l - 1)*lda, 1, a + l - 1 + (j - 1)*lda, 1); if (pl <= j && j <= pu) { if (qraux[j - 1] != 0.0) { tt = 1.0 - pow(r8_abs(a[l - 1 + (j - 1) * lda]) / qraux[j - 1], 2); tt = r8_max(tt, 0.0); t = tt; tt = 1.0 + 0.05 * tt * pow(qraux[j - 1] / work[j - 1], 2); if (tt != 1.0) qraux[j - 1] = qraux[j - 1] * sqrt(t); else { qraux[j - 1] = dnrm2(n - l, a + l + (j - 1) * lda, 1); work[j - 1] = qraux[j - 1]; } } } } /* Save the transformation. */ qraux[l - 1] = a[l - 1 + (l - 1) * lda]; a[l - 1 + (l - 1)*lda] = -nrmxl; } } } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { /* Variables */ int k,ind,nSamples,maxIter,sparse=0,*covered,*lastVisited,increasing=0,temp, nextpow2,levelMax,nLevels,level; long i,j,nVars,one=1; mwIndex *jc,*ir; double *w, *Xt, *y, lambda, *Li, alpha, innerProd, sig,c=1,*g,*d,nCovered=0,*cumSum,fi,fi_new,gg,precision, *randVals,*Lmax,Lmean,Li_old,*nDescendants,*unCoveredMatrix,*LiMatrix,offset,u,z,Z,wtx,*xtx,scaling; if (nrhs < 10) mexErrMsgTxt("Function needs nine arguments: {w,Xt,y,lambda,Lmax,Li,randVals,d,g,covered[,increasing,xtx]}"); /* Input */ w = mxGetPr(prhs[0]); Xt = mxGetPr(prhs[1]); y = mxGetPr(prhs[2]); lambda = mxGetScalar(prhs[3]); Lmax = mxGetPr(prhs[4]); Li = mxGetPr(prhs[5]); randVals = mxGetPr(prhs[6]); d = mxGetPr(prhs[7]); g = mxGetPr(prhs[8]); covered = (int*)mxGetPr(prhs[9]); if (nrhs >= 11) { increasing = (int)mxGetScalar(prhs[10]); if (!mxIsClass(prhs[10],"int32")) mexErrMsgTxt("increasing must be int32"); } /* Compute Sizes */ nVars = mxGetM(prhs[1]); nSamples = mxGetN(prhs[1]); maxIter = mxGetM(prhs[6]); precision = 1.490116119384765625e-8; if (nVars != mxGetM(prhs[0])) mexErrMsgTxt("w and Xt must have the same number of rows"); if (nSamples != mxGetM(prhs[2])) mexErrMsgTxt("number of columns of Xt must be the same as the number of rows in y"); if (nVars != mxGetM(prhs[7])) mexErrMsgTxt("w and d must have the same number of rows"); if (nSamples != mxGetM(prhs[8])) mexErrMsgTxt("w and g must have the same number of rows"); if (nSamples != mxGetM(prhs[9])) mexErrMsgTxt("covered and y must hvae the same number of rows"); if (mxIsSparse(prhs[1])) { sparse = 1; jc = mxGetJc(prhs[1]); ir = mxGetIr(prhs[1]); } if (sparse && alpha*lambda==1) mexErrMsgTxt("Sorry, I don't like it when Xt is sparse and alpha*lambda=1\n"); /* Allocate memory needed for lazy updates */ if (sparse) { lastVisited = mxCalloc(nVars,sizeof(double)); cumSum = mxCalloc(maxIter,sizeof(double)); } /* Compute mean of covered variables */ Lmean = 0; for(i=0;i<nSamples;i++) { if (covered[i]!=0) { nCovered++; Lmean += Li[i]; } } if(nCovered > 0) Lmean /= nCovered; if (nrhs >= 12) { xtx = mxGetPr(prhs[11]); if (nSamples != mxGetM(prhs[11])) mexErrMsgTxt("covered and xtx must have the same number or rows"); } else { xtx = mxCalloc(nSamples,sizeof(double)); for(i = 0; i < nSamples;i++) { xtx[i] = 0; if (sparse) { for(j=jc[i];j<jc[i+1];j++) xtx[i] += Xt[j]*Xt[j]; } else xtx[i] = ddot(&nVars,&Xt[i*nVars],&one,&Xt[i*nVars],&one); } } /* Do the O(n log n) initialization of the data structures will allow sampling in O(log(n)) time */ nextpow2 = pow(2,ceil(log2(nSamples)/log2(2))); nLevels = 1+(int)ceil(log2(nSamples)); /*printf("next power of 2 is: %d\n",nextpow2); printf("nLevels = %d\n",nLevels);*/ nDescendants = mxCalloc(nextpow2*nLevels,sizeof(double)); /* Counts number of descendents in tree */ unCoveredMatrix = mxCalloc(nextpow2*nLevels,sizeof(double)); /* Counts number of descenents that are still uncovered */ LiMatrix = mxCalloc(nextpow2*nLevels,sizeof(double)); /* Sums Lipschitz constant of loss over descendants */ for(i=0;i<nSamples;i++) { nDescendants[i] = 1; if (covered[i]) LiMatrix[i] = Li[i]; else unCoveredMatrix[i] = 1; } levelMax = nextpow2; for (level=1;level<nLevels;level++) { levelMax = levelMax/2; for(i=0;i<levelMax;i++) { nDescendants[i + nextpow2*level] = nDescendants[2*i + nextpow2*(level-1)] + nDescendants[2*i+1 + nextpow2*(level-1)]; LiMatrix[i + nextpow2*level] = LiMatrix[2*i + nextpow2*(level-1)] + LiMatrix[2*i+1 + nextpow2*(level-1)]; unCoveredMatrix[i + nextpow2*level] = unCoveredMatrix[2*i + nextpow2*(level-1)] + unCoveredMatrix[2*i+1 + nextpow2*(level-1)]; } } /* for(ind=0;ind<nextpow2;ind++) { for(j=0;j<nLevels;j++) { printf("%f ",unCoveredMatrix[ind + nextpow2*j]); } printf("\n"); } */ for(k=0;k<maxIter;k++) { /* Select next training example */ offset = 0; i = 0; u = randVals[k+maxIter]; if(randVals[k] < (double)(nSamples-nCovered)/(double)nSamples) { /* Sample fron uncovered guys */ Z = unCoveredMatrix[nextpow2*(nLevels-1)]; for(level=nLevels-1;level>=0;level--) { z = offset + unCoveredMatrix[2*i + nextpow2*level]; if(u < z/Z) i = 2*i; else { offset = z; i = 2*i+1; } } } else { /* Sample from covered guys according to estimate of Lipschitz constant */ Z = LiMatrix[nextpow2*(nLevels-1)] + (Lmean + 2*lambda)*(nDescendants[nextpow2*(nLevels-1)] - unCoveredMatrix[nextpow2*(nLevels-1)]); for(level=nLevels-1;level>=0;level--) { z = offset + LiMatrix[2*i + nextpow2*level] + (Lmean + 2*lambda)*(nDescendants[2*i + nextpow2*level] - unCoveredMatrix[2*i + nextpow2*level]); if(u < z/Z) i = 2*i; else { offset = z; i = 2*i+1; } } /*printf("i = %d\n",i);*/ } /* Compute current values of needed parameters */ if (sparse && k > 0) { for(j=jc[i];j<jc[i+1];j++) { if (lastVisited[ir[j]]==0) { w[ir[j]] -= d[ir[j]]*cumSum[k-1]; } else { w[ir[j]] -= d[ir[j]]*(cumSum[k-1]-cumSum[lastVisited[ir[j]]-1]); } lastVisited[ir[j]] = k; } } /* Compute derivative of loss */ innerProd = 0; if (sparse) { for(j=jc[i];j<jc[i+1];j++) innerProd += w[ir[j]]*Xt[j]; innerProd *= c; } else innerProd = ddot(&nVars,w,&one,&Xt[nVars*i],&one); sig = -y[i]/(1+exp(y[i]*innerProd)); /* Update direction */ if (sparse) { for(j=jc[i];j<jc[i+1];j++) d[ir[j]] += Xt[j]*(sig - g[i]); } else { scaling = sig-g[i]; daxpy(&nVars,&scaling,&Xt[i*nVars],&one,d,&one); } /* Store derivative of loss */ g[i] = sig; /* Line-search for Li */ Li_old = Li[i]; if(increasing && covered[i]) Li[i] /= 2; fi = log(1 + exp(-y[i]*innerProd)); /* Compute f_new as the function value obtained by taking * a step size of 1/Li in the gradient direction */ wtx = 0; if (sparse) { for(j=jc[i];j<jc[i+1];j++) wtx += c*w[ir[j]]*Xt[j]; } else wtx = ddot(&nVars,&Xt[i*nVars],&one,w,&one); gg = sig*sig*xtx[i]; innerProd = wtx - xtx[i]*sig/Li[i]; fi_new = log(1 + exp(-y[i]*innerProd)); /*printf("fi = %e, fi_new = %e, gg = %e\n",fi,fi_new,gg);*/ while (gg > precision && fi_new > fi - gg/(2*(Li[i]))) { /*printf("Lipschitz Backtracking (k = %d, fi = %e, fi_new = %e, 1/Li = %e)\n",k+1,fi,fi_new,1/(Li[i]));*/ Li[i] *= 2; innerProd = wtx - xtx[i]*sig/Li[i]; fi_new = log(1 + exp(-y[i]*innerProd)); } if(Li[i] > *Lmax) *Lmax = Li[i]; /* Update the number of examples that we have seen */ if (covered[i]==0) { covered[i]=1; nCovered++; Lmean = Lmean*((double)(nCovered-1)/(double)nCovered) + Li[i]/(double)nCovered; /* Update unCoveredMatrix so we don't sample this guy when looking for a new guy */ ind = i; for(level=0;level<nLevels;level++) { unCoveredMatrix[ind + nextpow2*level] -= 1; ind = ind/2; } /* Update LiMatrix so we sample this guy proportional to its Lipschitz constant*/ ind = i; for(level=0;level<nLevels;level++) { LiMatrix[ind + nextpow2*level] += Li[i]; ind = ind/2; } } else if (Li[i] != Li_old) { Lmean = Lmean + (Li[i] - Li_old)/(double)nCovered; /* Update LiMatrix with the new estimate of the Lipscitz constant */ ind = i; for(level=0;level<nLevels;level++) { LiMatrix[ind + nextpow2*level] += (Li[i] - Li_old); ind = ind/2; } } /*for(ind=0;ind<nextpow2;ind++) { for(j=0;j<nLevels;j++) { printf("%f ",LiMatrix[ind + nextpow2*j]); } printf("\n"); } /* Compute step size */ alpha = ((double)(nSamples-nCovered)/(double)nSamples)/(*Lmax + lambda) + ((double)nCovered/(double)nSamples)*(1/(2*(*Lmax + lambda)) + 1/(2*(Lmean + lambda))); /* Update parameters */ if (sparse) { c *= 1-alpha*lambda; if (k==0) cumSum[0] = alpha/(c*nCovered); else cumSum[k] = cumSum[k-1] + alpha/(c*nCovered); } else { scaling = 1-alpha*lambda; dscal(&nVars,&scaling,w,&one); scaling = -alpha/nCovered; daxpy(&nVars,&scaling,d,&one,w,&one); } /* Decrease value of max Lipschitz constant */ if (increasing) *Lmax *= pow(2.0,-1.0/nSamples); } if (sparse) { for(j=0;j<nVars;j++) { if (lastVisited[j]==0) { w[j] -= d[j]*cumSum[maxIter-1]; } else { w[j] -= d[j]*(cumSum[maxIter-1]-cumSum[lastVisited[j]-1]); } } scaling = c; dscal(&nVars,&scaling,w,&one); mxFree(lastVisited); mxFree(cumSum); } mxFree(nDescendants); mxFree(unCoveredMatrix); mxFree(LiMatrix); if (nrhs < 12) mxFree(xtx); }
int dqrsl(double a[], int lda, int n, int k, double qraux[], double y[], double qy[], double qty[], double b[], double rsd[], double ab[], int job) /******************************************************************************/ /* Purpose: DQRSL computes transformations, projections, and least squares solutions. Discussion: DQRSL requires the output of DQRDC. For K <= min(N,P), let AK be the matrix AK = ( A(JPVT[0]), A(JPVT(2)), ..., A(JPVT(K)) ) formed from columns JPVT[0], ..., JPVT(K) of the original N by P matrix A that was input to DQRDC. If no pivoting was done, AK consists of the first K columns of A in their original order. DQRDC produces a factored orthogonal matrix Q and an upper triangular matrix R such that AK = Q * (R) (0) This information is contained in coded form in the arrays A and QRAUX. The parameters QY, QTY, B, RSD, and AB are not referenced if their computation is not requested and in this case can be replaced by dummy variables in the calling program. To save storage, the user may in some cases use the same array for different parameters in the calling sequence. A frequently occurring example is when one wishes to compute any of B, RSD, or AB and does not need Y or QTY. In this case one may identify Y, QTY, and one of B, RSD, or AB, while providing separate arrays for anything else that is to be computed. Thus the calling sequence dqrsl ( a, lda, n, k, qraux, y, dum, y, b, y, dum, 110, info ) will result in the computation of B and RSD, with RSD overwriting Y. More generally, each item in the following list contains groups of permissible identifications for a single calling sequence. 1. (Y,QTY,B) (RSD) (AB) (QY) 2. (Y,QTY,RSD) (B) (AB) (QY) 3. (Y,QTY,AB) (B) (RSD) (QY) 4. (Y,QY) (QTY,B) (RSD) (AB) 5. (Y,QY) (QTY,RSD) (B) (AB) 6. (Y,QY) (QTY,AB) (B) (RSD) In any group the value returned in the array allocated to the group corresponds to the last member of the group. Licensing: This code is distributed under the GNU LGPL license. Modified: 07 June 2005 Author: C version by John Burkardt. Reference: Jack Dongarra, Cleve Moler, Jim Bunch and Pete Stewart, LINPACK User's Guide, SIAM, (Society for Industrial and Applied Mathematics), 3600 University City Science Center, Philadelphia, PA, 19104-2688. ISBN 0-89871-172-X Parameters: Input, double A[LDA*P], contains the output of DQRDC. Input, int LDA, the leading dimension of the array A. Input, int N, the number of rows of the matrix AK. It must have the same value as N in DQRDC. Input, int K, the number of columns of the matrix AK. K must not be greater than min(N,P), where P is the same as in the calling sequence to DQRDC. Input, double QRAUX[P], the auxiliary output from DQRDC. Input, double Y[N], a vector to be manipulated by DQRSL. Output, double QY[N], contains Q * Y, if requested. Output, double QTY[N], contains Q' * Y, if requested. Output, double B[K], the solution of the least squares problem minimize norm2 ( Y - AK * B), if its computation has been requested. Note that if pivoting was requested in DQRDC, the J-th component of B will be associated with column JPVT(J) of the original matrix A that was input into DQRDC. Output, double RSD[N], the least squares residual Y - AK * B, if its computation has been requested. RSD is also the orthogonal projection of Y onto the orthogonal complement of the column space of AK. Output, double AB[N], the least squares approximation Ak * B, if its computation has been requested. AB is also the orthogonal projection of Y onto the column space of A. Input, integer JOB, specifies what is to be computed. JOB has the decimal expansion ABCDE, with the following meaning: if A != 0, compute QY. if B != 0, compute QTY. if C != 0, compute QTY and B. if D != 0, compute QTY and RSD. if E != 0, compute QTY and AB. Note that a request to compute B, RSD, or AB automatically triggers the computation of QTY, for which an array must be provided in the calling sequence. Output, int DQRSL, is zero unless the computation of B has been requested and R is exactly singular. In this case, INFO is the index of the first zero diagonal element of R, and B is left unaltered. */ { int cab; int cb; int cqty; int cqy; int cr; int i; int info; int j; int jj; int ju; double t; double temp; /* Set INFO flag. */ info = 0; /* Determine what is to be computed. */ cqy = ( job / 10000 != 0); cqty = ((job % 10000) != 0); cb = ((job % 1000) / 100 != 0); cr = ((job % 100) / 10 != 0); cab = ((job % 10) != 0); ju = i4_min(k, n - 1); /* Special action when N = 1. */ if (ju == 0) { if (cqy) qy[0] = y[0]; if (cqty) qty[0] = y[0]; if (cab) ab[0] = y[0]; if (cb) { if (a[0 + 0 * lda] == 0.0) info = 1; else b[0] = y[0] / a[0 + 0 * lda]; } if (cr) rsd[0] = 0.0; return info; } /* Set up to compute QY or QTY. */ if (cqy) { for (i = 1; i <= n; i++) qy[i - 1] = y[i - 1]; } if (cqty) { for (i = 1; i <= n; i++) qty[i - 1] = y[i - 1]; } /* Compute QY. */ if (cqy) { for (jj = 1; jj <= ju; jj++) { j = ju - jj + 1; if (qraux[j - 1] != 0.0) { temp = a[j - 1 + (j - 1) * lda]; a[j - 1 + (j - 1)*lda] = qraux[j - 1]; t = -ddot(n - j + 1, a + j - 1 + (j - 1) * lda, 1, qy + j - 1, 1) / a[j - 1 + (j - 1) * lda]; daxpy(n - j + 1, t, a + j - 1 + (j - 1)*lda, 1, qy + j - 1, 1); a[j - 1 + (j - 1)*lda] = temp; } } } /* Compute Q'*Y. */ if (cqty) { for (j = 1; j <= ju; j++) { if (qraux[j - 1] != 0.0) { temp = a[j - 1 + (j - 1) * lda]; a[j - 1 + (j - 1)*lda] = qraux[j - 1]; t = -ddot(n - j + 1, a + j - 1 + (j - 1) * lda, 1, qty + j - 1, 1) / a[j - 1 + (j - 1) * lda]; daxpy(n - j + 1, t, a + j - 1 + (j - 1)*lda, 1, qty + j - 1, 1); a[j - 1 + (j - 1)*lda] = temp; } } } /* Set up to compute B, RSD, or AB. */ if (cb) { for (i = 1; i <= k; i++) b[i - 1] = qty[i - 1]; } if (cab) { for (i = 1; i <= k; i++) ab[i - 1] = qty[i - 1]; } if (cr && k < n) { for (i = k + 1; i <= n; i++) rsd[i - 1] = qty[i - 1]; } if (cab && k + 1 <= n) { for (i = k + 1; i <= n; i++) ab[i - 1] = 0.0; } if (cr) { for (i = 1; i <= k; i++) rsd[i - 1] = 0.0; } /* Compute B. */ if (cb) { for (jj = 1; jj <= k; jj++) { j = k - jj + 1; if (a[j - 1 + (j - 1)*lda] == 0.0) { info = j; break; } b[j - 1] = b[j - 1] / a[j - 1 + (j - 1) * lda]; if (j != 1) { t = -b[j - 1]; daxpy(j - 1, t, a + 0 + (j - 1)*lda, 1, b, 1); } } } /* Compute RSD or AB as required. */ if (cr || cab) { for (jj = 1; jj <= ju; jj++) { j = ju - jj + 1; if (qraux[j - 1] != 0.0) { temp = a[j - 1 + (j - 1) * lda]; a[j - 1 + (j - 1)*lda] = qraux[j - 1]; if (cr) { t = -ddot(n - j + 1, a + j - 1 + (j - 1) * lda, 1, rsd + j - 1, 1) / a[j - 1 + (j - 1) * lda]; daxpy(n - j + 1, t, a + j - 1 + (j - 1)*lda, 1, rsd + j - 1, 1); } if (cab) { t = -ddot(n - j + 1, a + j - 1 + (j - 1) * lda, 1, ab + j - 1, 1) / a[j - 1 + (j - 1) * lda]; daxpy(n - j + 1, t, a + j - 1 + (j - 1)*lda, 1, ab + j - 1, 1); } a[j - 1 + (j - 1)*lda] = temp; } } } return info; }
void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job ) /* We would like to declare a[][lda], but c does not allow it. In this function, references to a[i][j] are written a[lda*i+j]. */ /* dgesl solves the double precision system a * x = b or trans(a) * x = b using the factors computed by dgeco or dgefa. on entry a double precision[n][lda] the output from dgeco or dgefa. lda integer the leading dimension of the array a . n integer the order of the matrix a . ipvt integer[n] the pivot vector from dgeco or dgefa. b double precision[n] the right hand side vector. job integer = 0 to solve a*x = b , = nonzero to solve trans(a)*x = b where trans(a) is the transpose. on return b the solution vector x . error condition a division by zero will occur if the input factor contains a zero on the diagonal. technically this indicates singularity but it is often caused by improper arguments or improper setting of lda . it will not occur if the subroutines are called correctly and if dgeco has set rcond .gt. 0.0 or dgefa has set info .eq. 0 . to compute inverse(a) * c where c is a matrix with p columns dgeco(a,lda,n,ipvt,rcond,z) if (!rcond is too small){ for (j=0,j<p,j++) dgesl(a,lda,n,ipvt,c[j][0],0); } linpack. this version dated 08/14/78 . cleve moler, university of new mexico, argonne national lab. functions blas daxpy,ddot */ { /* internal variables */ REAL t; int k,kb,l,nm1; nm1 = n - 1; if (job == 0) { /* job = 0 , solve a * x = b first solve l*y = b */ if (nm1 >= 1) { for (k = 0; k < nm1; k++) { l = ipvt[k]; t = b[l]; if (l != k){ b[l] = b[k]; b[k] = t; } daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 ); } } /* now solve u*x = y */ for (kb = 0; kb < n; kb++) { k = n - (kb + 1); b[k] = b[k]/a[lda*k+k]; t = -b[k]; daxpy(k,t,&a[lda*k+0],1,&b[0],1 ); } } else { /* job = nonzero, solve trans(a) * x = b first solve trans(u)*y = b */ for (k = 0; k < n; k++) { t = ddot(k,&a[lda*k+0],1,&b[0],1); b[k] = (b[k] - t)/a[lda*k+k]; } /* now solve trans(l)*x = y */ if (nm1 >= 1) { for (kb = 1; kb < nm1; kb++) { k = n - (kb+1); b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1); l = ipvt[k]; if (l != k) { t = b[l]; b[l] = b[k]; b[k] = t; } } } } return; }
/*----------------------*/ void dgefa(REAL a[], int lda, int n, int ipvt[], int *info) /* We would like to declare a[][lda], but c does not allow it. In this function, references to a[i][j] are written a[lda*i+j]. */ /* dgefa factors a double precision matrix by gaussian elimination. dgefa is usually called by dgeco, but it can be called directly with a saving in time if rcond is not needed. (time for dgeco) = (1 + 9/n)*(time for dgefa) . on entry a REAL precision[n][lda] the matrix to be factored. lda integer the leading dimension of the array a . n integer the order of the matrix a . on return a an upper triangular matrix and the multipliers which were used to obtain it. the factorization can be written a = l*u where l is a product of permutation and unit lower triangular matrices and u is upper triangular. ipvt integer[n] an integer vector of pivot indices. info integer = 0 normal value. = k if u[k][k] .eq. 0.0 . this is not an error condition for this subroutine, but it does indicate that dgesl or dgedi will divide by zero if called. use rcond in dgeco for a reliable indication of singularity. linpack. this version dated 08/14/78 . cleve moler, university of new mexico, argonne national lab. functions blas daxpy,dscal,idamax */ { /* internal variables */ REAL t; int j,k,kp1,l,nm1; /* gaussian elimination with partial pivoting */ *info = 0; nm1 = n - 1; if (nm1 >= 0) { for (k = 0; k < nm1; k++) { kp1 = k + 1; /* find l = pivot index */ l = idamax(n-k,&a[lda*k+k],1) + k; ipvt[k] = l; /* zero pivot implies this column already triangularized */ if (a[lda*k+l] != ZERO) { /* interchange if necessary */ if (l != k) { t = a[lda*k+l]; a[lda*k+l] = a[lda*k+k]; a[lda*k+k] = t; } /* compute multipliers */ t = -ONE/a[lda*k+k]; dscal(n-(k+1),t,&a[lda*k+k+1],1); /* row elimination with column indexing */ for (j = kp1; j < n; j++) { t = a[lda*j+l]; if (l != k) { a[lda*j+l] = a[lda*j+k]; a[lda*j+k] = t; } daxpy(n-(k+1),t,&a[lda*k+k+1],1, &a[lda*j+k+1],1); } } else { *info = k; } } } ipvt[n-1] = n-1; if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1; return; }
/* * This function returns the solution of Ax = b where A is posititive definite, * based on the conjugate gradients method; see "An intro to the CG method" by J.R. Shewchuk, p. 50-51 * * A is mxm, b, x are is mx1. Argument niter specifies the maximum number of * iterations and eps is the desired solution accuracy. niter<0 signals that * x contains a valid initial approximation to the solution; if niter>0 then * the starting point is taken to be zero. Argument prec selects the desired * preconditioning method as follows: * 0: no preconditioning * 1: jacobi (diagonal) preconditioning * 2: SSOR preconditioning * Argument iscolmaj specifies whether A is stored in column or row major order. * * The function returns 0 in case of error, * the number of iterations performed if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int sba_Axb_CG(double *A, double *B, double *x, int m, int niter, double eps, int prec, int iscolmaj) { static double *buf=NULL; static int buf_sz=0; register int i, j; register double *aim; int iter, a_sz, res_sz, d_sz, q_sz, s_sz, wk_sz, z_sz, tot_sz; double *a, *res, *d, *q, *s, *wk, *z; double delta0, deltaold, deltanew, alpha, beta, eps_sq=eps*eps; register double sum; int rec_res; if(A==NULL){ if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } /* calculate required memory size */ a_sz=(iscolmaj)? m*m : 0; res_sz=m; d_sz=m; q_sz=m; if(prec!=SBA_CG_NOPREC){ s_sz=m; wk_sz=m; z_sz=(prec==SBA_CG_SSOR)? m : 0; } else s_sz=wk_sz=z_sz=0; tot_sz=a_sz+res_sz+d_sz+q_sz+s_sz+wk_sz+z_sz; if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(double *)malloc(buf_sz*sizeof(double)); if(!buf){ fprintf(stderr, "memory allocation request failed in sba_Axb_CG()\n"); exit(1); } } if(iscolmaj){ a=buf; /* store A (row major!) into a */ for(i=0; i<m; ++i) for(j=0, aim=a+i*m; j<m; ++j) aim[j]=A[i+j*m]; } else a=A; /* no copying required */ res=buf+a_sz; d=res+res_sz; q=d+d_sz; if(prec!=SBA_CG_NOPREC){ s=q+q_sz; wk=s+s_sz; z=(prec==SBA_CG_SSOR)? wk+wk_sz : NULL; for(i=0; i<m; ++i){ // compute jacobi (i.e. diagonal) preconditioners and save them in wk sum=a[i*m+i]; if(sum>DBL_EPSILON || -sum<-DBL_EPSILON) // != 0.0 wk[i]=1.0/sum; else wk[i]=1.0/DBL_EPSILON; } } else{ s=res; wk=z=NULL; } if(niter>0){ for(i=0; i<m; ++i){ // clear solution and initialize residual vector: res <-- B x[i]=0.0; res[i]=B[i]; } } else{ niter=-niter; for(i=0; i<m; ++i){ // initialize residual vector: res <-- B - A*x for(j=0, aim=a+i*m, sum=0.0; j<m; ++j) sum+=aim[j]*x[j]; res[i]=B[i]-sum; } } switch(prec){ case SBA_CG_NOPREC: for(i=0, deltanew=0.0; i<m; ++i){ d[i]=res[i]; deltanew+=res[i]*res[i]; } break; case SBA_CG_JACOBI: // jacobi preconditioning for(i=0, deltanew=0.0; i<m; ++i){ d[i]=res[i]*wk[i]; deltanew+=res[i]*d[i]; } break; case SBA_CG_SSOR: // SSOR preconditioning; see the "templates" book, fig. 3.2, p. 44 for(i=0; i<m; ++i){ for(j=0, sum=0.0, aim=a+i*m; j<i; ++j) sum+=aim[j]*z[j]; z[i]=wk[i]*(res[i]-sum); } for(i=m-1; i>=0; --i){ for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j) sum+=aim[j]*d[j]; d[i]=z[i]-wk[i]*sum; } deltanew=dprod(m, res, d); break; default: fprintf(stderr, "unknown preconditioning option %d in sba_Axb_CG\n", prec); exit(1); } delta0=deltanew; for(iter=1; deltanew>eps_sq*delta0 && iter<=niter; ++iter){ for(i=0; i<m; ++i){ // q <-- A d aim=a+i*m; /*** for(j=0, sum=0.0; j<m; ++j) sum+=aim[j]*d[j]; ***/ q[i]=dprod(m, aim, d); //sum; } /*** for(i=0, sum=0.0; i<m; ++i) sum+=d[i]*q[i]; ***/ alpha=deltanew/dprod(m, d, q); // deltanew/sum; /*** for(i=0; i<m; ++i) x[i]+=alpha*d[i]; ***/ daxpy(m, x, x, alpha, d); if(!(iter%50)){ for(i=0; i<m; ++i){ // accurate computation of the residual vector aim=a+i*m; /*** for(j=0, sum=0.0; j<m; ++j) sum+=aim[j]*x[j]; ***/ res[i]=B[i]-dprod(m, aim, x); //B[i]-sum; } rec_res=0; } else{ /*** for(i=0; i<m; ++i) // approximate computation of the residual vector res[i]-=alpha*q[i]; ***/ daxpy(m, res, res, -alpha, q); rec_res=1; } if(prec){ switch(prec){ case SBA_CG_JACOBI: // jacobi for(i=0; i<m; ++i) s[i]=res[i]*wk[i]; break; case SBA_CG_SSOR: // SSOR for(i=0; i<m; ++i){ for(j=0, sum=0.0, aim=a+i*m; j<i; ++j) sum+=aim[j]*z[j]; z[i]=wk[i]*(res[i]-sum); } for(i=m-1; i>=0; --i){ for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j) sum+=aim[j]*s[j]; s[i]=z[i]-wk[i]*sum; } break; } } deltaold=deltanew; /*** for(i=0, sum=0.0; i<m; ++i) sum+=res[i]*s[i]; ***/ deltanew=dprod(m, res, s); //sum; /* make sure that we get around small delta that are due to * accumulated floating point roundoff errors */ if(rec_res && deltanew<=eps_sq*delta0){ /* analytically recompute delta */ for(i=0; i<m; ++i){ for(j=0, aim=a+i*m, sum=0.0; j<m; ++j) sum+=aim[j]*x[j]; res[i]=B[i]-sum; } deltanew=dprod(m, res, s); } beta=deltanew/deltaold; /*** for(i=0; i<m; ++i) d[i]=s[i]+beta*d[i]; ***/ daxpy(m, d, s, beta, d); } return iter; }
/*---------------------------------------------------------------------------*/ int main (void) { /*---------------------------------------------------------------------------*/ /* Define arrays for the upper triangle of the coefficient matrix and */ /* preconditioner as well as an array for rhs vector */ /* Compressed sparse row storage is used for sparse representation */ /*---------------------------------------------------------------------------*/ MKL_INT n = 100, rci_request, itercount, lexpected_itercount = 15, uexpected_itercount = 19, i; double rhs[100]; MKL_INT ia[100 + 1]; MKL_INT ja[100 - 1]; double a[100 - 1], a1[100 - 1]; /*---------------------------------------------------------------------------*/ /* Allocate storage for the solver ?par and temporary storage tmp */ /*---------------------------------------------------------------------------*/ MKL_INT length = 128; MKL_INT ipar[128]; double dpar[128], tmp[4 * 100]; /*---------------------------------------------------------------------------*/ /* Some additional variables to use with the RCI (P)CG solver */ /* OMEGA is the relaxation parameter, NITER_SSOR is the maximum number of */ /* iterations for the SSOR preconditioner */ /*---------------------------------------------------------------------------*/ double solution[100]; double expected_sol[100]; double omega = 0.5E0, one = 1.E0, zero = 0.E0, om = 1.E0 - omega; double euclidean_norm, temp[100]; MKL_INT niter_ssor = 20; char matdes[6]; char tr = 'n'; double eone = -1.E0; MKL_INT ione = 1; /*---------------------------------------------------------------------------*/ /* Initialize the coefficient matrix and expected solution */ /*---------------------------------------------------------------------------*/ for (i = 0; i < n; i++) expected_sol[i] = 1.E0; for (i = 0; i < n - 1; i++) { ja[i] = i + 2; ia[i] = i + 1; a[i] = 0.5E0; a1[i] = omega * a[i]; } ia[n - 1] = n; ia[n] = ia[n - 1]; matdes[0] = 's'; matdes[1] = 'u'; matdes[2] = 'u'; matdes[3] = 'f'; /*---------------------------------------------------------------------------*/ /* Initialize vectors rhs, temp, and tmp[n:2*n-1] with zeros as mkl_dcsrmv */ /* routine does not set NAN to zero. Thus, if any of the values in the */ /* vectors above accidentally happens to be NAN, the example will fail */ /* to complete. */ /* Initialize the right hand side through matrix-vector product */ /*---------------------------------------------------------------------------*/ for (i = 0; i < n; i++) { rhs[i] = zero; temp[i] = zero; tmp[n + i] = zero; } mkl_dcsrmv (&tr, &n, &n, &one, matdes, a, ja, ia, &ia[1], expected_sol, &zero, rhs); /*---------------------------------------------------------------------------*/ /* Initialize the initial guess */ /*---------------------------------------------------------------------------*/ for (i = 0; i < n; i++) solution[i] = zero; /*---------------------------------------------------------------------------*/ /* Initialize the solver */ /*---------------------------------------------------------------------------*/ dcg_init (&n, solution, rhs, &rci_request, ipar, dpar, tmp); if (rci_request != 0) goto failure; /*---------------------------------------------------------------------------*/ /* Set the desired parameters: */ /* INTEGER parameters: */ /* set the maximal number of iterations to 100 */ /* LOGICAL parameters: */ /* run the Preconditioned version of RCI (P)CG with preconditioner C_inverse */ /* DOUBLE parameters */ /* - */ /*---------------------------------------------------------------------------*/ ipar[4] = 100; ipar[10] = 1; /*---------------------------------------------------------------------------*/ /* Check the correctness and consistency of the newly set parameters */ /*---------------------------------------------------------------------------*/ dcg_check (&n, solution, rhs, &rci_request, ipar, dpar, tmp); if (rci_request != 0) goto failure; /*---------------------------------------------------------------------------*/ /* Compute the solution by RCI (P)CG solver */ /* Reverse Communications starts here */ /*---------------------------------------------------------------------------*/ rci:dcg (&n, solution, rhs, &rci_request, ipar, dpar, tmp); /*---------------------------------------------------------------------------*/ /* If rci_request=0, then the solution was found according to the requested */ /* stopping tests. In this case, this means that it was found after 100 */ /* iterations. */ /*---------------------------------------------------------------------------*/ if (rci_request == 0) goto getsln; /*---------------------------------------------------------------------------*/ /* If rci_request=1, then compute the vector A*tmp[0] */ /* and put the result in vector tmp[n] */ /*---------------------------------------------------------------------------*/ if (rci_request == 1) { matdes[0] = 's'; mkl_dcsrmv (&tr, &n, &n, &one, matdes, a, ja, ia, &ia[1], tmp, &zero, &tmp[n]); goto rci; } /*---------------------------------------------------------------------------*/ /* If rci_request=2, then do the user-defined stopping test: compute the */ /* Euclidean norm of the actual residual using MKL routines and check if */ /* it is less than 1.E-8 */ /*---------------------------------------------------------------------------*/ if (rci_request == 2) { matdes[0] = 's'; mkl_dcsrmv (&tr, &n, &n, &one, matdes, a, ja, ia, &ia[1], solution, &zero, temp); daxpy (&n, &eone, rhs, &ione, temp, &ione); euclidean_norm = dnrm2 (&n, temp, &ione); /*---------------------------------------------------------------------------*/ /* The solution has not been found yet according to the user-defined stopping */ /* test. Continue RCI (P)CG iterations. */ /*---------------------------------------------------------------------------*/ if (euclidean_norm > 1.E-6) goto rci; /*---------------------------------------------------------------------------*/ /* The solution has been found according to the user-defined stopping test */ /*---------------------------------------------------------------------------*/ else goto getsln; } /*---------------------------------------------------------------------------*/ /* If rci_request=3, then apply the simplest SSOR preconditioning */ /* on vector tmp[2*n] and put the result in vector tmp[3*n] */ /*---------------------------------------------------------------------------*/ if (rci_request == 3) { dcopy (&n, &tmp[2 * n], &ione, &tmp[3 * n], &ione); matdes[0] = 't'; for (i = 1; i <= niter_ssor; i++) { dcopy (&n, &tmp[2 * n], &ione, temp, &ione); matdes[2] = 'n'; tr = 'n'; mkl_dcsrmv (&tr, &n, &n, &eone, matdes, a1, ja, ia, &ia[1], &tmp[3 * n], &omega, temp); daxpy (&n, &om, &tmp[3 * n], &ione, temp, &ione); matdes[2] = 'u'; tr = 't'; mkl_dcsrsv (&tr, &n, &one, matdes, a1, ja, ia, &ia[1], temp, &tmp[3 * n]); } goto rci; } /*---------------------------------------------------------------------------*/ /* If rci_request=anything else, then dcg subroutine failed */ /* to compute the solution vector: solution[n] */ /*---------------------------------------------------------------------------*/ goto failure; /*---------------------------------------------------------------------------*/ /* Reverse Communication ends here */ /* Get the current iteration number into itercount */ /*---------------------------------------------------------------------------*/ getsln:dcg_get (&n, solution, rhs, &rci_request, ipar, dpar, tmp, &itercount); /*---------------------------------------------------------------------------*/ /* Print solution vector: solution[n] and number of iterations: itercount */ /*---------------------------------------------------------------------------*/ printf ("The system has been solved\n"); printf ("The following solution obtained\n"); for (i = 0; i < n / 4; i++) { printf ("%6.3f %6.3f %6.3f %6.3f", solution[4 * i], solution[4 * i + 1], solution[4 * i + 2], solution[4 * i + 3]); printf ("\n"); } printf ("\nExpected solution is\n"); for (i = 0; i < n / 4; i++) { printf ("%6.3f %6.3f %6.3f %6.3f", expected_sol[4 * i], expected_sol[4 * i + 1], expected_sol[4 * i + 2], expected_sol[4 * i + 3]); expected_sol[4 * i] -= solution[4 * i]; printf ("\n"); } printf ("\nNumber of iterations: %d\n", itercount); i = 4; n /= 4; euclidean_norm = dnrm2 (&n, expected_sol, &i); /*-------------------------------------------------------------------------*/ /* Release internal MKL memory that might be used for computations */ /* NOTE: It is important to call the routine below to avoid memory leaks */ /* unless you disable MKL Memory Manager */ /*-------------------------------------------------------------------------*/ MKL_Free_Buffers (); if (lexpected_itercount <= itercount <= uexpected_itercount && euclidean_norm < 1.0e-4) { printf ("This example has successfully PASSED through all steps of computation!"); printf ("\n"); return 0; } else { printf ("This example may have FAILED as either the number of iterations differs"); printf ("\nfrom the expected number of iterations %d-", lexpected_itercount); printf ("-%d, or the computed solution\ndiffers much from ", uexpected_itercount); printf ("the expected solution (Euclidean norm is %e), or both.\n", euclidean_norm); return 1; } /*-------------------------------------------------------------------------*/ /* Release internal MKL memory that might be used for computations */ /* NOTE: It is important to call the routine below to avoid memory leaks */ /* unless you disable MKL Memory Manager */ /*-------------------------------------------------------------------------*/ failure:printf ("This example FAILED as the solver has returned the ERROR "); printf ("code %d", rci_request); MKL_Free_Buffers (); return 1; }