/* compute the Cholesky decompostion of C in W, s.t. C=W^t W and W is upper triangular */ int LEVMAR_CHOLESKY(LM_REAL *C, LM_REAL *W, int m) { register int i, j; int info; /* copy weights array C to W (in column-major order!) so that LAPACK won't destroy it */ for(i=0; i<m; i++) for(j=0; j<m; j++) W[i+j*m]=C[i*m+j]; /* cholesky decomposition */ POTF2("U", (int *)&m, W, (int *)&m, (int *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2 in %s\n", -info, LCAT(LEVMAR_DER, "()")); exit(1); } else{ fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\n%s()\n", info, RCAT("and the cholesky factorization could not be completed in ", LEVMAR_CHOLESKY)); return LM_ERROR; } } /* the decomposition is in the upper part of W (in column-major order!). * copying it to the lower part and zeroing the upper transposes * W in row-major order */ for(i=0; i<m; i++) for(j=0; j<i; j++){ W[i+j*m]=W[j+i*m]; W[j+i*m]=0.0; } return 0; }
/* * This function returns the solution of Ax = b * * The function employs LU decomposition followed by forward/back substitution (see * also the LAPACK-based LU solver above) * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successful * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ void *buf=NULL; __STATIC__ int buf_sz=0; register int i, j, k; int *idx, maxi=-1, idx_sz, a_sz, work_sz, tot_sz; LM_REAL *a, *work, max, sum, tmp; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ idx_sz=m; a_sz=m*m; work_sz=m; tot_sz=(a_sz+work_sz)*sizeof(LM_REAL) + idx_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */ // Check inputs for validity for(i=0; i<a_sz; i++) if (!LM_FINITE(A[i])) return 0; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(void *)malloc(tot_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(void *)malloc(tot_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=(LM_REAL*)buf; work=a+a_sz; idx=(int *)(work+work_sz); /* avoid destroying A, B by copying them to a, x resp. */ memcpy(a, A, a_sz*sizeof(LM_REAL)); memcpy(x, B, m*sizeof(LM_REAL)); /* compute the LU decomposition of a row permutation of matrix a; the permutation itself is saved in idx[] */ for(i=0; i<m; ++i){ max=0.0; for(j=0; j<m; ++j) if((tmp=FABS(a[i*m+j]))>max) max=tmp; if(max==0.0){ fprintf(stderr, RCAT("Singular matrix A in ", AX_EQ_B_LU) "()!\n"); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } work[i]=LM_CNST(1.0)/max; } for(j=0; j<m; ++j){ for(i=0; i<j; ++i){ sum=a[i*m+j]; for(k=0; k<i; ++k) sum-=a[i*m+k]*a[k*m+j]; a[i*m+j]=sum; } max=0.0; for(i=j; i<m; ++i){ sum=a[i*m+j]; for(k=0; k<j; ++k) sum-=a[i*m+k]*a[k*m+j]; a[i*m+j]=sum; if((tmp=work[i]*FABS(sum))>=max){ max=tmp; maxi=i; } } if(j!=maxi){ for(k=0; k<m; ++k){ tmp=a[maxi*m+k]; a[maxi*m+k]=a[j*m+k]; a[j*m+k]=tmp; } work[maxi]=work[j]; } idx[j]=maxi; if(a[j*m+j]==0.0) a[j*m+j]=LM_REAL_EPSILON; if(j!=m-1){ tmp=LM_CNST(1.0)/(a[j*m+j]); for(i=j+1; i<m; ++i) a[i*m+j]*=tmp; } } /* The decomposition has now replaced a. Solve the linear system using * forward and back substitution */ for(i=k=0; i<m; ++i){ j=idx[i]; sum=x[j]; x[j]=x[i]; if(k!=0) for(j=k-1; j<i; ++j) sum-=a[i*m+j]*x[j]; else if(sum!=0.0) k=i+1; x[i]=sum; } for(i=m-1; i>=0; --i){ sum=x[i]; for(j=i+1; j<m; ++j) sum-=a[i*m+j]*x[j]; x[i]=sum/a[i*m+i]; } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function returns the solution of Ax = b for a real symmetric matrix A * * The function is based on LDLT factorization with the pivoting * strategy of Bunch and Kaufman: * A is factored as L*D*L^T where L is lower triangular and * D symmetric and block diagonal (aka spectral decomposition, * Banachiewicz factorization, modified Cholesky factorization) * * A is mxm, b is mx1. * * The function returns 0 in case of error, 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_BK(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0, nb=0; LM_REAL *a, *work; int a_sz, ipiv_sz, work_sz, tot_sz; int info, *ipiv, nrhs=1; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ ipiv_sz=m; a_sz=m*m; if(!nb){ LM_REAL tmp; work_sz=-1; // workspace query; optimal size is returned in tmp SYTRF("L", (int *)&m, NULL, (int *)&m, NULL, (LM_REAL *)&tmp, (int *)&work_sz, (int *)&info); nb=((int)tmp)/m; // optimal worksize is m*nb } work_sz=(nb!=-1)? nb*m : 1; tot_sz=(a_sz + work_sz)*sizeof(LM_REAL) + ipiv_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */ #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_BK) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_BK) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; work=a+a_sz; ipiv=(int *)(work+work_sz); /* store A into a and B into x; A is assumed to be symmetric, hence * the column and row major order representations are the same */ memcpy(a, A, a_sz*sizeof(LM_REAL)); memcpy(x, B, m*sizeof(LM_REAL)); /* LDLt factorization for A */ SYTRF("L", (int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", SYTRF) " in ", AX_EQ_B_BK) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("LAPACK error: singular block diagonal matrix D for", SYTRF) " in ", AX_EQ_B_BK)"() [D(%d, %d) is zero]\n", info, info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* solve the system with the computed factorization */ SYTRS("L", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, x, (int *)&m, (int *)&info); if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", SYTRS) " in ", AX_EQ_B_BK) "()\n", -info); exit(1); } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function computes the inverse of A in B. A and B can coincide * * The function employs LAPACK-free LU decomposition of A to solve m linear * systems A*B_i=I_i, where B_i and I_i are the i-th columns of B and I. * * A and B are mxm * * The function returns 0 in case of error, * 1 if successfull * */ static int LEVMAR_LUINVERSE(LM_REAL *A, LM_REAL *B, int m) { void *buf=NULL; int buf_sz=0; register int i, j, k, l; int *idx, maxi=-1, idx_sz, a_sz, x_sz, work_sz, tot_sz; LM_REAL *a, *x, *work, max, sum, tmp; /* calculate required memory size */ idx_sz=m; a_sz=m*m; x_sz=m; work_sz=m; tot_sz=idx_sz*sizeof(int) + (a_sz+x_sz+work_sz)*sizeof(LM_REAL); buf_sz=tot_sz; buf=(void *)malloc(tot_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", LEVMAR_LUINVERSE) "() failed!\n"); exit(1); } idx=(int *)buf; a=(LM_REAL *)(idx + idx_sz); x=a + a_sz; work=x + x_sz; /* avoid destroying A by copying it to a */ for(i=0; i<a_sz; ++i) a[i]=A[i]; /* compute the LU decomposition of a row permutation of matrix a; the permutation itself is saved in idx[] */ for(i=0; i<m; ++i){ max=0.0; for(j=0; j<m; ++j) if((tmp=FABS(a[i*m+j]))>max) max=tmp; if(max==0.0){ fprintf(stderr, RCAT("Singular matrix A in ", LEVMAR_LUINVERSE) "()!\n"); free(buf); return 0; } work[i]=CNST(1.0)/max; } for(j=0; j<m; ++j){ for(i=0; i<j; ++i){ sum=a[i*m+j]; for(k=0; k<i; ++k) sum-=a[i*m+k]*a[k*m+j]; a[i*m+j]=sum; } max=0.0; for(i=j; i<m; ++i){ sum=a[i*m+j]; for(k=0; k<j; ++k) sum-=a[i*m+k]*a[k*m+j]; a[i*m+j]=sum; if((tmp=work[i]*FABS(sum))>=max){ max=tmp; maxi=i; } } if(j!=maxi){ for(k=0; k<m; ++k){ tmp=a[maxi*m+k]; a[maxi*m+k]=a[j*m+k]; a[j*m+k]=tmp; } work[maxi]=work[j]; } idx[j]=maxi; if(a[j*m+j]==0.0) a[j*m+j]=LM_REAL_EPSILON; if(j!=m-1){ tmp=CNST(1.0)/(a[j*m+j]); for(i=j+1; i<m; ++i) a[i*m+j]*=tmp; } } /* The decomposition has now replaced a. Solve the m linear systems using * forward and back substitution */ for(l=0; l<m; ++l){ for(i=0; i<m; ++i) x[i]=0.0; x[l]=CNST(1.0); for(i=k=0; i<m; ++i){ j=idx[i]; sum=x[j]; x[j]=x[i]; if(k!=0) for(j=k-1; j<i; ++j) sum-=a[i*m+j]*x[j]; else if(sum!=0.0) k=i+1; x[i]=sum; } for(i=m-1; i>=0; --i){ sum=x[i]; for(j=i+1; j<m; ++j) sum-=a[i*m+j]*x[j]; x[i]=sum/a[i*m+i]; } for(i=0; i<m; ++i) B[i*m+l]=x[i]; } free(buf); return 1; }
int LEVMAR_BC_DER( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ void (*jacf)(LM_REAL *p, LM_REAL *j, int m, int n, void *adata), /* function to evaluate the jacobian \part x / \part p */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ LM_REAL *lb, /* I: vector of lower bounds. If NULL, no lower bounds apply */ LM_REAL *ub, /* I: vector of upper bounds. If NULL, no upper bounds apply */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu, * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used. * Note that ||J^T e||_inf is computed on free (not equal to lb[i] or ub[i]) variables only. */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * info[7]= # function evaluations * info[8]= # jacobian evaluations */ LM_REAL *work, /* working memory, allocate if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf. * Set to NULL if not needed */ { register int i, j, k, l; int worksz, freework=0, issolved; /* temp work arrays */ LM_REAL *e, /* nx1 */ *hx, /* \hat{x}_i, nx1 */ *jacTe, /* J^T e_i mx1 */ *jac, /* nxm */ *jacTjac, /* mxm */ *Dp, /* mx1 */ *diag_jacTjac, /* diagonal of J^T J, mx1 */ *pDp; /* p + Dp, mx1 */ register LM_REAL mu, /* damping constant */ tmp; /* mainly used in matrix & vector multiplications */ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */ LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL tau, eps1, eps2, eps2_sq, eps3; LM_REAL init_p_eL2; int nu=2, nu2, stop, nfev, njev=0; const int nm=n*m; /* variables for constrained LM */ struct FUNC_STATE fstate; LM_REAL alpha=CNST(1e-4), beta=CNST(0.9), gamma=CNST(0.99995), gamma_sq=gamma*gamma, rho=CNST(1e-8); LM_REAL t, t0; LM_REAL steptl=CNST(1e3)*(LM_REAL)sqrt(LM_REAL_EPSILON), jacTeDp; LM_REAL tmin=CNST(1e-12), tming=CNST(1e-18); /* minimum step length for LS and PG steps */ const LM_REAL tini=CNST(1.0); /* initial step length for LS and PG steps */ int nLMsteps=0, nLSsteps=0, nPGsteps=0, gprevtaken=0; int numactive; mu=jacTe_inf=t=0.0; tmin=tmin; /* -Wall */ if(n<m){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m); exit(1); } if(!jacf){ fprintf(stderr, RCAT("No function specified for computing the jacobian in ", LEVMAR_BC_DER) RCAT("().\nIf no such function is available, use ", LEVMAR_BC_DIF) RCAT("() rather than ", LEVMAR_BC_DER) "()\n"); exit(1); } if(!BOXCHECK(lb, ub, m)){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): at least one lower bound exceeds the upper one\n")); exit(1); } if(opts){ tau=opts[0]; eps1=opts[1]; eps2=opts[2]; eps2_sq=opts[2]*opts[2]; eps3=opts[3]; } else{ // use default values tau=CNST(LM_INIT_MU); eps1=CNST(LM_STOP_THRESH); eps2=CNST(LM_STOP_THRESH); eps2_sq=CNST(LM_STOP_THRESH)*CNST(LM_STOP_THRESH); eps3=CNST(LM_STOP_THRESH); } if(!work){ worksz=LM_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m; work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ if(!work){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n")); exit(1); } freework=1; } /* set up work arrays */ e=work; hx=e + n; jacTe=hx + n; jac=jacTe + m; jacTjac=jac + nm; Dp=jacTjac + m*m; diag_jacTjac=Dp + m; pDp=diag_jacTjac + m; fstate.n=n; fstate.hx=hx; fstate.x=x; fstate.adata=adata; fstate.nfev=&nfev; /* see if starting point is within the feasile set */ for(i=0; i<m; ++i) pDp[i]=p[i]; BOXPROJECT(p, lb, ub, m); /* project to feasible set */ for(i=0; i<m; ++i) if(pDp[i]!=p[i]) fprintf(stderr, RCAT("Warning: component %d of starting point not feasible in ", LEVMAR_BC_DER) "()! [%g projected to %g]\n", i, p[i], pDp[i]); /* compute e=x - f(p) and its L2 norm */ (*func)(p, hx, m, n, adata); nfev=1; for(i=0, p_eL2=0.0; i<n; ++i){ e[i]=tmp=x[i]-hx[i]; p_eL2+=tmp*tmp; } init_p_eL2=p_eL2; for(k=stop=0; k<itmax && !stop; ++k){ //printf("%d %.15g\n", k, 0.5*p_eL2); /* Note that p and e have been updated at a previous iteration */ if(p_eL2<=eps3){ /* error is small */ stop=6; break; } /* Compute the jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. * Since J^T J is symmetric, its computation can be speeded up by computing * only its upper triangular part and copying it to the lower part */ (*jacf)(p, jac, m, n, adata); ++njev; /* J^T J, J^T e */ if(nm<__BLOCKSZ__SQ){ // this is a small problem /* This is the straightforward way to compute J^T J, J^T e. However, due to * its noncontinuous memory access pattern, it incures many cache misses when * applied to large minimization problems (i.e. problems involving a large * number of free variables and measurements), in which J is too large to * fit in the L1 cache. For such problems, a cache-efficient blocking scheme * is preferable. * * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * performance problem. * * On the other hand, the straightforward algorithm is faster on small * problems since in this case it avoids the overheads of blocking. */ for(i=0; i<m; ++i){ for(j=i; j<m; ++j){ int lm; for(l=0, tmp=0.0; l<n; ++l){ lm=l*m; tmp+=jac[lm+i]*jac[lm+j]; } /* store tmp in the corresponding upper and lower part elements */ jacTjac[i*m+j]=jacTjac[j*m+i]=tmp; } /* J^T e */ for(l=0, tmp=0.0; l<n; ++l) tmp+=jac[l*m+i]*e[l]; jacTe[i]=tmp; } } else{ // this is a large problem /* Cache efficient computation of J^T J based on blocking */ TRANS_MAT_MAT_MULT(jac, jacTjac, n, m); /* cache efficient computation of J^T e */ for(i=0; i<m; ++i) jacTe[i]=0.0; for(i=0; i<n; ++i){ register LM_REAL *jacrow; for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l) jacTe[l]+=jacrow[l]*tmp; } } /* Compute ||J^T e||_inf and ||p||^2. Note that ||J^T e||_inf * is computed for free (i.e. inactive) variables only. * At a local minimum, if p[i]==ub[i] then g[i]>0; * if p[i]==lb[i] g[i]<0; otherwise g[i]=0 */ for(i=j=numactive=0, p_L2=jacTe_inf=0.0; i<m; ++i){ if(ub && p[i]==ub[i]){ ++numactive; if(jacTe[i]>0.0) ++j; } else if(lb && p[i]==lb[i]){ ++numactive; if(jacTe[i]<0.0) ++j; } else if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp; diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */ p_L2+=p[i]*p[i]; } //p_L2=sqrt(p_L2); #if 0 if(!(k%100)){ printf("Current estimate: "); for(i=0; i<m; ++i) printf("%.9g ", p[i]); printf("-- errors %.9g %0.9g, #active %d [%d]\n", jacTe_inf, p_eL2, numactive, j); } #endif /* check for convergence */ if(j==numactive && (jacTe_inf <= eps1)){ Dp_L2=0.0; /* no increment for p in this case */ stop=1; break; } /* compute initial damping factor */ if(k==0){ if(!lb && !ub){ /* no bounds */ for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */ mu=tau*tmp; } else mu=CNST(0.5)*tau*p_eL2; /* use Kanzow's starting mu */ } /* determine increment using a combination of adaptive damping, line search and projected gradient search */ while(1){ /* augment normal equations */ for(i=0; i<m; ++i) jacTjac[i*m+i]+=mu; /* solve augmented equations */ /* 5 alternatives are available: LU, Cholesky, 2 variants of QR decomposition and SVD. * Cholesky is the fastest but might be inaccurate; QR is slower but more accurate; * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); if(issolved){ for(i=0; i<m; ++i) pDp[i]=p[i] + Dp[i]; /* compute p's new estimate and ||Dp||^2 */ BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */ for(i=0, Dp_L2=0.0; i<m; ++i){ Dp[i]=tmp=pDp[i]-p[i]; Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ stop=2; break; } if(Dp_L2>=(p_L2+eps2)/(CNST(EPSILON)*CNST(EPSILON))){ /* almost singular */ stop=4; break; } (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */ for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } if(pDp_eL2<=gamma_sq*p_eL2){ for(i=0, dL=0.0; i<m; ++i) dL+=Dp[i]*(mu*Dp[i]+jacTe[i]); #if 1 if(dL>0.0){ dF=p_eL2-pDp_eL2; tmp=(CNST(2.0)*dF/dL-CNST(1.0)); tmp=CNST(1.0)-tmp*tmp*tmp; mu=mu*( (tmp>=CNST(ONE_THIRD))? tmp : CNST(ONE_THIRD) ); } else mu=(mu>=pDp_eL2)? pDp_eL2 : mu; /* pDp_eL2 is the new pDp_eL2 */ #else mu=(mu>=pDp_eL2)? pDp_eL2 : mu; /* pDp_eL2 is the new pDp_eL2 */ #endif nu=2; for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2; ++nLMsteps; gprevtaken=0; break; } } else{ /* the augmented linear system could not be solved, increase mu */ mu*=nu; nu2=nu<<1; // 2*nu; if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */ stop=5; break; } nu=nu2; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; continue; /* solve again with increased nu */ } /* if this point is reached, the LM step did not reduce the error; * see if it is a descent direction */ /* negate jacTe (i.e. g) & compute g^T * Dp */ for(i=0, jacTeDp=0.0; i<m; ++i){ jacTe[i]=-jacTe[i]; jacTeDp+=jacTe[i]*Dp[i]; } if(jacTeDp<=-rho*pow(Dp_L2, _POW_/CNST(2.0))){ /* Dp is a descent direction; do a line search along it */ int mxtake, iretcd; LM_REAL stepmx; tmp=(LM_REAL)sqrt(p_L2); stepmx=CNST(1e3)*( (tmp>=CNST(1.0))? tmp : CNST(1.0) ); #if 1 /* use Schnabel's backtracking line search; it requires fewer "func" evaluations */ LNSRCH(m, p, p_eL2, jacTe, Dp, alpha, pDp, &pDp_eL2, func, fstate, &mxtake, &iretcd, stepmx, steptl, NULL); /* NOTE: LNSRCH() updates hx */ if(iretcd!=0) goto gradproj; /* rather inelegant but effective way to handle LNSRCH() failures... */ #else /* use the simpler (but slower!) line search described by Kanzow */ for(t=tini; t>tmin; t*=beta){ for(i=0; i<m; ++i){ pDp[i]=p[i] + t*Dp[i]; //pDp[i]=__MEDIAN3(lb[i], pDp[i], ub[i]); /* project to feasible set */ } (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + t*Dp */ for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } //if(CNST(0.5)*pDp_eL2<=CNST(0.5)*p_eL2 + t*alpha*jacTeDp) break; if(pDp_eL2<=p_eL2 + CNST(2.0)*t*alpha*jacTeDp) break; } #endif ++nLSsteps; gprevtaken=0; /* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2. * These values are used below to update their corresponding variables */ } else{ gradproj: /* Note that this point can also be reached via a goto when LNSRCH() fails */ /* jacTe is a descent direction; make a projected gradient step */ /* if the previous step was along the gradient descent, try to use the t employed in that step */ /* compute ||g|| */ for(i=0, tmp=0.0; i<m; ++i) tmp=jacTe[i]*jacTe[i]; tmp=(LM_REAL)sqrt(tmp); tmp=CNST(100.0)/(CNST(1.0)+tmp); t0=(tmp<=tini)? tmp : tini; /* guard against poor scaling & large steps; see (3.50) in C.T. Kelley's book */ for(t=(gprevtaken)? t : t0; t>tming; t*=beta){ for(i=0; i<m; ++i) pDp[i]=p[i] - t*jacTe[i]; BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */ for(i=0; i<m; ++i) Dp[i]=pDp[i]-p[i]; (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p - t*g */ for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } for(i=0, tmp=0.0; i<m; ++i) /* compute ||g^T * Dp|| */ tmp+=jacTe[i]*Dp[i]; if(gprevtaken && pDp_eL2<=p_eL2 + CNST(2.0)*CNST(0.99999)*tmp){ /* starting t too small */ t=t0; gprevtaken=0; continue; } //if(CNST(0.5)*pDp_eL2<=CNST(0.5)*p_eL2 + alpha*tmp) break; if(pDp_eL2<=p_eL2 + CNST(2.0)*alpha*tmp) break; } ++nPGsteps; gprevtaken=1; /* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2 */ } /* update using computed values */ for(i=0, Dp_L2=0.0; i<m; ++i){ tmp=pDp[i]-p[i]; Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ stop=2; break; } for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2; break; } /* inner loop */ } if(k>=itmax) stop=3; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; if(info){ info[0]=init_p_eL2; info[1]=p_eL2; info[2]=jacTe_inf; info[3]=Dp_L2; for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i]; info[4]=mu/tmp; info[5]=(LM_REAL)k; info[6]=(LM_REAL)stop; info[7]=(LM_REAL)nfev; info[8]=(LM_REAL)njev; } /* covariance matrix */ if(covar){ LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n); } if(freework) free(work); #if 0 printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSsteps, nPGsteps); #endif return (stop!=4)? k : -1; }
/* Similar to the LEVMAR_LEC_DER() function above, except that the jacobian is approximated * with the aid of finite differences (forward or central, see the comment for the opts argument) */ int LEVMAR_LEC_DIF( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ LM_REAL *A, /* I: constraints matrix, kxm */ LM_REAL *b, /* I: right hand constraints vector, kx1 */ int k, /* I: number of contraints (i.e. A's #rows) */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[5], /* I: opts[0-3] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the * scale factor for initial \mu, stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2 and * the step used in difference approximation to the jacobian. Set to NULL for defaults to be used. * If \delta<0, the jacobian is approximated with central differences which are more accurate * (but slower!) compared to the forward differences employed by default. */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * info[7]= # function evaluations * info[8]= # jacobian evaluations */ LM_REAL *work, /* working memory, allocate if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func. * Set to NULL if not needed */ { struct LMLEC_DATA data; LM_REAL *ptr, *Z, *pp, *p0, *Zimm; /* Z is mxmm */ int mm, ret; register int i, j; register LM_REAL tmp; LM_REAL locinfo[LM_INFO_SZ]; mm=m-k; ptr=(LM_REAL *)malloc((2*m + m*mm + mm)*sizeof(LM_REAL)); if(!ptr){ fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n")); exit(1); } data.p=p; p0=ptr; data.c=p0+m; data.Z=Z=data.c+m; data.jac=NULL; pp=data.Z+m*mm; data.ncnstr=k; data.func=func; data.jacf=NULL; data.adata=adata; LMLEC_ELIM(A, b, data.c, NULL, Z, k, m); // compute c, Z /* compute pp s.t. p = c + Z*pp or (Z^T Z)*pp=Z^T*(p-c) * Due to orthogonality, Z^T Z = I and the last equation * becomes pp=Z^T*(p-c). Also, save the starting p in p0 */ for(i=0; i<m; ++i){ p0[i]=p[i]; p[i]-=data.c[i]; } /* Z^T*(p-c) */ for(i=0; i<mm; ++i){ for(j=0, tmp=0.0; j<m; ++j) tmp+=Z[j*mm+i]*p[j]; pp[i]=tmp; } /* compute the p corresponding to pp (i.e. c + Z*pp) and compare with p0 */ for(i=0; i<m; ++i){ Zimm=Z+i*mm; for(j=0, tmp=data.c[i]; j<mm; ++j) tmp+=Zimm[j]*pp[j]; // tmp+=Z[i*mm+j]*pp[j]; if(FABS(tmp-p0[i])>CNST(1E-03)) fprintf(stderr, RCAT("Warning: component %d of starting point not feasible in ", LEVMAR_LEC_DIF) "()! [%.10g reset to %.10g]\n", i, p0[i], tmp); } if(!info) info=locinfo; /* make sure that LEVMAR_DIF() is called with non-null info */ /* note that covariance computation is not requested from LEVMAR_DIF() */ ret=LEVMAR_DIF(LMLEC_FUNC, pp, x, mm, n, itmax, opts, info, work, NULL, (void *)&data); /* p=c + Z*pp */ for(i=0; i<m; ++i){ Zimm=Z+i*mm; for(j=0, tmp=data.c[i]; j<mm; ++j) tmp+=Zimm[j]*pp[j]; // tmp+=Z[i*mm+j]*pp[j]; p[i]=tmp; } /* compute the jacobian with finite differences and use it to estimate the covariance */ if(covar){ LM_REAL *hx, *wrk, *jac; hx=(LM_REAL *)malloc((2*n+n*m)*sizeof(LM_REAL)); if(!work){ fprintf(stderr, LCAT(LEVMAR_LEC_DIF, "(): memory allocation request failed\n")); exit(1); } wrk=hx+n; jac=wrk+n; (*func)(p, hx, m, n, adata); /* evaluate function at p */ FDIF_FORW_JAC_APPROX(func, p, hx, wrk, (LM_REAL)LM_DIFF_DELTA, jac, m, n, adata); /* compute the jacobian at p */ TRANS_MAT_MAT_MULT(jac, covar, n, m, __BLOCKSZ__); /* covar = J^T J */ LEVMAR_COVAR(covar, covar, info[1], m, n); free(hx); } free(ptr); return ret; }
/* * This function returns the solution of Ax = b * * The function employs LU decomposition: * If A=L U with L lower and U upper triangular, then the original system * amounts to solving * L y = b, U x = y * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successful * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; int a_sz, ipiv_sz, tot_sz; register int i, j; int info, *ipiv, nrhs=1; LM_REAL *a; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ ipiv_sz=m; a_sz=m*m; tot_sz=a_sz*sizeof(LM_REAL) + ipiv_sz*sizeof(int); /* should be arranged in that order for proper doubles alignment */ #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; ipiv=(int *)(a+a_sz); /* store A (column major!) into a and B into x */ for(i=0; i<m; i++){ for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; x[i]=B[i]; } /* LU decomposition for A */ GETRF((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("argument %d of ", GETRF) " illegal in ", AX_EQ_B_LU) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("singular matrix A for ", GETRF) " in ", AX_EQ_B_LU) "()\n"); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* solve the system with the computed LU */ GETRS("N", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, x, (int *)&m, (int *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("argument %d of ", GETRS) " illegal in ", AX_EQ_B_LU) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("unknown error for ", GETRS) " in ", AX_EQ_B_LU) "()\n"); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function returns the solution of Ax=b * * The function assumes that A is symmetric & postive definite and employs * the Cholesky decomposition: * If A=L L^T with L lower triangular, the system to be solved becomes * (L L^T) x = b * This amounts to solving L y = b for y and then L^T x = y for x * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successful * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_CHOL(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; LM_REAL *a; int a_sz, tot_sz; int info, nrhs=1; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ a_sz=m*m; tot_sz=a_sz; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_CHOL) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_CHOL) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; /* store A into a and B into x. A is assumed symmetric, * hence no transposition is needed */ memcpy(a, A, a_sz*sizeof(LM_REAL)); memcpy(x, B, m*sizeof(LM_REAL)); /* Cholesky decomposition of A */ //POTF2("L", (int *)&m, a, (int *)&m, (int *)&info); POTRF("L", (int *)&m, a, (int *)&m, (int *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTF2) "/", POTRF) " in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for ", POTF2) "/", POTRF) " in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* solve using the computed Cholesky in one lapack call */ POTRS("L", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTRS) " in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } #if 0 /* alternative: solve the linear system L y = b ... */ TRTRS("L", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) " in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* ... solve the linear system L^T x = y */ TRTRS("L", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) "in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } #endif /* 0 */ #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
int LEVMAR_DER( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ void (*jacf)(LM_REAL *p, LM_REAL *j, int m, int n, void *adata), /* function to evaluate the Jacobian \part x / \part p */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector. NULL implies a zero vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu, * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * info[7]= # function evaluations * info[8]= # Jacobian evaluations * info[9]= # linear systems solved, i.e. # attempts for reducing error */ LM_REAL *work, /* working memory at least LM_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf. * Set to NULL if not needed */ { register int i, j, k, l; int worksz, freework=0, issolved; /* temp work arrays */ LM_REAL *e, /* nx1 */ *hx, /* \hat{x}_i, nx1 */ *jacTe, /* J^T e_i mx1 */ *jac, /* nxm */ *jacTjac, /* mxm */ *Dp, /* mx1 */ *diag_jacTjac, /* diagonal of J^T J, mx1 */ *pDp; /* p + Dp, mx1 */ register LM_REAL mu, /* damping constant */ tmp; /* mainly used in matrix & vector multiplications */ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */ LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL tau, eps1, eps2, eps2_sq, eps3; LM_REAL init_p_eL2; int nu=2, nu2, stop=0, nfev, njev=0, nlss=0; const int nm=n*m; int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; mu=jacTe_inf=0.0; /* -Wall */ if(n<m){ fprintf(stderr, LCAT(LEVMAR_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m); return LM_ERROR; } if(!jacf){ fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_DER) RCAT("().\nIf no such function is available, use ", LEVMAR_DIF) RCAT("() rather than ", LEVMAR_DER) "()\n"); return LM_ERROR; } if(opts){ tau=opts[0]; eps1=opts[1]; eps2=opts[2]; eps2_sq=opts[2]*opts[2]; eps3=opts[3]; } else{ // use default values tau=LM_CNST(LM_INIT_MU); eps1=LM_CNST(LM_STOP_THRESH); eps2=LM_CNST(LM_STOP_THRESH); eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH); eps3=LM_CNST(LM_STOP_THRESH); } if(!work){ worksz=LM_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m; work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ if(!work){ fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n")); return LM_ERROR; } freework=1; } /* set up work arrays */ e=work; hx=e + n; jacTe=hx + n; jac=jacTe + m; jacTjac=jac + nm; Dp=jacTjac + m*m; diag_jacTjac=Dp + m; pDp=diag_jacTjac + m; /* compute e=x - f(p) and its L2 norm */ (*func)(p, hx, m, n, adata); nfev=1; /* ### e=x-hx, p_eL2=||e|| */ #if 1 p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n); #else for(i=0, p_eL2=0.0; i<n; ++i){ e[i]=tmp=x[i]-hx[i]; p_eL2+=tmp*tmp; } #endif init_p_eL2=p_eL2; if(!LM_FINITE(p_eL2)) stop=7; for(k=0; k<itmax && !stop; ++k){ /* Note that p and e have been updated at a previous iteration */ if(p_eL2<=eps3){ /* error is small */ stop=6; break; } /* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. * Since J^T J is symmetric, its computation can be sped up by computing * only its upper triangular part and copying it to the lower part */ (*jacf)(p, jac, m, n, adata); ++njev; /* J^T J, J^T e */ if(nm<__BLOCKSZ__SQ){ // this is a small problem /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj. * Thus, the product J^T J can be computed using an outer loop for * l that adds J_li*J_lj to each element ij of the result. Note that * with this scheme, the accesses to J and JtJ are always along rows, * therefore induces less cache misses compared to the straightforward * algorithm for computing the product (i.e., l loop is innermost one). * A similar scheme applies to the computation of J^T e. * However, for large minimization problems (i.e., involving a large number * of unknowns and measurements) for which J/J^T J rows are too large to * fit in the L1 cache, even this scheme incures many cache misses. In * such cases, a cache-efficient blocking scheme is preferable. * * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * performance problem. * * Note that the non-blocking algorithm is faster on small * problems since in this case it avoids the overheads of blocking. */ /* looping downwards saves a few computations */ register int l; register LM_REAL alpha, *jaclm, *jacTjacim; for(i=m*m; i-->0; ) jacTjac[i]=0.0; for(i=m; i-->0; ) jacTe[i]=0.0; for(l=n; l-->0; ){ jaclm=jac+l*m; for(i=m; i-->0; ){ jacTjacim=jacTjac+i*m; alpha=jaclm[i]; //jac[l*m+i]; for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */ jacTjacim[j]+=jaclm[j]*alpha; //jacTjac[i*m+j]+=jac[l*m+j]*alpha /* J^T e */ jacTe[i]+=alpha*e[l]; } } for(i=m; i-->0; ) /* copy to upper part */ for(j=i+1; j<m; ++j) jacTjac[i*m+j]=jacTjac[j*m+i]; } else{ // this is a large problem /* Cache efficient computation of J^T J based on blocking */ LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m); /* cache efficient computation of J^T e */ for(i=0; i<m; ++i) jacTe[i]=0.0; for(i=0; i<n; ++i){ register LM_REAL *jacrow; for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l) jacTe[l]+=jacrow[l]*tmp; } } /* Compute ||J^T e||_inf and ||p||^2 */ for(i=0, p_L2=jacTe_inf=0.0; i<m; ++i){ if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp; diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */ p_L2+=p[i]*p[i]; } //p_L2=sqrt(p_L2); #if 0 if(!(k%100)){ printf("Current estimate: "); for(i=0; i<m; ++i) printf("%.9g ", p[i]); printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2); } #endif /* check for convergence */ if((jacTe_inf <= eps1)){ Dp_L2=0.0; /* no increment for p in this case */ stop=1; break; } /* compute initial damping factor */ if(k==0){ for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */ mu=tau*tmp; } /* determine increment using adaptive damping */ while(1){ /* augment normal equations */ for(i=0; i<m; ++i) jacTjac[i*m+i]+=mu; /* solve augmented equations */ #ifdef HAVE_LAPACK /* 7 alternatives are available: LU, Cholesky + Cholesky with PLASMA, LDLt, 2 variants of QR decomposition and SVD. * For matrices with dimensions of at least a few hundreds, the PLASMA implementation of Cholesky is the fastest. * From the serial solvers, Cholesky is the fastest but might occasionally be inapplicable due to numerical round-off; * QR is slower but more robust; SVD is the slowest but most robust; LU is quite robust but * slower than LDLt; LDLt offers a good tradeoff between robustness and speed */ issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK; //issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU; //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL; #ifdef HAVE_PLASMA //issolved=AX_EQ_B_PLASMA_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_PLASMA_CHOL; #endif //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR; //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS; //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD; #else /* use the LU included with levmar */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU; #endif /* HAVE_LAPACK */ if(issolved){ /* compute p's new estimate and ||Dp||^2 */ for(i=0, Dp_L2=0.0; i<m; ++i){ pDp[i]=p[i] + (tmp=Dp[i]); Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ //if(Dp_L2<=eps2*(p_L2 + eps2)){ /* relative change in p is small, stop */ stop=2; break; } if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)*LM_CNST(EPSILON))){ /* almost singular */ //if(Dp_L2>=(p_L2+eps2)/LM_CNST(EPSILON)){ /* almost singular */ stop=4; break; } (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */ /* compute ||e(pDp)||_2 */ /* ### hx=x-hx, pDp_eL2=||hx|| */ #if 1 pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n); #else for(i=0, pDp_eL2=0.0; i<n; ++i){ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } #endif if(!LM_FINITE(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error. * This check makes sure that the inner loop does not run indefinitely. * Thanks to Steve Danauskas for reporting such cases */ stop=7; break; } for(i=0, dL=0.0; i<m; ++i) dL+=Dp[i]*(mu*Dp[i]+jacTe[i]); dF=p_eL2-pDp_eL2; if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */ tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0)); tmp=LM_CNST(1.0)-tmp*tmp*tmp; mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) ); nu=2; for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2; break; } } /* if this point is reached, either the linear system could not be solved or * the error did not reduce; in any case, the increment must be rejected */ mu*=nu; nu2=nu<<1; // 2*nu; if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */ stop=5; break; } nu=nu2; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; } /* inner loop */ } if(k>=itmax) stop=3; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; if(info){ info[0]=init_p_eL2; info[1]=p_eL2; info[2]=jacTe_inf; info[3]=Dp_L2; for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i]; info[4]=mu/tmp; info[5]=(LM_REAL)k; info[6]=(LM_REAL)stop; info[7]=(LM_REAL)nfev; info[8]=(LM_REAL)njev; info[9]=(LM_REAL)nlss; } /* covariance matrix */ if(covar){ LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n); } if(freework) free(work); #ifdef LINSOLVERS_RETAIN_MEMORY if(linsolver) (*linsolver)(NULL, NULL, NULL, 0); #endif return (stop!=4 && stop!=7)? k : LM_ERROR; }
/* * This function returns the solution of Ax = b * * The function is based on QR decomposition with explicit computation of Q: * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes * Q R x = b or R x = Q^T b. * The last equation can be solved directly. * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_QR(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; LM_REAL *a, *qtb, *tau, *r, *work; int a_sz, qtb_sz, tau_sz, r_sz, tot_sz; register int i, j; blasint info, worksz, nrhs=1; register LM_REAL sum; #ifdef LINSOLVERS_RETAIN_MEMORY if(!A){ if(buf) free(buf); buf_sz=0; return 1; } #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ a_sz=m*m; qtb_sz=m; tau_sz=m; r_sz=m*m; /* only the upper triangular part really needed */ worksz=3*m; /* this is probably too much */ tot_sz=a_sz + qtb_sz + tau_sz + r_sz + worksz; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QR) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QR) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; qtb=a+a_sz; tau=qtb+qtb_sz; r=tau+tau_sz; work=r+r_sz; /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; /* QR decomposition of A */ const blasint mm = m; GEQRF((blasint *)&mm, (blasint *)&mm, a, (blasint *)&mm, tau, work, (blasint *)&worksz, (blasint *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", GEQRF) " in ", AX_EQ_B_QR) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("Unknown LAPACK error %ld for ", GEQRF) " in ", AX_EQ_B_QR) "()\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* R is stored in the upper triangular part of a; copy it in r so that ORGQR() below won't destroy it */ for(i=0; i<r_sz; i++) r[i]=a[i]; /* compute Q using the elementary reflectors computed by the above decomposition */ ORGQR((blasint *)&mm, (blasint *)&mm, (blasint *)&mm, a, (blasint *)&mm, tau, work, (blasint *)&worksz, (blasint *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", ORGQR) " in ", AX_EQ_B_QR) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT("Unknown LAPACK error (%ld) in ", AX_EQ_B_QR) "()\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* Q is now in a; compute Q^T b in qtb */ for(i=0; i<m; i++){ for(j=0, sum=0.0; j<m; j++) sum+=a[i*m+j]*B[j]; qtb[i]=sum; } /* solve the linear system R x = Q^t b */ TRTRS("U", "N", "N", (blasint *)&mm, (blasint *)&nrhs, r, (blasint *)&mm, qtb, (blasint *)&mm, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", TRTRS) " in ", AX_EQ_B_QR) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %ld-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_QR) "()\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* copy the result in x */ for(i=0; i<m; i++) x[i]=qtb[i]; #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function returns the solution of Ax = b * * The function is based on SVD decomposition: * If A=U D V^T with U, V orthogonal and D diagonal, the linear system becomes * (U D V^T) x = b or x=V D^{-1} U^T b * Note that V D^{-1} U^T is the pseudoinverse A^+ * * A is mxm, b is mx1. * * The function returns 0 in case of error, 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_SVD(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; static LM_REAL eps=CNST(-1.0); register int i, j; LM_REAL *a, *u, *s, *vt, *work; int a_sz, u_sz, s_sz, vt_sz, tot_sz; LM_REAL thresh, one_over_denom; register LM_REAL sum; blasint info, rank, worksz, *iwork, iworksz; #ifdef LINSOLVERS_RETAIN_MEMORY if(!A){ if(buf) free(buf); buf_sz=0; return 1; } #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ worksz=16*m; /* more than needed */ iworksz=8*m; a_sz=m*m; u_sz=m*m; s_sz=m; vt_sz=m*m; tot_sz=iworksz*sizeof(blasint) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL); #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_SVD) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_SVD) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ iwork=(blasint *)buf; a=(LM_REAL *)(iwork+iworksz); /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; u=a + a_sz; s=u+u_sz; vt=s+s_sz; work=vt+vt_sz; const blasint mm = m; /* SVD decomposition of A */ GESVD("A", "A", (blasint *)&mm, (blasint *)&mm, a, (blasint *)&mm, s, u, (blasint *)&mm, vt, (blasint *)&mm, work, (blasint *)&worksz, &info); //GESDD("A", (blasint *)&mm, (blasint *)&mm, a, (blasint *)&mm, s, u, (blasint *)&mm, vt, (blasint *)&mm, work, (blasint *)&worksz, iwork, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", GESVD), "/" GESDD) " in ", AX_EQ_B_SVD) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", AX_EQ_B_SVD) "() [info=%ld]\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } if(eps<0.0){ LM_REAL aux; /* compute machine epsilon */ for(eps=CNST(1.0); aux=eps+CNST(1.0), aux-CNST(1.0)>0.0; eps*=CNST(0.5)) ; eps*=CNST(2.0); } /* compute the pseudoinverse in a */ for(i=0; i<a_sz; i++) a[i]=0.0; /* initialize to zero */ for(rank=0, thresh=eps*s[0]; rank<m && s[rank]>thresh; rank++){ one_over_denom=CNST(1.0)/s[rank]; for(j=0; j<m; j++) for(i=0; i<m; i++) a[i*m+j]+=vt[rank+i*m]*u[j+rank*m]*one_over_denom; } /* compute A^+ b in x */ for(i=0; i<m; i++){ for(j=0, sum=0.0; j<m; j++) sum+=a[i*m+j]*B[j]; x[i]=sum; } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function returns the solution of min_x ||Ax - b|| * * || . || is the second order (i.e. L2) norm. This is a least squares technique that * is based on QR decomposition: * If A=Q R with Q orthogonal and R upper triangular, the normal equations become * (A^T A) x = A^T b or (R^T Q^T Q R) x = A^T b or (R^T R) x = A^T b. * This amounts to solving R^T y = A^T b for y and then R x = y for x * Note that Q does not need to be explicitly computed * * A is mxn, b is mx1 * * The function returns 0 in case of error, 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_QRLS(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m, int n) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; LM_REAL *a, *atb, *tau, *r, *work; int a_sz, atb_sz, tau_sz, r_sz, tot_sz; register int i, j; blasint info, worksz, nrhs=1; register LM_REAL sum; #ifdef LINSOLVERS_RETAIN_MEMORY if(!A){ if(buf) free(buf); buf_sz=0; return 1; } #endif /* LINSOLVERS_RETAIN_MEMORY */ if(m<n){ fprintf(stderr, RCAT("Normal equations require that the number of rows is greater than number of columns in ", AX_EQ_B_QRLS) "() [%d x %d]! -- try transposing\n", m, n); exit(1); } /* calculate required memory size */ a_sz=m*n; atb_sz=n; tau_sz=n; r_sz=n*n; worksz=3*n; /* this is probably too much */ tot_sz=a_sz + atb_sz + tau_sz + r_sz + worksz; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QRLS) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QRLS) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; atb=a+a_sz; tau=atb+atb_sz; r=tau+tau_sz; work=r+r_sz; /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<n; j++) a[i+j*m]=A[i*n+j]; /* compute A^T b in atb */ for(i=0; i<n; i++){ for(j=0, sum=0.0; j<m; j++) sum+=A[j*n+i]*B[j]; atb[i]=sum; } const blasint mm = m; const blasint nn = n; /* QR decomposition of A */ GEQRF((blasint *)&mm, (blasint *)&nn, a, (blasint *)&mm, tau, work, (blasint *)&worksz, (blasint *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", GEQRF) " in ", AX_EQ_B_QRLS) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("Unknown LAPACK error %ld for ", GEQRF) " in ", AX_EQ_B_QRLS) "()\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* R is stored in the upper triangular part of a. Note that a is mxn while r nxn */ for(j=0; j<n; j++){ for(i=0; i<=j; i++) r[i+j*n]=a[i+j*m]; /* lower part is zero */ for(i=j+1; i<n; i++) r[i+j*n]=0.0; } /* solve the linear system R^T y = A^t b */ TRTRS("U", "T", "N", (blasint *)&nn, (blasint *)&nrhs, r, (blasint *)&nn, atb, (blasint *)&nn, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", TRTRS) " in ", AX_EQ_B_QRLS) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %ld-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_QRLS) "()\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* solve the linear system R x = y */ TRTRS("U", "N", "N", (blasint *)&nn, (blasint *)&nrhs, r, (blasint *)&nn, atb, (blasint *)&nn, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %ld of ", TRTRS) " in ", AX_EQ_B_QRLS) "()\n", (long)-info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %ld-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_QRLS) "()\n", (long)info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* copy the result in x */ for(i=0; i<n; i++) x[i]=atb[i]; #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function seeks the parameter vector p that best describes the measurements * vector x under box & linear constraints. * More precisely, given a vector function func : R^m --> R^n with n>=m, * it finds p s.t. func(p) ~= x, i.e. the squared second order (i.e. L2) norm of * e=x-func(p) is minimized under the constraints lb[i]<=p[i]<=ub[i] and A p=b; * A is kxm, b kx1. Note that this function DOES NOT check the satisfiability of * the specified box and linear equation constraints. * If no lower bound constraint applies for p[i], use -DBL_MAX/-FLT_MAX for lb[i]; * If no upper bound constraint applies for p[i], use DBL_MAX/FLT_MAX for ub[i]. * * This function requires an analytic Jacobian. In case the latter is unavailable, * use LEVMAR_BLEC_DIF() bellow * * Returns the number of iterations (>=0) if successfull, LM_ERROR if failed * * For more details on the algorithm implemented by this function, please refer to * the comments in the top of this file. * */ int LEVMAR_BLEC_DER( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ void (*jacf)(LM_REAL *p, LM_REAL *j, int m, int n, void *adata), /* function to evaluate the Jacobian \part x / \part p */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector. NULL implies a zero vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ LM_REAL *lb, /* I: vector of lower bounds. If NULL, no lower bounds apply */ LM_REAL *ub, /* I: vector of upper bounds. If NULL, no upper bounds apply */ LM_REAL *A, /* I: constraints matrix, kxm */ LM_REAL *b, /* I: right hand constraints vector, kx1 */ int k, /* I: number of constraints (i.e. A's #rows) */ LM_REAL *wghts, /* mx1 weights for penalty terms, defaults used if NULL */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu, * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * info[7]= # function evaluations * info[8]= # Jacobian evaluations */ LM_REAL *work, /* working memory at least LM_BLEC_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf. * Set to NULL if not needed */ { struct LMBLEC_DATA data; int ret; LM_REAL locinfo[LM_INFO_SZ]; register int i; if(!jacf){ fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_BLEC_DER) RCAT("().\nIf no such function is available, use ", LEVMAR_BLEC_DIF) RCAT("() rather than ", LEVMAR_BLEC_DER) "()\n"); return LM_ERROR; } if(!LEVMAR_BOX_CHECK(lb, ub, m)){ fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n")); return LM_ERROR; } /* measurement vector needs to be extended by m */ if(x){ /* nonzero x */ data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL)); if(!data.x){ fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n")); exit(1); } for(i=0; i<n; ++i) data.x[i]=x[i]; for(i=n; i<n+m; ++i) data.x[i]=0.0; } else data.x=NULL; data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); if(!data.w){ fprintf(stderr, LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n")); exit(1); } data.bctype=(int *)(data.w+m); for(i=0; i<m; ++i){ data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i]; if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__; else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__; else data.bctype[i]=__BC_HIGH__; } data.lb=lb; data.ub=ub; data.func=func; data.jacf=jacf; data.adata=adata; if(!info) info=locinfo; /* make sure that LEVMAR_LEC_DER() is called with non-null info */ ret=LEVMAR_LEC_DER(LMBLEC_FUNC, LMBLEC_JACF, p, data.x, m, n+m, A, b, k, itmax, opts, info, work, covar, (void *)&data); if(data.x) free(data.x); free(data.w); return ret; }
/* * This function returns the solution of Ax = b * * The function is based on QR decomposition with explicit computation of Q: * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes * Q R x = b or R x = Q^T b. * The last equation can be solved directly. * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successful * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_QR(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; static int nb=0; /* no __STATIC__ decl. here! */ LM_REAL *a, *tau, *r, *work; int a_sz, tau_sz, r_sz, tot_sz; register int i, j; int info, worksz, nrhs=1; register LM_REAL sum; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ a_sz=m*m; tau_sz=m; r_sz=m*m; /* only the upper triangular part really needed */ if(!nb){ LM_REAL tmp; worksz=-1; // workspace query; optimal size is returned in tmp GEQRF((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (LM_REAL *)&tmp, (int *)&worksz, (int *)&info); nb=((int)tmp)/m; // optimal worksize is m*nb } worksz=nb*m; tot_sz=a_sz + tau_sz + r_sz + worksz; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QR) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QR) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; tau=a+a_sz; r=tau+tau_sz; work=r+r_sz; /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; /* QR decomposition of A */ GEQRF((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GEQRF) " in ", AX_EQ_B_QR) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("Unknown LAPACK error %d for ", GEQRF) " in ", AX_EQ_B_QR) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* R is stored in the upper triangular part of a; copy it in r so that ORGQR() below won't destroy it */ memcpy(r, a, r_sz*sizeof(LM_REAL)); /* compute Q using the elementary reflectors computed by the above decomposition */ ORGQR((int *)&m, (int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", ORGQR) " in ", AX_EQ_B_QR) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("Unknown LAPACK error (%d) in ", AX_EQ_B_QR) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* Q is now in a; compute Q^T b in x */ for(i=0; i<m; i++){ for(j=0, sum=0.0; j<m; j++) sum+=a[i*m+j]*B[j]; x[i]=sum; } /* solve the linear system R x = Q^t b */ TRTRS("U", "N", "N", (int *)&m, (int *)&nrhs, r, (int *)&m, x, (int *)&m, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) " in ", AX_EQ_B_QR) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_QR) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
int LEVMAR_BC_DER( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ void (*jacf)(LM_REAL *p, LM_REAL *j, int m, int n, void *adata), /* function to evaluate the Jacobian \part x / \part p */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector. NULL implies a zero vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ LM_REAL *lb, /* I: vector of lower bounds. If NULL, no lower bounds apply */ LM_REAL *ub, /* I: vector of upper bounds. If NULL, no upper bounds apply */ LM_REAL *dscl, /* I: diagonal scaling constants. NULL implies no scaling */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu, * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used. * Note that ||J^T e||_inf is computed on free (not equal to lb[i] or ub[i]) variables only. */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * info[7]= # function evaluations * info[8]= # Jacobian evaluations * info[9]= # linear systems solved, i.e. # attempts for reducing error */ LM_REAL *work, /* working memory at least LM_BC_DER_WORKSZ() reals large, allocated if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf. * Set to NULL if not needed */ { register int i, j, k, l; int worksz, freework=0, issolved; /* temp work arrays */ LM_REAL *e, /* nx1 */ *hx, /* \hat{x}_i, nx1 */ *jacTe, /* J^T e_i mx1 */ *jac, /* nxm */ *jacTjac, /* mxm */ *Dp, /* mx1 */ *diag_jacTjac, /* diagonal of J^T J, mx1 */ *pDp, /* p + Dp, mx1 */ *sp_pDp=NULL; /* dscl*p or dscl*pDp, mx1 */ register LM_REAL mu, /* damping constant */ tmp; /* mainly used in matrix & vector multiplications */ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */ LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL tau, eps1, eps2, eps2_sq, eps3; LM_REAL init_p_eL2; int nu=2, nu2, stop=0, nfev, njev=0, nlss=0; const int nm=n*m; /* variables for constrained LM */ struct FUNC_STATE fstate; LM_REAL alpha=LM_CNST(1e-4), beta=LM_CNST(0.9), gamma=LM_CNST(0.99995), rho=LM_CNST(1e-8); LM_REAL t, t0, jacTeDp; LM_REAL tmin=LM_CNST(1e-12), tming=LM_CNST(1e-18); /* minimum step length for LS and PG steps */ const LM_REAL tini=LM_CNST(1.0); /* initial step length for LS and PG steps */ int nLMsteps=0, nLSsteps=0, nPGsteps=0, gprevtaken=0; int numactive; int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; mu=jacTe_inf=t=0.0; /* -Wall */ if(n<m){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m); return LM_ERROR; } if(!jacf){ fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_BC_DER) RCAT("().\nIf no such function is available, use ", LEVMAR_BC_DIF) RCAT("() rather than ", LEVMAR_BC_DER) "()\n"); return LM_ERROR; } if(!LEVMAR_BOX_CHECK(lb, ub, m)){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): at least one lower bound exceeds the upper one\n")); return LM_ERROR; } if(dscl){ /* check that scaling consts are valid */ for(i=m; i-->0; ) if(dscl[i]<=0.0){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): scaling constants should be positive (scale %d: %g <= 0)\n"), i, dscl[i]); return LM_ERROR; } sp_pDp=(LM_REAL *)malloc(m*sizeof(LM_REAL)); if(!sp_pDp){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n")); return LM_ERROR; } } if(opts){ tau=opts[0]; eps1=opts[1]; eps2=opts[2]; eps2_sq=opts[2]*opts[2]; eps3=opts[3]; } else{ // use default values tau=LM_CNST(LM_INIT_MU); eps1=LM_CNST(LM_STOP_THRESH); eps2=LM_CNST(LM_STOP_THRESH); eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH); eps3=LM_CNST(LM_STOP_THRESH); } if(!work){ worksz=LM_BC_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m; work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ if(!work){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n")); return LM_ERROR; } freework=1; } /* set up work arrays */ e=work; hx=e + n; jacTe=hx + n; jac=jacTe + m; jacTjac=jac + nm; Dp=jacTjac + m*m; diag_jacTjac=Dp + m; pDp=diag_jacTjac + m; fstate.n=n; fstate.hx=hx; fstate.x=x; fstate.lb=lb; fstate.ub=ub; fstate.adata=adata; fstate.nfev=&nfev; /* see if starting point is within the feasible set */ for(i=0; i<m; ++i) pDp[i]=p[i]; BOXPROJECT(p, lb, ub, m); /* project to feasible set */ for(i=0; i<m; ++i) if(pDp[i]!=p[i]) fprintf(stderr, RCAT("Warning: component %d of starting point not feasible in ", LEVMAR_BC_DER) "()! [%g projected to %g]\n", i, pDp[i], p[i]); /* compute e=x - f(p) and its L2 norm */ (*func)(p, hx, m, n, adata); nfev=1; /* ### e=x-hx, p_eL2=||e|| */ #if 1 p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n); #else for(i=0, p_eL2=0.0; i<n; ++i){ e[i]=tmp=x[i]-hx[i]; p_eL2+=tmp*tmp; } #endif init_p_eL2=p_eL2; if(!LM_FINITE(p_eL2)) stop=7; if(dscl){ /* scale starting point and constraints */ for(i=m; i-->0; ) p[i]/=dscl[i]; BOXSCALE(lb, ub, dscl, m, 1); } for(k=0; k<itmax && !stop; ++k){ /* Note that p and e have been updated at a previous iteration */ if(p_eL2<=eps3){ /* error is small */ stop=6; break; } /* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. * Since J^T J is symmetric, its computation can be sped up by computing * only its upper triangular part and copying it to the lower part */ if(!dscl){ (*jacf)(p, jac, m, n, adata); ++njev; } else{ for(i=m; i-->0; ) sp_pDp[i]=p[i]*dscl[i]; (*jacf)(sp_pDp, jac, m, n, adata); ++njev; /* compute jac*D */ for(i=n; i-->0; ){ register LM_REAL *jacim; jacim=jac+i*m; for(j=m; j-->0; ) jacim[j]*=dscl[j]; // jac[i*m+j]*=dscl[j]; } } /* J^T J, J^T e */ if(nm<__BLOCKSZ__SQ){ // this is a small problem /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj. * Thus, the product J^T J can be computed using an outer loop for * l that adds J_li*J_lj to each element ij of the result. Note that * with this scheme, the accesses to J and JtJ are always along rows, * therefore induces less cache misses compared to the straightforward * algorithm for computing the product (i.e., l loop is innermost one). * A similar scheme applies to the computation of J^T e. * However, for large minimization problems (i.e., involving a large number * of unknowns and measurements) for which J/J^T J rows are too large to * fit in the L1 cache, even this scheme incures many cache misses. In * such cases, a cache-efficient blocking scheme is preferable. * * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * performance problem. * * Note that the non-blocking algorithm is faster on small * problems since in this case it avoids the overheads of blocking. */ register LM_REAL alpha, *jaclm, *jacTjacim; /* looping downwards saves a few computations */ for(i=m*m; i-->0; ) jacTjac[i]=0.0; for(i=m; i-->0; ) jacTe[i]=0.0; for(l=n; l-->0; ){ jaclm=jac+l*m; for(i=m; i-->0; ){ jacTjacim=jacTjac+i*m; alpha=jaclm[i]; //jac[l*m+i]; for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */ jacTjacim[j]+=jaclm[j]*alpha; //jacTjac[i*m+j]+=jac[l*m+j]*alpha /* J^T e */ jacTe[i]+=alpha*e[l]; } } for(i=m; i-->0; ) /* copy to upper part */ for(j=i+1; j<m; ++j) jacTjac[i*m+j]=jacTjac[j*m+i]; } else{ // this is a large problem /* Cache efficient computation of J^T J based on blocking */ LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m); /* cache efficient computation of J^T e */ for(i=0; i<m; ++i) jacTe[i]=0.0; for(i=0; i<n; ++i){ register LM_REAL *jacrow; for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l) jacTe[l]+=jacrow[l]*tmp; } } /* Compute ||J^T e||_inf and ||p||^2. Note that ||J^T e||_inf * is computed for free (i.e. inactive) variables only. * At a local minimum, if p[i]==ub[i] then g[i]>0; * if p[i]==lb[i] g[i]<0; otherwise g[i]=0 */ for(i=j=numactive=0, p_L2=jacTe_inf=0.0; i<m; ++i){ if(ub && p[i]==ub[i]){ ++numactive; if(jacTe[i]>0.0) ++j; } else if(lb && p[i]==lb[i]){ ++numactive; if(jacTe[i]<0.0) ++j; } else if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp; diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */ p_L2+=p[i]*p[i]; } //p_L2=sqrt(p_L2); #if 0 if(!(k%100)){ printf("Current estimate: "); for(i=0; i<m; ++i) printf("%.9g ", p[i]); printf("-- errors %.9g %0.9g, #active %d [%d]\n", jacTe_inf, p_eL2, numactive, j); } #endif /* check for convergence */ if(j==numactive && (jacTe_inf <= eps1)){ Dp_L2=0.0; /* no increment for p in this case */ stop=1; break; } /* compute initial damping factor */ if(k==0){ if(!lb && !ub){ /* no bounds */ for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */ mu=tau*tmp; } else mu=LM_CNST(0.5)*tau*p_eL2; /* use Kanzow's starting mu */ } /* determine increment using a combination of adaptive damping, line search and projected gradient search */ while(1){ /* augment normal equations */ for(i=0; i<m; ++i) jacTjac[i*m+i]+=mu; /* solve augmented equations */ #ifdef HAVE_LAPACK /* 7 alternatives are available: LU, Cholesky + Cholesky with PLASMA, LDLt, 2 variants of QR decomposition and SVD. * For matrices with dimensions of at least a few hundreds, the PLASMA implementation of Cholesky is the fastest. * From the serial solvers, Cholesky is the fastest but might occasionally be inapplicable due to numerical round-off; * QR is slower but more robust; SVD is the slowest but most robust; LU is quite robust but * slower than LDLt; LDLt offers a good tradeoff between robustness and speed */ issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK; //issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU; //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL; #ifdef HAVE_PLASMA //issolved=AX_EQ_B_PLASMA_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_PLASMA_CHOL; #endif //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR; //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS; //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD; #else /* use the LU included with levmar */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU; #endif /* HAVE_LAPACK */ if(issolved){ for(i=0; i<m; ++i) pDp[i]=p[i] + Dp[i]; /* compute p's new estimate and ||Dp||^2 */ BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */ for(i=0, Dp_L2=0.0; i<m; ++i){ Dp[i]=tmp=pDp[i]-p[i]; Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ stop=2; break; } if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)*LM_CNST(EPSILON))){ /* almost singular */ stop=4; break; } if(!dscl){ (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */ } else{ for(i=m; i-->0; ) sp_pDp[i]=pDp[i]*dscl[i]; (*func)(sp_pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */ } /* ### hx=x-hx, pDp_eL2=||hx|| */ #if 1 pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n); #else for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } #endif /* the following test ensures that the computation of pDp_eL2 has not overflowed. * Such an overflow does no harm here, thus it is not signalled as an error */ if(!LM_FINITE(pDp_eL2) && !LM_FINITE(VECNORM(hx, n))){ stop=7; break; } if(pDp_eL2<=gamma*p_eL2){ for(i=0, dL=0.0; i<m; ++i) dL+=Dp[i]*(mu*Dp[i]+jacTe[i]); #if 1 if(dL>0.0){ dF=p_eL2-pDp_eL2; tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0)); tmp=LM_CNST(1.0)-tmp*tmp*tmp; mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) ); } else{ tmp=LM_CNST(0.1)*pDp_eL2; /* pDp_eL2 is the new p_eL2 */ mu=(mu>=tmp)? tmp : mu; } #else tmp=LM_CNST(0.1)*pDp_eL2; /* pDp_eL2 is the new p_eL2 */ mu=(mu>=tmp)? tmp : mu; #endif nu=2; for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2; ++nLMsteps; gprevtaken=0; break; } /* note that if the LM step is not taken, code falls through to the LM line search below */ } else{ /* the augmented linear system could not be solved, increase mu */ mu*=nu; nu2=nu<<1; // 2*nu; if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */ stop=5; break; } nu=nu2; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; continue; /* solve again with increased nu */ } /* if this point is reached, the LM step did not reduce the error; * see if it is a descent direction */ /* negate jacTe (i.e. g) & compute g^T * Dp */ for(i=0, jacTeDp=0.0; i<m; ++i){ jacTe[i]=-jacTe[i]; jacTeDp+=jacTe[i]*Dp[i]; } if(jacTeDp<=-rho*pow(Dp_L2, LM_CNST(_POW_)/LM_CNST(2.0))){ /* Dp is a descent direction; do a line search along it */ #if 1 /* use Schnabel's backtracking line search; it requires fewer "func" evaluations */ { int mxtake, iretcd; LM_REAL stepmx, steptl=LM_CNST(1e3)*(LM_REAL)sqrt(LM_REAL_EPSILON); tmp=(LM_REAL)sqrt(p_L2); stepmx=LM_CNST(1e3)*( (tmp>=LM_CNST(1.0))? tmp : LM_CNST(1.0) ); LNSRCH(m, p, p_eL2, jacTe, Dp, alpha, pDp, &pDp_eL2, func, &fstate, &mxtake, &iretcd, stepmx, steptl, dscl); /* NOTE: LNSRCH() updates hx */ if(iretcd!=0 || !LM_FINITE(pDp_eL2)) goto gradproj; /* rather inelegant but effective way to handle LNSRCH() failures... */ } #else /* use the simpler (but slower!) line search described by Kanzow et al */ for(t=tini; t>tmin; t*=beta){ for(i=0; i<m; ++i) pDp[i]=p[i] + t*Dp[i]; BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */ if(!dscl){ (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + t*Dp */ } else{ for(i=m; i-->0; ) sp_pDp[i]=pDp[i]*dscl[i]; (*func)(sp_pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + t*Dp */ } /* compute ||e(pDp)||_2 */ /* ### hx=x-hx, pDp_eL2=||hx|| */ #if 1 pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n); #else for(i=0, pDp_eL2=0.0; i<n; ++i){ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } #endif /* ||e(pDp)||_2 */ if(!LM_FINITE(pDp_eL2)) goto gradproj; /* treat as line search failure */ //if(LM_CNST(0.5)*pDp_eL2<=LM_CNST(0.5)*p_eL2 + t*alpha*jacTeDp) break; if(pDp_eL2<=p_eL2 + LM_CNST(2.0)*t*alpha*jacTeDp) break; } #endif /* line search alternatives */ ++nLSsteps; gprevtaken=0; /* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2. * These values are used below to update their corresponding variables */ } else{ /* Note that this point can also be reached via a goto when LNSRCH() fails. */ gradproj: /* jacTe has been negated above. Being a descent direction, it is next used * to make a projected gradient step */ /* compute ||g|| */ for(i=0, tmp=0.0; i<m; ++i) tmp+=jacTe[i]*jacTe[i]; tmp=(LM_REAL)sqrt(tmp); tmp=LM_CNST(100.0)/(LM_CNST(1.0)+tmp); t0=(tmp<=tini)? tmp : tini; /* guard against poor scaling & large steps; see (3.50) in C.T. Kelley's book */ /* if the previous step was along the gradient descent, try to use the t employed in that step */ for(t=(gprevtaken)? t : t0; t>tming; t*=beta){ for(i=0; i<m; ++i) pDp[i]=p[i] - t*jacTe[i]; BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */ for(i=0, Dp_L2=0.0; i<m; ++i){ Dp[i]=tmp=pDp[i]-p[i]; Dp_L2+=tmp*tmp; } if(!dscl){ (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p - t*g */ } else{ for(i=m; i-->0; ) sp_pDp[i]=pDp[i]*dscl[i]; (*func)(sp_pDp, hx, m, n, adata); ++nfev; /* evaluate function at p - t*g */ } /* compute ||e(pDp)||_2 */ /* ### hx=x-hx, pDp_eL2=||hx|| */ #if 1 pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n); #else for(i=0, pDp_eL2=0.0; i<n; ++i){ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } #endif /* the following test ensures that the computation of pDp_eL2 has not overflowed. * Such an overflow does no harm here, thus it is not signalled as an error */ if(!LM_FINITE(pDp_eL2) && !LM_FINITE(VECNORM(hx, n))){ stop=7; goto breaknested; } /* compute ||g^T * Dp||. Note that if pDp has not been altered by projection * (i.e. BOXPROJECT), jacTeDp=-t*||g||^2 */ for(i=0, jacTeDp=0.0; i<m; ++i) jacTeDp+=jacTe[i]*Dp[i]; if(gprevtaken && pDp_eL2<=p_eL2 + LM_CNST(2.0)*LM_CNST(0.99999)*jacTeDp){ /* starting t too small */ t=t0; gprevtaken=0; continue; } //if(LM_CNST(0.5)*pDp_eL2<=LM_CNST(0.5)*p_eL2 + alpha*jacTeDp) terminatePGLS; if(pDp_eL2<=p_eL2 + LM_CNST(2.0)*alpha*jacTeDp) goto terminatePGLS; //if(pDp_eL2<=p_eL2 - LM_CNST(2.0)*alpha/t*Dp_L2) goto terminatePGLS; // sufficient decrease condition proposed by Kelley in (5.13) } /* if this point is reached then the gradient line search has failed */ gprevtaken=0; break; terminatePGLS: ++nPGsteps; gprevtaken=1; /* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2 */ } /* update using computed values */ for(i=0, Dp_L2=0.0; i<m; ++i){ tmp=pDp[i]-p[i]; Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ stop=2; break; } for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2; break; } /* inner loop */ } breaknested: /* NOTE: this point is also reached via an explicit goto! */ if(k>=itmax) stop=3; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; if(info){ info[0]=init_p_eL2; info[1]=p_eL2; info[2]=jacTe_inf; info[3]=Dp_L2; for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i]; info[4]=mu/tmp; info[5]=(LM_REAL)k; info[6]=(LM_REAL)stop; info[7]=(LM_REAL)nfev; info[8]=(LM_REAL)njev; info[9]=(LM_REAL)nlss; } /* covariance matrix */ if(covar){ LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n); if(dscl){ /* correct for the scaling */ for(i=m; i-->0; ) for(j=m; j-->0; ) covar[i*m+j]*=(dscl[i]*dscl[j]); } } if(freework) free(work); #ifdef LINSOLVERS_RETAIN_MEMORY if(linsolver) (*linsolver)(NULL, NULL, NULL, 0); #endif #if 0 printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSsteps, nPGsteps); #endif if(dscl){ /* scale final point and constraints */ for(i=0; i<m; ++i) p[i]*=dscl[i]; BOXSCALE(lb, ub, dscl, m, 0); free(sp_pDp); } return (stop!=4 && stop!=7)? k : LM_ERROR; }
/* * This function returns the solution of min_x ||Ax - b|| * * || . || is the second order (i.e. L2) norm. This is a least squares technique that * is based on QR decomposition: * If A=Q R with Q orthogonal and R upper triangular, the normal equations become * (A^T A) x = A^T b or (R^T Q^T Q R) x = A^T b or (R^T R) x = A^T b. * This amounts to solving R^T y = A^T b for y and then R x = y for x * Note that Q does not need to be explicitly computed * * A is mxn, b is mx1 * * The function returns 0 in case of error, 1 if successful * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_QRLS(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m, int n) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; static int nb=0; /* no __STATIC__ decl. here! */ LM_REAL *a, *tau, *r, *work; int a_sz, tau_sz, r_sz, tot_sz; register int i, j; int info, worksz, nrhs=1; register LM_REAL sum; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ if(m<n){ fprintf(stderr, RCAT("Normal equations require that the number of rows is greater than number of columns in ", AX_EQ_B_QRLS) "() [%d x %d]! -- try transposing\n", m, n); exit(1); } /* calculate required memory size */ a_sz=m*n; tau_sz=n; r_sz=n*n; if(!nb){ LM_REAL tmp; worksz=-1; // workspace query; optimal size is returned in tmp GEQRF((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (LM_REAL *)&tmp, (int *)&worksz, (int *)&info); nb=((int)tmp)/m; // optimal worksize is m*nb } worksz=nb*m; tot_sz=a_sz + tau_sz + r_sz + worksz; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QRLS) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_QRLS) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; tau=a+a_sz; r=tau+tau_sz; work=r+r_sz; /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<n; j++) a[i+j*m]=A[i*n+j]; /* compute A^T b in x */ for(i=0; i<n; i++){ for(j=0, sum=0.0; j<m; j++) sum+=A[j*n+i]*B[j]; x[i]=sum; } /* QR decomposition of A */ GEQRF((int *)&m, (int *)&n, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GEQRF) " in ", AX_EQ_B_QRLS) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("Unknown LAPACK error %d for ", GEQRF) " in ", AX_EQ_B_QRLS) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* R is stored in the upper triangular part of a. Note that a is mxn while r nxn */ for(j=0; j<n; j++){ for(i=0; i<=j; i++) r[i+j*n]=a[i+j*m]; /* lower part is zero */ for(i=j+1; i<n; i++) r[i+j*n]=0.0; } /* solve the linear system R^T y = A^t b */ TRTRS("U", "T", "N", (int *)&n, (int *)&nrhs, r, (int *)&n, x, (int *)&n, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) " in ", AX_EQ_B_QRLS) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_QRLS) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } /* solve the linear system R x = y */ TRTRS("U", "N", "N", (int *)&n, (int *)&nrhs, r, (int *)&n, x, (int *)&n, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) " in ", AX_EQ_B_QRLS) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_QRLS) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* Similar to the LEVMAR_BLEC_DER() function above, except that the Jacobian is approximated * with the aid of finite differences (forward or central, see the comment for the opts argument) */ int LEVMAR_BLEC_DIF( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector. NULL implies a zero vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ LM_REAL *lb, /* I: vector of lower bounds. If NULL, no lower bounds apply */ LM_REAL *ub, /* I: vector of upper bounds. If NULL, no upper bounds apply */ LM_REAL *A, /* I: constraints matrix, kxm */ LM_REAL *b, /* I: right hand constraints vector, kx1 */ int k, /* I: number of constraints (i.e. A's #rows) */ LM_REAL *wghts, /* mx1 weights for penalty terms, defaults used if NULL */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[5], /* I: opts[0-3] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the * scale factor for initial \mu, stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2 and * the step used in difference approximation to the Jacobian. Set to NULL for defaults to be used. * If \delta<0, the Jacobian is approximated with central differences which are more accurate * (but slower!) compared to the forward differences employed by default. */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error * info[7]= # function evaluations * info[8]= # Jacobian evaluations * info[9]= # linear systems solved, i.e. # attempts for reducing error */ LM_REAL *work, /* working memory at least LM_BLEC_DIF_WORKSZ() reals large, allocated if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func. * Set to NULL if not needed */ { struct LMBLEC_DATA data; int ret; register int i; LM_REAL locinfo[LM_INFO_SZ]; if(!lb && !ub){ PRINT_ERROR(RCAT(LCAT(LEVMAR_BLEC_DIF, "(): lower and upper bounds for box constraints cannot be both NULL, use "), LEVMAR_LEC_DIF) "() in this case!\n"); return LM_ERROR_NO_BOX_CONSTRAINTS; } if(!LEVMAR_BOX_CHECK(lb, ub, m)){ PRINT_ERROR(LCAT(LEVMAR_BLEC_DER, "(): at least one lower bound exceeds the upper one\n")); return LM_ERROR_FAILED_BOX_CHECK; } /* measurement vector needs to be extended by m */ if(x){ /* nonzero x */ data.x=(LM_REAL *)malloc((n+m)*sizeof(LM_REAL)); if(!data.x){ PRINT_ERROR(LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #1 failed\n")); return LM_ERROR_MEMORY_ALLOCATION_FAILURE; } for(i=0; i<n; ++i) data.x[i]=x[i]; for(i=n; i<n+m; ++i) data.x[i]=0.0; } else data.x=NULL; data.w=(LM_REAL *)malloc(m*sizeof(LM_REAL) + m*sizeof(int)); /* should be arranged in that order for proper doubles alignment */ if(!data.w){ PRINT_ERROR(LCAT(LEVMAR_BLEC_DER, "(): memory allocation request #2 failed\n")); if(data.x) free(data.x); return LM_ERROR_MEMORY_ALLOCATION_FAILURE; } data.bctype=(int *)(data.w+m); /* note: at this point, one of lb, ub are not NULL */ for(i=0; i<m; ++i){ data.w[i]=(!wghts)? __BC_WEIGHT__ : wghts[i]; if(!lb) data.bctype[i]=__BC_HIGH__; else if(!ub) data.bctype[i]=__BC_LOW__; else if(ub[i]!=LM_REAL_MAX && lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_INTERVAL__; else if(lb[i]!=LM_REAL_MIN) data.bctype[i]=__BC_LOW__; else data.bctype[i]=__BC_HIGH__; } data.lb=lb; data.ub=ub; data.func=func; data.jacf=NULL; data.adata=adata; if(!info) info=locinfo; /* make sure that LEVMAR_LEC_DIF() is called with non-null info */ ret=LEVMAR_LEC_DIF(LMBLEC_FUNC, p, data.x, m, n+m, A, b, k, itmax, opts, info, work, covar, (void *)&data); if(data.x) free(data.x); free(data.w); return ret; }
/* * This function returns the solution of Ax=b * * The function assumes that A is symmetric & positive definite and employs the * Cholesky decomposition implemented by PLASMA for homogeneous multicore processors. * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_PLASMA_CHOL(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; LM_REAL *a; int a_sz, tot_sz; int info, nrhs=1; if(A==NULL){ #ifdef LINSOLVERS_RETAIN_MEMORY if(buf) free(buf); buf=NULL; buf_sz=0; #endif /* LINSOLVERS_RETAIN_MEMORY */ PLASMA_Finalize(); PLASMA_ncores=-PLASMA_ncores; return 1; } /* calculate required memory size */ a_sz=m*m; tot_sz=a_sz; #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_PLASMA_CHOL) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_PLASMA_CHOL) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; /* store A into a and B into x; A is assumed to be symmetric, * hence no transposition is needed */ memcpy(a, A, a_sz*sizeof(LM_REAL)); memcpy(x, B, m*sizeof(LM_REAL)); /* initialize PLASMA */ if(PLASMA_ncores<0){ PLASMA_ncores=-PLASMA_ncores; PLASMA_Init(PLASMA_ncores); fprintf(stderr, RCAT("\n", AX_EQ_B_PLASMA_CHOL) "(): PLASMA is running on %d cores.\n\n", PLASMA_ncores); } /* Solve the linear system */ info=PLASMA_POSV(PlasmaLower, m, 1, a, m, x, m); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", PLASMA_POSV) " in ", AX_EQ_B_PLASMA_CHOL) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\n" "the factorization could not be completed for ", PLASMA_POSV) " in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function returns the solution of Ax=b * * The function assumes that A is symmetric & postive definite and employs * the Cholesky decomposition: * If A=U^T U with U upper triangular, the system to be solved becomes * (U^T U) x = b * This amount to solving U^T y = b for y and then U x = y for x * * A is mxm, b is mx1 * * The function returns 0 in case of error, 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_CHOL(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { LM_REAL stackBuf[16384]; const int stackBuf_sz = 16384; __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; LM_REAL *a, *b; int a_sz, b_sz, tot_sz; register int i, j; int info, nrhs=1; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ a_sz=m*m; b_sz=m; tot_sz=a_sz + b_sz; if(tot_sz <= stackBuf_sz) { a=stackBuf; } else { #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_CHOL) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_CHOL) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; } b=a+a_sz; /* store A (column major!) into a anb B into b */ for(i=0; i<m; i++){ for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; b[i]=B[i]; } /* Cholesky decomposition of A */ POTF2("U", (int *)&m, a, (int *)&m, (int *)&info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", POTF2) " in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } else{ //fprintf(stderr, RCAT(RCAT("LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for ", POTF2) " in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY if(buf) free(buf); /* free previously allocated memory */ #endif return 0; } } /* solve the linear system U^T y = b */ TRTRS("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) " in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY if(buf) free(buf); /* free previously allocated memory */ #endif return 0; } } /* solve the linear system U x = y */ TRTRS("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRS) "in ", AX_EQ_B_CHOL) "()\n", -info); exit(1); } else{ //fprintf(stderr, RCAT("LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in ", AX_EQ_B_CHOL) "()\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY if(buf) free(buf); /* free previously allocated memory */ #endif return 0; } } /* copy the result in x */ for(i=0; i<m; i++) x[i]=b[i]; #ifndef LINSOLVERS_RETAIN_MEMORY if(buf) free(buf); /* free previously allocated memory */ #endif return 1; }
/* * This function returns the solution of Ax = b * * The function is based on SVD decomposition: * If A=U D V^T with U, V orthogonal and D diagonal, the linear system becomes * (U D V^T) x = b or x=V D^{-1} U^T b * Note that V D^{-1} U^T is the pseudoinverse A^+ * * A is mxm, b is mx1. * * The function returns 0 in case of error, 1 if successful * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_SVD(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; static LM_REAL eps=LM_CNST(-1.0); register int i, j; LM_REAL *a, *u, *s, *vt, *work; int a_sz, u_sz, s_sz, vt_sz, tot_sz; LM_REAL thresh, one_over_denom; register LM_REAL sum; int info, rank, worksz, *iwork, iworksz; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ #if 1 /* use optimal size */ worksz=-1; // workspace query. Keep in mind that GESDD requires more memory than GESVD /* note that optimal work size is returned in thresh */ GESVD("A", "A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (LM_REAL *)&thresh, (int *)&worksz, &info); //GESDD("A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (LM_REAL *)&thresh, (int *)&worksz, NULL, &info); worksz=(int)thresh; #else /* use minimum size */ worksz=5*m; // min worksize for GESVD //worksz=m*(7*m+4); // min worksize for GESDD #endif iworksz=8*m; a_sz=m*m; u_sz=m*m; s_sz=m; vt_sz=m*m; tot_sz=(a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL) + iworksz*sizeof(int); /* should be arranged in that order for proper doubles alignment */ #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_SVD) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_SVD) "() failed!\n"); exit(1); } #endif /* LINSOLVERS_RETAIN_MEMORY */ a=buf; u=a+a_sz; s=u+u_sz; vt=s+s_sz; work=vt+vt_sz; iwork=(int *)(work+worksz); /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; /* SVD decomposition of A */ GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info); //GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GESVD), "/" GESDD) " in ", AX_EQ_B_SVD) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", AX_EQ_B_SVD) "() [info=%d]\n", info); #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 0; } } if(eps<0.0){ LM_REAL aux; /* compute machine epsilon */ for(eps=LM_CNST(1.0); aux=eps+LM_CNST(1.0), aux-LM_CNST(1.0)>0.0; eps*=LM_CNST(0.5)) ; eps*=LM_CNST(2.0); } /* compute the pseudoinverse in a */ for(i=0; i<a_sz; i++) a[i]=0.0; /* initialize to zero */ for(rank=0, thresh=eps*s[0]; rank<m && s[rank]>thresh; rank++){ one_over_denom=LM_CNST(1.0)/s[rank]; for(j=0; j<m; j++) for(i=0; i<m; i++) a[i*m+j]+=vt[rank+i*m]*u[j+rank*m]*one_over_denom; } /* compute A^+ b in x */ for(i=0; i<m; i++){ for(j=0, sum=0.0; j<m; j++) sum+=a[i*m+j]*B[j]; x[i]=sum; } #ifndef LINSOLVERS_RETAIN_MEMORY free(buf); #endif return 1; }
/* * This function returns the solution of Ax = b * * The function employs LU decomposition: * If A=L U with L lower and U upper triangular, then the original system * amounts to solving * L y = b, U x = y * * A is mxm, b is mx1 * * The function returns 0 in case of error, * 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { __STATIC__ LM_REAL *buf=NULL; __STATIC__ int buf_sz=0; LM_REAL stackBuf[2048]; const int stackBuf_sz=2048; int a_sz, ipiv_sz, b_sz, work_sz, tot_sz; register int i, j; int info, *ipiv, nrhs=1; LM_REAL *a, *b, *work; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ ipiv_sz=m; a_sz=m*m; b_sz=m; work_sz=100*m; /* this is probably too much */ tot_sz=ipiv_sz + a_sz + b_sz + work_sz; // ipiv_sz counted as LM_REAL here, no harm is done though #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf) free(buf); /* free previously allocated memory */ buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } } #else buf_sz=tot_sz; if(buf_sz <= stackBuf_sz) { buf=stackBuf; } else { buf=(LM_REAL *)malloc(buf_sz*sizeof(LM_REAL)); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } } #endif /* LINSOLVERS_RETAIN_MEMORY */ ipiv=(int *)buf; a=(LM_REAL *)(ipiv + ipiv_sz); b=a+a_sz; work=b+b_sz; /* store A (column major!) into a and B into b */ for(i=0; i<m; i++){ for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; b[i]=B[i]; } /* LU decomposition for A */ GETRF((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("argument %d of ", GETRF) " illegal in ", AX_EQ_B_LU) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("singular matrix A for ", GETRF) " in ", AX_EQ_B_LU) "()\n"); #ifndef LINSOLVERS_RETAIN_MEMORY if(buf != stackBuf) free(buf); #endif return 0; } } /* solve the system with the computed LU */ GETRS("N", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, b, (int *)&m, (int *)&info); if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("argument %d of ", GETRS) " illegal in ", AX_EQ_B_LU) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT(RCAT("unknown error for ", GETRS) " in ", AX_EQ_B_LU) "()\n"); #ifndef LINSOLVERS_RETAIN_MEMORY if(buf != stackBuf) free(buf); #endif return 0; } } /* copy the result in x */ for(i=0; i<m; i++){ x[i]=b[i]; } #ifndef LINSOLVERS_RETAIN_MEMORY if(buf != stackBuf) free(buf); #endif return 1; }
/* * This function implements an elimination strategy for linearly constrained * optimization problems. The strategy relies on QR decomposition to transform * an optimization problem constrained by Ax=b to an equivalent, unconstrained * one. Also referred to as "null space" or "reduced Hessian" method. * See pp. 430-433 (chap. 15) of "Numerical Optimization" by Nocedal-Wright * for details. * * A is mxn with m<=n and rank(A)=m * Two matrices Y and Z of dimensions nxm and nx(n-m) are computed from A^T so that * their columns are orthonormal and every x can be written as x=Y*b + Z*x_z= * c + Z*x_z, where c=Y*b is a fixed vector of dimension n and x_z is an * arbitrary vector of dimension n-m. Then, the problem of minimizing f(x) * subject to Ax=b is equivalent to minimizing f(c + Z*x_z) with no constraints. * The computed Y and Z are such that any solution of Ax=b can be written as * x=Y*x_y + Z*x_z for some x_y, x_z. Furthermore, A*Y is nonsingular, A*Z=0 * and Z spans the null space of A. * * The function accepts A, b and computes c, Y, Z. If b or c is NULL, c is not * computed. Also, Y can be NULL in which case it is not referenced. * The function returns 0 in case of error, A's computed rank if successfull * */ static int LMLEC_ELIM(LM_REAL *A, LM_REAL *b, LM_REAL *c, LM_REAL *Y, LM_REAL *Z, int m, int n) { static LM_REAL eps=CNST(-1.0); LM_REAL *buf=NULL; LM_REAL *a, *tau, *work, *r; register LM_REAL tmp; int a_sz, jpvt_sz, tau_sz, r_sz, Y_sz, worksz; int info, rank, *jpvt, tot_sz, mintmn, tm, tn; register int i, j, k; if(m>n){ fprintf(stderr, RCAT("matrix of constraints cannot have more rows than columns in", LMLEC_ELIM) "()!\n"); exit(1); } tm=n; tn=m; // transpose dimensions mintmn=m; /* calculate required memory size */ a_sz=tm*tm; // tm*tn is enough for xgeqp3() jpvt_sz=tn; tau_sz=mintmn; r_sz=mintmn*mintmn; // actually smaller if a is not of full row rank worksz=2*tn+(tn+1)*32; // more than needed Y_sz=(Y)? 0 : tm*tn; tot_sz=jpvt_sz*sizeof(int) + (a_sz + tau_sz + r_sz + worksz + Y_sz)*sizeof(LM_REAL); buf=(LM_REAL *)malloc(tot_sz); /* allocate a "big" memory chunk at once */ if(!buf){ fprintf(stderr, RCAT("Memory allocation request failed in ", LMLEC_ELIM) "()\n"); exit(1); } a=(LM_REAL *)buf; jpvt=(int *)(a+a_sz); tau=(LM_REAL *)(jpvt + jpvt_sz); r=tau+tau_sz; work=r+r_sz; if(!Y) Y=work+worksz; /* copy input array so that LAPACK won't destroy it. Note that copying is * done in row-major order, which equals A^T in column-major */ for(i=0; i<tm*tn; ++i) a[i]=A[i]; /* clear jpvt */ for(i=0; i<jpvt_sz; ++i) jpvt[i]=0; /* rank revealing QR decomposition of A^T*/ GEQP3((int *)&tm, (int *)&tn, a, (int *)&tm, jpvt, tau, work, (int *)&worksz, &info); //dgeqpf_((int *)&tm, (int *)&tn, a, (int *)&tm, jpvt, tau, work, &info); /* error checking */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GEQP3) " in ", LMLEC_ELIM) "()\n", -info); exit(1); } else if(info>0){ fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", GEQP3) " in ", LMLEC_ELIM) "()\n", info); free(buf); return 0; } } /* the upper triangular part of a now contains the upper triangle of the unpermuted R */ if(eps<0.0){ LM_REAL aux; /* compute machine epsilon. DBL_EPSILON should do also */ for(eps=CNST(1.0); aux=eps+CNST(1.0), aux-CNST(1.0)>0.0; eps*=CNST(0.5)) ; eps*=CNST(2.0); } tmp=tm*CNST(10.0)*eps*FABS(a[0]); // threshold. tm is max(tm, tn) tmp=(tmp>CNST(1E-12))? tmp : CNST(1E-12); // ensure that threshold is not too small /* compute A^T's numerical rank by counting the non-zeros in R's diagonal */ for(i=rank=0; i<mintmn; ++i) if(a[i*(tm+1)]>tmp || a[i*(tm+1)]<-tmp) ++rank; /* loop across R's diagonal elements */ else break; /* diagonal is arranged in absolute decreasing order */ if(rank<tn){ fprintf(stderr, RCAT("\nConstraints matrix in ", LMLEC_ELIM) "() is not of full row rank (i.e. %d < %d)!\n" "Make sure that you do not specify redundant or inconsistent constraints.\n\n", rank, tn); exit(1); } /* compute the permuted inverse transpose of R */ /* first, copy R from the upper triangular part of a to r. R is rank x rank */ for(j=0; j<rank; ++j){ for(i=0; i<=j; ++i) r[i+j*rank]=a[i+j*tm]; for(i=j+1; i<rank; ++i) r[i+j*rank]=0.0; // lower part is zero } /* compute the inverse */ TRTRI("U", "N", (int *)&rank, r, (int *)&rank, &info); /* error checking */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", TRTRI) " in ", LMLEC_ELIM) "()\n", -info); exit(1); } else if(info>0){ fprintf(stderr, RCAT(RCAT("A(%d, %d) is exactly zero for ", TRTRI) " (singular matrix) in ", LMLEC_ELIM) "()\n", info, info); free(buf); return 0; } } /* then, transpose r in place */ for(i=0; i<rank; ++i) for(j=i+1; j<rank; ++j){ tmp=r[i+j*rank]; k=j+i*rank; r[i+j*rank]=r[k]; r[k]=tmp; } /* finally, permute R^-T using Y as intermediate storage */ for(j=0; j<rank; ++j) for(i=0, k=jpvt[j]-1; i<rank; ++i) Y[i+k*rank]=r[i+j*rank]; for(i=0; i<rank*rank; ++i) // copy back to r r[i]=Y[i]; /* resize a to be tm x tm, filling with zeroes */ for(i=tm*tn; i<tm*tm; ++i) a[i]=0.0; /* compute Q in a as the product of elementary reflectors. Q is tm x tm */ ORGQR((int *)&tm, (int *)&tm, (int *)&mintmn, a, (int *)&tm, tau, work, &worksz, &info); /* error checking */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT("LAPACK error: illegal value for argument %d of ", ORGQR) " in ", LMLEC_ELIM) "()\n", -info); exit(1); } else if(info>0){ fprintf(stderr, RCAT(RCAT("unknown LAPACK error (%d) for ", ORGQR) " in ", LMLEC_ELIM) "()\n", info); free(buf); return 0; } } /* compute Y=Q_1*R^-T*P^T. Y is tm x rank */ for(i=0; i<tm; ++i) for(j=0; j<rank; ++j){ for(k=0, tmp=0.0; k<rank; ++k) tmp+=a[i+k*tm]*r[k+j*rank]; Y[i*rank+j]=tmp; } if(b && c){ /* compute c=Y*b */ for(i=0; i<tm; ++i){ for(j=0, tmp=0.0; j<rank; ++j) tmp+=Y[i*rank+j]*b[j]; c[i]=tmp; } } /* copy Q_2 into Z. Z is tm x (tm-rank) */ for(j=0; j<tm-rank; ++j) for(i=0, k=j+rank; i<tm; ++i) Z[i*(tm-rank)+j]=a[i+k*tm]; free(buf); return rank; }
/* * This function returns the solution of Ax = b * * The function employs LU decomposition followed by forward/back substitution (see * also the LAPACK-based LU solver above) * * A is mxm, b is mx1 * * The function returns 0 in case of error, * 1 if successfull * * This function is often called repetitively to solve problems of identical * dimensions. To avoid repetitive malloc's and free's, allocated memory is * retained between calls and free'd-malloc'ed when not of the appropriate size. * A call with NULL as the first argument forces this memory to be released. */ int AX_EQ_B_LU(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m) { #define STATICBUFLEN 32768 static staticBuf[STATICBUFLEN]; __STATIC__ void *buf=NULL; __STATIC__ int buf_sz=0; register int i, j, k; int *idx, maxi=-1, idx_sz, a_sz, work_sz, tot_sz; LM_REAL *a, *work, max, sum, tmp; if(!A) #ifdef LINSOLVERS_RETAIN_MEMORY { if(buf && buf != staticBuf) free(buf); buf=NULL; buf_sz=0; return 1; } #else return 1; /* NOP */ #endif /* LINSOLVERS_RETAIN_MEMORY */ /* calculate required memory size */ idx_sz=m; a_sz=m*m; work_sz=m; tot_sz=idx_sz*sizeof(int) + (a_sz+work_sz)*sizeof(LM_REAL); #ifdef LINSOLVERS_RETAIN_MEMORY if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ if(buf && buf != staticBuf) free(buf); /* free previously allocated memory */ if(tot_sz <= STATICBUFLEN) { buf_sz = tot_sz; buf = staticBuf; } else { buf_sz=tot_sz; buf=(void *)malloc(tot_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } } } #else if(tot_sz <= STATICBUFLEN) { buf_sz = tot_sz; buf = staticBuf; } else { buf_sz=tot_sz; buf=(void *)malloc(tot_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", AX_EQ_B_LU) "() failed!\n"); exit(1); } } #endif /* LINSOLVERS_RETAIN_MEMORY */ idx=(int *)buf; a=(LM_REAL *)(idx + idx_sz); work=a + a_sz; /* avoid destroying A, B by copying them to a, x resp. */ for(i=0; i<m; ++i){ // B & 1st row of A a[i]=A[i]; x[i]=B[i]; } for( ; i<a_sz; ++i) a[i]=A[i]; // copy A's remaining rows /**** for(i=0; i<m; ++i){ for(j=0; j<m; ++j) a[i*m+j]=A[i*m+j]; x[i]=B[i]; } ****/ /* compute the LU decomposition of a row permutation of matrix a; the permutation itself is saved in idx[] */ for(i=0; i<m; ++i){ max=0.0; for(j=0; j<m; ++j) if((tmp=FABS(a[i*m+j]))>max) max=tmp; if(max==0.0){ fprintf(stderr, RCAT("Singular matrix A in ", AX_EQ_B_LU) "()!\n"); #ifndef LINSOLVERS_RETAIN_MEMORY if(buf && buf != staticBuf) free(buf); /* free previously allocated memory */ #endif return 0; } work[i]=LM_CNST(1.0)/max; } for(j=0; j<m; ++j){ for(i=0; i<j; ++i){ sum=a[i*m+j]; for(k=0; k<i; ++k) sum-=a[i*m+k]*a[k*m+j]; a[i*m+j]=sum; } max=0.0; for(i=j; i<m; ++i){ sum=a[i*m+j]; for(k=0; k<j; ++k) sum-=a[i*m+k]*a[k*m+j]; a[i*m+j]=sum; if((tmp=work[i]*FABS(sum))>=max){ max=tmp; maxi=i; } } if(j!=maxi){ for(k=0; k<m; ++k){ tmp=a[maxi*m+k]; a[maxi*m+k]=a[j*m+k]; a[j*m+k]=tmp; } work[maxi]=work[j]; } idx[j]=maxi; if(a[j*m+j]==0.0) a[j*m+j]=LM_REAL_EPSILON; if(j!=m-1){ tmp=LM_CNST(1.0)/(a[j*m+j]); for(i=j+1; i<m; ++i) a[i*m+j]*=tmp; } } /* The decomposition has now replaced a. Solve the linear system using * forward and back substitution */ for(i=k=0; i<m; ++i){ j=idx[i]; sum=x[j]; x[j]=x[i]; if(k!=0) for(j=k-1; j<i; ++j) sum-=a[i*m+j]*x[j]; else if(sum!=0.0) k=i+1; x[i]=sum; } for(i=m-1; i>=0; --i){ sum=x[i]; for(j=i+1; j<m; ++j) sum-=a[i*m+j]*x[j]; x[i]=sum/a[i*m+i]; } #ifndef LINSOLVERS_RETAIN_MEMORY if(buf && buf != staticBuf) free(buf); /* free previously allocated memory */ #endif return 1; }
int LEVMAR_DER( void (*func)(LM_REAL *p, LM_REAL *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */ void (*jacf)(LM_REAL *p, LM_REAL *j, int m, int n, void *adata), /* function to evaluate the jacobian \part x / \part p */ LM_REAL *p, /* I/O: initial parameter estimates. On output has the estimated solution */ LM_REAL *x, /* I: measurement vector */ int m, /* I: parameter vector dimension (i.e. #unknowns) */ int n, /* I: measurement vector dimension */ int itmax, /* I: maximum number of iterations */ LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu, * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used */ LM_REAL info[LM_INFO_SZ], /* O: information regarding the minimization. Set to NULL if don't care * info[0]= ||e||_2 at initial p. * info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p. * info[5]= # iterations, * info[6]=reason for terminating: 1 - stopped by small gradient J^T e * 2 - stopped by small Dp * 3 - stopped by itmax * 4 - singular matrix. Restart from current p with increased mu * 5 - no further error reduction is possible. Restart with increased mu * 6 - stopped by small ||e||_2 * info[7]= # function evaluations * info[8]= # jacobian evaluations */ LM_REAL *work, /* working memory, allocate if NULL */ LM_REAL *covar, /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */ void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf. * Set to NULL if not needed */ { register int i, j, k, l; int worksz, freework=0, issolved; /* temp work arrays */ LM_REAL *e, /* nx1 */ *hx, /* \hat{x}_i, nx1 */ *jacTe, /* J^T e_i mx1 */ *jac, /* nxm */ *jacTjac, /* mxm */ *Dp, /* mx1 */ *diag_jacTjac, /* diagonal of J^T J, mx1 */ *pDp; /* p + Dp, mx1 */ register LM_REAL mu, /* damping constant */ tmp; /* mainly used in matrix & vector multiplications */ LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */ LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL; LM_REAL tau, eps1, eps2, eps2_sq, eps3; LM_REAL init_p_eL2; int nu=2, nu2, stop, nfev, njev=0; const int nm=n*m; mu=jacTe_inf=0.0; /* -Wall */ if(n<m){ fprintf(stderr, LCAT(LEVMAR_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m); return -1; } if(!jacf){ fprintf(stderr, RCAT("No function specified for computing the jacobian in ", LEVMAR_DER) RCAT("().\nIf no such function is available, use ", LEVMAR_DIF) RCAT("() rather than ", LEVMAR_DER) "()\n"); return -1; } if(opts){ tau=opts[0]; eps1=opts[1]; eps2=opts[2]; eps2_sq=opts[2]*opts[2]; eps3=opts[3]; } else{ // use default values tau=CNST(LM_INIT_MU); eps1=CNST(LM_STOP_THRESH); eps2=CNST(LM_STOP_THRESH); eps2_sq=CNST(LM_STOP_THRESH)*CNST(LM_STOP_THRESH); eps3=CNST(LM_STOP_THRESH); } if(!work){ worksz=LM_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m; work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ if(!work){ fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n")); return -1; } freework=1; } /* set up work arrays */ e=work; hx=e + n; jacTe=hx + n; jac=jacTe + m; jacTjac=jac + nm; Dp=jacTjac + m*m; diag_jacTjac=Dp + m; pDp=diag_jacTjac + m; /* compute e=x - f(p) and its L2 norm */ (*func)(p, hx, m, n, adata); nfev=1; for(i=0, p_eL2=0.0; i<n; ++i){ e[i]=tmp=x[i]-hx[i]; p_eL2+=tmp*tmp; } init_p_eL2=p_eL2; for(k=stop=0; k<itmax && !stop; ++k){ /* Note that p and e have been updated at a previous iteration */ if(p_eL2<=eps3){ /* error is small */ stop=6; break; } /* Compute the jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. * Since J^T J is symmetric, its computation can be speeded up by computing * only its upper triangular part and copying it to the lower part */ (*jacf)(p, jac, m, n, adata); ++njev; /* J^T J, J^T e */ if(nm<__BLOCKSZ__SQ){ // this is a small problem /* This is the straightforward way to compute J^T J, J^T e. However, due to * its noncontinuous memory access pattern, it incures many cache misses when * applied to large minimization problems (i.e. problems involving a large * number of free variables and measurements), in which J is too large to * fit in the L1 cache. For such problems, a cache-efficient blocking scheme * is preferable. * * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * performance problem. * * On the other hand, the straightforward algorithm is faster on small * problems since in this case it avoids the overheads of blocking. */ for(i=0; i<m; ++i){ for(j=i; j<m; ++j){ int lm; for(l=0, tmp=0.0; l<n; ++l){ lm=l*m; tmp+=jac[lm+i]*jac[lm+j]; } /* store tmp in the corresponding upper and lower part elements */ jacTjac[i*m+j]=jacTjac[j*m+i]=tmp; } /* J^T e */ for(l=0, tmp=0.0; l<n; ++l) tmp+=jac[l*m+i]*e[l]; jacTe[i]=tmp; } } else{ // this is a large problem /* Cache efficient computation of J^T J based on blocking */ TRANS_MAT_MAT_MULT(jac, jacTjac, n, m); /* cache efficient computation of J^T e */ for(i=0; i<m; ++i) jacTe[i]=0.0; for(i=0; i<n; ++i){ register LM_REAL *jacrow; for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l) jacTe[l]+=jacrow[l]*tmp; } } /* Compute ||J^T e||_inf and ||p||^2 */ for(i=0, p_L2=jacTe_inf=0.0; i<m; ++i){ if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp; diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */ p_L2+=p[i]*p[i]; } //p_L2=sqrt(p_L2); #if 0 if(!(k%10)){ printf("Iter: %d, estimate: ", k); for(i=0; i<m; ++i) printf("%.9g ", p[i]); printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2); } #endif /* check for convergence */ if((jacTe_inf <= eps1)){ Dp_L2=0.0; /* no increment for p in this case */ stop=1; break; } /* compute initial damping factor */ if(k==0){ for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */ mu=tau*tmp; } /* determine increment using adaptive damping */ while(1){ /* augment normal equations */ for(i=0; i<m; ++i) jacTjac[i*m+i]+=mu; /* solve augmented equations */ #ifdef HAVE_LAPACK /* 5 alternatives are available: LU, Cholesky, 2 variants of QR decomposition and SVD. * Cholesky is the fastest but might be inaccurate; QR is slower but more accurate; * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); #else /* use the LU included with levmar */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); #endif /* HAVE_LAPACK */ if(issolved){ /* compute p's new estimate and ||Dp||^2 */ for(i=0, Dp_L2=0.0; i<m; ++i){ pDp[i]=p[i] + (tmp=Dp[i]); Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ //if(Dp_L2<=eps2*(p_L2 + eps2)){ /* relative change in p is small, stop */ stop=2; break; } if(Dp_L2>=(p_L2+eps2)/(CNST(EPSILON)*CNST(EPSILON))){ /* almost singular */ //if(Dp_L2>=(p_L2+eps2)/CNST(EPSILON)){ /* almost singular */ stop=4; break; } (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */ for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; } for(i=0, dL=0.0; i<m; ++i) dL+=Dp[i]*(mu*Dp[i]+jacTe[i]); dF=p_eL2-pDp_eL2; if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */ tmp=(CNST(2.0)*dF/dL-CNST(1.0)); tmp=CNST(1.0)-tmp*tmp*tmp; mu=mu*( (tmp>=CNST(ONE_THIRD))? tmp : CNST(ONE_THIRD) ); nu=2; for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2; break; } } /* if this point is reached, either the linear system could not be solved or * the error did not reduce; in any case, the increment must be rejected */ mu*=nu; nu2=nu<<1; // 2*nu; if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */ stop=5; break; } nu=nu2; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; } /* inner loop */ } if(k>=itmax) stop=3; for(i=0; i<m; ++i) /* restore diagonal J^T J entries */ jacTjac[i*m+i]=diag_jacTjac[i]; if(info){ info[0]=init_p_eL2; info[1]=p_eL2; info[2]=jacTe_inf; info[3]=Dp_L2; for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i]; info[4]=mu/tmp; info[5]=(LM_REAL)k; info[6]=(LM_REAL)stop; info[7]=(LM_REAL)nfev; info[8]=(LM_REAL)njev; } /* covariance matrix */ if(covar){ LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n); } if(freework) free(work); return (stop!=4)? k : -1; }
/* * This function computes the pseudoinverse of a square matrix A * into B using SVD. A and B can coincide * * The function returns 0 in case of error (e.g. A is singular), * the rank of A if successfull * * A, B are mxm * */ static int LEVMAR_PSEUDOINVERSE(LM_REAL *A, LM_REAL *B, int m) { LM_REAL *buf=NULL; int buf_sz=0; static LM_REAL eps=CNST(-1.0); register int i, j; LM_REAL *a, *u, *s, *vt, *work; int a_sz, u_sz, s_sz, vt_sz, tot_sz; LM_REAL thresh, one_over_denom; int info, rank, worksz, *iwork, iworksz; /* calculate required memory size */ worksz=16*m; /* more than needed */ iworksz=8*m; a_sz=m*m; u_sz=m*m; s_sz=m; vt_sz=m*m; tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(LM_REAL); buf_sz=tot_sz; buf=(LM_REAL *)malloc(buf_sz); if(!buf){ fprintf(stderr, RCAT("memory allocation in ", LEVMAR_PSEUDOINVERSE) "() failed!\n"); exit(1); } iwork=(int *)buf; a=(LM_REAL *)(iwork+iworksz); /* store A (column major!) into a */ for(i=0; i<m; i++) for(j=0; j<m; j++) a[i+j*m]=A[i*m+j]; u=a + a_sz; s=u+u_sz; vt=s+s_sz; work=vt+vt_sz; /* SVD decomposition of A */ GESVD("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info); //GESDD("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); /* error treatment */ if(info!=0){ if(info<0){ fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GESVD), "/" GESDD) " in ", LEVMAR_PSEUDOINVERSE) "()\n", -info); exit(1); } else{ fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", LEVMAR_PSEUDOINVERSE) "() [info=%d]\n", info); free(buf); return 0; } } if(eps<0.0){ LM_REAL aux; /* compute machine epsilon */ for(eps=CNST(1.0); aux=eps+CNST(1.0), aux-CNST(1.0)>0.0; eps*=CNST(0.5)) ; eps*=CNST(2.0); } /* compute the pseudoinverse in B */ for(i=0; i<a_sz; i++) B[i]=0.0; /* initialize to zero */ for(rank=0, thresh=eps*s[0]; rank<m && s[rank]>thresh; rank++){ one_over_denom=CNST(1.0)/s[rank]; for(j=0; j<m; j++) for(i=0; i<m; i++) B[i*m+j]+=vt[rank+i*m]*u[j+rank*m]*one_over_denom; } free(buf); return rank; }