/* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a) * using a block size of bsize. The product is returned in b. * Since a^T a is symmetric, its computation can be speeded up by computing only its * upper triangular part and copying it to the lower part. * * More details on blocking can be found at * http://www-2.cs.cmu.edu/afs/cs/academic/class/15213-f02/www/R07/section_a/Recitation07-SectionA.pdf */ void TRANS_MAT_MAT_MULT(LM_REAL *a, LM_REAL *b, int n, int m) { #ifdef HAVE_LAPACK /* use BLAS matrix multiply */ LM_REAL alpha=CNST(1.0), beta=CNST(0.0); /* Fool BLAS to compute a^T*a avoiding transposing a: a is equivalent to a^T in column major, * therefore BLAS computes a*a^T with a and a*a^T in column major, which is equivalent to * computing a^T*a in row major! */ GEMM("N", "T", &m, &m, &n, &alpha, a, &m, a, &m, &beta, b, &m); #else /* no LAPACK, use blocking-based multiply */ register int i, j, k, jj, kk; register LM_REAL sum, *bim, *akm; const int bsize=__BLOCKSZ__; #define __MIN__(x, y) (((x)<=(y))? (x) : (y)) #define __MAX__(x, y) (((x)>=(y))? (x) : (y)) /* compute upper triangular part using blocking */ for(jj=0; jj<m; jj+=bsize){ for(i=0; i<m; ++i){ bim=b+i*m; for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j) bim[j]=0.0; //b[i*m+j]=0.0; } for(kk=0; kk<n; kk+=bsize){ for(i=0; i<m; ++i){ bim=b+i*m; for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j){ sum=0.0; for(k=kk; k<__MIN__(kk+bsize, n); ++k){ akm=a+k*m; sum+=akm[i]*akm[j]; //a[k*m+i]*a[k*m+j]; } bim[j]+=sum; //b[i*m+j]+=sum; } } } } /* copy upper triangular part to the lower one */ for(i=0; i<m; ++i) for(j=0; j<i; ++j) b[i*m+j]=b[j*m+i]; #undef __MIN__ #undef __MAX__ #endif /* HAVE_LAPACK */ }
/* constrained jacobian: given pp, compute the jacobian at c + Z*pp * Using the chain rule, the jacobian with respect to pp equals the * product of the jacobian with respect to p (at c + Z*pp) times Z */ static void LMLEC_JACF(LM_REAL *pp, LM_REAL *jacjac, int mm, int n, void *adata) { struct LMLEC_DATA *data=(struct LMLEC_DATA *)adata; int m; register int i, j, l; register LM_REAL sum, *aux1, *aux2; LM_REAL *c, *Z, *p, *jac; m=mm+data->ncnstr; c=data->c; Z=data->Z; p=data->p; jac=data->jac; /* p=c + Z*pp */ for(i=0; i<m; ++i){ aux1=Z+i*mm; for(j=0, sum=c[i]; j<mm; ++j) sum+=aux1[j]*pp[j]; // sum+=Z[i*mm+j]*pp[j]; p[i]=sum; } (*(data->jacf))(p, jac, m, n, data->adata); /* compute jac*Z in jacjac */ if(n*m<=__BLOCKSZ__SQ){ // this is a small problem /* This is the straightforward way to compute jac*Z. However, due to * its noncontinuous memory access pattern, it incures many cache misses when * applied to large minimization problems (i.e. problems involving a large * number of free variables and measurements), in which jac is too large to * fit in the L1 cache. For such problems, a cache-efficient blocking scheme * is preferable. On the other hand, the straightforward algorithm is faster * on small problems since in this case it avoids the overheads of blocking. */ for(i=0; i<n; ++i){ aux1=jac+i*m; aux2=jacjac+i*mm; for(j=0; j<mm; ++j){ for(l=0, sum=0.0; l<m; ++l) sum+=aux1[l]*Z[l*mm+j]; // sum+=jac[i*m+l]*Z[l*mm+j]; aux2[j]=sum; // jacjac[i*mm+j]=sum; } } } else{ // this is a large problem /* Cache efficient computation of jac*Z based on blocking */ #define __MIN__(x, y) (((x)<=(y))? (x) : (y)) register int jj, ll; for(jj=0; jj<mm; jj+=__BLOCKSZ__){ for(i=0; i<n; ++i){ aux1=jacjac+i*mm; for(j=jj; j<__MIN__(jj+__BLOCKSZ__, mm); ++j) aux1[j]=0.0; //jacjac[i*mm+j]=0.0; } for(ll=0; ll<m; ll+=__BLOCKSZ__){ for(i=0; i<n; ++i){ aux1=jacjac+i*mm; aux2=jac+i*m; for(j=jj; j<__MIN__(jj+__BLOCKSZ__, mm); ++j){ sum=0.0; for(l=ll; l<__MIN__(ll+__BLOCKSZ__, m); ++l) sum+=aux2[l]*Z[l*mm+j]; //jac[i*m+l]*Z[l*mm+j]; aux1[j]+=sum; //jacjac[i*mm+j]+=sum; } } } } } }