コード例 #1
0
ファイル: misc_core.c プロジェクト: diehard2/stochfit
/* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a)
 * using a block size of bsize. The product is returned in b.
 * Since a^T a is symmetric, its computation can be speeded up by computing only its
 * upper triangular part and copying it to the lower part.
 *
 * More details on blocking can be found at 
 * http://www-2.cs.cmu.edu/afs/cs/academic/class/15213-f02/www/R07/section_a/Recitation07-SectionA.pdf
 */
void TRANS_MAT_MAT_MULT(LM_REAL *a, LM_REAL *b, int n, int m)
{
#ifdef HAVE_LAPACK /* use BLAS matrix multiply */

LM_REAL alpha=CNST(1.0), beta=CNST(0.0);
  /* Fool BLAS to compute a^T*a avoiding transposing a: a is equivalent to a^T in column major,
   * therefore BLAS computes a*a^T with a and a*a^T in column major, which is equivalent to
   * computing a^T*a in row major!
   */
  GEMM("N", "T", &m, &m, &n, &alpha, a, &m, a, &m, &beta, b, &m);

#else /* no LAPACK, use blocking-based multiply */

register int i, j, k, jj, kk;
register LM_REAL sum, *bim, *akm;
const int bsize=__BLOCKSZ__;

#define __MIN__(x, y) (((x)<=(y))? (x) : (y))
#define __MAX__(x, y) (((x)>=(y))? (x) : (y))

  /* compute upper triangular part using blocking */
  for(jj=0; jj<m; jj+=bsize){
    for(i=0; i<m; ++i){
      bim=b+i*m;
      for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j)
        bim[j]=0.0; //b[i*m+j]=0.0;
    }

    for(kk=0; kk<n; kk+=bsize){
      for(i=0; i<m; ++i){
        bim=b+i*m;
        for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j){
          sum=0.0;
          for(k=kk; k<__MIN__(kk+bsize, n); ++k){
            akm=a+k*m;
            sum+=akm[i]*akm[j]; //a[k*m+i]*a[k*m+j];
          }
          bim[j]+=sum; //b[i*m+j]+=sum;
        }
      }
    }
  }

  /* copy upper triangular part to the lower one */
  for(i=0; i<m; ++i)
    for(j=0; j<i; ++j)
      b[i*m+j]=b[j*m+i];

#undef __MIN__
#undef __MAX__

#endif /* HAVE_LAPACK */
}
コード例 #2
0
ファイル: lmlec_core.c プロジェクト: hoogerheide/garefl
/* constrained jacobian: given pp, compute the jacobian at c + Z*pp
 * Using the chain rule, the jacobian with respect to pp equals the
 * product of the jacobian with respect to p (at c + Z*pp) times Z
 */
static void LMLEC_JACF(LM_REAL *pp, LM_REAL *jacjac, int mm, int n, void *adata)
{
struct LMLEC_DATA *data=(struct LMLEC_DATA *)adata;
int m;
register int i, j, l;
register LM_REAL sum, *aux1, *aux2;
LM_REAL *c, *Z, *p, *jac; 

  m=mm+data->ncnstr;
  c=data->c;
  Z=data->Z;
  p=data->p;
  jac=data->jac;
  /* p=c + Z*pp */
  for(i=0; i<m; ++i){
    aux1=Z+i*mm;
    for(j=0, sum=c[i]; j<mm; ++j)
      sum+=aux1[j]*pp[j]; // sum+=Z[i*mm+j]*pp[j];
    p[i]=sum;
  }

  (*(data->jacf))(p, jac, m, n, data->adata);

  /* compute jac*Z in jacjac */
  if(n*m<=__BLOCKSZ__SQ){ // this is a small problem
    /* This is the straightforward way to compute jac*Z. However, due to
     * its noncontinuous memory access pattern, it incures many cache misses when
     * applied to large minimization problems (i.e. problems involving a large
     * number of free variables and measurements), in which jac is too large to
     * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
     * is preferable. On the other hand, the straightforward algorithm is faster
     * on small problems since in this case it avoids the overheads of blocking.
     */

    for(i=0; i<n; ++i){
      aux1=jac+i*m;
      aux2=jacjac+i*mm;
      for(j=0; j<mm; ++j){
        for(l=0, sum=0.0; l<m; ++l)
          sum+=aux1[l]*Z[l*mm+j]; // sum+=jac[i*m+l]*Z[l*mm+j];

        aux2[j]=sum; // jacjac[i*mm+j]=sum;
      }
    }
  }
  else{ // this is a large problem
    /* Cache efficient computation of jac*Z based on blocking
     */
#define __MIN__(x, y) (((x)<=(y))? (x) : (y))
    register int jj, ll;

    for(jj=0; jj<mm; jj+=__BLOCKSZ__){
      for(i=0; i<n; ++i){
        aux1=jacjac+i*mm;
        for(j=jj; j<__MIN__(jj+__BLOCKSZ__, mm); ++j)
          aux1[j]=0.0; //jacjac[i*mm+j]=0.0;
      }

      for(ll=0; ll<m; ll+=__BLOCKSZ__){
        for(i=0; i<n; ++i){
          aux1=jacjac+i*mm; aux2=jac+i*m;
          for(j=jj; j<__MIN__(jj+__BLOCKSZ__, mm); ++j){
            sum=0.0;
            for(l=ll; l<__MIN__(ll+__BLOCKSZ__, m); ++l)
              sum+=aux2[l]*Z[l*mm+j]; //jac[i*m+l]*Z[l*mm+j];
            aux1[j]+=sum; //jacjac[i*mm+j]+=sum;
          }
        }
      }
    }
  }
}