Пример #1
0
void commatvec(int l, int u, int i, real_t **ar, real_t **ai,
					real_t br[], real_t bi[], real_t *rr, real_t *ri)
{
	real_t matvec(int, int, int, real_t **, real_t []);
	real_t mv;

	mv=matvec(l,u,i,ar,br)-matvec(l,u,i,ai,bi);
	*ri=matvec(l,u,i,ai,br)+matvec(l,u,i,ar,bi);
	*rr=mv;
}
Пример #2
0
int main()
{
	double **mat;
	double vec1[MAX];
	double vec2[MAX];
	double *vec3;
	int i=0,j=0;

	mat = (double **)malloc(sizeof(double *) * MAX);
	for(i=0;i<MAX;i++)
	{
		// Initializing the vector and matrix with some arbitrary values
		vec1[i] = 2*i + 3;
		mat[i] = (double *)malloc(MAX * sizeof(double));
		for(j=0;j<MAX;j++)
			mat[i][j] = i*j + 3;
	}
	vec3 = matvec(mat,vec1,vec2,MAX);
	for(i=0;i<MAX;i++)
	{
		printf("C[%d] = %f\n",i,vec3[i]);
		free(mat[i]);
	}
	free(mat);
	return 1;
}
Пример #3
0
int main()
{
	double execTime = 0.0;
	double startTime, endTime;

    int k, size1, size2;

// Tell the compiler to align the a, b, and x arrays on 16-byte
// boundaries.  This allows the vectorizer to use aligned instructions
// and produce faster code.
#ifdef ALIGNED
#ifdef _WIN32
	_declspec(align(16)) FTYPE a[ROW][COLWIDTH];
    _declspec(align(16)) FTYPE b[ROW];
    _declspec(align(16)) FTYPE x[COLWIDTH];
#else
	FTYPE a[ROW][COLWIDTH]	__attribute__((aligned(16)));
	FTYPE b[ROW]			__attribute__((aligned(16)));
	FTYPE x[COLWIDTH]		__attribute__((aligned(16)));
#endif // _WIN32
#else
	FTYPE a[ROW][COLWIDTH];
	FTYPE b[ROW];
	FTYPE x[COLWIDTH];
#endif
	size1 = ROW;
	size2 = COLWIDTH;

    printf("\nROW:%d COL: %d\n",ROW,COLWIDTH);

    // initialize the arrays with data
	init_matrix(ROW,COL,1,a);
	init_array(COL,3,x);

	//start timing the matrix multiply code
	startTime = clock_it();
	for (k = 0;k < REPEATNTIMES;k++) {
#ifdef NOFUNCCALL
        int i, j;
		for (i = 0; i < size1; i++) {
			b[i] = 0;
			for (j = 0;j < size2; j++) {
				b[i] += a[i][j] * x[j];
			}
		}
#else
		matvec(size1,size2,a,b,x);
#endif
		x[0] = x[0] + 0.000001;
	}
	endTime = clock_it();
	execTime = endTime - startTime;

	printf("Execution time is %2.3f seconds\n", execTime);
	printf("GigaFlops = %f\n", (((double)REPEATNTIMES * (double)COL * (double)ROW * 2.0) / (double)(execTime))/1000000000.0);
	printsum(COL,b);

	return 0;
}
Пример #4
0
NUMERICS_EXPORT void vecmat(double *x, double **a, int nra, int nca, double *b)
{
    double** t = dmatrix(0, nca - 1, 0, nra - 1);
    
    transpose(a, nra, nca, t);
    matvec(t, nca, nra, x, b);
    
    free_dmatrix(t, 0, nca - 1, 0);
}
Пример #5
0
double *cgsolve(int k)
{
	int i, first_i, last_i;
	int n = k * k;
	int maxiters = 1000 > 5*k ? 1000 : k;

	// partition data
	if (n % size) {
		first_i = (n / size + 1) * rank;
		last_i = (rank != size-1 ? first_i+n/size+1 : n);
	} else {
		first_i = n / size * rank;
		last_i = n / size * (rank + 1);
	}

	double *b_vec = (double *)malloc(n * sizeof(double));
	double *r_vec = (double *)malloc(n * sizeof(double));
	double *d_vec = (double *)malloc(n * sizeof(double));
	double *A_vec = (double *)malloc(n * sizeof(double));
	double *x_vec = (double *)malloc(n * sizeof(double));

	for (i=0; i<n; i++) {
		double tmp = cs240_getB(i, n);
		b_vec[i] = tmp;
		r_vec[i] = tmp;
		d_vec[i] = tmp;
		x_vec[i] = 0;
	}


	double normb = sqrt(ddot(b_vec+first_i, b_vec+first_i, last_i-first_i));
	double rtr = ddot(r_vec+first_i, r_vec+first_i, last_i-first_i);
	double relres = 1;

	i = 0;
	while (relres > 1e-6 && i++ < maxiters) {
	/*while (i++ < 1) {*/

		matvec(A_vec, d_vec, k);
		double alpha = rtr / ddot(d_vec+first_i, A_vec+first_i, last_i-first_i);
		daxpy(x_vec, d_vec, 1, alpha, n);
		daxpy(r_vec, A_vec, 1, -1*alpha, n);
		double rtrold = rtr;
		rtr = ddot(r_vec+first_i, r_vec+first_i, last_i-first_i);
		double beta = rtr / rtrold;
		daxpy(d_vec, r_vec, beta, 1, n);
		relres = sqrt(rtr) / normb;
	}
	return x_vec;
}
Пример #6
0
void init_BICG_Z(Complex_Z *vl, int ldvl, Complex_Z *vr, int ldvr, int nvecs, 
               Complex_Z *xinit, Complex_Z *b, int lde, int n,
               Complex_Z *H, int ldH, int *IPIV, Complex_Z *work,
               void (*matvec) (void *, void *, void *), void *params, int *info)
{

    /* xinit = xinit + (vr inv(H) vl' res) */
    
    int i,ONE=1;
    Complex_Z tempc,tpone={1.00000e+00,0.000000e+00};
    char cN='N';

    //compute res=b-Ax and store it in work[nvecs:nvecs+n-1]
    tempc = wrap_zdot(&n,xinit,&ONE,xinit,&ONE,params);
    if(tempc.r > 0.0) //if nonzero initial guess
      {
           matvec(xinit,&work[nvecs],params);

           for(i=0; i<n; i++)
              { work[nvecs+i].r = b[i].r - work[nvecs+i].r;
                work[nvecs+i].i = b[i].i - work[nvecs+i].i;}
      }
     else
       BLAS_ZCOPY(&n,b,&ONE,&work[nvecs],&ONE);

    for(i=0; i<nvecs; i++){
      work[i] = wrap_zdot(&n, &vl[i*ldvl],&ONE,&work[nvecs],&ONE,params);}


    //solve using LU factorized H
    BLAS_ZGETRS(&cN,&nvecs,&ONE,H,&ldH,IPIV,work,&nvecs,info);
    printf("inside init_bicg\n");

    if( (*info) != 0)
      { fprintf(stderr,"Error in BLAS_ZGESV inside init-BICG_Z. info %d\n",(*info));
        exit(2);
      }

    BLAS_ZGEMV(&cN,&n,&nvecs,&tpone,vr,&ldvr,work,&ONE,&tpone,xinit,&ONE);

    return;
} 
Пример #7
0
int main (int argc, char * argv[]) {

  int nrows = 64;
  int ncols = 32;
  int i,j;

  int **A = (int **)malloc(nrows*sizeof(int*));
  for(i = 0; i < nrows; i++) {
    A[i] = (int *) malloc(ncols*sizeof(int));
  }

  printf("A = \n");
  for (i = 0; i < nrows; i++ ) {
    for (j = 0; j < ncols; j++ ) {
      A[i][j] = i + j;
      printf("%2d ", A[i][j]);
    }
    printf("\n");
  }
  
  int x[ncols];
  printf("x = \n");
  for (j = 0; j < ncols; j++) {
    x[j] = j;
    printf("%d\n",x[j]);
  }
  
  int result[nrows];
  matvec(nrows, ncols, A, x, result);
  printf("A x = \n");
  for (i = 0; i < nrows; i++) {
    printf("%d\n", result[i]);  
  }
  for(i = 0; i < nrows; i++) {
    free(A[i]);
  } 
 free(A);
}
Пример #8
0
double DiscreteAsian(int model,					//modello
                     double spot, 
					 double strike, 
	                 double rf, 
					 double dt, 
					 int ndates,
					 double lowlim,
					 double uplim,
                     int npoints,				//n. of quadrature points
					 long nfft,					//n. of points for the fft inversion
					 double ModelParameters[],  //the parameters of the model
					 double price[],
					 double solution[],double *delta)			//OUTPUT: Contains the solution	
{

int i, j, k;
double  inter_dens=0.;
double upperdens,b;
//parameters for spline interpolation
int ier;
double dfb,ddfb,arg;
//vectors where to store the outputs of the FFT inversion
double *inv, *logk;
//abscissa and weights for Gaussian quadrature with npoints
double *abscissa,*weights, *temp;
double** c,** kernelmatrix;
double optprice,optdelta;
double gamma_price;

inv=dvector(0, nfft-1);		//contains the density
logk=dvector(0, nfft-1);	//contains the abscissa of the density   
 
c = dmatrix(0, nfft-1, 0, 2);
kernelmatrix= dmatrix(0, npoints-1, 0, npoints-1);

abscissa=dvector(1,npoints);
weights=dvector(1,npoints);
temp=dvector(0,npoints-1);


//Generate abscissa and weights for quadrature
gauleg(lowlim, uplim, abscissa, weights, npoints);


b=MAX(fabs(lowlim - log(exp(uplim) + 1)),fabs(uplim - log(exp(lowlim) + 1)));
b=b*1.1;
upperdens=b;
TableIFRT(1, model, rf, dt, nfft, b, 1.5, ModelParameters, inv, logk);

//spline interpolation
i=spline(logk, inv, nfft ,c);
if(i>100) return i;

//construct the kernel matrix
for(i=1;i<=npoints;i++){
	for(j=1;j<=npoints;j++){
//		argument of the density
		arg=abscissa[i] - log(exp(abscissa[j]) + 1);
		if(arg>-upperdens)
		{
			if(arg<upperdens)
			{
			inter_dens=MAX(splevl(arg, nfft, logk, inv, c, &dfb, &ddfb, &ier),0.0);
			}
		}
		
		if(arg<-upperdens) inter_dens=0.0;
		
		if(arg>upperdens) inter_dens=0.0;
//construct the kernel
		kernelmatrix[i-1][ j-1] = weights[j] * MAX(inter_dens,0.0);
        /*printf("%d %d %d %f\n",npoints,i-1,j-1,kernelmatrix[i-1][ j-1]);*/
		}
//construct the initial condition

		arg=abscissa[i];
		if(arg>-upperdens)
		{
			if(arg<upperdens)
			{
			solution[i-1] = MAX(splevl(abscissa[i], nfft, logk, inv, c, &dfb, &ddfb, &ier),0);
			}
		}
        
		if(arg<-upperdens) solution[i-1]=0.0;
		
		if(arg>upperdens) solution[i-1]=0.0;
}
//iterations over the monitoring dates
for(k = 1;k<  ndates;k++){
	//compute K*v_n
	matvec(kernelmatrix, npoints, npoints, solution, temp);	
	//update v_n+1
	for( i = 0;i<= npoints-1;i++){ solution[i]=temp[i]; }
}
 
 optprice=0.0,optdelta=0.0;
 gamma_price=log(strike*((double)(ndates + 1))/spot-1.);
for( i = 0;i<= npoints-1;i++){
//spot price
		price[i] = spot * exp(abscissa[i+1]);
///option price
		optprice=optprice+weights[i+1]*MAX(spot*(1 + exp(abscissa[i+1]))/(ndates + 1) - strike, 0)*solution[i]*exp(-rf*dt*ndates);
        if(abscissa[i+1]>gamma_price)
         optdelta=optdelta+weights[i+1]*MAX((1 + exp(abscissa[i+1])), 0)*solution[i]*exp(-rf*dt*ndates)/(ndates + 1);
	}
 *delta=optdelta;
 
free_dvector(abscissa,1,npoints);
free_dvector(weights,1,npoints);
free_dvector(temp,0,npoints-1);
free_dvector(inv,0,nfft-1);
free_dvector(logk,0,nfft-1);
free_dmatrix(c, 0, nfft-1, 0, 2);
free_dmatrix(kernelmatrix, 0, npoints-1, 0, npoints-1);

return optprice;
  
}
Пример #9
0
void Enqueue::visit(StmtMatmul& s)
{
    if (_failure) return;
    if (_printer) _printer->visit(s);

    // some compute devices are single precision only
    if ( ! OCLhacks::singleton().supportFP64(_memMgr.deviceNum()) &&
         (PrecType::Double == s.precA() ||
          PrecType::Double == s.precB() ||
          PrecType::Double == s.precC()) )
    {
        _failure = true;
        return;
    }

    if (s.isMATMUL())
    {
        // check if device supports the Evergreen matrix multiply
        if (OCLhacks::singleton().supportEvergreen( _memMgr.deviceNum() ))
        {
            // do not allow modulo stream array subscripting for K dimension
            const size_t KfromA = s.isTransposeA() ? s.heightA() : s.widthA();
            const size_t KfromB = s.isTransposeB() ? s.widthB() : s.heightB();
            if (KfromA != KfromB)
            {
                _failure = true;
                return;
            }

            // ATI Evergreen matrix multiply
            Evergreen::MatmulMM matmul( _memMgr.deviceNum() );

            // exogenous parameters
            matmul.setBatching(_vt.numTraces());
            if (s.isSameDataA()) matmul.setSameDataMatrixA();
            if (s.isSameDataB()) matmul.setSameDataMatrixB();
            matmul.setGeneral(s.isGEMM());
            matmul.setPrecision(s.precA(),
                                s.precB(),
                                s.precC());
            matmul.setDimensions(s.heightC(), // M
                                 s.widthC(),  // N
                                 KfromA);     // K
            matmul.setDataLayout(s.isTransposeA(),
                                 s.isTransposeB());

            const vector< size_t > params = _jitMemo.autotuneLookup(matmul);

            if (params.empty())
            {
                _failure = true;
                return;
            }

            matmul.setParams(params);

            ArrayBuf* A
                = s.astvarA()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarA()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarA(), _vt);

            ArrayBuf* B
                = s.astvarB()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarB()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarB(), _vt);

            ArrayBuf* C
                = s.astvarC()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarC()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarC(), _vt);

            const double alpha = s.alpha();
            const double beta  = s.beta();

            const string kernelName = matmul.kernelName();
            stringstream ss;
            ss << matmul;
            const string kernelSource = ss.str();

            OCLkernel ckernel( *_memMgr.computeDevice() );
            ckernel.buildJIT(kernelName,
                             kernelSource);

            if (! ckernel.isOk())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "compile error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            if (! matmul.setArgs(ckernel, A, B, C, alpha, beta))
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "set arguments error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            const vector< size_t > globalDims = matmul.globalWorkItems();
            const vector< size_t > localDims = matmul.localWorkItems();

            for (size_t i = 0; i < globalDims.size(); i++)
            {
                ckernel << OCLWorkIndex(globalDims[i], localDims[i]);

                if (! ckernel.statusOp())
                {
#ifdef __LOGGING_ENABLED__
                    stringstream ss;
                    ss << "set index space dimension error: " << i;
                    LOGGER(ss.str())
#endif
                    _failure = true;
                    return;
                }
            }

            *_memMgr.computeDevice() << ckernel;

            if (! _memMgr.computeDevice()->statusOp())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "enqueue error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            *_memMgr.computeDevice() << FLUSH;

            if (! _memMgr.computeDevice()->statusOp())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "wait error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            matmul.clearArgs();
        }
        else
        {
            // there are no other GPU matrix multiply implementations except
            // "Evergreen" (which works on both ATI and NVIDIA), so fail
            _failure = true;
        }
    }
    else if (s.isMATVEC())
    {
        // check if device supports the Evergreen matrix multiply
        if (OCLhacks::singleton().supportEvergreen( _memMgr.deviceNum() ))
        {
            // do not allow modulo stream array subscripting for N dimension
            const size_t NfromA = s.isTransposeA() ? s.heightA() : s.widthA();
            const size_t NfromB = s.widthB();
            if (NfromA != NfromB)
            {
                _failure = true;
                return;
            }

            // ATI Evergreen matrix-vector multiply
            Evergreen::MatmulMV matvec( _memMgr.deviceNum() );

            // exogenous parameters
            matvec.setBatching(_vt.numTraces());
            if (s.isSameDataA()) matvec.setSameDataMatrixA();
            matvec.setGeneral(s.isGEMM());
            matvec.setPrecision(s.precA(),
                                s.precB(),
                                s.precC());
            matvec.setDimensions(s.widthC(), // M
                                 NfromA);    // N
            matvec.setDataLayout(s.isTransposeA());

            const vector< size_t > params = _jitMemo.autotuneLookup(matvec);

            if (params.empty())
            {
                _failure = true;
                return;
            }

            matvec.setParams(params);

            ArrayBuf* A
                = s.astvarA()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarA()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarA(), _vt);

            ArrayBuf* B
                = s.astvarB()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarB()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarB(), _vt);

            ArrayBuf* C
                = s.astvarC()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarC()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarC(), _vt);

            const double alpha = s.alpha();
            const double beta  = s.beta();

            const string kernelName = matvec.kernelName();
            stringstream ss;
            ss << matvec;
            const string kernelSource = ss.str();

            OCLkernel ckernel( *_memMgr.computeDevice() );
            ckernel.buildJIT(kernelName,
                             kernelSource);

            if (! ckernel.isOk())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "compile error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            if (! matvec.setArgs(ckernel, A, B, C, alpha, beta))
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "set arguments error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            const vector< size_t > globalDims = matvec.globalWorkItems();
            const vector< size_t > localDims = matvec.localWorkItems();

            for (size_t i = 0; i < globalDims.size(); i++)
            {
                ckernel << OCLWorkIndex(globalDims[i], localDims[i]);

                if (! ckernel.statusOp())
                {
#ifdef __LOGGING_ENABLED__
                    stringstream ss;
                    ss << "set index space dimension error: " << i;
                    LOGGER(ss.str())
#endif
                    _failure = true;
                    return;
                }
            }

            *_memMgr.computeDevice() << ckernel;

            if (! _memMgr.computeDevice()->statusOp())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "enqueue error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            *_memMgr.computeDevice() << FLUSH;

            if (! _memMgr.computeDevice()->statusOp())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "wait error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            matvec.clearArgs();
        }
        else
        {
            // there are no other GPU matrix multiply implementations except
            // "Evergreen" (which works on both ATI and NVIDIA), so fail
            _failure = true;
        }
    }
    else if (s.isVECMAT())
    {
        // check if device supports the Evergreen matrix multiply
        if (OCLhacks::singleton().supportEvergreen( _memMgr.deviceNum() ))
        {
            // do not allow modulo stream array subscripting for N dimension
            const size_t NfromA = s.widthA();
            const size_t NfromB = s.isTransposeB() ? s.widthB() : s.heightB();
            if (NfromA != NfromB)
            {
                _failure = true;
                return;
            }

            // ATI Evergreen matrix-vector multiply
            Evergreen::MatmulMV matvec( _memMgr.deviceNum() );

            // exogenous parameters
            matvec.setBatching(_vt.numTraces());
            if (s.isSameDataB()) matvec.setSameDataMatrixA();
            matvec.setGeneral(s.isGEMM());
            matvec.setPrecision(s.precB(),
                                s.precA(),
                                s.precC());
            matvec.setDimensions(s.widthC(), // M
                                 NfromB);    // N
            matvec.setDataLayout(! s.isTransposeB());

            const vector< size_t > params = _jitMemo.autotuneLookup(matvec);

            if (params.empty())
            {
                _failure = true;
                return;
            }

            matvec.setParams(params);

            ArrayBuf* A
                = s.astvarA()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarA()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarA(), _vt);

            ArrayBuf* B
                = s.astvarB()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarB()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarB(), _vt);

            ArrayBuf* C
                = s.astvarC()->isTraceVariable()
                      ? _memMgr.arrayBuf(s.astvarC()->variable(), _vt)
                      : _memMgr.arrayBuf(s.astvarC(), _vt);

            const double alpha = s.alpha();
            const double beta  = s.beta();

            const string kernelName = matvec.kernelName();
            stringstream ss;
            ss << matvec;
            const string kernelSource = ss.str();

            OCLkernel ckernel( *_memMgr.computeDevice() );
            ckernel.buildJIT(kernelName,
                             kernelSource);

            if (! ckernel.isOk())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "compile error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            if (! matvec.setArgs(ckernel, B, A, C, alpha, beta))
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "set arguments error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            const vector< size_t > globalDims = matvec.globalWorkItems();
            const vector< size_t > localDims = matvec.localWorkItems();

            for (size_t i = 0; i < globalDims.size(); i++)
            {
                ckernel << OCLWorkIndex(globalDims[i], localDims[i]);

                if (! ckernel.statusOp())
                {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "set index space dimension error: " << i;
                LOGGER(ss.str())
#endif
                    _failure = true;
                    return;
                }
            }

            *_memMgr.computeDevice() << ckernel;

            if (! _memMgr.computeDevice()->statusOp())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "enqueue error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            *_memMgr.computeDevice() << FLUSH;

            if (! _memMgr.computeDevice()->statusOp())
            {
#ifdef __LOGGING_ENABLED__
                stringstream ss;
                ss << "wait error: " << kernelName;
                LOGGER(ss.str())
#endif
                _failure = true;
                return;
            }

            matvec.clearArgs();
        }
        else
        {
            // there are no other GPU matrix multiply implementations except
            // "Evergreen" (which works on both ATI and NVIDIA), so fail
            _failure = true;
        }
    }
}
Пример #10
0
/***************************************
 *         Conjugate Gradient          *
 *   This function will do the CG      *
 *  algorithm without preconditioning. *
 *    For optimiziation you must not   *
 *        change the algorithm.        *
 ***************************************
 r(0)    = b - Ax(0)
 p(0)    = r(0)
 rho(0)    =  <r(0),r(0)>                
 ***************************************
 for k=0,1,2,...,n-1
   q(k)      = A * p(k)                 
   dot_pq    = <p(k),q(k)>             
   alpha     = rho(k) / dot_pq
   x(k+1)    = x(k) + alpha*p(k)      
   r(k+1)    = r(k) - alpha*q(k)     
   check convergence ||r(k+1)||_2 < eps  
	 rho(k+1)  = <r(k+1), r(k+1)>         
   beta      = rho(k+1) / rho(k)
   p(k+1)    = r(k+1) + beta*p(k)      
***************************************/
void cg(const int n, const int nnz, const int maxNNZ, const floatType* data, const int* indices, const int* length, const floatType* b, floatType* x, struct SolverConfig* sc){

	floatType* r, *p, *q;
	floatType alpha, beta, rho, rho_old, dot_pq, bnrm2;
	int iter;
 	double timeMatvec_s;
 	double timeMatvec=0;
	int i;
	floatType temp;
	
	/* allocate memory */
	r = (floatType*)malloc(n * sizeof(floatType));
	p = (floatType*)malloc(n * sizeof(floatType));
	q = (floatType*)malloc(n * sizeof(floatType));
	
#pragma acc data copyin(data[0:n*maxNNZ], indices[0:n*maxNNZ], length[0:n], n, nnz, maxNNZ, b[0:n]) copy(x[0:n]) create(alpha, beta, r[0:n], p[0:n], q[0:n], i, temp) //eigentlich auch copy(x[0:n]) aber error: not found on device???
{
	DBGMAT("Start matrix A = ", n, nnz, maxNNZ, data, indices, length)
	DBGVEC("b = ", b, n);
	DBGVEC("x = ", x, n);

	/* r(0)    = b - Ax(0) */
	timeMatvec_s = getWTime();
	matvec(n, nnz, maxNNZ, data, indices, length, x, r);
//hier inline ausprobieren
/*int i, j, k;
#pragma acc parallel loop present(data, indices, length, x)
	for (i = 0; i < n; i++) {
		r[i] = 0;
		for (j = 0; j < length[i]; j++) {
			k = j * n + i;
			r[i] += data[k] * x[indices[k]];
		}
	}*/
	timeMatvec += getWTime() - timeMatvec_s;
	xpay(b, -1.0, n, r);
	DBGVEC("r = b - Ax = ", r, n);
	

	/* Calculate initial residuum */
	nrm2(r, n, &bnrm2);
	bnrm2 = 1.0 /bnrm2;

	/* p(0)    = r(0) */
	memcpy(p, r, n*sizeof(floatType));
	DBGVEC("p = r = ", p, n);

	/* rho(0)    =  <r(0),r(0)> */
	vectorDot(r, r, n, &rho);
	printf("rho_0=%e\n", rho);

	for(iter = 0; iter < sc->maxIter; iter++){
		DBGMSG("=============== Iteration %d ======================\n", iter);
		/* q(k)      = A * p(k) */
		timeMatvec_s = getWTime();
		matvec(n, nnz, maxNNZ, data, indices, length, p, q);
		timeMatvec += getWTime() - timeMatvec_s;
		DBGVEC("q = A * p= ", q, n);

		/* dot_pq    = <p(k),q(k)> */
		vectorDot(p, q, n, &dot_pq);
		DBGSCA("dot_pq = <p, q> = ", dot_pq);

		/* alpha     = rho(k) / dot_pq */
		alpha = rho / dot_pq;
		DBGSCA("alpha = rho / dot_pq = ", alpha);

		/* x(k+1)    = x(k) + alpha*p(k) */
		axpy(alpha, p, n, x);
#pragma acc update host(x[0:n])
		DBGVEC("x = x + alpha * p= ", x, n);

		/* r(k+1)    = r(k) - alpha*q(k) */
		axpy(-alpha, q, n, r);
		DBGVEC("r = r - alpha * q= ", r, n);


		rho_old = rho;
		DBGSCA("rho_old = rho = ", rho_old);


		/* rho(k+1)  = <r(k+1), r(k+1)> */
		vectorDot(r, r, n, &rho);
		DBGSCA("rho = <r, r> = ", rho);

		/* Normalize the residual with initial one */
		sc->residual= sqrt(rho) * bnrm2;


   	
		/* Check convergence ||r(k+1)||_2 < eps
		 * If the residual is smaller than the CG
		 * tolerance specified in the CG_TOLERANCE
		 * environment variable our solution vector
		 * is good enough and we can stop the 
		 * algorithm. */
		printf("res_%d=%e\n", iter+1, sc->residual);
		if(sc->residual <= sc->tolerance)
			break;


		/* beta      = rho(k+1) / rho(k) */
		beta = rho / rho_old;
		DBGSCA("beta = rho / rho_old= ", beta);

		/* p(k+1)    = r(k+1) + beta*p(k) */
		xpay(r, beta, n, p);
		DBGVEC("p = r + beta * p> = ", p, n);

	}

	/* Store the number of iterations and the 
	 * time for the sparse matrix vector
	 * product which is the most expensive 
	 * function in the whole CG algorithm. */
	sc->iter = iter;
	sc->timeMatvec = timeMatvec;

	/* Clean up */
	free(r);
	free(p);
	free(q);
}//ende data region
}
Пример #11
0
void LRD_BICGSTAB_Z(Complex_Z *vl, int ldvl, Complex_Z *vr, int ldvr, int nvecs, Complex_Z *x, Complex_Z *b,     
                    int lde, int n, Complex_Z *H, int ldH, int *IPIV, Complex_Z *work, void (*matvec) (void *, void *, void *), 
                    void *params, double AnormEst, int maxiter, double DefTol, double tol, int ConvTestOpt, FILE *outputFile,  int *info)
{
     Complex_Z *xincr, *resid, *tmpH;  //used to solve the correction equation
     int i,j,k,numIts,iters_used,zs,is,ds,allelems,ONE=1;
     Complex_Z tzero={00.00e+00,00.00e+00};
     double resNorm,bnorm,curTol,leftoverTol;
     Complex_Z tempc;
     double *reshist;   //residual norm history
     int flag1,flag2;
     double stoptol_used,stoptol_cur,xnorm,rhsnorm; 
     double MACHEPS=1.e-16;

     
     zs = sizeof(Complex_Z);
     is = sizeof(int);
     ds = sizeof(double);

     xincr = (Complex_Z *) calloc(lde,zs);
     if(xincr==NULL){
        printf("Error in allocating xincr in LRD_BICGSTAB\n");
        exit(1);
     }


     resid = (Complex_Z *) calloc(lde,zs);
     if(resid==NULL){
        printf("Errro in allocating resid in LRD_BICGSTAB\n");
        exit(1);
     }




     tmpH = (Complex_Z  *) calloc(ldH*nvecs,zs);  //temporary copy of H
     if(tmpH == NULL){
        printf("Error in allocating tmpH\n");
        exit(1);
     }

     reshist = (double *) calloc(maxiter,ds);
     if(reshist == NULL){
        printf("Error allocating reshist\n");
        exit(1);
     }

     
     //initial residual std::vector
     tempc = wrap_zdot(&n,x,&ONE,x,&ONE,params);
     xnorm = sqrt(tempc.r);
     if(tempc.r > 0 )
       {
          matvec(x,resid,params);
         for(i=0; i<n; i++){
             resid[i].r = b[i].r - resid[i].r;
             resid[i].i = b[i].i - resid[i].i;}
       }
     else
       BLAS_ZCOPY(&n,b,&ONE,resid,&ONE);


     tempc = wrap_zdot(&n,resid,&ONE,resid,&ONE,params);
     resNorm = sqrt(tempc.r);

     tempc = wrap_zdot(&n,b,&ONE,b,&ONE,params);
     bnorm = sqrt(tempc.r);


     numIts=0; 
     //leftoverTol = DefTol;

     stoptol_used = tol*bnorm;
     if(ConvTestOpt==2)
       {
           stoptol_cur = MACHEPS*(AnormEst*xnorm + bnorm);
           if(stoptol_used < stoptol_cur)
               stoptol_used = stoptol_cur;
       }


     //Factorize the nvecs*nvecs part of H
     allelems = ldH*nvecs; 
     BLAS_ZCOPY(&allelems,H,&ONE,tmpH,&ONE);
     BLAS_ZGETRF(&nvecs,&nvecs,tmpH,&ldH,IPIV,info);
     
     if(info==0)
        {
           printf("ERROR: factorization of tmpH\n");
           exit(2);
        }

     if(DefTol < tol)
       curTol = tol;
     else
       curTol = DefTol;
     
     while( resNorm > stoptol_used){
     //int jrest=0;
     //while( jrest< 2){
     //    fprintf(outputFile,"restart\n");
         iters_used =0;         
         init_BICG_Z(vl,ldvl,vr,ldvr,nvecs,xincr,resid,lde,n,
                     tmpH,ldH,IPIV,work,matvec,params,info);
         
         if( (*info) != 0){
            printf("Error in init_BICG\n");
            exit(2);
         } 


         
         fprintf(outputFile,"bnorm %g, curTol %g\n",bnorm,curTol);

         MyBICGSTAB_Z(n,lde,xincr,resid,curTol,maxiter,&iters_used,reshist,
                     matvec,params,AnormEst, ConvTestOpt, work,&flag1);
         
         //if(flag1==0 || flag1=3)
         //   {
                 for(i=0; i<iters_used; i++){
                     fprintf(outputFile,"%-6d  %-22.16g \n",numIts+i,reshist[i]);} 

                 numIts = numIts + iters_used;

                 for(i=0; i<n; i++){
                     x[i].r = x[i].r + xincr[i].r;
                     x[i].i = x[i].i + xincr[i].i;
                     xincr[i]=tzero;}
                 

                 matvec(x,resid,params);
          
                 for(i=0; i<n; i++){
                     resid[i].r = b[i].r - resid[i].r;
                     resid[i].i = b[i].i - resid[i].i;}

                 tempc = wrap_zdot(&n, resid, &ONE, resid, &ONE, params);
                 resNorm = sqrt(tempc.r);
                 
                     //    }
         // else
             fprintf(outputFile,"BICGSTAB returns with flag %d\n",flag1); 

          if(ConvTestOpt==2)
            { 
                tempc = wrap_zdot(&n,x,&ONE,x,&ONE,params);
                xnorm = sqrt(tempc.r);
                stoptol_cur = MACHEPS*(AnormEst*xnorm+bnorm);
                if(stoptol_used < stoptol_cur)
                    stoptol_used = stoptol_cur;
             }
          leftoverTol = stoptol_used/resNorm;
          if(leftoverTol < DefTol)
            curTol=DefTol;
          else
            curTol=leftoverTol;
          fprintf(outputFile,"leftoverTol %g\n",leftoverTol);
          //jrest = jrest +1;
     }
     
     


     return;
}
Пример #12
0
void IncrEigbicg_Z(  int n, int lde,int nrhs, Complex_Z *X, Complex_Z *B, int *ncurEvals,         	
                     int ldh, Complex_Z *evecsl, Complex_Z *evecsr, Complex_Z *evals, 		
	             Complex_Z *H, void (*matvec) (void *, void *, void *),  
	             void (*mathvec)(void *, void *, void *), void *params, double *AnormEst, 
	             Complex_Z *work, Complex_Z *VL, int ldvl, Complex_Z *VR, int ldvr,        
	             Complex_Z *ework, int esize, double tol, double *restartTol, 	
	             int maxit, char SRT_OPT, double epsi, int ConvTestOpt, int plvl, int nev,
                     int v_max,FILE *outputFile)      
{

  /* Timing vars */
  double wt1,wt2,ut1,ut2,st1,st2,wE,wI;

  /* Pointers */
  Complex_Z  tempc, tempc1, tempc2, *tmpH, *x, *resid, *b;
  double     *rnorms, *reshist, normb, curTol,resNorm,leftTol;
  int        i,j,k, *IPIV, ONE = 1;
  int        zs, ds, tmpsize, is, phase, allelems ;
  int        numIts, flag, flag2,nAdded, nev_used, iters_used, info;
  Complex_Z  tpone = {+1.0e+00,+0.0e00}, tzero = {+0.0e+00,+0.0e00};
  char       cR = 'R'; char cL = 'L'; char cN ='N'; 
  char       cV = 'V'; char cU = 'U'; char cC ='C';
  double     *xrnorms,*xlnorms,*ernorms;
  Complex_Z  *angles;
  

  /* ------------------------------------------------------------------- */
  /* Work allocations */
  /* ------------------------------------------------------------------- */ 
  zs = sizeof(Complex_Z); 
  ds = sizeof(double);
  is = sizeof(int);


  if( (IPIV = (int *) calloc(ldh,is)) == NULL)
    { fprintf(stderr,"ERROR IncrEigbicg could not allocate IPIV\n");
      exit(1);}


  if( (x = (Complex_Z *) calloc(lde,zs)) == NULL)
    { fprintf(stderr,"ERROR IncrEigbicg could not allocate x\n");
      exit(1);}


  if( (resid = (Complex_Z *) calloc(lde,zs)) == NULL)
    { fprintf(stderr,"ERROR IncrEigbicg could not allocate resid\n");
      exit(1);}

  
  if( (tmpH = (Complex_Z *) calloc(ldh*ldh,zs)) == NULL)
    { fprintf(stderr,"ERROR IncrEigbicg could not allocate tmpH\n");
      exit(1);}


  if ((work = (Complex_Z *) calloc(6*lde, zs)) == NULL) 
    {fprintf(stderr, "ERROR IncrEigbicg could not allocate work\n");
     exit(1);}
     
  if ((ework = (Complex_Z *) calloc(esize, zs)) == NULL) 
    {fprintf(stderr, "ERROR IncrEigbicg could not allocate ework\n");
     exit(1);}
  
  if ((VL = (Complex_Z *) calloc(v_max*ldvl, zs)) == NULL) 
    {fprintf(stderr, "ERROR IncrEigbicg could not allocate VL\n");
     exit(1);}

  if ((VR = (Complex_Z *) calloc(v_max*ldvr, zs)) == NULL) 
    {fprintf(stderr, "ERROR IncrEigbicg could not allocate VR\n");
     exit(1);}
     
  if ( (rnorms = (double *) calloc(ldh, ds))  == NULL )
     {fprintf(stderr, "ERROR IncrEigbicg could not allocate rnorms\n");
      exit(1);}

  if ( (reshist = (double *) calloc(maxit, ds)) == NULL)
     {fprintf(stderr, "ERROR IncrEigbicg could not allocate reshist\n");
      exit(1);}



  if( (xlnorms = (double *) calloc(ldh,ds)) == NULL){
     fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate xlnorms\n");
     exit(1);}

  if( (xrnorms = (double *) calloc(ldh,ds)) == NULL){
     fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate xrnorms\n");
     exit(1);}

  if( (ernorms = (double *) calloc(ldh,ds)) == NULL){
     fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate ernorms\n");
     exit(1);}

  if( (angles = (Complex_Z *) calloc(ldh,zs)) == NULL){
     fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate angles\n");
     exit(1);}

  /* ------------------------------------------------------------------- */
  /* end Work allocations */
  /* ------------------------------------------------------------------- */

  /* ---------------------------------------------------------------------------------  */
  /* Solving one by one the nrhs systems with incremental init-eigbicg or init-bicgstab */
  /* ---------------------------------------------------------------------------------- */


  for (j=0; j<nrhs; j++) {

      b = &B[j*lde];
      tempc = wrap_zdot(&n,b,&ONE,b,&ONE,params);
      printf("bnorm=%g\n",sqrt(tempc.r));
      normb = sqrt(tempc.r);

      numIts = 0;
      //choose eigbicg or bicgstab
      if(ldh-(*ncurEvals) >= nev )
        phase=1;
      else
        phase=2;

      if (plvl) fprintf(outputFile, "\n\nSystem %d\n", j);

      wE = 0.0; wI = 0.0;     /* Start accumulator timers */

      if ( (*ncurEvals > 0) && (phase==1)) {
         /* --------------------------------------------------------- */
         /* Perform init-BICG with evecsl and evecsr vectors          */
         /* xinit = xinit + evecsr*inv(H)*evecl'*(b-Ax0) 		      */
         /* --------------------------------------------------------- */
         wt1 = primme_get_wtime(); 
   
        /* copy H into tmpH otherwise it will be changed by CGEV */
        allelems = ldh*(*ncurEvals);
        BLAS_ZCOPY(&allelems,H,&ONE,tmpH,&ONE);
        // LU factorization of tmpH
        BLAS_ZGETRF(ncurEvals,ncurEvals,tmpH,&ldh,IPIV,&info);


        for(i=0; i<n; i++)
            x[i]=tzero;

        matvec(&X[j*lde],resid,params);

        for(i=0; i<n; i++){
         resid[i].r = b[i].r - resid[i].r;
         resid[i].i = b[i].i - resid[i].i;}


        init_BICG_Z(evecsl,ldvl,evecsr,ldvr,(*ncurEvals), 
                    x,resid,lde,n,tmpH,ldh,IPIV,work,matvec,params,&info);

        if(phase==1){
           for(i=0; i<n; i++){
              X[j*lde+i].r = X[j*lde+i].r + x[i].r;
              X[j*lde+i].i = X[j*lde+i].i + x[i].i;}}
        

        wt2 = primme_get_wtime();
	wI = wI + wt2-wt1;
       }
       /* end of init-BICG with evecsl and evecsr vectors              */
       /* ------------------------------------------------------------ */

       if(phase == 1){	
         /* ------------------------------------------------------------ */
         /* Solve Ax = b with x initial guess using eigbicg and compute
            new nev eigenvectors                                         */
         /* ------------------------------------------------------------ */
         wt1 = primme_get_wtime(); 
        
         Zeigbicg(n, lde, &X[j*lde], b, &normb, tol, maxit, SRT_OPT,epsi,ConvTestOpt, &numIts, reshist, 
                  &flag, plvl, work, matvec, mathvec, params, AnormEst, nev, 
                  &evals[(*ncurEvals)], &rnorms[(*ncurEvals)],
  	          v_max, VR,ldvr,VL,ldvl,esize,ework);

         wt2 = primme_get_wtime();
	 wE = wE + wt2-wt1;

         /* ---------- */
         /* Reporting  */
         /* ---------- */
         tempc1 = wrap_zdot(&n,&X[j*lde],&ONE,&X[j*lde],&ONE,params);
         if (plvl) {
            fprintf(outputFile, "For this rhs:\n");
            fprintf(outputFile, "Norm(solution) %-16.12E, AnormEst %-16.12E\n",sqrt(tempc1.r),(*AnormEst));
            fprintf(outputFile, "Total initBICG Wallclock : %-f\n", wI);
            fprintf(outputFile, "Total eigbicg Wallclock : %-f\n", wE);
            fprintf(outputFile, "Iterations: %-d\n", numIts); 
            fprintf(outputFile, "Actual Resid of LinSys  : %e\n", reshist[numIts-1]);
	    if (plvl > 1) 
               for (i=0; i < nev; i++) 
                   fprintf(outputFile, "Eval[%d]: %-22.15E    %-22.15E           rnorm: %-22.15E\n", 
	                       i+1, evals[*ncurEvals+i].r,  evals[*ncurEvals+i].i, rnorms[*ncurEvals+i]); 
            if (plvl >1)
              {
                 fprintf(outputFile,"Residual norm\n");
               for( i=0; i < numIts; i++)
                   fprintf(outputFile,"%-d  %-22.15E\n",i,reshist[i]);
              }  

            if (flag != 0) {
               fprintf(outputFile, "Error: eigbicg returned with nonzero exit status\n");
            return;}
         }

      /* ------------------------------------------------------------------- */
      /* Update the evecsl, evecsr,  and evecsl'*A*evecsr                               */
      /* ------------------------------------------------------------------- */
      wt1 = primme_get_wtime(); 
      primme_get_time(&ut1,&st1);


      /* Append new Ritz pairs to evecs */
      for (i=0; i<nev; i++){
	 BLAS_ZCOPY(&n, &VL[i*ldvl], &ONE, &evecsl[((*ncurEvals)+i)*ldvl], &ONE);
	 BLAS_ZCOPY(&n, &VR[i*ldvr], &ONE, &evecsr[((*ncurEvals)+i)*ldvr], &ONE);}
  
      /* Bi-Orthogonalize the new Ritz vectors */
      /* Use a simple biorthogonalization that uses all vectors */   

      nAdded = nev;  //for the moment, we add all the vectors
      biortho_global_Z(evecsl,ldvl,evecsr,ldvr,n,(*ncurEvals)+1,(*ncurEvals)+nev,3,params);
         
      //check the biorthogonality of the vectors
      /*
      for(i=0; i<(*ncurEvals)+nev; i++)
         for(k=0; k<(*ncurEvals)+nev; k++)
             {
                 tempc = wrap_zdot(&n,&evecsl[i*ldvl],&ONE,&evecsr[k*ldvr],&ONE,params);
                 fprintf(outputFile,"evecsl[%d]'*evecsr[%d]=%g %g\n",i,k,tempc.r,tempc.i);
             } 
      */

      /* Augument H */   
      /* (1:ncurEvals+nAdded,ncurEvals+1:ncurEvals+nAdded) block */
      for(k=(*ncurEvals); k<(*ncurEvals)+nAdded; k++)
          {
             matvec(&evecsr[k*ldvr],VR,params);
             for(i=0; i<(*ncurEvals)+nAdded; i++)
                {
                      tempc=wrap_zdot(&n,&evecsl[i*ldvl],&ONE,VR,&ONE,params);
                      H[i+k*ldh]=tempc;           
                }   
                   
          }


      /* (ncurEvals+1:ncurEvals+nAdded,1:ncurEvals) block */
      for(k=(*ncurEvals); k<(*ncurEvals)+nAdded; k++)
         {
             mathvec(&evecsl[k*ldvl],VL,params);
             for(i=0; i<(*ncurEvals); i++)
                {
                   tempc = wrap_zdot(&n,&evecsr[i*ldvr],&ONE,VL,&ONE,params);
                   H[k+i*ldh].r = tempc.r; 
                   H[k+i*ldh].i = -tempc.i;
                }
         } 
  
      (*ncurEvals) = (*ncurEvals) + nAdded;
   
        
      /* Reporting */
      wt2 = primme_get_wtime();
      primme_get_time(&ut2,&st2);
      if (plvl) {
            fprintf(outputFile, "Update\n");
            fprintf(outputFile, "Added %d vecs\n",nAdded);
            fprintf(outputFile, "U Wallclock : %-f\n", wt2-wt1);
            fprintf(outputFile, "U User Time  : %f seconds\n", ut2-ut1);
            fprintf(outputFile, "U Syst Time  : %f seconds\n", st2-st1);}
      
   } /* if phase==1 */

   /****************************************************/



   if(phase==2) //solve deflated bicgstab the correction equation
     {
        
        fprintf(outputFile,"\n\nDeflated bicgstab\n");;
        LRD_BICGSTAB_Z(evecsl, ldvl,evecsr,ldvr,(*ncurEvals),&X[j*lde],b,     
                     lde,n,H,ldh,IPIV,work,matvec,params,(*AnormEst),maxit,(*restartTol),tol,ConvTestOpt,outputFile,&info);                
     } /*end of if(phase==2)*/
    
  } /*for(j=0; j<nrhs; j++) */





  // compute final evecs,etc.
  ComputeFinalEvecs_Z
    ( (*ncurEvals),n,evecsl,ldvl,evecsr,ldvr,H,ldh,SRT_OPT,epsi, 
       evals, ernorms, xlnorms, xrnorms, angles, 
       matvec,params,work,6*lde);


  fprintf(outputFile,"\n\n Final Evals\n");
  for(i=0; i< (*ncurEvals); i++){
     fprintf(outputFile,"EVAL %-16.12E %-16.12E, ERNORM %-16.12E\n",
                         evals[i].r,evals[i].i,ernorms[i]);
     fprintf(outputFile,"XLNORM %-16.12E, XRNORM %-16.12E, ANGLE %-16.12E %-16.12E\n\n",
                         xlnorms[i],xrnorms[i],angles[i].r,angles[i].i);}
  fprintf(outputFile,"===================================================\n");

      
   return;
}
Пример #13
0
int main(int argc, char* argv[])
{
	if (argc != 4)
	{
		printf("Usage: %s <nx> <ny> <nt>\n", argv[0]);
		exit(1);
	}

	const char* no_timing = getenv("NO_TIMING");

#if defined(_OPENACC)
	char* regcount_fname = getenv("OPENACC_PROFILING_FNAME");
	if (regcount_fname)
	{
		char* regcount_lineno = getenv("OPENACC_PROFILING_LINENO");
		int lineno = -1;
		if (regcount_lineno)
			lineno = atoi(regcount_lineno);
		//kernelgen_enable_openacc_regcount(regcount_fname, lineno);
	}
#endif

	parse_arg(nx, argv[1]);
	parse_arg(ny, argv[2]);
	parse_arg(nt, argv[3]);

	real* A = (real*)memalign(MEMALIGN, nx * ny * sizeof(real));
	real* x = (real*)memalign(MEMALIGN, nx * sizeof(real));
	real* y = (real*)memalign(MEMALIGN, ny * sizeof(real));

	if (!A || !x || !y)
	{
		printf("Error allocating memory for arrays: %p, %p, %p\n", A, x, y);
		exit(1);
	}

	real amean = 0.0f, xmean = 0.0f, ymean = 0.0f;
	for (int i = 0; i < nx * ny; i++)
	{
		A[i] = real_rand();
		amean += A[i];
	}
	for (int i = 0; i < nx; i++)
	{
		x[i] = real_rand();
		xmean += x[i];
	}
	for (int i = 0; i < ny; i++)
	{
		y[i] = real_rand();
		ymean += y[i];
	}
	if (!no_timing) printf("initial mean = %f\n", amean / (nx * ny) + xmean / nx + ymean / ny);

	//
	// MIC or OPENACC:
	//
	// 1) Perform an empty offload, that should strip
	// the initialization time from further offloads.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec init_s, init_f;
#if defined(_MIC)
	get_time(&init_s);
	#pragma offload target(mic) \
		nocopy(A:length(nx * ny) alloc_if(0) free_if(0)), \
		nocopy(x:length(nx) alloc_if(0) free_if(0)), \
		nocopy(y:length(ny) alloc_if(0) free_if(0))
	{ }
	get_time(&init_f);
#endif
#if defined(_OPENACC)
	get_time(&init_s);
	acc_init(acc_device_default);
	get_time(&init_f);
#endif
	double init_t = get_time_diff((struct timespec*)&init_s, (struct timespec*)&init_f);
	if (!no_timing) printf("init time = %f sec\n", init_t);
#endif

	volatile struct timespec total_s, total_f;
	get_time(&total_s);
	//
	// MIC or OPENACC:
	//
	// 2) Allocate data on device, but do not copy anything.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec alloc_s, alloc_f;
#if defined(_MIC)
	get_time(&alloc_s);
	#pragma offload target(mic) \
		nocopy(A:length(nx * ny) alloc_if(1) free_if(0)), \
		nocopy(x:length(nx) alloc_if(1) free_if(0)), \
		nocopy(y:length(ny) alloc_if(1) free_if(0))
	{ }
	get_time(&alloc_f);
#endif
#if defined(_OPENACC)
	get_time(&alloc_s);
	#pragma acc data create (A[0:nx*ny], x[0:nx], y[0:ny])
	{
	get_time(&alloc_f);
#endif
	double alloc_t = get_time_diff((struct timespec*)&alloc_s, (struct timespec*)&alloc_f);
	if (!no_timing) printf("device buffer alloc time = %f sec\n", alloc_t);
#endif

	//
	// MIC or OPENACC:
	//
	// 3) Transfer data from host to device and leave it there,
	// i.e. do not allocate deivce memory buffers.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec load_s, load_f;
#if defined(_MIC)
	get_time(&load_s);
	#pragma offload target(mic) \
		in(A:length(nx * ny) alloc_if(0) free_if(0)), \
		in(x:length(nx) alloc_if(0) free_if(0)), \
		in(y:length(ny) alloc_if(0) free_if(0))
	{ }
	get_time(&load_f);
#endif
#if defined(_OPENACC)
	get_time(&load_s);
	#pragma acc update device(A[0:nx*ny], x[0:nx], y[0:ny])
	get_time(&load_f);
#endif
	double load_t = get_time_diff((struct timespec*)&load_s, (struct timespec*)&load_f);
	if (!no_timing) printf("data load time = %f sec (%f GB/sec)\n", load_t,
		((nx * ny + nx + ny) * sizeof(real)) / (load_t * 1024 * 1024 * 1024));
#endif

	//
	// 4) Perform data processing iterations, keeping all data
	// on device.
	//
	volatile struct timespec compute_s, compute_f;
	get_time(&compute_s);
#if defined(_MIC)
	#pragma offload target(mic) \
		nocopy(A:length(nx * ny) alloc_if(0) free_if(0)), \
		nocopy(x:length(nx) alloc_if(0) free_if(0)), \
		nocopy(y:length(ny) alloc_if(0) free_if(0))
#endif
	{
		for (int it = 0; it < nt; it++)
			matvec(nx, ny, A, x, y);
	}
	get_time(&compute_f);
	double compute_t = get_time_diff((struct timespec*)&compute_s, (struct timespec*)&compute_f);
	if (!no_timing) printf("compute time = %f sec\n", compute_t);

	//
	// MIC or OPENACC:
	//
	// 5) Transfer output data back from device to host.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec save_s, save_f;
#if defined(_MIC)
	get_time(&save_s);
	#pragma offload target(mic) \
		out(y:length(ny) alloc_if(0) free_if(0))
	{ }
	get_time(&save_f);
#endif
#if defined(_OPENACC)
	get_time(&save_s);
	#pragma acc update host (y[0:ny])
	get_time(&save_f);
#endif
	double save_t = get_time_diff((struct timespec*)&save_s, (struct timespec*)&save_f);
	if (!no_timing) printf("data save time = %f sec (%f GB/sec)\n", save_t, (ny * sizeof(real)) / (save_t * 1024 * 1024 * 1024));
#endif

	//
	// MIC or OPENACC:
	//
	// 6) Deallocate device data buffers.
	// OPENACC does not seem to have explicit deallocation.
	//
#if defined(_OPENACC)
	}
#endif
#if defined(_MIC)
	volatile struct timespec free_s, free_f;
	get_time(&free_s);
	#pragma offload target(mic) \
		nocopy(A:length(nx * ny) alloc_if(0) free_if(1)), \
		nocopy(x:length(nx) alloc_if(0) free_if(1)), \
		nocopy(y:length(ny) alloc_if(0) free_if(1))
	{ }
	get_time(&free_f);
	double free_t = get_time_diff((timepsec&)free_s, (timepsec&)free_f);
	// if (!no_timing) printf("device buffer free time = %f sec\n", free_t);
#endif

	get_time(&total_f);
	if (!no_timing) printf("device buffer free time = %f sec\n", get_time_diff((struct timespec*)&total_s, (struct timespec*)&total_f));

	ymean = 0.0f;
	for (int i = 0; i < ny; i++)
		ymean += y[i];
	printf("final mean = %f\n", ymean / ny);

	free(A);
	free(x);
	free(y);

	fflush(stdout);

	return 0;
}
Пример #14
0
void efsirk(real_t *x, real_t xe, int m, real_t y[],
			real_t *delta, void (*derivative)(int, real_t[], real_t *),
			void (*jacobian)(int, real_t **, real_t [], real_t *),
			real_t **j, int *n, real_t aeta, real_t reta, real_t hmin,
			real_t hmax, int linear,
			void (*output)(real_t, real_t, int, real_t [],
								real_t, real_t **, int))
{
	int *allocate_integer_vector(int, int);
	real_t *allocate_real_vector(int, int);
	real_t **allocate_real_matrix(int, int, int, int);
	void free_integer_vector(int *, int);
	void free_real_vector(real_t *, int);
	void free_real_matrix(real_t **, int, int, int);
	real_t vecvec(int, int, int, real_t [], real_t []);
	real_t matmat(int, int, int, int, real_t **, real_t **);
	real_t matvec(int, int, int, real_t **, real_t []);
	void gsselm(real_t **, int, real_t [], int [], int []);
	void solelm(real_t **, int, int [], int [], real_t []);
	int k,l,lin,*ri,*ci;
	real_t step,h,mu0,mu1,mu2,theta0,theta1,nu1,nu2,nu3,yk,fk,c1,c2,
			d,*f,*k0,*labda,**j1,aux[8],discr,eta,s,z1,z2,e,alpha1,a,b;

	ri=allocate_integer_vector(1,m);
	ci=allocate_integer_vector(1,m);
	f=allocate_real_vector(1,m);
	k0=allocate_real_vector(1,m);
	labda=allocate_real_vector(1,m);
	j1=allocate_real_matrix(1,m,1,m);

	aux[2]=FLT_EPSILON;
	aux[4]=8.0;
	for (k=1; k<=m; k++) f[k]=y[k];
	*n = 0;
	(*output)(*x,xe,m,y,*delta,j,*n);
	step=0.0;
	do {
		(*n)++;
		/* difference scheme */
		(*derivative)(m,f,delta);
		/* step size */
		if (linear)
			s=h=hmax;
		else
			if (*n == 1 || hmin == hmax)
				s=h=hmin;
			else {
				eta=aeta+reta*sqrt(vecvec(1,m,0,y,y));
				c1=nu3*step;
				for (k=1; k<=m; k++) labda[k] += c1*f[k]-y[k];
				discr=sqrt(vecvec(1,m,0,labda,labda));
				s=h=(eta/(0.75*(eta+discr))+0.33)*h;
				if (h < hmin)
					s=h=hmin;
				else
					if (h > hmax) s=h=hmax;
			}
		if ((*x)+s > xe) s=xe-(*x);
		lin=((step == s) && linear);
		step=s;
		if (!linear || *n == 1) (*jacobian)(m,j,y,delta);
		if (!lin) {
			/* coefficient */
			z1=step*(*delta);
			if (*n == 1) z2=z1+z1;
			if (fabs(z2-z1) > 1.0e-6*fabs(z1) || z2 > -1.0) {
				a=z1*z1+12.0;
				b=6.0*z1;
				if (fabs(z1) < 0.1)
					alpha1=(z1*z1/140.0-1.0)*z1/30.0;
				else if (z1 < 1.0e-14)
					alpha1=1.0/3.0;
				else if (z1 < -33.0)
					alpha1=(a+b)/(3.0*z1*(2.0+z1));
				else {
					e=((z1 < 230.0) ? exp(z1) : FLT_MAX);
					alpha1=((a-b)*e-a-b)/(((2.0-z1)*e-2.0-z1)*3.0*z1);
				}
				mu2=(1.0/3.0+alpha1)*0.25;
				mu1 = -(1.0+alpha1)*0.5;
				mu0=(6.0*mu1+2.0)/9.0;
				theta0=0.25;
				theta1=0.75;
				a=3.0*alpha1;
				nu3=(1.0+a)/(5.0-a)*0.5;
				a=nu3+nu3;
				nu1=0.5-a;
				nu2=(1.0+a)*0.75;
				z2=z1;
			}
			c1=step*mu1;
			d=step*step*mu2;
			for (k=1; k<=m; k++) {
				for (l=1; l<=m; l++)
					j1[k][l]=d*matmat(1,m,k,l,j,j)+c1*j[k][l];
				j1[k][k] += 1.0;
			}
			gsselm(j1,m,aux,ri,ci);
		}
		c1=step*step*mu0;
		d=step*2.0/3.0;
		for (k=1; k<=m; k++) {
			k0[k]=fk=f[k];
			labda[k]=d*fk+c1*matvec(1,m,k,j,f);
		}
		solelm(j1,m,ri,ci,labda);
		for (k=1; k<=m; k++) f[k]=y[k]+labda[k];
		(*derivative)(m,f,delta);
		c1=theta0*step;
		c2=theta1*step;
		d=nu1*step;
		for (k=1; k<=m; k++) {
			yk=y[k];
			fk=f[k];
			labda[k]=yk+d*fk+nu2*labda[k];
			y[k]=f[k]=yk+c1*k0[k]+c2*fk;
		}
		(*x) += step;
		(*output)(*x,xe,m,y,*delta,j,*n);
	} while (*x < xe);
	free_integer_vector(ri,1);
	free_integer_vector(ci,1);
	free_real_vector(f,1);
	free_real_vector(k0,1);
	free_real_vector(labda,1);
	free_real_matrix(j1,1,m,1);
}
Пример #15
0
/* PCG - Conjugate Gradients Algorithm
 */
void pcg(int n, 
	 double *x, 
	 double *b,
	 double tol, 
	 int maxit,
	 int clvl,
	 int *iter, 
	 double *relres, 
	 int *flag,
	 double *work,
	 void (*matvec)(double *, double *),
	 void (*precon)(double *, double *))
{
  double ALPHA;			/* used for passing parameters */
  int ONE = 1;			/* to BLAS routines */

  double n2b;			/* norm of rhs vector */
  double tolb;			/* requested tolerance for residual */
  double normr;			/* residual norm */
  double alpha, beta;
  double rho, rho1;
  double pq;
  double dmax, ddum;		/* used to detect stagnation */
  int stag;			/* flag to indicate stagnation */
  int it;			/* current iteration number */
  int i;			/* index variable */
  double *r, *z, *p, *q;	/* pointers to vectors in PCG algorithm */
  
  /* setup pointers into work */
  r = work;
  z = work + n;
  p = work + 2*n;
  q = work + 3*n;

  /* Check for all zero right hand side vector => all zero solution */
  n2b = F77(dnrm2)(&n, b, &ONE);/* Norm of rhs vector, b */
  if (n2b == 0.0) {		/* if rhs vector is all zeros */
    for (i = 0; i < n; i ++)	/* then  solution is all zeros */
      x[i] = 0.0;
    *flag = 0;			/* a valid solution has been obtained */
    *relres = 0.0;		/* the relative residual is actually 0/0 */
    *iter = 0;			/* no iterations need be performed */
    if (clvl)
      itermsg(tol,maxit,*flag,*iter,*relres);
    return;
  }
  
  /* Set up for the method */
  *flag = 1;
  tolb = tol * n2b;		/* Relative tolerance */
  matvec(x, r);			/* Zero-th residual: r = b - A * x*/
  for (i = 0; i < n; i ++)	/* then  solution is all zeros */
    r[i] = b[i] - r[i];
  normr = F77(dnrm2)(&n, r, &ONE); /* Norm of residual */
  
  if (normr <= tolb) {		/* Initial guess is a good enough solution */
    *flag = 0;
    *relres = normr / n2b;
    *iter = 0;
    if (clvl)
      itermsg(tol,maxit,*flag,*iter,*relres);
    return;
  }

  rho = 1.0;
  stag = 0;			/* stagnation of the method */

  /* loop over maxit iterations (unless convergence or failure) */
  
  for (it = 1; it <= maxit; it ++) {
    
    if (precon) {
      precon(r, z);
      /*
	if isinf(norm(y,inf))
	flag = 2;
	break
	end
      */
    } else {
      F77(dcopy)(&n, r, &ONE, z, &ONE);
    }
   
    rho1 = rho;
    rho = F77(ddot)(&n, r, &ONE, z, &ONE);
    if (rho == 0.0) {		/* or isinf(rho) */
      *flag = 4;
      break;
    }
    if (it == 1) {
      F77(dcopy)(&n, z, &ONE, p, &ONE);
    } else {
      beta = rho / rho1;
      if (beta == 0.0) {	/* | isinf(beta) */
	*flag = 4;
	break;
      }
      for (i = 0; i < n; i ++)	/* p = z + beta * p; */
	p[i] = z[i] + beta * p[i];
    }
    matvec(p, q);		/* q = A * p */
    pq = F77(ddot)(&n, p, &ONE, q, &ONE); /* pq = p' * q */
    if (pq == 0.0) {		/* | isinf(pq) */
      *flag = 4;
      break;
    } else {
      alpha = rho / pq;
    }
    /* 
       if isinf(alpha)
       flag = 4;
       break
       end
    */
    if (alpha == 0.0)		/* stagnation of the method */
      stag = 1;
   
    /* Check for stagnation of the method */
    if (stag == 0) {
      dmax = 0.0;
      for (i = 0; i < n; i ++)
	if (x[i] != 0.0) {
	  ddum = fabs(alpha * p[i]/x[i]);
	  if (ddum > dmax)
	    dmax = ddum;
	} else
	  if (p[i] != 0.0)
	    dmax = 1.0;
      stag = (1.0 + dmax == 1.0);
    }
    
    F77(daxpy)(&n, &alpha, p, &ONE, x, &ONE); /* form new iterate */
    ALPHA = -alpha;
    F77(daxpy)(&n, &ALPHA, q, &ONE, r, &ONE); /* r = r - alpha * q */
    
    /* check for convergence */
#ifdef EXPENSIVE_CRIT
    matvec(x, z);		/* normr = norm(b - A * x) */
    for (i = 0; i < n; i ++)
      z[i] = b[i] - z[i];
    normr = F77(dnrm2)(&n, z, &ONE);
#else
    normr = F77(dnrm2)(&n, r, &ONE); /* normr = norm(r) */
#endif
    if (normr <= tolb) {
      *flag = 0;
      break;
    }
    
    if (stag == 1) {
      *flag = 3;
      break;
    }
  } /* for it = 1 : maxit */
  
  *iter = it;
  *relres = normr / n2b;

  if (clvl)
    itermsg(tol,maxit,*flag,*iter,*relres);
}
Пример #16
0
int fgmr(int n,
     void (*matvec) (double, double[], double, double[]),
     void (*psolve) (int, double[], double[]),
     double *rhs, double *sol, double tol, int im, int *itmax, FILE * fits)
{
/*----------------------------------------------------------------------
|                 *** Preconditioned FGMRES ***
+-----------------------------------------------------------------------
| This is a simple version of the ARMS preconditioned FGMRES algorithm.
+-----------------------------------------------------------------------
| Y. S. Dec. 2000. -- Apr. 2008
+-----------------------------------------------------------------------
| on entry:
|----------
|
| rhs     = real vector of length n containing the right hand side.
| sol     = real vector of length n containing an initial guess to the
|           solution on input.
| tol     = tolerance for stopping iteration
| im      = Krylov subspace dimension
| (itmax) = max number of iterations allowed.
| fits    = NULL: no output
|        != NULL: file handle to output " resid vs time and its"
|
| on return:
|----------
| fgmr      int =  0 --> successful return.
|           int =  1 --> convergence not achieved in itmax iterations.
| sol     = contains an approximate solution (upon successful return).
| itmax   = has changed. It now contains the number of steps required
|           to converge --
+-----------------------------------------------------------------------
| internal work arrays:
|----------
| vv      = work array of length [im+1][n] (used to store the Arnoldi
|           basis)
| hh      = work array of length [im][im+1] (Householder matrix)
| z       = work array of length [im][n] to store preconditioned vectors
+-----------------------------------------------------------------------
| subroutines called :
| matvec - matrix-vector multiplication operation
| psolve - (right) preconditionning operation
|	   psolve can be a NULL pointer (GMRES without preconditioner)
+---------------------------------------------------------------------*/

    int maxits = *itmax;
    int i, i1, ii, j, k, k1, its, retval, i_1 = 1;
    double **hh, *c, *s, *rs, t, t0;
    double beta, eps1 = 0.0, gam, **vv, **z;

    its = 0;
    vv = (double **)SUPERLU_MALLOC((im + 1) * sizeof(double *));
    for (i = 0; i <= im; i++)
	vv[i] = doubleMalloc(n);
    z = (double **)SUPERLU_MALLOC(im * sizeof(double *));
    hh = (double **)SUPERLU_MALLOC(im * sizeof(double *));
    for (i = 0; i < im; i++)
    {
	hh[i] = doubleMalloc(i + 2);
	z[i] = doubleMalloc(n);
    }
    c = doubleMalloc(im);
    s = doubleMalloc(im);
    rs = doubleMalloc(im + 1);

    /*---- outer loop starts here ----*/
    do
    {
	/*---- compute initial residual vector ----*/
	matvec(1.0, sol, 0.0, vv[0]);
	for (j = 0; j < n; j++)
	    vv[0][j] = rhs[j] - vv[0][j];	/* vv[0]= initial residual */
	beta = dnrm2_(&n, vv[0], &i_1);

	/*---- print info if fits != null ----*/
	if (fits != NULL && its == 0)
	    fprintf(fits, "%8d   %10.2e\n", its, beta);
	/*if ( beta < tol * dnrm2_(&n, rhs, &i_1) )*/
	if ( !(beta >= tol * dnrm2_(&n, rhs, &i_1)) )
	    break;
	t = 1.0 / beta;

	/*---- normalize: vv[0] = vv[0] / beta ----*/
	for (j = 0; j < n; j++)
	    vv[0][j] = vv[0][j] * t;
	if (its == 0)
	    eps1 = tol * beta;

	/*---- initialize 1-st term of rhs of hessenberg system ----*/
	rs[0] = beta;
	for (i = 0; i < im; i++)
	{
	    its++;
	    i1 = i + 1;

	    /*------------------------------------------------------------
	    |  (Right) Preconditioning Operation   z_{j} = M^{-1} v_{j}
	    +-----------------------------------------------------------*/
	    if (psolve)
		psolve(n, z[i], vv[i]);
	    else
		dcopy_(&n, vv[i], &i_1, z[i], &i_1);

	    /*---- matvec operation w = A z_{j} = A M^{-1} v_{j} ----*/
	    matvec(1.0, z[i], 0.0, vv[i1]);

	    /*------------------------------------------------------------
	    |     modified gram - schmidt...
	    |     h_{i,j} = (w,v_{i})
	    |     w  = w - h_{i,j} v_{i}
	    +------------------------------------------------------------*/
	    t0 = dnrm2_(&n, vv[i1], &i_1);
	    for (j = 0; j <= i; j++)
	    {
		double negt;
		t = ddot_(&n, vv[j], &i_1, vv[i1], &i_1);
		hh[i][j] = t;
		negt = -t;
		daxpy_(&n, &negt, vv[j], &i_1, vv[i1], &i_1);
	    }

	    /*---- h_{j+1,j} = ||w||_{2} ----*/
	    t = dnrm2_(&n, vv[i1], &i_1);
	    while (t < 0.5 * t0)
	    {
		t0 = t;
		for (j = 0; j <= i; j++)
		{
		    double negt;
		    t = ddot_(&n, vv[j], &i_1, vv[i1], &i_1);
		    hh[i][j] += t;
		    negt = -t;
		    daxpy_(&n, &negt, vv[j], &i_1, vv[i1], &i_1);
		}
		t = dnrm2_(&n, vv[i1], &i_1);
	    }
	    hh[i][i1] = t;
	    if (t != 0.0)
	    {
		/*---- v_{j+1} = w / h_{j+1,j} ----*/
		t = 1.0 / t;
		for (k = 0; k < n; k++)
		    vv[i1][k] = vv[i1][k] * t;
	    }
	    /*---------------------------------------------------
	    |     done with modified gram schimdt and arnoldi step
	    |     now  update factorization of hh
	    +--------------------------------------------------*/

	    /*--------------------------------------------------------
	    |   perform previous transformations  on i-th column of h
	    +-------------------------------------------------------*/
	    for (k = 1; k <= i; k++)
	    {
		k1 = k - 1;
		t = hh[i][k1];
		hh[i][k1] = c[k1] * t + s[k1] * hh[i][k];
		hh[i][k] = -s[k1] * t + c[k1] * hh[i][k];
	    }
	    gam = sqrt(pow(hh[i][i], 2) + pow(hh[i][i1], 2));

	    /*---------------------------------------------------
	    |     if gamma is zero then any small value will do
	    |     affect only residual estimate
	    +--------------------------------------------------*/
	    /* if (gam == 0.0) gam = epsmac; */

	    /*---- get next plane rotation ---*/
	    if (gam > 0.0)
	    {
		c[i] = hh[i][i] / gam;
		s[i] = hh[i][i1] / gam;
	    }
	    else
	    {
		c[i] = 1.0;
		s[i] = 0.0;
	    }
	    rs[i1] = -s[i] * rs[i];
	    rs[i] = c[i] * rs[i];

	    /*----------------------------------------------------
	    |   determine residual norm and test for convergence
	    +---------------------------------------------------*/
	    hh[i][i] = c[i] * hh[i][i] + s[i] * hh[i][i1];
	    beta = fabs(rs[i1]);
	    if (fits != NULL)
		fprintf(fits, "%8d   %10.2e\n", its, beta);
	    if (beta <= eps1 || its >= maxits)
		break;
	}

	if (i == im) i--;
	/*---- now compute solution. 1st, solve upper triangular system ----*/
	rs[i] = rs[i] / hh[i][i];
	for (ii = 1; ii <= i; ii++)
	{
	    k = i - ii;
	    k1 = k + 1;
	    t = rs[k];
	    for (j = k1; j <= i; j++)
		t = t - hh[j][k] * rs[j];
	    rs[k] = t / hh[k][k];
	}

	/*---- linear combination of v[i]'s to get sol. ----*/
	for (j = 0; j <= i; j++)
	{
	    t = rs[j];
	    for (k = 0; k < n; k++)
		sol[k] += t * z[j][k];
	}

	/* calculate the residual and output */
	matvec(1.0, sol, 0.0, vv[0]);
	for (j = 0; j < n; j++)
	    vv[0][j] = rhs[j] - vv[0][j];	/* vv[0]= initial residual */

	/*---- print info if fits != null ----*/
	beta = dnrm2_(&n, vv[0], &i_1);

	/*---- restart outer loop if needed ----*/
	/*if (beta >= eps1 / tol)*/
	if ( !(beta < eps1 / tol) )
	{
	    its = maxits + 10;
	    break;
	}
	if (beta <= eps1)
	    break;
    } while(its < maxits);

    retval = (its >= maxits);
    for (i = 0; i <= im; i++)
	SUPERLU_FREE(vv[i]);
    SUPERLU_FREE(vv);
    for (i = 0; i < im; i++)
    {
	SUPERLU_FREE(hh[i]);
	SUPERLU_FREE(z[i]);
    }
    SUPERLU_FREE(hh);
    SUPERLU_FREE(z);
    SUPERLU_FREE(c);
    SUPERLU_FREE(s);
    SUPERLU_FREE(rs);

    *itmax = its;

    return retval;
} /*----end of fgmr ----*/
Пример #17
0
int main() {
  vector x,b;
  vector r,p,Ap;
  matrix A;
  
  double one=1.0, zero=0.0;
  double normr, rtrans, oldtrans, p_ap_dot , alpha, beta;
  int iter=0;

  //create matrix
  allocate_3d_poission_matrix(A,N);
    
  printf("Rows: %d, nnz: %d\n", A.num_rows, A.row_offsets[A.num_rows]);

  allocate_vector(x,A.num_rows);
  allocate_vector(Ap,A.num_rows);
  allocate_vector(r,A.num_rows);
  allocate_vector(p,A.num_rows);
  allocate_vector(b,A.num_rows);

  initialize_vector(x,100000);
  initialize_vector(b,1);
 

  waxpby(one, x, zero, x, p);
  matvec(A,p,Ap);
  waxpby(one, b, -one, Ap, r);
  
  rtrans=dot(r,r);
  normr=sqrt(rtrans);
  
  double st = omp_get_wtime();
  do {
    if(iter==0) {
      waxpby(one,r,zero,r,p);
    } else {
      oldtrans=rtrans;
      rtrans = dot(r,r);
      beta = rtrans/oldtrans;
      waxpby(one,r,beta,p,p);
    }
    
    normr=sqrt(rtrans);
  
    matvec(A,p,Ap);
    p_ap_dot = dot(Ap,p);

    alpha = rtrans/p_ap_dot;

    waxpby(one,x,alpha,p,x);
    waxpby(one,r,-alpha,Ap,r);

    if(iter%10==0)
      printf("Iteration: %d, Tolerance: %.4e\n", iter, normr);
    iter++;
  } while(iter<MAX_ITERS && normr>TOL);
  double et = omp_get_wtime();

  printf("Total Iterations: %d\n", iter);
  printf("Total Time: %lf s\n", (et-st));

  free_vector(x);
  free_vector(r);
  free_vector(p);
  free_vector(Ap);
  free_matrix(A);

  return 0;
}
Пример #18
0
void power_method(void (*matvec)(void *, int, int, real*, real **, int, int*),
		  void *A, int n, int K, int random_seed, int maxit, real tol, real **eigv, real **eigs){
  /* find k-largest eigenvectors of a matrix A. Result in eigv. if eigv == NULL; memory will be allocated.
     maxium of maxit iterations will be done, and tol is the convergence criterion

     This converges only if the largest eigenvectors/values are real (e.g., if A is symmetric) and the 
     next largest eigenvalues separate from the largest ones

     input:
     matvec: a function point that takes a matrix M and a vector u, produce v = M.u
     A: the matrix
     n: dimension of matrix A
     K: number of eigenes to find
     random_seed: seed for eigenvector initialization
     matrix: max number f iterations
     tol: accuracy control

     output:
     eigv: eigenvectors. The i-th is at eigvs[i*n, i*(n+1) - 1]
     eigs: eigenvalues.  The i-th is at eigs[i]


     Function PowerIteration (A – m × m matrix )
     % This function computes u1, u2, . . . , uk, the first k eigenvectors of S.
     const tol ← 0.001
     for i = 1 to k do
     . ui ← random
     . ui ← ui/||ui||
     . do
     .   vi ← ui
     .   % orthogonalize against previous eigenvectors
     .   for j = 1 to i − 1 do
     .     vi ← vi − (vi^Tvi)vj
     .   end for
     .   ui ← A vi/||A vi||
     . while (ui^T vi < 1-tol) (halt when direction change is small)
     . vi = ui
     end for
     return v1,v2,...
   */
  real **v, *u, *vv;
  int iter = 0;
  real res, unorm;
  int i, j, k;
  real uij;
  int flag;

  K = MAX(0, MIN(n, K));
  assert(K <= n && K > 0);

  if (!(*eigv)) *eigv = MALLOC(sizeof(real)*n*K);
  if (!(*eigs)) *eigs = MALLOC(sizeof(real)*K);
  v = MALLOC(sizeof(real*)*K);

  vv = MALLOC(sizeof(real)*n);
  u = MALLOC(sizeof(real)*n);

  srand(random_seed);

  for (k = 0; k < K; k++){
    //fprintf(stderr,"calculating eig k ==================== %d\n",k);
    v[k] = &((*eigv)[k*n]);
    for (i = 0; i < n; i++) u[i] = drand();
    res = sqrt(vector_product(n, u, u));
    if (res > 0) res =  1/res;
    for (i = 0; i < n; i++) {
      u[i] = u[i]*res;
      v[k][i] = u[i];
    }
    /*
    fprintf(stderr,"inital vec=");
    for (i = 0; i < n; i++) fprintf(stderr,"%f,",u[i]);fprintf(stderr,"\n");
    */
    iter = 0;
    do {


      /* normalize against previous eigens */
      for (j = 0; j < k; j++){
	uij = vector_product(n, u, v[j]);
	for (i = 0; i < n; i++) {
	  u[i] = u[i] - uij *v[j][i];
	}
      }
      matvec(A, n, n, u, &vv, FALSE, &flag);
      assert(!flag);

      /*
      fprintf(stderr,"normalized aginst prev vec=");
      for (i = 0; i < n; i++) fprintf(stderr,"%f,",u[i]);fprintf(stderr,"\n");
      */

      unorm = vector_product(n, vv, vv);/* ||u||^2 */    
      unorm = sqrt(unorm);
      (*eigs)[k] = unorm;
      if (unorm > 0) {
	unorm = 1/unorm;
      } else {
	// ||A.v||=0, so v must be an eigenvec correspond to eigenvalue zero
	for (i = 0; i < n; i++) vv[i] = u[i];
	unorm = sqrt(vector_product(n, vv, vv));
	if (unorm > 0) unorm = 1/unorm;
      }
      res = 0.;

      for (i = 0; i < n; i++) {
	//res = MAX(res, ABS(vv[i]-(*eigs)[k]*u[i]));
	u[i] = vv[i]*unorm;
	res = res + u[i]*v[k][i];
	v[k][i] = u[i];
      }
      //fprintf(stderr,"res=%g, tol = %g, res < 1-tol=%d\n",res, tol,res < 1 - tol);
    } while (res < 1 - tol && iter++ < maxit);
    //} while (iter++ < maxit);
    //fprintf(stderr,"iter= %d, res=%f\n",iter, res);
  }
  FREE(u);
  FREE(vv);  
}
Пример #19
0
void peide(int n, int m, int nobs, int *nbp, real_t par[],
		real_t res[], int bp[], real_t **jtjinv,
		real_t in[], real_t out[],
		int (*deriv)(int,int,real_t [],real_t [],real_t,real_t []),
		int (*jacdfdy)(int,int,real_t [],real_t [],real_t,real_t **),
		int (*jacdfdp)(int,int,real_t [],real_t [],real_t,real_t **),
		void (*callystart)(int,int,real_t [],real_t [],real_t[]),
		void (*data)(int,real_t [],real_t [],int[]),
		void (*monitor)(int,int,int,real_t [],real_t [],int,int))
{
	int i,j,weight,ncol,nrow,away,max,nfe,nis,*cobs,
			first,sec,clean,nbpold,maxfe,fe,it,err,emergency;
	real_t eps1,res1,in3,in4,fac3,fac4,aux[4],*obs,*save,*tobs,
			**yp,*ymax,*y,**fy,**fp,w,**aid,temp,
			vv,ww,w2,mu,res2,fpar,fparpres,lambda,lambdamin,p,pw,
			reltolres,abstolres,em[8],*val,*b,*bb,*parpres,**jaco;
	static real_t save1[35]={1.0, 1.0, 9.0, 4.0, 0.0, 2.0/3.0, 1.0,
			1.0/3.0, 36.0, 20.25, 1.0, 6.0/11.0, 1.0, 6.0/11.0,
			1.0/11.0, 84.028, 53.778, 0.25, 0.48, 1.0, 0.7, 0.2,
			0.02, 156.25, 108.51, 0.027778, 120.0/274.0, 1.0,
			225.0/274.0, 85.0/274.0, 15.0/274.0, 1.0/274.0, 0.0,
			187.69, 0.0047361};

	nbpold=(*nbp);
	cobs=allocate_integer_vector(1,nobs);
	obs=allocate_real_vector(1,nobs);
	save=allocate_real_vector(-38,6*n);
	tobs=allocate_real_vector(0,nobs);
	ymax=allocate_real_vector(1,n);
	y=allocate_real_vector(1,6*n*(nbpold+m+1));
	yp=allocate_real_matrix(1,nbpold+nobs,1,nbpold+m);
	fy=allocate_real_matrix(1,n,1,n);
	fp=allocate_real_matrix(1,n,1,m+nbpold);
	aid=allocate_real_matrix(1,m+nbpold,1,m+nbpold);

	for (i=0; i<=34; i++) save[-38+i]=save1[i];
	(*data)(nobs,tobs,obs,cobs);
	weight=1;
	first=sec=0;
	clean=(*nbp > 0);
	aux[2]=FLT_EPSILON;
	eps1=1.0e10;
	out[1]=0.0;
	bp[0]=max=0;
	/* smooth integration without break-points */
	if (!peidefunct(nobs,m,par,res,
			n,m,nobs,nbp,first,&sec,&max,&nis,eps1,weight,bp,
			save,ymax,y,yp,fy,fp,cobs,tobs,obs,in,aux,clean,deriv,
			jacdfdy,jacdfdp,callystart,monitor)) goto Escape;
	res1=sqrt(vecvec(1,nobs,0,res,res));
	nfe=1;
	if (in[5] == 1.0) {
		out[1]=1.0;
		goto Escape;
	}
	if (clean) {
		first=1;
		clean=0;
		fac3=sqrt(sqrt(in[3]/res1));
		fac4=sqrt(sqrt(in[4]/res1));
		eps1=res1*fac4;
		if (!peidefunct(nobs,m,par,res,
				n,m,nobs,nbp,first,&sec,&max,&nis,eps1,weight,bp,
				save,ymax,y,yp,fy,fp,cobs,tobs,obs,in,aux,clean,deriv,
				jacdfdy,jacdfdp,callystart,monitor)) goto Escape;
		first=0;
	} else
		nfe=0;
	ncol=m+(*nbp);
	nrow=nobs+(*nbp);
	sec=1;
	in3=in[3];
	in4=in[4];
	in[3]=res1;
	weight=away=0;
	out[4]=out[5]=w=0.0;
	temp=sqrt(weight)+1.0;
	weight=temp*temp;
	while (weight != 16 && *nbp > 0) {
		if (away == 0 && w != 0.0) {
			/* if no break-points were omitted then one function
				function evaluation is saved */
			w=weight/w;
			for (i=nobs+1; i<=nrow; i++) {
				for (j=1; j<=ncol; j++) yp[i][j] *= w;
				res[i] *= w;
			}
			sec=1;
			nfe--;
		}
		in[3] *= fac3*weight;
		in[4]=eps1;
		(*monitor)(2,ncol,nrow,par,res,weight,nis);
		/* marquardt's method */
		val=allocate_real_vector(1,ncol);
		b=allocate_real_vector(1,ncol);
		bb=allocate_real_vector(1,ncol);
		parpres=allocate_real_vector(1,ncol);
		jaco=allocate_real_matrix(1,nrow,1,ncol);
		vv=10.0;
		w2=0.5;
		mu=0.01;
		ww = (in[6] < 1.0e-7) ? 1.0e-8 : 1.0e-1*in[6];
		em[0]=em[2]=em[6]=in[0];
		em[4]=10*ncol;
		reltolres=in[3];
		abstolres=in[4]*in[4];
		maxfe=in[5];
		err=0;
		fe=it=1;
		p=fpar=res2=0.0;
		pw = -log(ww*in[0])/2.30;
		if (!peidefunct(nrow,ncol,par,res,
					n,m,nobs,nbp,first,&sec,&max,&nis,eps1,
					weight,bp,save,ymax,y,yp,fy,fp,cobs,tobs,obs,
					in,aux,clean,deriv,jacdfdy,jacdfdp,
					callystart,monitor))
			err=3;
		else {
			fpar=vecvec(1,nrow,0,res,res);
			out[3]=sqrt(fpar);
			emergency=0;
			it=1;
			do {
				dupmat(1,nrow,1,ncol,jaco,yp);
				i=qrisngvaldec(jaco,nrow,ncol,val,aid,em);
				if (it == 1)
					lambda=in[6]*vecvec(1,ncol,0,val,val);
				else
					if (p == 0.0) lambda *= w2;
				for (i=1; i<=ncol; i++)
					b[i]=val[i]*tamvec(1,nrow,i,jaco,res);
				while (1) {
					for (i=1; i<=ncol; i++)
						bb[i]=b[i]/(val[i]*val[i]+lambda);
					for (i=1; i<=ncol; i++)
						parpres[i]=par[i]-matvec(1,ncol,i,aid,bb);
					fe++;
					if (fe >= maxfe)
						err=1;
					else
						if (!peidefunct(nrow,ncol,parpres,res,
								n,m,nobs,nbp,first,&sec,&max,&nis,
								eps1,weight,bp,save,ymax,y,yp,fy,fp,
								cobs,tobs,obs,in,aux,clean,deriv,
								jacdfdy,jacdfdp,callystart,monitor))
							err=2;
					if (err != 0) {
						emergency=1;
						break;
					}
					fparpres=vecvec(1,nrow,0,res,res);
					res2=fpar-fparpres;
					if (res2 < mu*vecvec(1,ncol,0,b,bb)) {
						p += 1.0;
						lambda *= vv;
						if (p == 1.0) {
							lambdamin=ww*vecvec(1,ncol,0,val,val);
							if (lambda < lambdamin) lambda=lambdamin;
						}
						if (p >= pw) {
							err=4;
							emergency=1;
							break;
						}
					} else {
						dupvec(1,ncol,0,par,parpres);
						fpar=fparpres;
						break;
					}
				}
				if (emergency) break;
				it++;
			} while (fpar>abstolres &&
							res2>reltolres*fpar+abstolres);
			for (i=1; i<=ncol; i++)
				mulcol(1,ncol,i,i,jaco,aid,1.0/(val[i]+in[0]));
			for (i=1; i<=ncol; i++)
				for (j=1; j<=i; j++)
					aid[i][j]=aid[j][i]=mattam(1,ncol,i,j,jaco,jaco);
			lambda=lambdamin=val[1];
			for (i=2; i<=ncol; i++)
				if (val[i] > lambda)
					lambda=val[i];
				else
					if (val[i] < lambdamin) lambdamin=val[i];
			temp=lambda/(lambdamin+in[0]);
			out[7]=temp*temp;
			out[2]=sqrt(fpar);
			out[6]=sqrt(res2+fpar)-out[2];
		}
		out[4]=fe;
		out[5]=it-1;
		out[1]=err;
		free_real_vector(val,1);
		free_real_vector(b,1);
		free_real_vector(bb,1);
		free_real_vector(parpres,1);
		free_real_matrix(jaco,1,nrow,1);
		if (out[1] > 0.0) goto Escape;
		/* the relative starting value of lambda is adjusted
			to the last value of lambda used */
		away=out[4]-out[5]-1.0;
		in[6] *= pow(5.0,away)*pow(2.0,away-out[5]);
		nfe += out[4];
		w=weight;
		temp=sqrt(weight)+1.0;
		eps1=temp*temp*in[4]*fac4;
		away=0;
		/* omit useless break-points */
		for (j=1; j<=(*nbp); j++)
			if (fabs(obs[bp[j]]+res[bp[j]]-par[j+m]) < eps1) {
				(*nbp)--;
				for (i=j; i<=(*nbp); i++) bp[i]=bp[i+1];
				dupvec(j+m,(*nbp)+m,1,par,par);
				j--;
				away++;
				bp[*nbp+1]=0;
			}
		ncol -= away;
		nrow -= away;
		temp=sqrt(weight)+1.0;
		weight=temp*temp;
	}
	in[3]=in3;
	in[4]=in4;
	*nbp=0;
	weight=1;
	(*monitor)(2,m,nobs,par,res,weight,nis);
	/* marquardt's method */
	val=allocate_real_vector(1,m);
	b=allocate_real_vector(1,m);
	bb=allocate_real_vector(1,m);
	parpres=allocate_real_vector(1,m);
	jaco=allocate_real_matrix(1,nobs,1,m);
	vv=10.0;
	w2=0.5;
	mu=0.01;
	ww = (in[6] < 1.0e-7) ? 1.0e-8 : 1.0e-1*in[6];
	em[0]=em[2]=em[6]=in[0];
	em[4]=10*m;
	reltolres=in[3];
	abstolres=in[4]*in[4];
	maxfe=in[5];
	err=0;
	fe=it=1;
	p=fpar=res2=0.0;
	pw = -log(ww*in[0])/2.30;
	if (!peidefunct(nobs,m,par,res,
				n,m,nobs,nbp,first,&sec,&max,&nis,eps1,weight,bp,
				save,ymax,y,yp,fy,fp,cobs,tobs,obs,in,aux,clean,
				deriv,jacdfdy,jacdfdp,callystart,monitor))
		err=3;
	else {
		fpar=vecvec(1,nobs,0,res,res);
		out[3]=sqrt(fpar);
		emergency=0;
		it=1;
		do {
			dupmat(1,nobs,1,m,jaco,yp);
			i=qrisngvaldec(jaco,nobs,m,val,jtjinv,em);
			if (it == 1)
				lambda=in[6]*vecvec(1,m,0,val,val);
			else
				if (p == 0.0) lambda *= w2;
			for (i=1; i<=m; i++)
				b[i]=val[i]*tamvec(1,nobs,i,jaco,res);
			while (1) {
				for (i=1; i<=m; i++)
					bb[i]=b[i]/(val[i]*val[i]+lambda);
				for (i=1; i<=m; i++)
					parpres[i]=par[i]-matvec(1,m,i,jtjinv,bb);
				fe++;
				if (fe >= maxfe)
					err=1;
				else
					if (!peidefunct(nobs,m,parpres,res,
							n,m,nobs,nbp,first,&sec,&max,&nis,eps1,
							weight,bp,save,ymax,y,yp,fy,fp,cobs,tobs,
							obs,in,aux,clean,deriv,jacdfdy,jacdfdp,
							callystart,monitor))
						err=2;
				if (err != 0) {
					emergency=1;
					break;
				}
				fparpres=vecvec(1,nobs,0,res,res);
				res2=fpar-fparpres;
				if (res2 < mu*vecvec(1,m,0,b,bb)) {
					p += 1.0;
					lambda *= vv;
					if (p == 1.0) {
						lambdamin=ww*vecvec(1,m,0,val,val);
						if (lambda < lambdamin) lambda=lambdamin;
					}
					if (p >= pw) {
						err=4;
						emergency=1;
						break;
					}
				} else {
					dupvec(1,m,0,par,parpres);
					fpar=fparpres;
					break;
				}
			}
			if (emergency) break;
			it++;
		} while (fpar>abstolres && res2>reltolres*fpar+abstolres);
		for (i=1; i<=m; i++)
			mulcol(1,m,i,i,jaco,jtjinv,1.0/(val[i]+in[0]));
		for (i=1; i<=m; i++)
			for (j=1; j<=i; j++)
				jtjinv[i][j]=jtjinv[j][i]=mattam(1,m,i,j,jaco,jaco);
		lambda=lambdamin=val[1];
		for (i=2; i<=m; i++)
			if (val[i] > lambda)
				lambda=val[i];
			else
				if (val[i] < lambdamin) lambdamin=val[i];
		temp=lambda/(lambdamin+in[0]);
		out[7]=temp*temp;
		out[2]=sqrt(fpar);
		out[6]=sqrt(res2+fpar)-out[2];
	}
	out[4]=fe;
	out[5]=it-1;
	out[1]=err;
	free_real_vector(val,1);
	free_real_vector(b,1);
	free_real_vector(bb,1);
	free_real_vector(parpres,1);
	free_real_matrix(jaco,1,nobs,1);
	nfe += out[4];

	Escape:
	if (out[1] == 3.0)
		out[1]=2.0;
	else
		if (out[1] == 4.0) out[1]=6.0;
	if (save[-3] != 0.0) out[1]=save[-3];
	out[3]=res1;
	out[4]=nfe;
	out[5]=max;
	free_integer_vector(cobs,1);
	free_real_vector(obs,1);
	free_real_vector(save,-38);
	free_real_vector(tobs,0);
	free_real_vector(ymax,1);
	free_real_vector(y,1);
	free_real_matrix(yp,1,nbpold+nobs,1);
	free_real_matrix(fy,1,n,1);
	free_real_matrix(fp,1,n,1);
	free_real_matrix(aid,1,m+nbpold,1);
}
Пример #20
0
int peidefunct(int nrow, int ncol, real_t par[], real_t res[],
		int n, int m, int nobs, int *nbp, int first, int *sec,
		int *max, int *nis, real_t eps1, int weight, int bp[],
		real_t save[], real_t ymax[], real_t y[], real_t **yp,
		real_t **fy, real_t **fp, int cobs[], real_t tobs[],
		real_t obs[], real_t in[], real_t aux[], int clean,
		int (*deriv)(int,int,real_t [],real_t [],real_t,real_t []),
		int (*jacdfdy)(int,int,real_t [],real_t [],real_t,real_t **),
		int (*jacdfdp)(int,int,real_t [],real_t [],real_t,real_t **),
		void (*callystart)(int,int,real_t [],real_t [],real_t[]),
		void (*monitor)(int,int,int,real_t [],real_t [],int,int))
{
	/* this function is internally used by PEIDE */

	void peidereset(int, int, real_t, real_t, real_t, real_t, real_t [],
				real_t [], real_t *, real_t *, real_t *, int *);
	void peideorder(int, int, real_t, real_t [], real_t [],
			real_t *, real_t *, real_t *, real_t *, real_t *, int *);
	void peidestep(int, int, int, real_t, real_t, real_t, real_t,
			real_t [], real_t [], real_t [], real_t [], int *, real_t *);
	real_t peideinterpol(int, int, int, real_t, real_t []);
	int l,k,knew,fails,same,kpold,n6,nnpar,j5n,cobsii,*p,evaluate,
			evaluated,decompose,conv,extra,npar,i,j,jj,ii;
	real_t xold,hold,a0,tolup,tol,toldwn,tolconv,h,ch,chnew,error,
			dfi,tobsdif,a[6],*delta,*lastdelta,*df,*y0,**jacob,xend,
			hmax,hmin,eps,s,aa,x,t,c;

	p=allocate_integer_vector(1,n);
	delta=allocate_real_vector(1,n);
	lastdelta=allocate_real_vector(1,n);
	df=allocate_real_vector(1,n);
	y0=allocate_real_vector(1,n);
	jacob=allocate_real_matrix(1,n,1,n);

	if (*sec) {
		*sec=0;
		goto Finish;
	}
	xend=tobs[nobs];
	eps=in[2];
	npar=m;
	extra=(*nis)=0;
	ii=1;
	jj = (*nbp == 0) ? 0 : 1;
	n6=n*6;
	inivec(-3,-1,save,0.0);
	inivec(n6+1,(6+m)*n,y,0.0);
	inimat(1,nobs+(*nbp),1,m+(*nbp),yp,0.0);
	t=tobs[1];
	x=tobs[0];
	(*callystart)(n,m,par,y,ymax);
	hmax=tobs[1]-tobs[0];
	hmin=hmax*in[1];
	/* evaluate jacobian */
	evaluate=0;
	decompose=evaluated=1;
	if (!(*jacdfdy)(n,m,par,y,x,fy)) {
		save[-3]=4.0;
		goto Finish;
	}
	nnpar=n*npar;

	Newstart:
	k=1;
	kpold=0;
	same=2;
	peideorder(n,k,eps,a,save,&tol,&tolup,&toldwn,&tolconv,
					&a0,&decompose);
	if (!(*deriv)(n,m,par,y,x,df)) {
		save[-3]=3.0;
		goto Finish;
	}
	s=FLT_MIN;
	for (i=1; i<=n; i++) {
		aa=matvec(1,n,i,fy,df)/ymax[i];
		s += aa*aa;
	}
	h=sqrt(2.0*eps/sqrt(s));
	if (h > hmax)
		h=hmax;
	else
		if (h < hmin) h=hmin;
	xold=x;
	hold=h;
	ch=1.0;
	for (i=1; i<=n; i++) {
		save[i]=y[i];
		save[n+i]=y[n+i]=df[i]*h;
	}
	fails=0;
	while (x < xend) {
		if (x+h <= xend)
			x += h;
		else {
			h=xend-x;
			x=xend;
			ch=h/hold;
			c=1.0;
			for (j=n; j<=k*n; j += n) {
				c *= ch;
				for (i=j+1; i<=j+n; i++) y[i] *= c;
			}
			same = (same < 3) ? 3 : same+1;
		}
		/* prediction */
		for (l=1; l<=n; l++) {
			for (i=l; i<=(k-1)*n+l; i += n)
				for (j=(k-1)*n+l; j>=i; j -= n) y[j] += y[j+n];
			delta[l]=0.0;
		}
		evaluated=0;
		/* correction and estimation local error */
		for (l=1; l<=3; l++) {
			if (!(*deriv)(n,m,par,y,x,df)) {
				save[-3]=3;
				goto Finish;
			}
			for (i=1; i<=n; i++) df[i]=df[i]*h-y[n+i];
			if (evaluate) {
				/* evaluate jacobian */
				evaluate=0;
				decompose=evaluated=1;
				if (!(*jacdfdy)(n,m,par,y,x,fy)) {
					save[-3]=4.0;
					goto Finish;
				}
			}
			if (decompose) {
				/* decompose jacobian */
				decompose=0;
				c = -a0*h;
				for (j=1; j<=n; j++) {
					for (i=1; i<=n; i++) jacob[i][j]=fy[i][j]*c;
					jacob[j][j] += 1.0;
				}
				dec(jacob,n,aux,p);
			}
			sol(jacob,n,p,df);
			conv=1;
			for (i=1; i<=n; i++) {
				dfi=df[i];
				y[i] += a0*dfi;
				y[n+i] += dfi;
				delta[i] += dfi;
				conv=(conv && (fabs(dfi) < tolconv*ymax[i]));
			}
			if (conv) {
				s=FLT_MIN;
				for (i=1; i<=n; i++) {
					aa=delta[i]/ymax[i];
					s += aa*aa;
				}
				error=s;
				break;
			}
		}
		/* acceptance or rejection */
		if (!conv) {
			if (!evaluated)
				evaluate=1;
			else {
				ch /= 4.0;
				if (h < 4.0*hmin) {
					save[-1] += 10.0;
					hmin /= 10.0;
					if (save[-1] > 40.0) goto Finish;
				}
			}
			peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x,
							&h,&decompose);
		} else if (error > tol) {
			fails++;
			if (h > 1.1*hmin) {
				if (fails > 2) {
					peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x,
								&h,&decompose);
					goto Newstart;
				} else {
					/* calculate step and order */
					peidestep(n,k,fails,tolup,toldwn,tol,error,delta,
								lastdelta,y,ymax,&knew,&chnew);
					if (knew != k) {
						k=knew;
						peideorder(n,k,eps,a,save,&tol,&tolup,
									&toldwn,&tolconv,&a0,&decompose);
					}
					ch *= chnew;
					peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x,
								&h,&decompose);
				}
			} else {
				if (k == 1) {
					/* violate eps criterion */
					save[-2] += 1.0;
					same=4;
					goto Errortestok;
				}
				k=1;
				peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x,
							&h,&decompose);
				peideorder(n,k,eps,a,save,&tol,&tolup,
							&toldwn,&tolconv,&a0,&decompose);
				same=2;
			}
		} else {
			Errortestok:
			fails=0;
			for (i=1; i<=n; i++) {
				c=delta[i];
				for (l=2; l<=k; l++) y[l*n+i] += a[l]*c;
				if (fabs(y[i]) > ymax[i]) ymax[i]=fabs(y[i]);
			}
			same--;
			if (same == 1)
				dupvec(1,n,0,lastdelta,delta);
			else if (same == 0) {
				/* calculate step and order */
				peidestep(n,k,fails,tolup,toldwn,tol,error,delta,
							lastdelta,y,ymax,&knew,&chnew);
				if (chnew > 1.1) {
					if (k != knew) {
						if (knew > k)
							mulvec(knew*n+1,knew*n+n,-knew*n,y,delta,
									a[k]/knew);
						k=knew;
						peideorder(n,k,eps,a,save,&tol,&tolup,
									&toldwn,&tolconv,&a0,&decompose);
					}
					same=k+1;
					if (chnew*h > hmax) chnew=hmax/h;
					h *= chnew;
					c=1.0;
					for (j=n; j<=k*n; j += n) {
						c *= chnew;
						mulvec(j+1,j+n,0,y,y,c);
					}
					decompose=1;
				} else
					same=10;
			}
			(*nis)++;
			/* start of an integration step of yp */
			if (clean) {
				hold=h;
				xold=x;
				kpold=k;
				ch=1.0;
				dupvec(1,k*n+n,0,save,y);
			} else {
				if (h != hold) {
					ch=h/hold;
					c=1.0;
					for (j=n6+nnpar; j<=kpold*nnpar+n6; j += nnpar) {
						c *= ch;
						for (i=j+1; i<=j+nnpar; i++) y[i] *= c;
					}
					hold=h;
				}
				if (k > kpold)
					inivec(n6+k*nnpar+1,n6+k*nnpar+nnpar,y,0.0);
				xold=x;
				kpold=k;
				ch=1.0;
				dupvec(1,k*n+n,0,save,y);
				/* evaluate jacobian */
				evaluate=0;
				decompose=evaluated=1;
				if (!(*jacdfdy)(n,m,par,y,x,fy)) {
					save[-3]=4.0;
					goto Finish;
				}
				/* decompose jacobian */
				decompose=0;
				c = -a0*h;
				for (j=1; j<=n; j++) {
					for (i=1; i<=n; i++) jacob[i][j]=fy[i][j]*c;
					jacob[j][j] += 1.0;
				}
				dec(jacob,n,aux,p);
				if (!(*jacdfdp)(n,m,par,y,x,fp)) {
					save[-3]=5.0;
					goto Finish;
				}
				if (npar > m) inimat(1,n,m+1,npar,fp,0.0);
				/* prediction */
				for (l=0; l<=k-1; l++)
					for (j=k-1; j>=l; j--)
						elmvec(j*nnpar+n6+1,j*nnpar+n6+nnpar,nnpar,
									y,y,1.0);
				/* correction */
				for (j=1; j<=npar; j++) {
					j5n=(j+5)*n;
					dupvec(1,n,j5n,y0,y);
					for (i=1; i<=n; i++)
						df[i]=h*(fp[i][j]+matvec(1,n,i,fy,y0))-
									y[nnpar+j5n+i];
					sol(jacob,n,p,df);
					for (l=0; l<=k; l++) {
						i=l*nnpar+j5n;
						elmvec(i+1,i+n,-i,y,df,a[l]);
					}
				}
			}
			while (x >= t) {
				/* calculate a row of the jacobian matrix and an
					element of the residual vector */
				tobsdif=(tobs[ii]-x)/h;
				cobsii=cobs[ii];
				res[ii]=peideinterpol(cobsii,n,k,tobsdif,y)-obs[ii];
				if (!clean) {
					for (i=1; i<=npar; i++)
						yp[ii][i]=peideinterpol(cobsii+(i+5)*n,nnpar,k,
														tobsdif,y);
					/* introducing break-points */
					if (bp[jj] != ii) {
					} else if (first && fabs(res[ii]) < eps1) {
						(*nbp)--;
						for (i=jj; i<=(*nbp); i++) bp[i]=bp[i+1];
						bp[*nbp+1]=0;
					} else {
						extra++;
						if (first) par[m+jj]=obs[ii];
						/* introducing a jacobian row and a residual
							vector element for continuity requirements */
						yp[nobs+jj][m+jj] = -weight;
						mulrow(1,npar,nobs+jj,ii,yp,yp,weight);
						res[nobs+jj]=weight*(res[ii]+obs[ii]-par[m+jj]);
					}
				}
				if (ii == nobs)
					goto Finish;
				else {
					t=tobs[ii+1];
					if (bp[jj] == ii && jj < *nbp) jj++;
					hmax=t-tobs[ii];
					hmin=hmax*in[1];
					ii++;
				}
			}
			/* break-points introduce new initial values for y & yp */
			if (extra > 0) {
				for (i=1; i<=n; i++) {
					y[i]=peideinterpol(i,n,k,tobsdif,y);
					for (j=1; j<=npar; j++)
						y[i+(j+5)*n]=peideinterpol(i+(j+5)*n,nnpar,
															k,tobsdif,y);
				}
				for (l=1; l<=extra; l++) {
					cobsii=cobs[bp[npar-m+l]];
					y[cobsii]=par[npar+l];
					for (i=1; i<=npar+extra; i++) y[cobsii+(5+i)*n]=0.0;
					inivec(1+nnpar+(l+5)*n,nnpar+(l+6)*n,y,0.0);
					y[cobsii+(5+npar+l)*n]=1.0;
				}
				npar += extra;
				extra=0;
				x=tobs[ii-1];
				/* evaluate jacobian */
				evaluate=0;
				decompose=evaluated=1;
				if (!(*jacdfdy)(n,m,par,y,x,fy)) {
					save[-3]=4.0;
					goto Finish;
				}
				nnpar=n*npar;
				goto Newstart;
			}
		}
	}
	Finish:
	if (save[-2] > *max) *max=save[-2];
	if (!first) (*monitor)(1,ncol,nrow,par,res,weight,*nis);
	free_integer_vector(p,1);
	free_real_vector(delta,1);
	free_real_vector(lastdelta,1);
	free_real_vector(df,1);
	free_real_vector(y0,1);
	free_real_matrix(jacob,1,n,1);
	return (save[-1] <= 40.0 && save[-3] == 0.0);
}
Пример #21
0
void semip_gc(

		//Output Arguments

        double *bdraw, // draws for beta (ndraw - nomit,k) matrix
		double *adraw, // draws for regional effects, a, (m,1) vector
        double *pdraw, // draws for rho (ndraw - nomit,1) vector
		double *sdraw, // draws for sige (ndraw - nomit,1) vector
		double *rdraw, // draws for rval (ndraw - nomit,1) vector (if mm != 0)
		double *vmean, // mean of vi draws (n,1) vector			
        double *amean, // mean of a-draws (m,1) vector
        double *zmean, // mean of latent z-draws (n,1) vector
        double *yhat,  // mean of posterior predicted y (n,1) vector
            
		//Input Arguments

		double *y,		// (n,1) lhs vector with 0,1 values
 		double *x,		// (n,k) matrix of explanatory variables   
 		double *W,		// (m,m) weight matrix
        int ndraw,		// # of draws
        int nomit,		// # of burn-in draws to omit
		int nsave,		// # of draws saved (= ndraw - nomit)
        int n,			// # of observations
		int k,			// # of explanatory variables
        int m,			// # of regions
        int *mobs,		// (m,1) vector of obs numbers in each region
		double *a,		// (m,1) vector of regional effects
		double nu,		// prior parameter for sige	 
        double d0,		// prior parameter for sige
        double rval,	// hyperparameter r            
        double mm,		// prior parameter for rval
        double kk,		// prior parameter for rval
        double *detval, // (ngrid,2) matrix with [rho , log det values]            
        int ngrid,		// # of values in detval (rows)
        double *TI,		// prior var-cov for beta (inverted in matlab)
		double *TIc)	// prior var-cov * prior mean	

{
    
// Local Variables

	int i,j,l,iter,invt,accept,rcount,cobs,obsi,zflag;
    double rmin,rmax,sige,chi,ee,ratio,p,rho,rho_new;
	double junk,aBBa,vsqrt,ru,rhox,rhoy,phi,awi,aw2i,bi,di;
    double *z,*tvec,*e,*e0,*xb,*mn,*ys,*limit1,*limit2,*zmt;
	double *bhat,*b0,*b0tmp,*v,*vv,*b1,*b1tmp,*Bpa,*rdet,*ldet,*w1;
	double **xs,**xst,**xsxs,**A0,**A1,**Bpt,**Bp,**xmat,**W2; 	
	double *Wa, **Wmat;
	double c0, c1,c2;

		   
// Allocate Matrices and Vectors


	  xs     = dmatrix(0,n-1,0,k-1);	  
	  xst    = dmatrix(0,k-1,0,n-1);      
	  xsxs   = dmatrix(0,k-1,0,k-1);
	  A0     = dmatrix(0,k-1,0,k-1);	  
	  A1     = dmatrix(0,m-1,0,m-1);
	  Bp     = dmatrix(0,m-1,0,m-1);
	  Bpt    = dmatrix(0,m-1,0,m-1);
	  xmat	 = dmatrix(0,n-1,0,k-1);
	  W2     = dmatrix(0,m-1,0,m-1);
	  Wmat   = dmatrix(0,m-1,0,m-1);

	   z     = dvector(0,n-1);
	   tvec  = dvector(0,n-1);
	   e     = dvector(0,n-1);
	   e0    = dvector(0,n-1);
	   xb    = dvector(0,n-1);
	   mn    = dvector(0,n-1);
	   ys    = dvector(0,n-1);      
      limit1 = dvector(0,n-1);			
	  limit2 = dvector(0,n-1);
	  zmt	 = dvector(0,n-1);
	  vv	 = dvector(0,n-1);
	  bhat   = dvector(0,k-1);
	  b0     = dvector(0,k-1);
	  b0tmp  = dvector(0,k-1);
	  v      = dvector(0,m-1);
	  b1     = dvector(0,m-1);
	  b1tmp  = dvector(0,m-1);
	  Bpa    = dvector(0,m-1);
	  w1     = dvector(0,m-1);
	  rdet   = dvector(0,ngrid-1);
	  ldet   = dvector(0,ngrid-1);	 
	  Wa     = dvector(0,m-1);
   
// Initializations


    junk = 0.0;  // Placeholder for mean in meanvar()
  	rho = 0.5;  
	sige = 1.0;
	zflag = 0;  // a flag for 0,1 y-values

// Initialize (z,vv,limits,tvec) 

	for(i=0; i<n; i++){
        z[i] = y[i];
		vv[i] = 1.0;
		tvec[i] = 1.0;
	    if (y[i] == 0.0){
			limit1[i] = -10000;
			limit2[i] = 0.0;
			zflag = 1; // we have 0,1 y-values so sample z
	    }else{
			limit1[i] = 0.0;
			limit2[i] = 10000;
		}
	}
        
	
	// Initialize v, b1tmp, Bp


	for(i=0; i<m; i++){
		v[i] = 1.0;
	    b1tmp[i] = 1.0;		                 
        for(j=0; j<m; j++){
              if (j == i){
                 Bp[i][j] = 1 - rho*W[i + j*m];
			  }else{
                 Bp[i][j] = - rho*W[i + j*m];
              }
              Wmat[i][j] = W[i + j*m];
		}
	}


	
	// Parse detval into rdet and ldet vectors 
	//       and define (rmin,rmax)
	
	for(i=0; i<ngrid; i++){
	    j=0;
		rdet[i] = detval[i + j*ngrid];
	    j=1;
		ldet[i] = detval[i + j*ngrid];
	}
	
	rmin = rdet[0];
    rmax = rdet[ngrid-1];  

	
	// Put x into xmat

	for(i=0; i<n; i++){
		for(j=0; j<k; j++)
            xmat[i][j] = x[i + j*n];
	}


	// Compute matrices to be used for updating a-vector

	if(m > 100){  // Used only for large m

		for(i=0; i<m; i++){

			w1[i] = 0.0;							// Form w1[i]
			for(j=0;j<m;j++){
				w1[i] = w1[i] + (W[j + i*m]*W[j + i*m]);
			}

			for(j=0;j<m;j++){						// Form W2[i][*]
				W2[i][j] = 0.0;
				if(j != i){					
					for(l=0;l<m;l++){
						W2[i][j] = W2[i][j] + W[l + i*m]*W[l + j*m];
					}
				}		// Note that W2[i][i] = 0.0 by construction
			}			
		}
	} // End if(m > 10)



// ======================================
// Start the Sampler
// ======================================

	for(iter=0; iter<ndraw; iter++){


// UPDATE: beta

// (1) Apply stnd devs (vsqrt) to x, z, z - tvec
	
	for(i=0; i<n; i++){
		vsqrt = sqrt(vv[i]);
	    zmt[i] = (z[i] - tvec[i])/vsqrt;
		for(j=0; j<k; j++)
			xs[i][j] = x[i + j*n]/vsqrt;
	}
	      

//  (2) Construct A0 matrix


	transpose(xs,n,k,xst);				// form xs'
	  matmat(xst,k,n,xs,k,xsxs);		// form xs'*xs
      for(i=0; i<k; i++){				// form xs'*xs + TI
            for(j=0; j<k; j++)
                 A0[i][j] = xsxs[i][j] + TI[i + j*k];
          }      

      invt = inverse(A0, k);			// replace A0 = inv(A0)
	  if (invt != 1)
		 mexPrintf("semip_gc: Inversion error in beta conditional \n");

// (3) Construct b0 vector

	matvec(xst,k,n,zmt,b0tmp);			//form xs'*zmt
      for(i=0; i<k; i++){
		b0tmp[i] = b0tmp[i] + TIc[i];		//form b0tmp = xs'*zmt + TIc
      }
	matvec(A0,k,k,b0tmp,b0);				//form b0 = A0*b0tmp

// (4) Do multivariate normal draw from N(b0,A0)

	normal_rndc(A0, k, bhat);			// generate N(0,A0) vector
	for(i=0; i<k; i++){
		bhat[i] = bhat[i] + b0[i];		// add mean vector, b0
	}

// (5) Now update related values:


	matvec(xmat,n,k,bhat,xb);			// form xb = xmat*bhat


	for(i=0; i<n; i++){					// form e0 = z - x*bhat
		e0[i] = z[i] - xb[i];	
	}

	cobs = 0;
	for(i=0; i<m; i++){					// form b1tmp = e0i/v[i]
		obsi = mobs[i];
		b1tmp[i] = 0.0;
		for(j=cobs; j<cobs + obsi; j++){
			b1tmp[i] = b1tmp[i] + (e0[j]/v[i]);	
		}
		cobs = cobs + obsi;
	}
	
        

// UPDATE: a 
	
	if(m <= 100){  // For small m use straight inverse

		// (1) Define A1 and b1

		transpose(Bp,m,m,Bpt);			    // form Bp'
		matmat(Bpt,m,m,Bp,m,A1);			// form Bp'*Bp
	
		for(i=0; i<m; i++){					// form A1 = (1/sige)*Bp'Bp + diag(mobs/v)
			for(j=0; j<m; j++){
				if (j == i){
					A1[i][j] = (1/sige)*A1[i][j] + ((double)mobs[i])/v[i];
				}else{
					A1[i][j] = (1/sige)*A1[i][j];
				}
			}
		}
	

		inverse(A1,m);						// set A1 = inv(A1)
	
		matvec(A1,m,m,b1tmp,b1);			// form b1

	
		// (2) Do multivariate normal draw from N(b1,A1)

		normal_rndc(A1,m,a);				// generate N(0,A1) vector
		for(i=0; i<m; i++){
			a[i] = a[i] + b1[i];			// add mean vector, b1
		}

	}else{   // For large m use marginal distributions

		cobs = 0;
		for(i=0;i<m;i++){			
			obsi = mobs[i];
			
			phi = 0.0;
			for(j=cobs;j<cobs+obsi;j++){
				phi = phi + ((z[j] - xb[j])/v[i]);  // form phi
			}		
			awi = 0.0;
			aw2i = 0.0;
			for(j=0;j<m;j++){						// form awi and aw2i
				aw2i = aw2i + W2[i][j]*a[j];		// (W2[i][i] = 0)	
				if(j != i){
					awi = awi + (W[i + j*m] + W[j + i*m])*a[j];
				}
			}
			bi = phi + (rho/sige)*awi - ((rho*rho)/sige)*aw2i;
			di = (1/sige) + ((rho*rho)/sige)*w1[i] + (obsi/v[i]);		
			a[i] = (bi/di) + sqrt(1/di)*snorm();	// Form a[i]           

			cobs = cobs + obsi;
		}
	} // End update of a
	


	// Compute tvec = del*a	
	cobs = 0;
	for(i=0; i<m; i++){
		obsi = mobs[i];
		for(j=cobs; j<cobs + obsi; j++){
			tvec[j] = a[i];
		}
		cobs = cobs + obsi;
	}

//UPDATE: sige

	matvec(Bp,m,m,a,Bpa);			//form Bp*a
	aBBa = 0.0;						//form aBBa = a'*Bp'*Bp*a
	for(i=0; i<m; i++){		
		aBBa = aBBa + Bpa[i]*Bpa[i];
	}
	
	chi = genchi(m + 2.0*nu);

	sige = (aBBa + 2.0*d0)/chi;

	
//UPDATE: v (and vv = del*v)	
	

	for(i=0; i<n; i++){
		e[i] = e0[i] - tvec[i];		//form e  = z - x*bhat - tvec
	}
	cobs = 0;
	for(i=0; i<m; i++){
		obsi = mobs[i];
		chi = genchi(rval + obsi);
		ee = 0.0;
		for(j=cobs; j<cobs + obsi; j++){		// form ee
			ee = ee + e[j]*e[j];
		}
		v[i] = (ee + rval)/chi;					// form v
		for(j=cobs; j<cobs + obsi; j++){
			vv[j] = v[i];						// form vv
		}
		cobs = cobs + obsi;
	}
	


//UPDATE: rval (if necessary)

	if (mm != 0.0){
    rval = gengam(mm,kk);
    }

//UPDATE: rho (using univariate integration)

	      matvec(Wmat,m,m,a,Wa); // form Wa vector

		  	c0 = 0.0;						// form a'*a
		  	c1 = 0.0;                       // form a'*Wa
		  	c2 = 0.0;                       // form (Wa)'*Wa
	        for(i=0; i<m; i++){		
		    c0 = c0 + a[i]*a[i];
		    c1 = c1 + a[i]*Wa[i];
		    c2 = c2 + Wa[i]*Wa[i];
	        }

	        rho = draw_rho(rdet,ldet,c0,c1,c2,sige,ngrid,m,k,rho);



// (4) Update Bp matrix using new rho

	 
	for(i=0; i<m; i++){		                 
        for(j=0; j<m; j++){
              if (j == i){
                 Bp[i][j] = 1 - rho*W[i + j*m];
			  }else{
                 Bp[i][j] = - rho*W[i + j*m];
              }
		}
	}

	 
//UPDATE: z 
if (zflag == 1){ // skip this for continuous y-values
	 // (1) Generate vector of means

	 cobs = 0;
	 for(i=0; i<m; i++){
		 obsi = mobs[i];
		 for(j=cobs; j<cobs + obsi; j++){
			 mn[j] = xb[j] + a[i];
		 }			 		 
		 cobs = cobs + obsi;		 
	 }
	 

	 // (2) Sample truncated normal for latent z values

	 normal_truncc(n,mn,vv,limit1,limit2,z);

	 // (3) Compute associated sample y vector: ys

	 for(i=0; i<n; i++){
		 if(z[i]<0){
			 ys[i] = 0.0;
		 }else{
			 ys[i] = 1.0;
		 }
	 }

} // end of if zflag == 1


// ===================
// Save sample draws 	
// ===================


	 
	 if (iter > nomit-1){
		 pdraw[iter - nomit] = rho;							//save rho-draws
		 sdraw[iter - nomit] = sige;					    //save sige-draws	 
		 if(mm != 0.0)
			rdraw[iter - nomit] = rval;						//save r-draws (if necessary)
	     for(j=0; j<k; j++)
			bdraw[(iter - nomit) + j*nsave] = bhat[j];		//save beta-draws
		 for(i=0; i<m; i++){
			vmean[i] = vmean[i] + v[i]/((double) (nsave));	//save mean v-draws
			adraw[(iter - nomit) + i*nsave] = a[i];			//save a-draws
			amean[i] = amean[i] + a[i]/((double) (nsave));	//save mean a-draws
		 }
	     for(i=0; i<n; i++){
			zmean[i] = zmean[i] + z[i]/((double) (nsave));	//save mean z-draws
			yhat[i] = yhat[i] + mn[i]/((double) (nsave));   //save mean y-values
		 }
	 }

	 } // End iteration loop
	 

// ===============================
// END SAMPLER
// ===============================


// Free up allocated vectors

	free_dmatrix(xs,0,n-1,0);	  	
	free_dmatrix(xst,0,k-1,0);       
	free_dmatrix(xsxs,0,k-1,0);
	free_dmatrix(A0,0,k-1,0);
	free_dmatrix(A1,0,m-1,0);  
	free_dmatrix(Bp,0,m-1,0);  
    free_dmatrix(Bpt,0,m-1,0);
	free_dmatrix(W2,0,m-1,0);
	free_dmatrix(xmat,0,n-1,0);
	free_dmatrix(Wmat,0,m-1,0);


	free_dvector(z,0);
	free_dvector(tvec,0);   
	free_dvector(e,0);   
	free_dvector(e0,0);
	free_dvector(xb,0);
	free_dvector(mn,0);
	free_dvector(ys,0);      
    free_dvector(limit1,0);			
	free_dvector(limit2,0);
	free_dvector(zmt,0);
	free_dvector(vv,0);
	free_dvector(bhat,0);
	free_dvector(b0,0);
	free_dvector(b0tmp,0);
	free_dvector(v,0);
	free_dvector(b1,0);
	free_dvector(b1tmp,0);
	free_dvector(Bpa,0);
	free_dvector(rdet,0);
	free_dvector(ldet,0);
	free_dvector(w1,0);
	free_dvector(Wa,0);


} // End of semip_gc
Пример #22
0
/*-------------------------------------------------------------------------------
 calculate the least squares solution of an overdetermined system of nonlinear equations
 with Marquardt's method 

-------------------------------------------------------------------------------*/
void Ti_Optimization::MarquardtforCylinderFitting(
			   int m,
			   int n, 
			   double**g_pnt,
			   double* const par, 
			   double*& g, 
			   double**v,
			   int (*funct)(int m, int n, double* const par, double* g,double**g_pnt),
			   void (*jacobian)(int m, int n, double* const par, double*& g, double **jac,double**g_pnt),
			   double in[], 
			   double out[]
			   )
{
	int		maxfe,fe,it,i,j,err,emergency;
	double	vv,ww,w,mu,res,fpar,fparpres,lambda,lambdamin,p,pw,reltolres,
			abstolres,em[8],*val,*b,*bb,*parpres,**jac,temp;

	val		= allocate_real_vector(1,n);
	b		= allocate_real_vector(1,n);
	bb		= allocate_real_vector(1,n);
	parpres = allocate_real_vector(1,n);
	jac		= allocate_real_matrix(1,m,1,n);
	assert( (val != NULL) &&
		    (b   != NULL) &&
			(bb  != NULL) &&
			(parpres!= NULL)&&
			(jac != NULL)
			);
	vv      = 10.0;
	w		= 0.5;
	mu		= 0.01;
	ww		= (in[6] < 1.0e-7) ? 1.0e-8 : 1.0e-1*in[6];
	em[0] = em[2] = em[6] = in[0];
	em[4] = 10*n;
	reltolres =in[3];
	abstolres=in[4]*in[4];
	maxfe=(int)in[5];
	err=0;
	fe=it=1;
	p=fpar=res=0.0;
	pw = -log(ww*in[0])/2.30;
	if (!(*funct)(m,n,par,g,g_pnt))
	{
		err=3;
		out[4]=fe;
		out[5]=it-1;
		out[1]=err;
		free_real_vector(val,1);
		free_real_vector(b,1);
		free_real_vector(bb,1);
		free_real_vector(parpres,1);
		free_real_matrix(jac,1,m,1);
		return;
	}
	fpar=vecvec(1,m,0,g,g);// norm of residual vector
	out[3]=sqrt(fpar);
	emergency=0;
	it=1;
	do {
 		(*jacobian)(m,n,par,g,jac,g_pnt);
		i = qrisngvaldec(jac,m,n,val,v,em);
		if (it == 1)
			lambda = in[6]*vecvec(1,n,0,val,val);
		else
			if (p == 0.0)
				lambda *= w;
		for (i=1; i<=n; i++) 
			b[i] = val[i]*tamvec(1,m,i,jac,g);
		while (1)
		{
			for (i=1; i<=n; i++) 
				bb[i]=b[i]/(val[i]*val[i]+lambda);
			for (i=1; i<=n; i++)
				parpres[i]=par[i]-matvec(1,n,i,v,bb);

	     	//normalization ,this section only used for cylinder fitting, 
			//when it is used in other situations,it should be removed
			  temp = sqrt(parpres[4]*parpres[4]+parpres[5]*parpres[5]+parpres[6]*parpres[6]);
			  parpres[4] /= temp;
			  parpres[5] /= temp;
			  parpres[6] /= temp;
            //end normalization
			fe++;
			if (fe >= maxfe)
				err=1;
			else
				if (!(*funct)(m,n,parpres,g,g_pnt))
					err=2;
			if (err != 0) 
			{
				emergency = 1;
				break;
			}
			fparpres=vecvec(1,m,0,g,g);
			res=fpar-fparpres;
			if (res < mu*vecvec(1,n,0,b,bb)) 
			{
				p += 1.0;
				lambda *= vv;
				if (p == 1.0) 
				{
					lambdamin=ww*vecvec(1,n,0,val,val);
					if (lambda < lambdamin)
						lambda=lambdamin;
				}
				if (p >= pw)
				{
					err=4;
					emergency=1;
					break;
				}
			} // end if
			else 
			{

				dupvec(1,n,0,par,parpres);
				fpar=fparpres;
				break;
			} // end else
		} // end while
		if (emergency) 
			break;
		it++;
	} 
	while ( 
			(fpar > abstolres) &&   
		    (res > reltolres*fpar+abstolres)
		  );

	for (i=1; i<=n; i++) 
		mulcol(1,n,i,i,jac,v,1.0/(val[i]+in[0]));
	for (i=1; i<=n; i++)
	{
		for (j=1; j<=i; j++) 
			v[i][j]=v[j][i]=mattam(1,n,i,j,jac,jac);
		lambda=lambdamin=val[1];
	}
	for (i=2; i<=n; i++)
	{
		if (val[i] > lambda)
			lambda=val[i];
		else
		{
			if (val[i] < lambdamin) 
				lambdamin=val[i];
		}
	}
	temp=lambda/(lambdamin+in[0]);
	out[7]=temp*temp;
	out[2]=sqrt(fpar);
	out[6]=sqrt(res+fpar)-out[2];
	out[4]=fe;
	out[5]=it-1;
	out[1]=err;

    if(val != NULL)
	{
		free_real_vector(val,1);
		val = NULL;
	}
	if (b != NULL)
	{
		free_real_vector(b,1);
		b = NULL;
	}
	if(bb!=NULL)
	{
		free_real_vector(bb,1);
		bb  = NULL;
	}
	if(parpres != NULL)
	{
		free_real_vector(parpres,1);
		parpres = NULL;
	}
	if (jac != NULL)
	{
		free_real_matrix(jac,1,m,1);
		jac = NULL;
	}
}