void commatvec(int l, int u, int i, real_t **ar, real_t **ai, real_t br[], real_t bi[], real_t *rr, real_t *ri) { real_t matvec(int, int, int, real_t **, real_t []); real_t mv; mv=matvec(l,u,i,ar,br)-matvec(l,u,i,ai,bi); *ri=matvec(l,u,i,ai,br)+matvec(l,u,i,ar,bi); *rr=mv; }
int main() { double **mat; double vec1[MAX]; double vec2[MAX]; double *vec3; int i=0,j=0; mat = (double **)malloc(sizeof(double *) * MAX); for(i=0;i<MAX;i++) { // Initializing the vector and matrix with some arbitrary values vec1[i] = 2*i + 3; mat[i] = (double *)malloc(MAX * sizeof(double)); for(j=0;j<MAX;j++) mat[i][j] = i*j + 3; } vec3 = matvec(mat,vec1,vec2,MAX); for(i=0;i<MAX;i++) { printf("C[%d] = %f\n",i,vec3[i]); free(mat[i]); } free(mat); return 1; }
int main() { double execTime = 0.0; double startTime, endTime; int k, size1, size2; // Tell the compiler to align the a, b, and x arrays on 16-byte // boundaries. This allows the vectorizer to use aligned instructions // and produce faster code. #ifdef ALIGNED #ifdef _WIN32 _declspec(align(16)) FTYPE a[ROW][COLWIDTH]; _declspec(align(16)) FTYPE b[ROW]; _declspec(align(16)) FTYPE x[COLWIDTH]; #else FTYPE a[ROW][COLWIDTH] __attribute__((aligned(16))); FTYPE b[ROW] __attribute__((aligned(16))); FTYPE x[COLWIDTH] __attribute__((aligned(16))); #endif // _WIN32 #else FTYPE a[ROW][COLWIDTH]; FTYPE b[ROW]; FTYPE x[COLWIDTH]; #endif size1 = ROW; size2 = COLWIDTH; printf("\nROW:%d COL: %d\n",ROW,COLWIDTH); // initialize the arrays with data init_matrix(ROW,COL,1,a); init_array(COL,3,x); //start timing the matrix multiply code startTime = clock_it(); for (k = 0;k < REPEATNTIMES;k++) { #ifdef NOFUNCCALL int i, j; for (i = 0; i < size1; i++) { b[i] = 0; for (j = 0;j < size2; j++) { b[i] += a[i][j] * x[j]; } } #else matvec(size1,size2,a,b,x); #endif x[0] = x[0] + 0.000001; } endTime = clock_it(); execTime = endTime - startTime; printf("Execution time is %2.3f seconds\n", execTime); printf("GigaFlops = %f\n", (((double)REPEATNTIMES * (double)COL * (double)ROW * 2.0) / (double)(execTime))/1000000000.0); printsum(COL,b); return 0; }
NUMERICS_EXPORT void vecmat(double *x, double **a, int nra, int nca, double *b) { double** t = dmatrix(0, nca - 1, 0, nra - 1); transpose(a, nra, nca, t); matvec(t, nca, nra, x, b); free_dmatrix(t, 0, nca - 1, 0); }
double *cgsolve(int k) { int i, first_i, last_i; int n = k * k; int maxiters = 1000 > 5*k ? 1000 : k; // partition data if (n % size) { first_i = (n / size + 1) * rank; last_i = (rank != size-1 ? first_i+n/size+1 : n); } else { first_i = n / size * rank; last_i = n / size * (rank + 1); } double *b_vec = (double *)malloc(n * sizeof(double)); double *r_vec = (double *)malloc(n * sizeof(double)); double *d_vec = (double *)malloc(n * sizeof(double)); double *A_vec = (double *)malloc(n * sizeof(double)); double *x_vec = (double *)malloc(n * sizeof(double)); for (i=0; i<n; i++) { double tmp = cs240_getB(i, n); b_vec[i] = tmp; r_vec[i] = tmp; d_vec[i] = tmp; x_vec[i] = 0; } double normb = sqrt(ddot(b_vec+first_i, b_vec+first_i, last_i-first_i)); double rtr = ddot(r_vec+first_i, r_vec+first_i, last_i-first_i); double relres = 1; i = 0; while (relres > 1e-6 && i++ < maxiters) { /*while (i++ < 1) {*/ matvec(A_vec, d_vec, k); double alpha = rtr / ddot(d_vec+first_i, A_vec+first_i, last_i-first_i); daxpy(x_vec, d_vec, 1, alpha, n); daxpy(r_vec, A_vec, 1, -1*alpha, n); double rtrold = rtr; rtr = ddot(r_vec+first_i, r_vec+first_i, last_i-first_i); double beta = rtr / rtrold; daxpy(d_vec, r_vec, beta, 1, n); relres = sqrt(rtr) / normb; } return x_vec; }
void init_BICG_Z(Complex_Z *vl, int ldvl, Complex_Z *vr, int ldvr, int nvecs, Complex_Z *xinit, Complex_Z *b, int lde, int n, Complex_Z *H, int ldH, int *IPIV, Complex_Z *work, void (*matvec) (void *, void *, void *), void *params, int *info) { /* xinit = xinit + (vr inv(H) vl' res) */ int i,ONE=1; Complex_Z tempc,tpone={1.00000e+00,0.000000e+00}; char cN='N'; //compute res=b-Ax and store it in work[nvecs:nvecs+n-1] tempc = wrap_zdot(&n,xinit,&ONE,xinit,&ONE,params); if(tempc.r > 0.0) //if nonzero initial guess { matvec(xinit,&work[nvecs],params); for(i=0; i<n; i++) { work[nvecs+i].r = b[i].r - work[nvecs+i].r; work[nvecs+i].i = b[i].i - work[nvecs+i].i;} } else BLAS_ZCOPY(&n,b,&ONE,&work[nvecs],&ONE); for(i=0; i<nvecs; i++){ work[i] = wrap_zdot(&n, &vl[i*ldvl],&ONE,&work[nvecs],&ONE,params);} //solve using LU factorized H BLAS_ZGETRS(&cN,&nvecs,&ONE,H,&ldH,IPIV,work,&nvecs,info); printf("inside init_bicg\n"); if( (*info) != 0) { fprintf(stderr,"Error in BLAS_ZGESV inside init-BICG_Z. info %d\n",(*info)); exit(2); } BLAS_ZGEMV(&cN,&n,&nvecs,&tpone,vr,&ldvr,work,&ONE,&tpone,xinit,&ONE); return; }
int main (int argc, char * argv[]) { int nrows = 64; int ncols = 32; int i,j; int **A = (int **)malloc(nrows*sizeof(int*)); for(i = 0; i < nrows; i++) { A[i] = (int *) malloc(ncols*sizeof(int)); } printf("A = \n"); for (i = 0; i < nrows; i++ ) { for (j = 0; j < ncols; j++ ) { A[i][j] = i + j; printf("%2d ", A[i][j]); } printf("\n"); } int x[ncols]; printf("x = \n"); for (j = 0; j < ncols; j++) { x[j] = j; printf("%d\n",x[j]); } int result[nrows]; matvec(nrows, ncols, A, x, result); printf("A x = \n"); for (i = 0; i < nrows; i++) { printf("%d\n", result[i]); } for(i = 0; i < nrows; i++) { free(A[i]); } free(A); }
double DiscreteAsian(int model, //modello double spot, double strike, double rf, double dt, int ndates, double lowlim, double uplim, int npoints, //n. of quadrature points long nfft, //n. of points for the fft inversion double ModelParameters[], //the parameters of the model double price[], double solution[],double *delta) //OUTPUT: Contains the solution { int i, j, k; double inter_dens=0.; double upperdens,b; //parameters for spline interpolation int ier; double dfb,ddfb,arg; //vectors where to store the outputs of the FFT inversion double *inv, *logk; //abscissa and weights for Gaussian quadrature with npoints double *abscissa,*weights, *temp; double** c,** kernelmatrix; double optprice,optdelta; double gamma_price; inv=dvector(0, nfft-1); //contains the density logk=dvector(0, nfft-1); //contains the abscissa of the density c = dmatrix(0, nfft-1, 0, 2); kernelmatrix= dmatrix(0, npoints-1, 0, npoints-1); abscissa=dvector(1,npoints); weights=dvector(1,npoints); temp=dvector(0,npoints-1); //Generate abscissa and weights for quadrature gauleg(lowlim, uplim, abscissa, weights, npoints); b=MAX(fabs(lowlim - log(exp(uplim) + 1)),fabs(uplim - log(exp(lowlim) + 1))); b=b*1.1; upperdens=b; TableIFRT(1, model, rf, dt, nfft, b, 1.5, ModelParameters, inv, logk); //spline interpolation i=spline(logk, inv, nfft ,c); if(i>100) return i; //construct the kernel matrix for(i=1;i<=npoints;i++){ for(j=1;j<=npoints;j++){ // argument of the density arg=abscissa[i] - log(exp(abscissa[j]) + 1); if(arg>-upperdens) { if(arg<upperdens) { inter_dens=MAX(splevl(arg, nfft, logk, inv, c, &dfb, &ddfb, &ier),0.0); } } if(arg<-upperdens) inter_dens=0.0; if(arg>upperdens) inter_dens=0.0; //construct the kernel kernelmatrix[i-1][ j-1] = weights[j] * MAX(inter_dens,0.0); /*printf("%d %d %d %f\n",npoints,i-1,j-1,kernelmatrix[i-1][ j-1]);*/ } //construct the initial condition arg=abscissa[i]; if(arg>-upperdens) { if(arg<upperdens) { solution[i-1] = MAX(splevl(abscissa[i], nfft, logk, inv, c, &dfb, &ddfb, &ier),0); } } if(arg<-upperdens) solution[i-1]=0.0; if(arg>upperdens) solution[i-1]=0.0; } //iterations over the monitoring dates for(k = 1;k< ndates;k++){ //compute K*v_n matvec(kernelmatrix, npoints, npoints, solution, temp); //update v_n+1 for( i = 0;i<= npoints-1;i++){ solution[i]=temp[i]; } } optprice=0.0,optdelta=0.0; gamma_price=log(strike*((double)(ndates + 1))/spot-1.); for( i = 0;i<= npoints-1;i++){ //spot price price[i] = spot * exp(abscissa[i+1]); ///option price optprice=optprice+weights[i+1]*MAX(spot*(1 + exp(abscissa[i+1]))/(ndates + 1) - strike, 0)*solution[i]*exp(-rf*dt*ndates); if(abscissa[i+1]>gamma_price) optdelta=optdelta+weights[i+1]*MAX((1 + exp(abscissa[i+1])), 0)*solution[i]*exp(-rf*dt*ndates)/(ndates + 1); } *delta=optdelta; free_dvector(abscissa,1,npoints); free_dvector(weights,1,npoints); free_dvector(temp,0,npoints-1); free_dvector(inv,0,nfft-1); free_dvector(logk,0,nfft-1); free_dmatrix(c, 0, nfft-1, 0, 2); free_dmatrix(kernelmatrix, 0, npoints-1, 0, npoints-1); return optprice; }
void Enqueue::visit(StmtMatmul& s) { if (_failure) return; if (_printer) _printer->visit(s); // some compute devices are single precision only if ( ! OCLhacks::singleton().supportFP64(_memMgr.deviceNum()) && (PrecType::Double == s.precA() || PrecType::Double == s.precB() || PrecType::Double == s.precC()) ) { _failure = true; return; } if (s.isMATMUL()) { // check if device supports the Evergreen matrix multiply if (OCLhacks::singleton().supportEvergreen( _memMgr.deviceNum() )) { // do not allow modulo stream array subscripting for K dimension const size_t KfromA = s.isTransposeA() ? s.heightA() : s.widthA(); const size_t KfromB = s.isTransposeB() ? s.widthB() : s.heightB(); if (KfromA != KfromB) { _failure = true; return; } // ATI Evergreen matrix multiply Evergreen::MatmulMM matmul( _memMgr.deviceNum() ); // exogenous parameters matmul.setBatching(_vt.numTraces()); if (s.isSameDataA()) matmul.setSameDataMatrixA(); if (s.isSameDataB()) matmul.setSameDataMatrixB(); matmul.setGeneral(s.isGEMM()); matmul.setPrecision(s.precA(), s.precB(), s.precC()); matmul.setDimensions(s.heightC(), // M s.widthC(), // N KfromA); // K matmul.setDataLayout(s.isTransposeA(), s.isTransposeB()); const vector< size_t > params = _jitMemo.autotuneLookup(matmul); if (params.empty()) { _failure = true; return; } matmul.setParams(params); ArrayBuf* A = s.astvarA()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarA()->variable(), _vt) : _memMgr.arrayBuf(s.astvarA(), _vt); ArrayBuf* B = s.astvarB()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarB()->variable(), _vt) : _memMgr.arrayBuf(s.astvarB(), _vt); ArrayBuf* C = s.astvarC()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarC()->variable(), _vt) : _memMgr.arrayBuf(s.astvarC(), _vt); const double alpha = s.alpha(); const double beta = s.beta(); const string kernelName = matmul.kernelName(); stringstream ss; ss << matmul; const string kernelSource = ss.str(); OCLkernel ckernel( *_memMgr.computeDevice() ); ckernel.buildJIT(kernelName, kernelSource); if (! ckernel.isOk()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "compile error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } if (! matmul.setArgs(ckernel, A, B, C, alpha, beta)) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "set arguments error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } const vector< size_t > globalDims = matmul.globalWorkItems(); const vector< size_t > localDims = matmul.localWorkItems(); for (size_t i = 0; i < globalDims.size(); i++) { ckernel << OCLWorkIndex(globalDims[i], localDims[i]); if (! ckernel.statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "set index space dimension error: " << i; LOGGER(ss.str()) #endif _failure = true; return; } } *_memMgr.computeDevice() << ckernel; if (! _memMgr.computeDevice()->statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "enqueue error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } *_memMgr.computeDevice() << FLUSH; if (! _memMgr.computeDevice()->statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "wait error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } matmul.clearArgs(); } else { // there are no other GPU matrix multiply implementations except // "Evergreen" (which works on both ATI and NVIDIA), so fail _failure = true; } } else if (s.isMATVEC()) { // check if device supports the Evergreen matrix multiply if (OCLhacks::singleton().supportEvergreen( _memMgr.deviceNum() )) { // do not allow modulo stream array subscripting for N dimension const size_t NfromA = s.isTransposeA() ? s.heightA() : s.widthA(); const size_t NfromB = s.widthB(); if (NfromA != NfromB) { _failure = true; return; } // ATI Evergreen matrix-vector multiply Evergreen::MatmulMV matvec( _memMgr.deviceNum() ); // exogenous parameters matvec.setBatching(_vt.numTraces()); if (s.isSameDataA()) matvec.setSameDataMatrixA(); matvec.setGeneral(s.isGEMM()); matvec.setPrecision(s.precA(), s.precB(), s.precC()); matvec.setDimensions(s.widthC(), // M NfromA); // N matvec.setDataLayout(s.isTransposeA()); const vector< size_t > params = _jitMemo.autotuneLookup(matvec); if (params.empty()) { _failure = true; return; } matvec.setParams(params); ArrayBuf* A = s.astvarA()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarA()->variable(), _vt) : _memMgr.arrayBuf(s.astvarA(), _vt); ArrayBuf* B = s.astvarB()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarB()->variable(), _vt) : _memMgr.arrayBuf(s.astvarB(), _vt); ArrayBuf* C = s.astvarC()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarC()->variable(), _vt) : _memMgr.arrayBuf(s.astvarC(), _vt); const double alpha = s.alpha(); const double beta = s.beta(); const string kernelName = matvec.kernelName(); stringstream ss; ss << matvec; const string kernelSource = ss.str(); OCLkernel ckernel( *_memMgr.computeDevice() ); ckernel.buildJIT(kernelName, kernelSource); if (! ckernel.isOk()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "compile error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } if (! matvec.setArgs(ckernel, A, B, C, alpha, beta)) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "set arguments error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } const vector< size_t > globalDims = matvec.globalWorkItems(); const vector< size_t > localDims = matvec.localWorkItems(); for (size_t i = 0; i < globalDims.size(); i++) { ckernel << OCLWorkIndex(globalDims[i], localDims[i]); if (! ckernel.statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "set index space dimension error: " << i; LOGGER(ss.str()) #endif _failure = true; return; } } *_memMgr.computeDevice() << ckernel; if (! _memMgr.computeDevice()->statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "enqueue error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } *_memMgr.computeDevice() << FLUSH; if (! _memMgr.computeDevice()->statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "wait error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } matvec.clearArgs(); } else { // there are no other GPU matrix multiply implementations except // "Evergreen" (which works on both ATI and NVIDIA), so fail _failure = true; } } else if (s.isVECMAT()) { // check if device supports the Evergreen matrix multiply if (OCLhacks::singleton().supportEvergreen( _memMgr.deviceNum() )) { // do not allow modulo stream array subscripting for N dimension const size_t NfromA = s.widthA(); const size_t NfromB = s.isTransposeB() ? s.widthB() : s.heightB(); if (NfromA != NfromB) { _failure = true; return; } // ATI Evergreen matrix-vector multiply Evergreen::MatmulMV matvec( _memMgr.deviceNum() ); // exogenous parameters matvec.setBatching(_vt.numTraces()); if (s.isSameDataB()) matvec.setSameDataMatrixA(); matvec.setGeneral(s.isGEMM()); matvec.setPrecision(s.precB(), s.precA(), s.precC()); matvec.setDimensions(s.widthC(), // M NfromB); // N matvec.setDataLayout(! s.isTransposeB()); const vector< size_t > params = _jitMemo.autotuneLookup(matvec); if (params.empty()) { _failure = true; return; } matvec.setParams(params); ArrayBuf* A = s.astvarA()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarA()->variable(), _vt) : _memMgr.arrayBuf(s.astvarA(), _vt); ArrayBuf* B = s.astvarB()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarB()->variable(), _vt) : _memMgr.arrayBuf(s.astvarB(), _vt); ArrayBuf* C = s.astvarC()->isTraceVariable() ? _memMgr.arrayBuf(s.astvarC()->variable(), _vt) : _memMgr.arrayBuf(s.astvarC(), _vt); const double alpha = s.alpha(); const double beta = s.beta(); const string kernelName = matvec.kernelName(); stringstream ss; ss << matvec; const string kernelSource = ss.str(); OCLkernel ckernel( *_memMgr.computeDevice() ); ckernel.buildJIT(kernelName, kernelSource); if (! ckernel.isOk()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "compile error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } if (! matvec.setArgs(ckernel, B, A, C, alpha, beta)) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "set arguments error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } const vector< size_t > globalDims = matvec.globalWorkItems(); const vector< size_t > localDims = matvec.localWorkItems(); for (size_t i = 0; i < globalDims.size(); i++) { ckernel << OCLWorkIndex(globalDims[i], localDims[i]); if (! ckernel.statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "set index space dimension error: " << i; LOGGER(ss.str()) #endif _failure = true; return; } } *_memMgr.computeDevice() << ckernel; if (! _memMgr.computeDevice()->statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "enqueue error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } *_memMgr.computeDevice() << FLUSH; if (! _memMgr.computeDevice()->statusOp()) { #ifdef __LOGGING_ENABLED__ stringstream ss; ss << "wait error: " << kernelName; LOGGER(ss.str()) #endif _failure = true; return; } matvec.clearArgs(); } else { // there are no other GPU matrix multiply implementations except // "Evergreen" (which works on both ATI and NVIDIA), so fail _failure = true; } } }
/*************************************** * Conjugate Gradient * * This function will do the CG * * algorithm without preconditioning. * * For optimiziation you must not * * change the algorithm. * *************************************** r(0) = b - Ax(0) p(0) = r(0) rho(0) = <r(0),r(0)> *************************************** for k=0,1,2,...,n-1 q(k) = A * p(k) dot_pq = <p(k),q(k)> alpha = rho(k) / dot_pq x(k+1) = x(k) + alpha*p(k) r(k+1) = r(k) - alpha*q(k) check convergence ||r(k+1)||_2 < eps rho(k+1) = <r(k+1), r(k+1)> beta = rho(k+1) / rho(k) p(k+1) = r(k+1) + beta*p(k) ***************************************/ void cg(const int n, const int nnz, const int maxNNZ, const floatType* data, const int* indices, const int* length, const floatType* b, floatType* x, struct SolverConfig* sc){ floatType* r, *p, *q; floatType alpha, beta, rho, rho_old, dot_pq, bnrm2; int iter; double timeMatvec_s; double timeMatvec=0; int i; floatType temp; /* allocate memory */ r = (floatType*)malloc(n * sizeof(floatType)); p = (floatType*)malloc(n * sizeof(floatType)); q = (floatType*)malloc(n * sizeof(floatType)); #pragma acc data copyin(data[0:n*maxNNZ], indices[0:n*maxNNZ], length[0:n], n, nnz, maxNNZ, b[0:n]) copy(x[0:n]) create(alpha, beta, r[0:n], p[0:n], q[0:n], i, temp) //eigentlich auch copy(x[0:n]) aber error: not found on device??? { DBGMAT("Start matrix A = ", n, nnz, maxNNZ, data, indices, length) DBGVEC("b = ", b, n); DBGVEC("x = ", x, n); /* r(0) = b - Ax(0) */ timeMatvec_s = getWTime(); matvec(n, nnz, maxNNZ, data, indices, length, x, r); //hier inline ausprobieren /*int i, j, k; #pragma acc parallel loop present(data, indices, length, x) for (i = 0; i < n; i++) { r[i] = 0; for (j = 0; j < length[i]; j++) { k = j * n + i; r[i] += data[k] * x[indices[k]]; } }*/ timeMatvec += getWTime() - timeMatvec_s; xpay(b, -1.0, n, r); DBGVEC("r = b - Ax = ", r, n); /* Calculate initial residuum */ nrm2(r, n, &bnrm2); bnrm2 = 1.0 /bnrm2; /* p(0) = r(0) */ memcpy(p, r, n*sizeof(floatType)); DBGVEC("p = r = ", p, n); /* rho(0) = <r(0),r(0)> */ vectorDot(r, r, n, &rho); printf("rho_0=%e\n", rho); for(iter = 0; iter < sc->maxIter; iter++){ DBGMSG("=============== Iteration %d ======================\n", iter); /* q(k) = A * p(k) */ timeMatvec_s = getWTime(); matvec(n, nnz, maxNNZ, data, indices, length, p, q); timeMatvec += getWTime() - timeMatvec_s; DBGVEC("q = A * p= ", q, n); /* dot_pq = <p(k),q(k)> */ vectorDot(p, q, n, &dot_pq); DBGSCA("dot_pq = <p, q> = ", dot_pq); /* alpha = rho(k) / dot_pq */ alpha = rho / dot_pq; DBGSCA("alpha = rho / dot_pq = ", alpha); /* x(k+1) = x(k) + alpha*p(k) */ axpy(alpha, p, n, x); #pragma acc update host(x[0:n]) DBGVEC("x = x + alpha * p= ", x, n); /* r(k+1) = r(k) - alpha*q(k) */ axpy(-alpha, q, n, r); DBGVEC("r = r - alpha * q= ", r, n); rho_old = rho; DBGSCA("rho_old = rho = ", rho_old); /* rho(k+1) = <r(k+1), r(k+1)> */ vectorDot(r, r, n, &rho); DBGSCA("rho = <r, r> = ", rho); /* Normalize the residual with initial one */ sc->residual= sqrt(rho) * bnrm2; /* Check convergence ||r(k+1)||_2 < eps * If the residual is smaller than the CG * tolerance specified in the CG_TOLERANCE * environment variable our solution vector * is good enough and we can stop the * algorithm. */ printf("res_%d=%e\n", iter+1, sc->residual); if(sc->residual <= sc->tolerance) break; /* beta = rho(k+1) / rho(k) */ beta = rho / rho_old; DBGSCA("beta = rho / rho_old= ", beta); /* p(k+1) = r(k+1) + beta*p(k) */ xpay(r, beta, n, p); DBGVEC("p = r + beta * p> = ", p, n); } /* Store the number of iterations and the * time for the sparse matrix vector * product which is the most expensive * function in the whole CG algorithm. */ sc->iter = iter; sc->timeMatvec = timeMatvec; /* Clean up */ free(r); free(p); free(q); }//ende data region }
void LRD_BICGSTAB_Z(Complex_Z *vl, int ldvl, Complex_Z *vr, int ldvr, int nvecs, Complex_Z *x, Complex_Z *b, int lde, int n, Complex_Z *H, int ldH, int *IPIV, Complex_Z *work, void (*matvec) (void *, void *, void *), void *params, double AnormEst, int maxiter, double DefTol, double tol, int ConvTestOpt, FILE *outputFile, int *info) { Complex_Z *xincr, *resid, *tmpH; //used to solve the correction equation int i,j,k,numIts,iters_used,zs,is,ds,allelems,ONE=1; Complex_Z tzero={00.00e+00,00.00e+00}; double resNorm,bnorm,curTol,leftoverTol; Complex_Z tempc; double *reshist; //residual norm history int flag1,flag2; double stoptol_used,stoptol_cur,xnorm,rhsnorm; double MACHEPS=1.e-16; zs = sizeof(Complex_Z); is = sizeof(int); ds = sizeof(double); xincr = (Complex_Z *) calloc(lde,zs); if(xincr==NULL){ printf("Error in allocating xincr in LRD_BICGSTAB\n"); exit(1); } resid = (Complex_Z *) calloc(lde,zs); if(resid==NULL){ printf("Errro in allocating resid in LRD_BICGSTAB\n"); exit(1); } tmpH = (Complex_Z *) calloc(ldH*nvecs,zs); //temporary copy of H if(tmpH == NULL){ printf("Error in allocating tmpH\n"); exit(1); } reshist = (double *) calloc(maxiter,ds); if(reshist == NULL){ printf("Error allocating reshist\n"); exit(1); } //initial residual std::vector tempc = wrap_zdot(&n,x,&ONE,x,&ONE,params); xnorm = sqrt(tempc.r); if(tempc.r > 0 ) { matvec(x,resid,params); for(i=0; i<n; i++){ resid[i].r = b[i].r - resid[i].r; resid[i].i = b[i].i - resid[i].i;} } else BLAS_ZCOPY(&n,b,&ONE,resid,&ONE); tempc = wrap_zdot(&n,resid,&ONE,resid,&ONE,params); resNorm = sqrt(tempc.r); tempc = wrap_zdot(&n,b,&ONE,b,&ONE,params); bnorm = sqrt(tempc.r); numIts=0; //leftoverTol = DefTol; stoptol_used = tol*bnorm; if(ConvTestOpt==2) { stoptol_cur = MACHEPS*(AnormEst*xnorm + bnorm); if(stoptol_used < stoptol_cur) stoptol_used = stoptol_cur; } //Factorize the nvecs*nvecs part of H allelems = ldH*nvecs; BLAS_ZCOPY(&allelems,H,&ONE,tmpH,&ONE); BLAS_ZGETRF(&nvecs,&nvecs,tmpH,&ldH,IPIV,info); if(info==0) { printf("ERROR: factorization of tmpH\n"); exit(2); } if(DefTol < tol) curTol = tol; else curTol = DefTol; while( resNorm > stoptol_used){ //int jrest=0; //while( jrest< 2){ // fprintf(outputFile,"restart\n"); iters_used =0; init_BICG_Z(vl,ldvl,vr,ldvr,nvecs,xincr,resid,lde,n, tmpH,ldH,IPIV,work,matvec,params,info); if( (*info) != 0){ printf("Error in init_BICG\n"); exit(2); } fprintf(outputFile,"bnorm %g, curTol %g\n",bnorm,curTol); MyBICGSTAB_Z(n,lde,xincr,resid,curTol,maxiter,&iters_used,reshist, matvec,params,AnormEst, ConvTestOpt, work,&flag1); //if(flag1==0 || flag1=3) // { for(i=0; i<iters_used; i++){ fprintf(outputFile,"%-6d %-22.16g \n",numIts+i,reshist[i]);} numIts = numIts + iters_used; for(i=0; i<n; i++){ x[i].r = x[i].r + xincr[i].r; x[i].i = x[i].i + xincr[i].i; xincr[i]=tzero;} matvec(x,resid,params); for(i=0; i<n; i++){ resid[i].r = b[i].r - resid[i].r; resid[i].i = b[i].i - resid[i].i;} tempc = wrap_zdot(&n, resid, &ONE, resid, &ONE, params); resNorm = sqrt(tempc.r); // } // else fprintf(outputFile,"BICGSTAB returns with flag %d\n",flag1); if(ConvTestOpt==2) { tempc = wrap_zdot(&n,x,&ONE,x,&ONE,params); xnorm = sqrt(tempc.r); stoptol_cur = MACHEPS*(AnormEst*xnorm+bnorm); if(stoptol_used < stoptol_cur) stoptol_used = stoptol_cur; } leftoverTol = stoptol_used/resNorm; if(leftoverTol < DefTol) curTol=DefTol; else curTol=leftoverTol; fprintf(outputFile,"leftoverTol %g\n",leftoverTol); //jrest = jrest +1; } return; }
void IncrEigbicg_Z( int n, int lde,int nrhs, Complex_Z *X, Complex_Z *B, int *ncurEvals, int ldh, Complex_Z *evecsl, Complex_Z *evecsr, Complex_Z *evals, Complex_Z *H, void (*matvec) (void *, void *, void *), void (*mathvec)(void *, void *, void *), void *params, double *AnormEst, Complex_Z *work, Complex_Z *VL, int ldvl, Complex_Z *VR, int ldvr, Complex_Z *ework, int esize, double tol, double *restartTol, int maxit, char SRT_OPT, double epsi, int ConvTestOpt, int plvl, int nev, int v_max,FILE *outputFile) { /* Timing vars */ double wt1,wt2,ut1,ut2,st1,st2,wE,wI; /* Pointers */ Complex_Z tempc, tempc1, tempc2, *tmpH, *x, *resid, *b; double *rnorms, *reshist, normb, curTol,resNorm,leftTol; int i,j,k, *IPIV, ONE = 1; int zs, ds, tmpsize, is, phase, allelems ; int numIts, flag, flag2,nAdded, nev_used, iters_used, info; Complex_Z tpone = {+1.0e+00,+0.0e00}, tzero = {+0.0e+00,+0.0e00}; char cR = 'R'; char cL = 'L'; char cN ='N'; char cV = 'V'; char cU = 'U'; char cC ='C'; double *xrnorms,*xlnorms,*ernorms; Complex_Z *angles; /* ------------------------------------------------------------------- */ /* Work allocations */ /* ------------------------------------------------------------------- */ zs = sizeof(Complex_Z); ds = sizeof(double); is = sizeof(int); if( (IPIV = (int *) calloc(ldh,is)) == NULL) { fprintf(stderr,"ERROR IncrEigbicg could not allocate IPIV\n"); exit(1);} if( (x = (Complex_Z *) calloc(lde,zs)) == NULL) { fprintf(stderr,"ERROR IncrEigbicg could not allocate x\n"); exit(1);} if( (resid = (Complex_Z *) calloc(lde,zs)) == NULL) { fprintf(stderr,"ERROR IncrEigbicg could not allocate resid\n"); exit(1);} if( (tmpH = (Complex_Z *) calloc(ldh*ldh,zs)) == NULL) { fprintf(stderr,"ERROR IncrEigbicg could not allocate tmpH\n"); exit(1);} if ((work = (Complex_Z *) calloc(6*lde, zs)) == NULL) {fprintf(stderr, "ERROR IncrEigbicg could not allocate work\n"); exit(1);} if ((ework = (Complex_Z *) calloc(esize, zs)) == NULL) {fprintf(stderr, "ERROR IncrEigbicg could not allocate ework\n"); exit(1);} if ((VL = (Complex_Z *) calloc(v_max*ldvl, zs)) == NULL) {fprintf(stderr, "ERROR IncrEigbicg could not allocate VL\n"); exit(1);} if ((VR = (Complex_Z *) calloc(v_max*ldvr, zs)) == NULL) {fprintf(stderr, "ERROR IncrEigbicg could not allocate VR\n"); exit(1);} if ( (rnorms = (double *) calloc(ldh, ds)) == NULL ) {fprintf(stderr, "ERROR IncrEigbicg could not allocate rnorms\n"); exit(1);} if ( (reshist = (double *) calloc(maxit, ds)) == NULL) {fprintf(stderr, "ERROR IncrEigbicg could not allocate reshist\n"); exit(1);} if( (xlnorms = (double *) calloc(ldh,ds)) == NULL){ fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate xlnorms\n"); exit(1);} if( (xrnorms = (double *) calloc(ldh,ds)) == NULL){ fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate xrnorms\n"); exit(1);} if( (ernorms = (double *) calloc(ldh,ds)) == NULL){ fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate ernorms\n"); exit(1);} if( (angles = (Complex_Z *) calloc(ldh,zs)) == NULL){ fprintf(stderr,"ERROR: IncrEigbicg couldn't allocate angles\n"); exit(1);} /* ------------------------------------------------------------------- */ /* end Work allocations */ /* ------------------------------------------------------------------- */ /* --------------------------------------------------------------------------------- */ /* Solving one by one the nrhs systems with incremental init-eigbicg or init-bicgstab */ /* ---------------------------------------------------------------------------------- */ for (j=0; j<nrhs; j++) { b = &B[j*lde]; tempc = wrap_zdot(&n,b,&ONE,b,&ONE,params); printf("bnorm=%g\n",sqrt(tempc.r)); normb = sqrt(tempc.r); numIts = 0; //choose eigbicg or bicgstab if(ldh-(*ncurEvals) >= nev ) phase=1; else phase=2; if (plvl) fprintf(outputFile, "\n\nSystem %d\n", j); wE = 0.0; wI = 0.0; /* Start accumulator timers */ if ( (*ncurEvals > 0) && (phase==1)) { /* --------------------------------------------------------- */ /* Perform init-BICG with evecsl and evecsr vectors */ /* xinit = xinit + evecsr*inv(H)*evecl'*(b-Ax0) */ /* --------------------------------------------------------- */ wt1 = primme_get_wtime(); /* copy H into tmpH otherwise it will be changed by CGEV */ allelems = ldh*(*ncurEvals); BLAS_ZCOPY(&allelems,H,&ONE,tmpH,&ONE); // LU factorization of tmpH BLAS_ZGETRF(ncurEvals,ncurEvals,tmpH,&ldh,IPIV,&info); for(i=0; i<n; i++) x[i]=tzero; matvec(&X[j*lde],resid,params); for(i=0; i<n; i++){ resid[i].r = b[i].r - resid[i].r; resid[i].i = b[i].i - resid[i].i;} init_BICG_Z(evecsl,ldvl,evecsr,ldvr,(*ncurEvals), x,resid,lde,n,tmpH,ldh,IPIV,work,matvec,params,&info); if(phase==1){ for(i=0; i<n; i++){ X[j*lde+i].r = X[j*lde+i].r + x[i].r; X[j*lde+i].i = X[j*lde+i].i + x[i].i;}} wt2 = primme_get_wtime(); wI = wI + wt2-wt1; } /* end of init-BICG with evecsl and evecsr vectors */ /* ------------------------------------------------------------ */ if(phase == 1){ /* ------------------------------------------------------------ */ /* Solve Ax = b with x initial guess using eigbicg and compute new nev eigenvectors */ /* ------------------------------------------------------------ */ wt1 = primme_get_wtime(); Zeigbicg(n, lde, &X[j*lde], b, &normb, tol, maxit, SRT_OPT,epsi,ConvTestOpt, &numIts, reshist, &flag, plvl, work, matvec, mathvec, params, AnormEst, nev, &evals[(*ncurEvals)], &rnorms[(*ncurEvals)], v_max, VR,ldvr,VL,ldvl,esize,ework); wt2 = primme_get_wtime(); wE = wE + wt2-wt1; /* ---------- */ /* Reporting */ /* ---------- */ tempc1 = wrap_zdot(&n,&X[j*lde],&ONE,&X[j*lde],&ONE,params); if (plvl) { fprintf(outputFile, "For this rhs:\n"); fprintf(outputFile, "Norm(solution) %-16.12E, AnormEst %-16.12E\n",sqrt(tempc1.r),(*AnormEst)); fprintf(outputFile, "Total initBICG Wallclock : %-f\n", wI); fprintf(outputFile, "Total eigbicg Wallclock : %-f\n", wE); fprintf(outputFile, "Iterations: %-d\n", numIts); fprintf(outputFile, "Actual Resid of LinSys : %e\n", reshist[numIts-1]); if (plvl > 1) for (i=0; i < nev; i++) fprintf(outputFile, "Eval[%d]: %-22.15E %-22.15E rnorm: %-22.15E\n", i+1, evals[*ncurEvals+i].r, evals[*ncurEvals+i].i, rnorms[*ncurEvals+i]); if (plvl >1) { fprintf(outputFile,"Residual norm\n"); for( i=0; i < numIts; i++) fprintf(outputFile,"%-d %-22.15E\n",i,reshist[i]); } if (flag != 0) { fprintf(outputFile, "Error: eigbicg returned with nonzero exit status\n"); return;} } /* ------------------------------------------------------------------- */ /* Update the evecsl, evecsr, and evecsl'*A*evecsr */ /* ------------------------------------------------------------------- */ wt1 = primme_get_wtime(); primme_get_time(&ut1,&st1); /* Append new Ritz pairs to evecs */ for (i=0; i<nev; i++){ BLAS_ZCOPY(&n, &VL[i*ldvl], &ONE, &evecsl[((*ncurEvals)+i)*ldvl], &ONE); BLAS_ZCOPY(&n, &VR[i*ldvr], &ONE, &evecsr[((*ncurEvals)+i)*ldvr], &ONE);} /* Bi-Orthogonalize the new Ritz vectors */ /* Use a simple biorthogonalization that uses all vectors */ nAdded = nev; //for the moment, we add all the vectors biortho_global_Z(evecsl,ldvl,evecsr,ldvr,n,(*ncurEvals)+1,(*ncurEvals)+nev,3,params); //check the biorthogonality of the vectors /* for(i=0; i<(*ncurEvals)+nev; i++) for(k=0; k<(*ncurEvals)+nev; k++) { tempc = wrap_zdot(&n,&evecsl[i*ldvl],&ONE,&evecsr[k*ldvr],&ONE,params); fprintf(outputFile,"evecsl[%d]'*evecsr[%d]=%g %g\n",i,k,tempc.r,tempc.i); } */ /* Augument H */ /* (1:ncurEvals+nAdded,ncurEvals+1:ncurEvals+nAdded) block */ for(k=(*ncurEvals); k<(*ncurEvals)+nAdded; k++) { matvec(&evecsr[k*ldvr],VR,params); for(i=0; i<(*ncurEvals)+nAdded; i++) { tempc=wrap_zdot(&n,&evecsl[i*ldvl],&ONE,VR,&ONE,params); H[i+k*ldh]=tempc; } } /* (ncurEvals+1:ncurEvals+nAdded,1:ncurEvals) block */ for(k=(*ncurEvals); k<(*ncurEvals)+nAdded; k++) { mathvec(&evecsl[k*ldvl],VL,params); for(i=0; i<(*ncurEvals); i++) { tempc = wrap_zdot(&n,&evecsr[i*ldvr],&ONE,VL,&ONE,params); H[k+i*ldh].r = tempc.r; H[k+i*ldh].i = -tempc.i; } } (*ncurEvals) = (*ncurEvals) + nAdded; /* Reporting */ wt2 = primme_get_wtime(); primme_get_time(&ut2,&st2); if (plvl) { fprintf(outputFile, "Update\n"); fprintf(outputFile, "Added %d vecs\n",nAdded); fprintf(outputFile, "U Wallclock : %-f\n", wt2-wt1); fprintf(outputFile, "U User Time : %f seconds\n", ut2-ut1); fprintf(outputFile, "U Syst Time : %f seconds\n", st2-st1);} } /* if phase==1 */ /****************************************************/ if(phase==2) //solve deflated bicgstab the correction equation { fprintf(outputFile,"\n\nDeflated bicgstab\n");; LRD_BICGSTAB_Z(evecsl, ldvl,evecsr,ldvr,(*ncurEvals),&X[j*lde],b, lde,n,H,ldh,IPIV,work,matvec,params,(*AnormEst),maxit,(*restartTol),tol,ConvTestOpt,outputFile,&info); } /*end of if(phase==2)*/ } /*for(j=0; j<nrhs; j++) */ // compute final evecs,etc. ComputeFinalEvecs_Z ( (*ncurEvals),n,evecsl,ldvl,evecsr,ldvr,H,ldh,SRT_OPT,epsi, evals, ernorms, xlnorms, xrnorms, angles, matvec,params,work,6*lde); fprintf(outputFile,"\n\n Final Evals\n"); for(i=0; i< (*ncurEvals); i++){ fprintf(outputFile,"EVAL %-16.12E %-16.12E, ERNORM %-16.12E\n", evals[i].r,evals[i].i,ernorms[i]); fprintf(outputFile,"XLNORM %-16.12E, XRNORM %-16.12E, ANGLE %-16.12E %-16.12E\n\n", xlnorms[i],xrnorms[i],angles[i].r,angles[i].i);} fprintf(outputFile,"===================================================\n"); return; }
int main(int argc, char* argv[]) { if (argc != 4) { printf("Usage: %s <nx> <ny> <nt>\n", argv[0]); exit(1); } const char* no_timing = getenv("NO_TIMING"); #if defined(_OPENACC) char* regcount_fname = getenv("OPENACC_PROFILING_FNAME"); if (regcount_fname) { char* regcount_lineno = getenv("OPENACC_PROFILING_LINENO"); int lineno = -1; if (regcount_lineno) lineno = atoi(regcount_lineno); //kernelgen_enable_openacc_regcount(regcount_fname, lineno); } #endif parse_arg(nx, argv[1]); parse_arg(ny, argv[2]); parse_arg(nt, argv[3]); real* A = (real*)memalign(MEMALIGN, nx * ny * sizeof(real)); real* x = (real*)memalign(MEMALIGN, nx * sizeof(real)); real* y = (real*)memalign(MEMALIGN, ny * sizeof(real)); if (!A || !x || !y) { printf("Error allocating memory for arrays: %p, %p, %p\n", A, x, y); exit(1); } real amean = 0.0f, xmean = 0.0f, ymean = 0.0f; for (int i = 0; i < nx * ny; i++) { A[i] = real_rand(); amean += A[i]; } for (int i = 0; i < nx; i++) { x[i] = real_rand(); xmean += x[i]; } for (int i = 0; i < ny; i++) { y[i] = real_rand(); ymean += y[i]; } if (!no_timing) printf("initial mean = %f\n", amean / (nx * ny) + xmean / nx + ymean / ny); // // MIC or OPENACC: // // 1) Perform an empty offload, that should strip // the initialization time from further offloads. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec init_s, init_f; #if defined(_MIC) get_time(&init_s); #pragma offload target(mic) \ nocopy(A:length(nx * ny) alloc_if(0) free_if(0)), \ nocopy(x:length(nx) alloc_if(0) free_if(0)), \ nocopy(y:length(ny) alloc_if(0) free_if(0)) { } get_time(&init_f); #endif #if defined(_OPENACC) get_time(&init_s); acc_init(acc_device_default); get_time(&init_f); #endif double init_t = get_time_diff((struct timespec*)&init_s, (struct timespec*)&init_f); if (!no_timing) printf("init time = %f sec\n", init_t); #endif volatile struct timespec total_s, total_f; get_time(&total_s); // // MIC or OPENACC: // // 2) Allocate data on device, but do not copy anything. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec alloc_s, alloc_f; #if defined(_MIC) get_time(&alloc_s); #pragma offload target(mic) \ nocopy(A:length(nx * ny) alloc_if(1) free_if(0)), \ nocopy(x:length(nx) alloc_if(1) free_if(0)), \ nocopy(y:length(ny) alloc_if(1) free_if(0)) { } get_time(&alloc_f); #endif #if defined(_OPENACC) get_time(&alloc_s); #pragma acc data create (A[0:nx*ny], x[0:nx], y[0:ny]) { get_time(&alloc_f); #endif double alloc_t = get_time_diff((struct timespec*)&alloc_s, (struct timespec*)&alloc_f); if (!no_timing) printf("device buffer alloc time = %f sec\n", alloc_t); #endif // // MIC or OPENACC: // // 3) Transfer data from host to device and leave it there, // i.e. do not allocate deivce memory buffers. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec load_s, load_f; #if defined(_MIC) get_time(&load_s); #pragma offload target(mic) \ in(A:length(nx * ny) alloc_if(0) free_if(0)), \ in(x:length(nx) alloc_if(0) free_if(0)), \ in(y:length(ny) alloc_if(0) free_if(0)) { } get_time(&load_f); #endif #if defined(_OPENACC) get_time(&load_s); #pragma acc update device(A[0:nx*ny], x[0:nx], y[0:ny]) get_time(&load_f); #endif double load_t = get_time_diff((struct timespec*)&load_s, (struct timespec*)&load_f); if (!no_timing) printf("data load time = %f sec (%f GB/sec)\n", load_t, ((nx * ny + nx + ny) * sizeof(real)) / (load_t * 1024 * 1024 * 1024)); #endif // // 4) Perform data processing iterations, keeping all data // on device. // volatile struct timespec compute_s, compute_f; get_time(&compute_s); #if defined(_MIC) #pragma offload target(mic) \ nocopy(A:length(nx * ny) alloc_if(0) free_if(0)), \ nocopy(x:length(nx) alloc_if(0) free_if(0)), \ nocopy(y:length(ny) alloc_if(0) free_if(0)) #endif { for (int it = 0; it < nt; it++) matvec(nx, ny, A, x, y); } get_time(&compute_f); double compute_t = get_time_diff((struct timespec*)&compute_s, (struct timespec*)&compute_f); if (!no_timing) printf("compute time = %f sec\n", compute_t); // // MIC or OPENACC: // // 5) Transfer output data back from device to host. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec save_s, save_f; #if defined(_MIC) get_time(&save_s); #pragma offload target(mic) \ out(y:length(ny) alloc_if(0) free_if(0)) { } get_time(&save_f); #endif #if defined(_OPENACC) get_time(&save_s); #pragma acc update host (y[0:ny]) get_time(&save_f); #endif double save_t = get_time_diff((struct timespec*)&save_s, (struct timespec*)&save_f); if (!no_timing) printf("data save time = %f sec (%f GB/sec)\n", save_t, (ny * sizeof(real)) / (save_t * 1024 * 1024 * 1024)); #endif // // MIC or OPENACC: // // 6) Deallocate device data buffers. // OPENACC does not seem to have explicit deallocation. // #if defined(_OPENACC) } #endif #if defined(_MIC) volatile struct timespec free_s, free_f; get_time(&free_s); #pragma offload target(mic) \ nocopy(A:length(nx * ny) alloc_if(0) free_if(1)), \ nocopy(x:length(nx) alloc_if(0) free_if(1)), \ nocopy(y:length(ny) alloc_if(0) free_if(1)) { } get_time(&free_f); double free_t = get_time_diff((timepsec&)free_s, (timepsec&)free_f); // if (!no_timing) printf("device buffer free time = %f sec\n", free_t); #endif get_time(&total_f); if (!no_timing) printf("device buffer free time = %f sec\n", get_time_diff((struct timespec*)&total_s, (struct timespec*)&total_f)); ymean = 0.0f; for (int i = 0; i < ny; i++) ymean += y[i]; printf("final mean = %f\n", ymean / ny); free(A); free(x); free(y); fflush(stdout); return 0; }
void efsirk(real_t *x, real_t xe, int m, real_t y[], real_t *delta, void (*derivative)(int, real_t[], real_t *), void (*jacobian)(int, real_t **, real_t [], real_t *), real_t **j, int *n, real_t aeta, real_t reta, real_t hmin, real_t hmax, int linear, void (*output)(real_t, real_t, int, real_t [], real_t, real_t **, int)) { int *allocate_integer_vector(int, int); real_t *allocate_real_vector(int, int); real_t **allocate_real_matrix(int, int, int, int); void free_integer_vector(int *, int); void free_real_vector(real_t *, int); void free_real_matrix(real_t **, int, int, int); real_t vecvec(int, int, int, real_t [], real_t []); real_t matmat(int, int, int, int, real_t **, real_t **); real_t matvec(int, int, int, real_t **, real_t []); void gsselm(real_t **, int, real_t [], int [], int []); void solelm(real_t **, int, int [], int [], real_t []); int k,l,lin,*ri,*ci; real_t step,h,mu0,mu1,mu2,theta0,theta1,nu1,nu2,nu3,yk,fk,c1,c2, d,*f,*k0,*labda,**j1,aux[8],discr,eta,s,z1,z2,e,alpha1,a,b; ri=allocate_integer_vector(1,m); ci=allocate_integer_vector(1,m); f=allocate_real_vector(1,m); k0=allocate_real_vector(1,m); labda=allocate_real_vector(1,m); j1=allocate_real_matrix(1,m,1,m); aux[2]=FLT_EPSILON; aux[4]=8.0; for (k=1; k<=m; k++) f[k]=y[k]; *n = 0; (*output)(*x,xe,m,y,*delta,j,*n); step=0.0; do { (*n)++; /* difference scheme */ (*derivative)(m,f,delta); /* step size */ if (linear) s=h=hmax; else if (*n == 1 || hmin == hmax) s=h=hmin; else { eta=aeta+reta*sqrt(vecvec(1,m,0,y,y)); c1=nu3*step; for (k=1; k<=m; k++) labda[k] += c1*f[k]-y[k]; discr=sqrt(vecvec(1,m,0,labda,labda)); s=h=(eta/(0.75*(eta+discr))+0.33)*h; if (h < hmin) s=h=hmin; else if (h > hmax) s=h=hmax; } if ((*x)+s > xe) s=xe-(*x); lin=((step == s) && linear); step=s; if (!linear || *n == 1) (*jacobian)(m,j,y,delta); if (!lin) { /* coefficient */ z1=step*(*delta); if (*n == 1) z2=z1+z1; if (fabs(z2-z1) > 1.0e-6*fabs(z1) || z2 > -1.0) { a=z1*z1+12.0; b=6.0*z1; if (fabs(z1) < 0.1) alpha1=(z1*z1/140.0-1.0)*z1/30.0; else if (z1 < 1.0e-14) alpha1=1.0/3.0; else if (z1 < -33.0) alpha1=(a+b)/(3.0*z1*(2.0+z1)); else { e=((z1 < 230.0) ? exp(z1) : FLT_MAX); alpha1=((a-b)*e-a-b)/(((2.0-z1)*e-2.0-z1)*3.0*z1); } mu2=(1.0/3.0+alpha1)*0.25; mu1 = -(1.0+alpha1)*0.5; mu0=(6.0*mu1+2.0)/9.0; theta0=0.25; theta1=0.75; a=3.0*alpha1; nu3=(1.0+a)/(5.0-a)*0.5; a=nu3+nu3; nu1=0.5-a; nu2=(1.0+a)*0.75; z2=z1; } c1=step*mu1; d=step*step*mu2; for (k=1; k<=m; k++) { for (l=1; l<=m; l++) j1[k][l]=d*matmat(1,m,k,l,j,j)+c1*j[k][l]; j1[k][k] += 1.0; } gsselm(j1,m,aux,ri,ci); } c1=step*step*mu0; d=step*2.0/3.0; for (k=1; k<=m; k++) { k0[k]=fk=f[k]; labda[k]=d*fk+c1*matvec(1,m,k,j,f); } solelm(j1,m,ri,ci,labda); for (k=1; k<=m; k++) f[k]=y[k]+labda[k]; (*derivative)(m,f,delta); c1=theta0*step; c2=theta1*step; d=nu1*step; for (k=1; k<=m; k++) { yk=y[k]; fk=f[k]; labda[k]=yk+d*fk+nu2*labda[k]; y[k]=f[k]=yk+c1*k0[k]+c2*fk; } (*x) += step; (*output)(*x,xe,m,y,*delta,j,*n); } while (*x < xe); free_integer_vector(ri,1); free_integer_vector(ci,1); free_real_vector(f,1); free_real_vector(k0,1); free_real_vector(labda,1); free_real_matrix(j1,1,m,1); }
/* PCG - Conjugate Gradients Algorithm */ void pcg(int n, double *x, double *b, double tol, int maxit, int clvl, int *iter, double *relres, int *flag, double *work, void (*matvec)(double *, double *), void (*precon)(double *, double *)) { double ALPHA; /* used for passing parameters */ int ONE = 1; /* to BLAS routines */ double n2b; /* norm of rhs vector */ double tolb; /* requested tolerance for residual */ double normr; /* residual norm */ double alpha, beta; double rho, rho1; double pq; double dmax, ddum; /* used to detect stagnation */ int stag; /* flag to indicate stagnation */ int it; /* current iteration number */ int i; /* index variable */ double *r, *z, *p, *q; /* pointers to vectors in PCG algorithm */ /* setup pointers into work */ r = work; z = work + n; p = work + 2*n; q = work + 3*n; /* Check for all zero right hand side vector => all zero solution */ n2b = F77(dnrm2)(&n, b, &ONE);/* Norm of rhs vector, b */ if (n2b == 0.0) { /* if rhs vector is all zeros */ for (i = 0; i < n; i ++) /* then solution is all zeros */ x[i] = 0.0; *flag = 0; /* a valid solution has been obtained */ *relres = 0.0; /* the relative residual is actually 0/0 */ *iter = 0; /* no iterations need be performed */ if (clvl) itermsg(tol,maxit,*flag,*iter,*relres); return; } /* Set up for the method */ *flag = 1; tolb = tol * n2b; /* Relative tolerance */ matvec(x, r); /* Zero-th residual: r = b - A * x*/ for (i = 0; i < n; i ++) /* then solution is all zeros */ r[i] = b[i] - r[i]; normr = F77(dnrm2)(&n, r, &ONE); /* Norm of residual */ if (normr <= tolb) { /* Initial guess is a good enough solution */ *flag = 0; *relres = normr / n2b; *iter = 0; if (clvl) itermsg(tol,maxit,*flag,*iter,*relres); return; } rho = 1.0; stag = 0; /* stagnation of the method */ /* loop over maxit iterations (unless convergence or failure) */ for (it = 1; it <= maxit; it ++) { if (precon) { precon(r, z); /* if isinf(norm(y,inf)) flag = 2; break end */ } else { F77(dcopy)(&n, r, &ONE, z, &ONE); } rho1 = rho; rho = F77(ddot)(&n, r, &ONE, z, &ONE); if (rho == 0.0) { /* or isinf(rho) */ *flag = 4; break; } if (it == 1) { F77(dcopy)(&n, z, &ONE, p, &ONE); } else { beta = rho / rho1; if (beta == 0.0) { /* | isinf(beta) */ *flag = 4; break; } for (i = 0; i < n; i ++) /* p = z + beta * p; */ p[i] = z[i] + beta * p[i]; } matvec(p, q); /* q = A * p */ pq = F77(ddot)(&n, p, &ONE, q, &ONE); /* pq = p' * q */ if (pq == 0.0) { /* | isinf(pq) */ *flag = 4; break; } else { alpha = rho / pq; } /* if isinf(alpha) flag = 4; break end */ if (alpha == 0.0) /* stagnation of the method */ stag = 1; /* Check for stagnation of the method */ if (stag == 0) { dmax = 0.0; for (i = 0; i < n; i ++) if (x[i] != 0.0) { ddum = fabs(alpha * p[i]/x[i]); if (ddum > dmax) dmax = ddum; } else if (p[i] != 0.0) dmax = 1.0; stag = (1.0 + dmax == 1.0); } F77(daxpy)(&n, &alpha, p, &ONE, x, &ONE); /* form new iterate */ ALPHA = -alpha; F77(daxpy)(&n, &ALPHA, q, &ONE, r, &ONE); /* r = r - alpha * q */ /* check for convergence */ #ifdef EXPENSIVE_CRIT matvec(x, z); /* normr = norm(b - A * x) */ for (i = 0; i < n; i ++) z[i] = b[i] - z[i]; normr = F77(dnrm2)(&n, z, &ONE); #else normr = F77(dnrm2)(&n, r, &ONE); /* normr = norm(r) */ #endif if (normr <= tolb) { *flag = 0; break; } if (stag == 1) { *flag = 3; break; } } /* for it = 1 : maxit */ *iter = it; *relres = normr / n2b; if (clvl) itermsg(tol,maxit,*flag,*iter,*relres); }
int fgmr(int n, void (*matvec) (double, double[], double, double[]), void (*psolve) (int, double[], double[]), double *rhs, double *sol, double tol, int im, int *itmax, FILE * fits) { /*---------------------------------------------------------------------- | *** Preconditioned FGMRES *** +----------------------------------------------------------------------- | This is a simple version of the ARMS preconditioned FGMRES algorithm. +----------------------------------------------------------------------- | Y. S. Dec. 2000. -- Apr. 2008 +----------------------------------------------------------------------- | on entry: |---------- | | rhs = real vector of length n containing the right hand side. | sol = real vector of length n containing an initial guess to the | solution on input. | tol = tolerance for stopping iteration | im = Krylov subspace dimension | (itmax) = max number of iterations allowed. | fits = NULL: no output | != NULL: file handle to output " resid vs time and its" | | on return: |---------- | fgmr int = 0 --> successful return. | int = 1 --> convergence not achieved in itmax iterations. | sol = contains an approximate solution (upon successful return). | itmax = has changed. It now contains the number of steps required | to converge -- +----------------------------------------------------------------------- | internal work arrays: |---------- | vv = work array of length [im+1][n] (used to store the Arnoldi | basis) | hh = work array of length [im][im+1] (Householder matrix) | z = work array of length [im][n] to store preconditioned vectors +----------------------------------------------------------------------- | subroutines called : | matvec - matrix-vector multiplication operation | psolve - (right) preconditionning operation | psolve can be a NULL pointer (GMRES without preconditioner) +---------------------------------------------------------------------*/ int maxits = *itmax; int i, i1, ii, j, k, k1, its, retval, i_1 = 1; double **hh, *c, *s, *rs, t, t0; double beta, eps1 = 0.0, gam, **vv, **z; its = 0; vv = (double **)SUPERLU_MALLOC((im + 1) * sizeof(double *)); for (i = 0; i <= im; i++) vv[i] = doubleMalloc(n); z = (double **)SUPERLU_MALLOC(im * sizeof(double *)); hh = (double **)SUPERLU_MALLOC(im * sizeof(double *)); for (i = 0; i < im; i++) { hh[i] = doubleMalloc(i + 2); z[i] = doubleMalloc(n); } c = doubleMalloc(im); s = doubleMalloc(im); rs = doubleMalloc(im + 1); /*---- outer loop starts here ----*/ do { /*---- compute initial residual vector ----*/ matvec(1.0, sol, 0.0, vv[0]); for (j = 0; j < n; j++) vv[0][j] = rhs[j] - vv[0][j]; /* vv[0]= initial residual */ beta = dnrm2_(&n, vv[0], &i_1); /*---- print info if fits != null ----*/ if (fits != NULL && its == 0) fprintf(fits, "%8d %10.2e\n", its, beta); /*if ( beta < tol * dnrm2_(&n, rhs, &i_1) )*/ if ( !(beta >= tol * dnrm2_(&n, rhs, &i_1)) ) break; t = 1.0 / beta; /*---- normalize: vv[0] = vv[0] / beta ----*/ for (j = 0; j < n; j++) vv[0][j] = vv[0][j] * t; if (its == 0) eps1 = tol * beta; /*---- initialize 1-st term of rhs of hessenberg system ----*/ rs[0] = beta; for (i = 0; i < im; i++) { its++; i1 = i + 1; /*------------------------------------------------------------ | (Right) Preconditioning Operation z_{j} = M^{-1} v_{j} +-----------------------------------------------------------*/ if (psolve) psolve(n, z[i], vv[i]); else dcopy_(&n, vv[i], &i_1, z[i], &i_1); /*---- matvec operation w = A z_{j} = A M^{-1} v_{j} ----*/ matvec(1.0, z[i], 0.0, vv[i1]); /*------------------------------------------------------------ | modified gram - schmidt... | h_{i,j} = (w,v_{i}) | w = w - h_{i,j} v_{i} +------------------------------------------------------------*/ t0 = dnrm2_(&n, vv[i1], &i_1); for (j = 0; j <= i; j++) { double negt; t = ddot_(&n, vv[j], &i_1, vv[i1], &i_1); hh[i][j] = t; negt = -t; daxpy_(&n, &negt, vv[j], &i_1, vv[i1], &i_1); } /*---- h_{j+1,j} = ||w||_{2} ----*/ t = dnrm2_(&n, vv[i1], &i_1); while (t < 0.5 * t0) { t0 = t; for (j = 0; j <= i; j++) { double negt; t = ddot_(&n, vv[j], &i_1, vv[i1], &i_1); hh[i][j] += t; negt = -t; daxpy_(&n, &negt, vv[j], &i_1, vv[i1], &i_1); } t = dnrm2_(&n, vv[i1], &i_1); } hh[i][i1] = t; if (t != 0.0) { /*---- v_{j+1} = w / h_{j+1,j} ----*/ t = 1.0 / t; for (k = 0; k < n; k++) vv[i1][k] = vv[i1][k] * t; } /*--------------------------------------------------- | done with modified gram schimdt and arnoldi step | now update factorization of hh +--------------------------------------------------*/ /*-------------------------------------------------------- | perform previous transformations on i-th column of h +-------------------------------------------------------*/ for (k = 1; k <= i; k++) { k1 = k - 1; t = hh[i][k1]; hh[i][k1] = c[k1] * t + s[k1] * hh[i][k]; hh[i][k] = -s[k1] * t + c[k1] * hh[i][k]; } gam = sqrt(pow(hh[i][i], 2) + pow(hh[i][i1], 2)); /*--------------------------------------------------- | if gamma is zero then any small value will do | affect only residual estimate +--------------------------------------------------*/ /* if (gam == 0.0) gam = epsmac; */ /*---- get next plane rotation ---*/ if (gam > 0.0) { c[i] = hh[i][i] / gam; s[i] = hh[i][i1] / gam; } else { c[i] = 1.0; s[i] = 0.0; } rs[i1] = -s[i] * rs[i]; rs[i] = c[i] * rs[i]; /*---------------------------------------------------- | determine residual norm and test for convergence +---------------------------------------------------*/ hh[i][i] = c[i] * hh[i][i] + s[i] * hh[i][i1]; beta = fabs(rs[i1]); if (fits != NULL) fprintf(fits, "%8d %10.2e\n", its, beta); if (beta <= eps1 || its >= maxits) break; } if (i == im) i--; /*---- now compute solution. 1st, solve upper triangular system ----*/ rs[i] = rs[i] / hh[i][i]; for (ii = 1; ii <= i; ii++) { k = i - ii; k1 = k + 1; t = rs[k]; for (j = k1; j <= i; j++) t = t - hh[j][k] * rs[j]; rs[k] = t / hh[k][k]; } /*---- linear combination of v[i]'s to get sol. ----*/ for (j = 0; j <= i; j++) { t = rs[j]; for (k = 0; k < n; k++) sol[k] += t * z[j][k]; } /* calculate the residual and output */ matvec(1.0, sol, 0.0, vv[0]); for (j = 0; j < n; j++) vv[0][j] = rhs[j] - vv[0][j]; /* vv[0]= initial residual */ /*---- print info if fits != null ----*/ beta = dnrm2_(&n, vv[0], &i_1); /*---- restart outer loop if needed ----*/ /*if (beta >= eps1 / tol)*/ if ( !(beta < eps1 / tol) ) { its = maxits + 10; break; } if (beta <= eps1) break; } while(its < maxits); retval = (its >= maxits); for (i = 0; i <= im; i++) SUPERLU_FREE(vv[i]); SUPERLU_FREE(vv); for (i = 0; i < im; i++) { SUPERLU_FREE(hh[i]); SUPERLU_FREE(z[i]); } SUPERLU_FREE(hh); SUPERLU_FREE(z); SUPERLU_FREE(c); SUPERLU_FREE(s); SUPERLU_FREE(rs); *itmax = its; return retval; } /*----end of fgmr ----*/
int main() { vector x,b; vector r,p,Ap; matrix A; double one=1.0, zero=0.0; double normr, rtrans, oldtrans, p_ap_dot , alpha, beta; int iter=0; //create matrix allocate_3d_poission_matrix(A,N); printf("Rows: %d, nnz: %d\n", A.num_rows, A.row_offsets[A.num_rows]); allocate_vector(x,A.num_rows); allocate_vector(Ap,A.num_rows); allocate_vector(r,A.num_rows); allocate_vector(p,A.num_rows); allocate_vector(b,A.num_rows); initialize_vector(x,100000); initialize_vector(b,1); waxpby(one, x, zero, x, p); matvec(A,p,Ap); waxpby(one, b, -one, Ap, r); rtrans=dot(r,r); normr=sqrt(rtrans); double st = omp_get_wtime(); do { if(iter==0) { waxpby(one,r,zero,r,p); } else { oldtrans=rtrans; rtrans = dot(r,r); beta = rtrans/oldtrans; waxpby(one,r,beta,p,p); } normr=sqrt(rtrans); matvec(A,p,Ap); p_ap_dot = dot(Ap,p); alpha = rtrans/p_ap_dot; waxpby(one,x,alpha,p,x); waxpby(one,r,-alpha,Ap,r); if(iter%10==0) printf("Iteration: %d, Tolerance: %.4e\n", iter, normr); iter++; } while(iter<MAX_ITERS && normr>TOL); double et = omp_get_wtime(); printf("Total Iterations: %d\n", iter); printf("Total Time: %lf s\n", (et-st)); free_vector(x); free_vector(r); free_vector(p); free_vector(Ap); free_matrix(A); return 0; }
void power_method(void (*matvec)(void *, int, int, real*, real **, int, int*), void *A, int n, int K, int random_seed, int maxit, real tol, real **eigv, real **eigs){ /* find k-largest eigenvectors of a matrix A. Result in eigv. if eigv == NULL; memory will be allocated. maxium of maxit iterations will be done, and tol is the convergence criterion This converges only if the largest eigenvectors/values are real (e.g., if A is symmetric) and the next largest eigenvalues separate from the largest ones input: matvec: a function point that takes a matrix M and a vector u, produce v = M.u A: the matrix n: dimension of matrix A K: number of eigenes to find random_seed: seed for eigenvector initialization matrix: max number f iterations tol: accuracy control output: eigv: eigenvectors. The i-th is at eigvs[i*n, i*(n+1) - 1] eigs: eigenvalues. The i-th is at eigs[i] Function PowerIteration (A – m × m matrix ) % This function computes u1, u2, . . . , uk, the first k eigenvectors of S. const tol ← 0.001 for i = 1 to k do . ui ← random . ui ← ui/||ui|| . do . vi ← ui . % orthogonalize against previous eigenvectors . for j = 1 to i − 1 do . vi ← vi − (vi^Tvi)vj . end for . ui ← A vi/||A vi|| . while (ui^T vi < 1-tol) (halt when direction change is small) . vi = ui end for return v1,v2,... */ real **v, *u, *vv; int iter = 0; real res, unorm; int i, j, k; real uij; int flag; K = MAX(0, MIN(n, K)); assert(K <= n && K > 0); if (!(*eigv)) *eigv = MALLOC(sizeof(real)*n*K); if (!(*eigs)) *eigs = MALLOC(sizeof(real)*K); v = MALLOC(sizeof(real*)*K); vv = MALLOC(sizeof(real)*n); u = MALLOC(sizeof(real)*n); srand(random_seed); for (k = 0; k < K; k++){ //fprintf(stderr,"calculating eig k ==================== %d\n",k); v[k] = &((*eigv)[k*n]); for (i = 0; i < n; i++) u[i] = drand(); res = sqrt(vector_product(n, u, u)); if (res > 0) res = 1/res; for (i = 0; i < n; i++) { u[i] = u[i]*res; v[k][i] = u[i]; } /* fprintf(stderr,"inital vec="); for (i = 0; i < n; i++) fprintf(stderr,"%f,",u[i]);fprintf(stderr,"\n"); */ iter = 0; do { /* normalize against previous eigens */ for (j = 0; j < k; j++){ uij = vector_product(n, u, v[j]); for (i = 0; i < n; i++) { u[i] = u[i] - uij *v[j][i]; } } matvec(A, n, n, u, &vv, FALSE, &flag); assert(!flag); /* fprintf(stderr,"normalized aginst prev vec="); for (i = 0; i < n; i++) fprintf(stderr,"%f,",u[i]);fprintf(stderr,"\n"); */ unorm = vector_product(n, vv, vv);/* ||u||^2 */ unorm = sqrt(unorm); (*eigs)[k] = unorm; if (unorm > 0) { unorm = 1/unorm; } else { // ||A.v||=0, so v must be an eigenvec correspond to eigenvalue zero for (i = 0; i < n; i++) vv[i] = u[i]; unorm = sqrt(vector_product(n, vv, vv)); if (unorm > 0) unorm = 1/unorm; } res = 0.; for (i = 0; i < n; i++) { //res = MAX(res, ABS(vv[i]-(*eigs)[k]*u[i])); u[i] = vv[i]*unorm; res = res + u[i]*v[k][i]; v[k][i] = u[i]; } //fprintf(stderr,"res=%g, tol = %g, res < 1-tol=%d\n",res, tol,res < 1 - tol); } while (res < 1 - tol && iter++ < maxit); //} while (iter++ < maxit); //fprintf(stderr,"iter= %d, res=%f\n",iter, res); } FREE(u); FREE(vv); }
void peide(int n, int m, int nobs, int *nbp, real_t par[], real_t res[], int bp[], real_t **jtjinv, real_t in[], real_t out[], int (*deriv)(int,int,real_t [],real_t [],real_t,real_t []), int (*jacdfdy)(int,int,real_t [],real_t [],real_t,real_t **), int (*jacdfdp)(int,int,real_t [],real_t [],real_t,real_t **), void (*callystart)(int,int,real_t [],real_t [],real_t[]), void (*data)(int,real_t [],real_t [],int[]), void (*monitor)(int,int,int,real_t [],real_t [],int,int)) { int i,j,weight,ncol,nrow,away,max,nfe,nis,*cobs, first,sec,clean,nbpold,maxfe,fe,it,err,emergency; real_t eps1,res1,in3,in4,fac3,fac4,aux[4],*obs,*save,*tobs, **yp,*ymax,*y,**fy,**fp,w,**aid,temp, vv,ww,w2,mu,res2,fpar,fparpres,lambda,lambdamin,p,pw, reltolres,abstolres,em[8],*val,*b,*bb,*parpres,**jaco; static real_t save1[35]={1.0, 1.0, 9.0, 4.0, 0.0, 2.0/3.0, 1.0, 1.0/3.0, 36.0, 20.25, 1.0, 6.0/11.0, 1.0, 6.0/11.0, 1.0/11.0, 84.028, 53.778, 0.25, 0.48, 1.0, 0.7, 0.2, 0.02, 156.25, 108.51, 0.027778, 120.0/274.0, 1.0, 225.0/274.0, 85.0/274.0, 15.0/274.0, 1.0/274.0, 0.0, 187.69, 0.0047361}; nbpold=(*nbp); cobs=allocate_integer_vector(1,nobs); obs=allocate_real_vector(1,nobs); save=allocate_real_vector(-38,6*n); tobs=allocate_real_vector(0,nobs); ymax=allocate_real_vector(1,n); y=allocate_real_vector(1,6*n*(nbpold+m+1)); yp=allocate_real_matrix(1,nbpold+nobs,1,nbpold+m); fy=allocate_real_matrix(1,n,1,n); fp=allocate_real_matrix(1,n,1,m+nbpold); aid=allocate_real_matrix(1,m+nbpold,1,m+nbpold); for (i=0; i<=34; i++) save[-38+i]=save1[i]; (*data)(nobs,tobs,obs,cobs); weight=1; first=sec=0; clean=(*nbp > 0); aux[2]=FLT_EPSILON; eps1=1.0e10; out[1]=0.0; bp[0]=max=0; /* smooth integration without break-points */ if (!peidefunct(nobs,m,par,res, n,m,nobs,nbp,first,&sec,&max,&nis,eps1,weight,bp, save,ymax,y,yp,fy,fp,cobs,tobs,obs,in,aux,clean,deriv, jacdfdy,jacdfdp,callystart,monitor)) goto Escape; res1=sqrt(vecvec(1,nobs,0,res,res)); nfe=1; if (in[5] == 1.0) { out[1]=1.0; goto Escape; } if (clean) { first=1; clean=0; fac3=sqrt(sqrt(in[3]/res1)); fac4=sqrt(sqrt(in[4]/res1)); eps1=res1*fac4; if (!peidefunct(nobs,m,par,res, n,m,nobs,nbp,first,&sec,&max,&nis,eps1,weight,bp, save,ymax,y,yp,fy,fp,cobs,tobs,obs,in,aux,clean,deriv, jacdfdy,jacdfdp,callystart,monitor)) goto Escape; first=0; } else nfe=0; ncol=m+(*nbp); nrow=nobs+(*nbp); sec=1; in3=in[3]; in4=in[4]; in[3]=res1; weight=away=0; out[4]=out[5]=w=0.0; temp=sqrt(weight)+1.0; weight=temp*temp; while (weight != 16 && *nbp > 0) { if (away == 0 && w != 0.0) { /* if no break-points were omitted then one function function evaluation is saved */ w=weight/w; for (i=nobs+1; i<=nrow; i++) { for (j=1; j<=ncol; j++) yp[i][j] *= w; res[i] *= w; } sec=1; nfe--; } in[3] *= fac3*weight; in[4]=eps1; (*monitor)(2,ncol,nrow,par,res,weight,nis); /* marquardt's method */ val=allocate_real_vector(1,ncol); b=allocate_real_vector(1,ncol); bb=allocate_real_vector(1,ncol); parpres=allocate_real_vector(1,ncol); jaco=allocate_real_matrix(1,nrow,1,ncol); vv=10.0; w2=0.5; mu=0.01; ww = (in[6] < 1.0e-7) ? 1.0e-8 : 1.0e-1*in[6]; em[0]=em[2]=em[6]=in[0]; em[4]=10*ncol; reltolres=in[3]; abstolres=in[4]*in[4]; maxfe=in[5]; err=0; fe=it=1; p=fpar=res2=0.0; pw = -log(ww*in[0])/2.30; if (!peidefunct(nrow,ncol,par,res, n,m,nobs,nbp,first,&sec,&max,&nis,eps1, weight,bp,save,ymax,y,yp,fy,fp,cobs,tobs,obs, in,aux,clean,deriv,jacdfdy,jacdfdp, callystart,monitor)) err=3; else { fpar=vecvec(1,nrow,0,res,res); out[3]=sqrt(fpar); emergency=0; it=1; do { dupmat(1,nrow,1,ncol,jaco,yp); i=qrisngvaldec(jaco,nrow,ncol,val,aid,em); if (it == 1) lambda=in[6]*vecvec(1,ncol,0,val,val); else if (p == 0.0) lambda *= w2; for (i=1; i<=ncol; i++) b[i]=val[i]*tamvec(1,nrow,i,jaco,res); while (1) { for (i=1; i<=ncol; i++) bb[i]=b[i]/(val[i]*val[i]+lambda); for (i=1; i<=ncol; i++) parpres[i]=par[i]-matvec(1,ncol,i,aid,bb); fe++; if (fe >= maxfe) err=1; else if (!peidefunct(nrow,ncol,parpres,res, n,m,nobs,nbp,first,&sec,&max,&nis, eps1,weight,bp,save,ymax,y,yp,fy,fp, cobs,tobs,obs,in,aux,clean,deriv, jacdfdy,jacdfdp,callystart,monitor)) err=2; if (err != 0) { emergency=1; break; } fparpres=vecvec(1,nrow,0,res,res); res2=fpar-fparpres; if (res2 < mu*vecvec(1,ncol,0,b,bb)) { p += 1.0; lambda *= vv; if (p == 1.0) { lambdamin=ww*vecvec(1,ncol,0,val,val); if (lambda < lambdamin) lambda=lambdamin; } if (p >= pw) { err=4; emergency=1; break; } } else { dupvec(1,ncol,0,par,parpres); fpar=fparpres; break; } } if (emergency) break; it++; } while (fpar>abstolres && res2>reltolres*fpar+abstolres); for (i=1; i<=ncol; i++) mulcol(1,ncol,i,i,jaco,aid,1.0/(val[i]+in[0])); for (i=1; i<=ncol; i++) for (j=1; j<=i; j++) aid[i][j]=aid[j][i]=mattam(1,ncol,i,j,jaco,jaco); lambda=lambdamin=val[1]; for (i=2; i<=ncol; i++) if (val[i] > lambda) lambda=val[i]; else if (val[i] < lambdamin) lambdamin=val[i]; temp=lambda/(lambdamin+in[0]); out[7]=temp*temp; out[2]=sqrt(fpar); out[6]=sqrt(res2+fpar)-out[2]; } out[4]=fe; out[5]=it-1; out[1]=err; free_real_vector(val,1); free_real_vector(b,1); free_real_vector(bb,1); free_real_vector(parpres,1); free_real_matrix(jaco,1,nrow,1); if (out[1] > 0.0) goto Escape; /* the relative starting value of lambda is adjusted to the last value of lambda used */ away=out[4]-out[5]-1.0; in[6] *= pow(5.0,away)*pow(2.0,away-out[5]); nfe += out[4]; w=weight; temp=sqrt(weight)+1.0; eps1=temp*temp*in[4]*fac4; away=0; /* omit useless break-points */ for (j=1; j<=(*nbp); j++) if (fabs(obs[bp[j]]+res[bp[j]]-par[j+m]) < eps1) { (*nbp)--; for (i=j; i<=(*nbp); i++) bp[i]=bp[i+1]; dupvec(j+m,(*nbp)+m,1,par,par); j--; away++; bp[*nbp+1]=0; } ncol -= away; nrow -= away; temp=sqrt(weight)+1.0; weight=temp*temp; } in[3]=in3; in[4]=in4; *nbp=0; weight=1; (*monitor)(2,m,nobs,par,res,weight,nis); /* marquardt's method */ val=allocate_real_vector(1,m); b=allocate_real_vector(1,m); bb=allocate_real_vector(1,m); parpres=allocate_real_vector(1,m); jaco=allocate_real_matrix(1,nobs,1,m); vv=10.0; w2=0.5; mu=0.01; ww = (in[6] < 1.0e-7) ? 1.0e-8 : 1.0e-1*in[6]; em[0]=em[2]=em[6]=in[0]; em[4]=10*m; reltolres=in[3]; abstolres=in[4]*in[4]; maxfe=in[5]; err=0; fe=it=1; p=fpar=res2=0.0; pw = -log(ww*in[0])/2.30; if (!peidefunct(nobs,m,par,res, n,m,nobs,nbp,first,&sec,&max,&nis,eps1,weight,bp, save,ymax,y,yp,fy,fp,cobs,tobs,obs,in,aux,clean, deriv,jacdfdy,jacdfdp,callystart,monitor)) err=3; else { fpar=vecvec(1,nobs,0,res,res); out[3]=sqrt(fpar); emergency=0; it=1; do { dupmat(1,nobs,1,m,jaco,yp); i=qrisngvaldec(jaco,nobs,m,val,jtjinv,em); if (it == 1) lambda=in[6]*vecvec(1,m,0,val,val); else if (p == 0.0) lambda *= w2; for (i=1; i<=m; i++) b[i]=val[i]*tamvec(1,nobs,i,jaco,res); while (1) { for (i=1; i<=m; i++) bb[i]=b[i]/(val[i]*val[i]+lambda); for (i=1; i<=m; i++) parpres[i]=par[i]-matvec(1,m,i,jtjinv,bb); fe++; if (fe >= maxfe) err=1; else if (!peidefunct(nobs,m,parpres,res, n,m,nobs,nbp,first,&sec,&max,&nis,eps1, weight,bp,save,ymax,y,yp,fy,fp,cobs,tobs, obs,in,aux,clean,deriv,jacdfdy,jacdfdp, callystart,monitor)) err=2; if (err != 0) { emergency=1; break; } fparpres=vecvec(1,nobs,0,res,res); res2=fpar-fparpres; if (res2 < mu*vecvec(1,m,0,b,bb)) { p += 1.0; lambda *= vv; if (p == 1.0) { lambdamin=ww*vecvec(1,m,0,val,val); if (lambda < lambdamin) lambda=lambdamin; } if (p >= pw) { err=4; emergency=1; break; } } else { dupvec(1,m,0,par,parpres); fpar=fparpres; break; } } if (emergency) break; it++; } while (fpar>abstolres && res2>reltolres*fpar+abstolres); for (i=1; i<=m; i++) mulcol(1,m,i,i,jaco,jtjinv,1.0/(val[i]+in[0])); for (i=1; i<=m; i++) for (j=1; j<=i; j++) jtjinv[i][j]=jtjinv[j][i]=mattam(1,m,i,j,jaco,jaco); lambda=lambdamin=val[1]; for (i=2; i<=m; i++) if (val[i] > lambda) lambda=val[i]; else if (val[i] < lambdamin) lambdamin=val[i]; temp=lambda/(lambdamin+in[0]); out[7]=temp*temp; out[2]=sqrt(fpar); out[6]=sqrt(res2+fpar)-out[2]; } out[4]=fe; out[5]=it-1; out[1]=err; free_real_vector(val,1); free_real_vector(b,1); free_real_vector(bb,1); free_real_vector(parpres,1); free_real_matrix(jaco,1,nobs,1); nfe += out[4]; Escape: if (out[1] == 3.0) out[1]=2.0; else if (out[1] == 4.0) out[1]=6.0; if (save[-3] != 0.0) out[1]=save[-3]; out[3]=res1; out[4]=nfe; out[5]=max; free_integer_vector(cobs,1); free_real_vector(obs,1); free_real_vector(save,-38); free_real_vector(tobs,0); free_real_vector(ymax,1); free_real_vector(y,1); free_real_matrix(yp,1,nbpold+nobs,1); free_real_matrix(fy,1,n,1); free_real_matrix(fp,1,n,1); free_real_matrix(aid,1,m+nbpold,1); }
int peidefunct(int nrow, int ncol, real_t par[], real_t res[], int n, int m, int nobs, int *nbp, int first, int *sec, int *max, int *nis, real_t eps1, int weight, int bp[], real_t save[], real_t ymax[], real_t y[], real_t **yp, real_t **fy, real_t **fp, int cobs[], real_t tobs[], real_t obs[], real_t in[], real_t aux[], int clean, int (*deriv)(int,int,real_t [],real_t [],real_t,real_t []), int (*jacdfdy)(int,int,real_t [],real_t [],real_t,real_t **), int (*jacdfdp)(int,int,real_t [],real_t [],real_t,real_t **), void (*callystart)(int,int,real_t [],real_t [],real_t[]), void (*monitor)(int,int,int,real_t [],real_t [],int,int)) { /* this function is internally used by PEIDE */ void peidereset(int, int, real_t, real_t, real_t, real_t, real_t [], real_t [], real_t *, real_t *, real_t *, int *); void peideorder(int, int, real_t, real_t [], real_t [], real_t *, real_t *, real_t *, real_t *, real_t *, int *); void peidestep(int, int, int, real_t, real_t, real_t, real_t, real_t [], real_t [], real_t [], real_t [], int *, real_t *); real_t peideinterpol(int, int, int, real_t, real_t []); int l,k,knew,fails,same,kpold,n6,nnpar,j5n,cobsii,*p,evaluate, evaluated,decompose,conv,extra,npar,i,j,jj,ii; real_t xold,hold,a0,tolup,tol,toldwn,tolconv,h,ch,chnew,error, dfi,tobsdif,a[6],*delta,*lastdelta,*df,*y0,**jacob,xend, hmax,hmin,eps,s,aa,x,t,c; p=allocate_integer_vector(1,n); delta=allocate_real_vector(1,n); lastdelta=allocate_real_vector(1,n); df=allocate_real_vector(1,n); y0=allocate_real_vector(1,n); jacob=allocate_real_matrix(1,n,1,n); if (*sec) { *sec=0; goto Finish; } xend=tobs[nobs]; eps=in[2]; npar=m; extra=(*nis)=0; ii=1; jj = (*nbp == 0) ? 0 : 1; n6=n*6; inivec(-3,-1,save,0.0); inivec(n6+1,(6+m)*n,y,0.0); inimat(1,nobs+(*nbp),1,m+(*nbp),yp,0.0); t=tobs[1]; x=tobs[0]; (*callystart)(n,m,par,y,ymax); hmax=tobs[1]-tobs[0]; hmin=hmax*in[1]; /* evaluate jacobian */ evaluate=0; decompose=evaluated=1; if (!(*jacdfdy)(n,m,par,y,x,fy)) { save[-3]=4.0; goto Finish; } nnpar=n*npar; Newstart: k=1; kpold=0; same=2; peideorder(n,k,eps,a,save,&tol,&tolup,&toldwn,&tolconv, &a0,&decompose); if (!(*deriv)(n,m,par,y,x,df)) { save[-3]=3.0; goto Finish; } s=FLT_MIN; for (i=1; i<=n; i++) { aa=matvec(1,n,i,fy,df)/ymax[i]; s += aa*aa; } h=sqrt(2.0*eps/sqrt(s)); if (h > hmax) h=hmax; else if (h < hmin) h=hmin; xold=x; hold=h; ch=1.0; for (i=1; i<=n; i++) { save[i]=y[i]; save[n+i]=y[n+i]=df[i]*h; } fails=0; while (x < xend) { if (x+h <= xend) x += h; else { h=xend-x; x=xend; ch=h/hold; c=1.0; for (j=n; j<=k*n; j += n) { c *= ch; for (i=j+1; i<=j+n; i++) y[i] *= c; } same = (same < 3) ? 3 : same+1; } /* prediction */ for (l=1; l<=n; l++) { for (i=l; i<=(k-1)*n+l; i += n) for (j=(k-1)*n+l; j>=i; j -= n) y[j] += y[j+n]; delta[l]=0.0; } evaluated=0; /* correction and estimation local error */ for (l=1; l<=3; l++) { if (!(*deriv)(n,m,par,y,x,df)) { save[-3]=3; goto Finish; } for (i=1; i<=n; i++) df[i]=df[i]*h-y[n+i]; if (evaluate) { /* evaluate jacobian */ evaluate=0; decompose=evaluated=1; if (!(*jacdfdy)(n,m,par,y,x,fy)) { save[-3]=4.0; goto Finish; } } if (decompose) { /* decompose jacobian */ decompose=0; c = -a0*h; for (j=1; j<=n; j++) { for (i=1; i<=n; i++) jacob[i][j]=fy[i][j]*c; jacob[j][j] += 1.0; } dec(jacob,n,aux,p); } sol(jacob,n,p,df); conv=1; for (i=1; i<=n; i++) { dfi=df[i]; y[i] += a0*dfi; y[n+i] += dfi; delta[i] += dfi; conv=(conv && (fabs(dfi) < tolconv*ymax[i])); } if (conv) { s=FLT_MIN; for (i=1; i<=n; i++) { aa=delta[i]/ymax[i]; s += aa*aa; } error=s; break; } } /* acceptance or rejection */ if (!conv) { if (!evaluated) evaluate=1; else { ch /= 4.0; if (h < 4.0*hmin) { save[-1] += 10.0; hmin /= 10.0; if (save[-1] > 40.0) goto Finish; } } peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x, &h,&decompose); } else if (error > tol) { fails++; if (h > 1.1*hmin) { if (fails > 2) { peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x, &h,&decompose); goto Newstart; } else { /* calculate step and order */ peidestep(n,k,fails,tolup,toldwn,tol,error,delta, lastdelta,y,ymax,&knew,&chnew); if (knew != k) { k=knew; peideorder(n,k,eps,a,save,&tol,&tolup, &toldwn,&tolconv,&a0,&decompose); } ch *= chnew; peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x, &h,&decompose); } } else { if (k == 1) { /* violate eps criterion */ save[-2] += 1.0; same=4; goto Errortestok; } k=1; peidereset(n,k,hmin,hmax,hold,xold,y,save,&ch,&x, &h,&decompose); peideorder(n,k,eps,a,save,&tol,&tolup, &toldwn,&tolconv,&a0,&decompose); same=2; } } else { Errortestok: fails=0; for (i=1; i<=n; i++) { c=delta[i]; for (l=2; l<=k; l++) y[l*n+i] += a[l]*c; if (fabs(y[i]) > ymax[i]) ymax[i]=fabs(y[i]); } same--; if (same == 1) dupvec(1,n,0,lastdelta,delta); else if (same == 0) { /* calculate step and order */ peidestep(n,k,fails,tolup,toldwn,tol,error,delta, lastdelta,y,ymax,&knew,&chnew); if (chnew > 1.1) { if (k != knew) { if (knew > k) mulvec(knew*n+1,knew*n+n,-knew*n,y,delta, a[k]/knew); k=knew; peideorder(n,k,eps,a,save,&tol,&tolup, &toldwn,&tolconv,&a0,&decompose); } same=k+1; if (chnew*h > hmax) chnew=hmax/h; h *= chnew; c=1.0; for (j=n; j<=k*n; j += n) { c *= chnew; mulvec(j+1,j+n,0,y,y,c); } decompose=1; } else same=10; } (*nis)++; /* start of an integration step of yp */ if (clean) { hold=h; xold=x; kpold=k; ch=1.0; dupvec(1,k*n+n,0,save,y); } else { if (h != hold) { ch=h/hold; c=1.0; for (j=n6+nnpar; j<=kpold*nnpar+n6; j += nnpar) { c *= ch; for (i=j+1; i<=j+nnpar; i++) y[i] *= c; } hold=h; } if (k > kpold) inivec(n6+k*nnpar+1,n6+k*nnpar+nnpar,y,0.0); xold=x; kpold=k; ch=1.0; dupvec(1,k*n+n,0,save,y); /* evaluate jacobian */ evaluate=0; decompose=evaluated=1; if (!(*jacdfdy)(n,m,par,y,x,fy)) { save[-3]=4.0; goto Finish; } /* decompose jacobian */ decompose=0; c = -a0*h; for (j=1; j<=n; j++) { for (i=1; i<=n; i++) jacob[i][j]=fy[i][j]*c; jacob[j][j] += 1.0; } dec(jacob,n,aux,p); if (!(*jacdfdp)(n,m,par,y,x,fp)) { save[-3]=5.0; goto Finish; } if (npar > m) inimat(1,n,m+1,npar,fp,0.0); /* prediction */ for (l=0; l<=k-1; l++) for (j=k-1; j>=l; j--) elmvec(j*nnpar+n6+1,j*nnpar+n6+nnpar,nnpar, y,y,1.0); /* correction */ for (j=1; j<=npar; j++) { j5n=(j+5)*n; dupvec(1,n,j5n,y0,y); for (i=1; i<=n; i++) df[i]=h*(fp[i][j]+matvec(1,n,i,fy,y0))- y[nnpar+j5n+i]; sol(jacob,n,p,df); for (l=0; l<=k; l++) { i=l*nnpar+j5n; elmvec(i+1,i+n,-i,y,df,a[l]); } } } while (x >= t) { /* calculate a row of the jacobian matrix and an element of the residual vector */ tobsdif=(tobs[ii]-x)/h; cobsii=cobs[ii]; res[ii]=peideinterpol(cobsii,n,k,tobsdif,y)-obs[ii]; if (!clean) { for (i=1; i<=npar; i++) yp[ii][i]=peideinterpol(cobsii+(i+5)*n,nnpar,k, tobsdif,y); /* introducing break-points */ if (bp[jj] != ii) { } else if (first && fabs(res[ii]) < eps1) { (*nbp)--; for (i=jj; i<=(*nbp); i++) bp[i]=bp[i+1]; bp[*nbp+1]=0; } else { extra++; if (first) par[m+jj]=obs[ii]; /* introducing a jacobian row and a residual vector element for continuity requirements */ yp[nobs+jj][m+jj] = -weight; mulrow(1,npar,nobs+jj,ii,yp,yp,weight); res[nobs+jj]=weight*(res[ii]+obs[ii]-par[m+jj]); } } if (ii == nobs) goto Finish; else { t=tobs[ii+1]; if (bp[jj] == ii && jj < *nbp) jj++; hmax=t-tobs[ii]; hmin=hmax*in[1]; ii++; } } /* break-points introduce new initial values for y & yp */ if (extra > 0) { for (i=1; i<=n; i++) { y[i]=peideinterpol(i,n,k,tobsdif,y); for (j=1; j<=npar; j++) y[i+(j+5)*n]=peideinterpol(i+(j+5)*n,nnpar, k,tobsdif,y); } for (l=1; l<=extra; l++) { cobsii=cobs[bp[npar-m+l]]; y[cobsii]=par[npar+l]; for (i=1; i<=npar+extra; i++) y[cobsii+(5+i)*n]=0.0; inivec(1+nnpar+(l+5)*n,nnpar+(l+6)*n,y,0.0); y[cobsii+(5+npar+l)*n]=1.0; } npar += extra; extra=0; x=tobs[ii-1]; /* evaluate jacobian */ evaluate=0; decompose=evaluated=1; if (!(*jacdfdy)(n,m,par,y,x,fy)) { save[-3]=4.0; goto Finish; } nnpar=n*npar; goto Newstart; } } } Finish: if (save[-2] > *max) *max=save[-2]; if (!first) (*monitor)(1,ncol,nrow,par,res,weight,*nis); free_integer_vector(p,1); free_real_vector(delta,1); free_real_vector(lastdelta,1); free_real_vector(df,1); free_real_vector(y0,1); free_real_matrix(jacob,1,n,1); return (save[-1] <= 40.0 && save[-3] == 0.0); }
void semip_gc( //Output Arguments double *bdraw, // draws for beta (ndraw - nomit,k) matrix double *adraw, // draws for regional effects, a, (m,1) vector double *pdraw, // draws for rho (ndraw - nomit,1) vector double *sdraw, // draws for sige (ndraw - nomit,1) vector double *rdraw, // draws for rval (ndraw - nomit,1) vector (if mm != 0) double *vmean, // mean of vi draws (n,1) vector double *amean, // mean of a-draws (m,1) vector double *zmean, // mean of latent z-draws (n,1) vector double *yhat, // mean of posterior predicted y (n,1) vector //Input Arguments double *y, // (n,1) lhs vector with 0,1 values double *x, // (n,k) matrix of explanatory variables double *W, // (m,m) weight matrix int ndraw, // # of draws int nomit, // # of burn-in draws to omit int nsave, // # of draws saved (= ndraw - nomit) int n, // # of observations int k, // # of explanatory variables int m, // # of regions int *mobs, // (m,1) vector of obs numbers in each region double *a, // (m,1) vector of regional effects double nu, // prior parameter for sige double d0, // prior parameter for sige double rval, // hyperparameter r double mm, // prior parameter for rval double kk, // prior parameter for rval double *detval, // (ngrid,2) matrix with [rho , log det values] int ngrid, // # of values in detval (rows) double *TI, // prior var-cov for beta (inverted in matlab) double *TIc) // prior var-cov * prior mean { // Local Variables int i,j,l,iter,invt,accept,rcount,cobs,obsi,zflag; double rmin,rmax,sige,chi,ee,ratio,p,rho,rho_new; double junk,aBBa,vsqrt,ru,rhox,rhoy,phi,awi,aw2i,bi,di; double *z,*tvec,*e,*e0,*xb,*mn,*ys,*limit1,*limit2,*zmt; double *bhat,*b0,*b0tmp,*v,*vv,*b1,*b1tmp,*Bpa,*rdet,*ldet,*w1; double **xs,**xst,**xsxs,**A0,**A1,**Bpt,**Bp,**xmat,**W2; double *Wa, **Wmat; double c0, c1,c2; // Allocate Matrices and Vectors xs = dmatrix(0,n-1,0,k-1); xst = dmatrix(0,k-1,0,n-1); xsxs = dmatrix(0,k-1,0,k-1); A0 = dmatrix(0,k-1,0,k-1); A1 = dmatrix(0,m-1,0,m-1); Bp = dmatrix(0,m-1,0,m-1); Bpt = dmatrix(0,m-1,0,m-1); xmat = dmatrix(0,n-1,0,k-1); W2 = dmatrix(0,m-1,0,m-1); Wmat = dmatrix(0,m-1,0,m-1); z = dvector(0,n-1); tvec = dvector(0,n-1); e = dvector(0,n-1); e0 = dvector(0,n-1); xb = dvector(0,n-1); mn = dvector(0,n-1); ys = dvector(0,n-1); limit1 = dvector(0,n-1); limit2 = dvector(0,n-1); zmt = dvector(0,n-1); vv = dvector(0,n-1); bhat = dvector(0,k-1); b0 = dvector(0,k-1); b0tmp = dvector(0,k-1); v = dvector(0,m-1); b1 = dvector(0,m-1); b1tmp = dvector(0,m-1); Bpa = dvector(0,m-1); w1 = dvector(0,m-1); rdet = dvector(0,ngrid-1); ldet = dvector(0,ngrid-1); Wa = dvector(0,m-1); // Initializations junk = 0.0; // Placeholder for mean in meanvar() rho = 0.5; sige = 1.0; zflag = 0; // a flag for 0,1 y-values // Initialize (z,vv,limits,tvec) for(i=0; i<n; i++){ z[i] = y[i]; vv[i] = 1.0; tvec[i] = 1.0; if (y[i] == 0.0){ limit1[i] = -10000; limit2[i] = 0.0; zflag = 1; // we have 0,1 y-values so sample z }else{ limit1[i] = 0.0; limit2[i] = 10000; } } // Initialize v, b1tmp, Bp for(i=0; i<m; i++){ v[i] = 1.0; b1tmp[i] = 1.0; for(j=0; j<m; j++){ if (j == i){ Bp[i][j] = 1 - rho*W[i + j*m]; }else{ Bp[i][j] = - rho*W[i + j*m]; } Wmat[i][j] = W[i + j*m]; } } // Parse detval into rdet and ldet vectors // and define (rmin,rmax) for(i=0; i<ngrid; i++){ j=0; rdet[i] = detval[i + j*ngrid]; j=1; ldet[i] = detval[i + j*ngrid]; } rmin = rdet[0]; rmax = rdet[ngrid-1]; // Put x into xmat for(i=0; i<n; i++){ for(j=0; j<k; j++) xmat[i][j] = x[i + j*n]; } // Compute matrices to be used for updating a-vector if(m > 100){ // Used only for large m for(i=0; i<m; i++){ w1[i] = 0.0; // Form w1[i] for(j=0;j<m;j++){ w1[i] = w1[i] + (W[j + i*m]*W[j + i*m]); } for(j=0;j<m;j++){ // Form W2[i][*] W2[i][j] = 0.0; if(j != i){ for(l=0;l<m;l++){ W2[i][j] = W2[i][j] + W[l + i*m]*W[l + j*m]; } } // Note that W2[i][i] = 0.0 by construction } } } // End if(m > 10) // ====================================== // Start the Sampler // ====================================== for(iter=0; iter<ndraw; iter++){ // UPDATE: beta // (1) Apply stnd devs (vsqrt) to x, z, z - tvec for(i=0; i<n; i++){ vsqrt = sqrt(vv[i]); zmt[i] = (z[i] - tvec[i])/vsqrt; for(j=0; j<k; j++) xs[i][j] = x[i + j*n]/vsqrt; } // (2) Construct A0 matrix transpose(xs,n,k,xst); // form xs' matmat(xst,k,n,xs,k,xsxs); // form xs'*xs for(i=0; i<k; i++){ // form xs'*xs + TI for(j=0; j<k; j++) A0[i][j] = xsxs[i][j] + TI[i + j*k]; } invt = inverse(A0, k); // replace A0 = inv(A0) if (invt != 1) mexPrintf("semip_gc: Inversion error in beta conditional \n"); // (3) Construct b0 vector matvec(xst,k,n,zmt,b0tmp); //form xs'*zmt for(i=0; i<k; i++){ b0tmp[i] = b0tmp[i] + TIc[i]; //form b0tmp = xs'*zmt + TIc } matvec(A0,k,k,b0tmp,b0); //form b0 = A0*b0tmp // (4) Do multivariate normal draw from N(b0,A0) normal_rndc(A0, k, bhat); // generate N(0,A0) vector for(i=0; i<k; i++){ bhat[i] = bhat[i] + b0[i]; // add mean vector, b0 } // (5) Now update related values: matvec(xmat,n,k,bhat,xb); // form xb = xmat*bhat for(i=0; i<n; i++){ // form e0 = z - x*bhat e0[i] = z[i] - xb[i]; } cobs = 0; for(i=0; i<m; i++){ // form b1tmp = e0i/v[i] obsi = mobs[i]; b1tmp[i] = 0.0; for(j=cobs; j<cobs + obsi; j++){ b1tmp[i] = b1tmp[i] + (e0[j]/v[i]); } cobs = cobs + obsi; } // UPDATE: a if(m <= 100){ // For small m use straight inverse // (1) Define A1 and b1 transpose(Bp,m,m,Bpt); // form Bp' matmat(Bpt,m,m,Bp,m,A1); // form Bp'*Bp for(i=0; i<m; i++){ // form A1 = (1/sige)*Bp'Bp + diag(mobs/v) for(j=0; j<m; j++){ if (j == i){ A1[i][j] = (1/sige)*A1[i][j] + ((double)mobs[i])/v[i]; }else{ A1[i][j] = (1/sige)*A1[i][j]; } } } inverse(A1,m); // set A1 = inv(A1) matvec(A1,m,m,b1tmp,b1); // form b1 // (2) Do multivariate normal draw from N(b1,A1) normal_rndc(A1,m,a); // generate N(0,A1) vector for(i=0; i<m; i++){ a[i] = a[i] + b1[i]; // add mean vector, b1 } }else{ // For large m use marginal distributions cobs = 0; for(i=0;i<m;i++){ obsi = mobs[i]; phi = 0.0; for(j=cobs;j<cobs+obsi;j++){ phi = phi + ((z[j] - xb[j])/v[i]); // form phi } awi = 0.0; aw2i = 0.0; for(j=0;j<m;j++){ // form awi and aw2i aw2i = aw2i + W2[i][j]*a[j]; // (W2[i][i] = 0) if(j != i){ awi = awi + (W[i + j*m] + W[j + i*m])*a[j]; } } bi = phi + (rho/sige)*awi - ((rho*rho)/sige)*aw2i; di = (1/sige) + ((rho*rho)/sige)*w1[i] + (obsi/v[i]); a[i] = (bi/di) + sqrt(1/di)*snorm(); // Form a[i] cobs = cobs + obsi; } } // End update of a // Compute tvec = del*a cobs = 0; for(i=0; i<m; i++){ obsi = mobs[i]; for(j=cobs; j<cobs + obsi; j++){ tvec[j] = a[i]; } cobs = cobs + obsi; } //UPDATE: sige matvec(Bp,m,m,a,Bpa); //form Bp*a aBBa = 0.0; //form aBBa = a'*Bp'*Bp*a for(i=0; i<m; i++){ aBBa = aBBa + Bpa[i]*Bpa[i]; } chi = genchi(m + 2.0*nu); sige = (aBBa + 2.0*d0)/chi; //UPDATE: v (and vv = del*v) for(i=0; i<n; i++){ e[i] = e0[i] - tvec[i]; //form e = z - x*bhat - tvec } cobs = 0; for(i=0; i<m; i++){ obsi = mobs[i]; chi = genchi(rval + obsi); ee = 0.0; for(j=cobs; j<cobs + obsi; j++){ // form ee ee = ee + e[j]*e[j]; } v[i] = (ee + rval)/chi; // form v for(j=cobs; j<cobs + obsi; j++){ vv[j] = v[i]; // form vv } cobs = cobs + obsi; } //UPDATE: rval (if necessary) if (mm != 0.0){ rval = gengam(mm,kk); } //UPDATE: rho (using univariate integration) matvec(Wmat,m,m,a,Wa); // form Wa vector c0 = 0.0; // form a'*a c1 = 0.0; // form a'*Wa c2 = 0.0; // form (Wa)'*Wa for(i=0; i<m; i++){ c0 = c0 + a[i]*a[i]; c1 = c1 + a[i]*Wa[i]; c2 = c2 + Wa[i]*Wa[i]; } rho = draw_rho(rdet,ldet,c0,c1,c2,sige,ngrid,m,k,rho); // (4) Update Bp matrix using new rho for(i=0; i<m; i++){ for(j=0; j<m; j++){ if (j == i){ Bp[i][j] = 1 - rho*W[i + j*m]; }else{ Bp[i][j] = - rho*W[i + j*m]; } } } //UPDATE: z if (zflag == 1){ // skip this for continuous y-values // (1) Generate vector of means cobs = 0; for(i=0; i<m; i++){ obsi = mobs[i]; for(j=cobs; j<cobs + obsi; j++){ mn[j] = xb[j] + a[i]; } cobs = cobs + obsi; } // (2) Sample truncated normal for latent z values normal_truncc(n,mn,vv,limit1,limit2,z); // (3) Compute associated sample y vector: ys for(i=0; i<n; i++){ if(z[i]<0){ ys[i] = 0.0; }else{ ys[i] = 1.0; } } } // end of if zflag == 1 // =================== // Save sample draws // =================== if (iter > nomit-1){ pdraw[iter - nomit] = rho; //save rho-draws sdraw[iter - nomit] = sige; //save sige-draws if(mm != 0.0) rdraw[iter - nomit] = rval; //save r-draws (if necessary) for(j=0; j<k; j++) bdraw[(iter - nomit) + j*nsave] = bhat[j]; //save beta-draws for(i=0; i<m; i++){ vmean[i] = vmean[i] + v[i]/((double) (nsave)); //save mean v-draws adraw[(iter - nomit) + i*nsave] = a[i]; //save a-draws amean[i] = amean[i] + a[i]/((double) (nsave)); //save mean a-draws } for(i=0; i<n; i++){ zmean[i] = zmean[i] + z[i]/((double) (nsave)); //save mean z-draws yhat[i] = yhat[i] + mn[i]/((double) (nsave)); //save mean y-values } } } // End iteration loop // =============================== // END SAMPLER // =============================== // Free up allocated vectors free_dmatrix(xs,0,n-1,0); free_dmatrix(xst,0,k-1,0); free_dmatrix(xsxs,0,k-1,0); free_dmatrix(A0,0,k-1,0); free_dmatrix(A1,0,m-1,0); free_dmatrix(Bp,0,m-1,0); free_dmatrix(Bpt,0,m-1,0); free_dmatrix(W2,0,m-1,0); free_dmatrix(xmat,0,n-1,0); free_dmatrix(Wmat,0,m-1,0); free_dvector(z,0); free_dvector(tvec,0); free_dvector(e,0); free_dvector(e0,0); free_dvector(xb,0); free_dvector(mn,0); free_dvector(ys,0); free_dvector(limit1,0); free_dvector(limit2,0); free_dvector(zmt,0); free_dvector(vv,0); free_dvector(bhat,0); free_dvector(b0,0); free_dvector(b0tmp,0); free_dvector(v,0); free_dvector(b1,0); free_dvector(b1tmp,0); free_dvector(Bpa,0); free_dvector(rdet,0); free_dvector(ldet,0); free_dvector(w1,0); free_dvector(Wa,0); } // End of semip_gc
/*------------------------------------------------------------------------------- calculate the least squares solution of an overdetermined system of nonlinear equations with Marquardt's method -------------------------------------------------------------------------------*/ void Ti_Optimization::MarquardtforCylinderFitting( int m, int n, double**g_pnt, double* const par, double*& g, double**v, int (*funct)(int m, int n, double* const par, double* g,double**g_pnt), void (*jacobian)(int m, int n, double* const par, double*& g, double **jac,double**g_pnt), double in[], double out[] ) { int maxfe,fe,it,i,j,err,emergency; double vv,ww,w,mu,res,fpar,fparpres,lambda,lambdamin,p,pw,reltolres, abstolres,em[8],*val,*b,*bb,*parpres,**jac,temp; val = allocate_real_vector(1,n); b = allocate_real_vector(1,n); bb = allocate_real_vector(1,n); parpres = allocate_real_vector(1,n); jac = allocate_real_matrix(1,m,1,n); assert( (val != NULL) && (b != NULL) && (bb != NULL) && (parpres!= NULL)&& (jac != NULL) ); vv = 10.0; w = 0.5; mu = 0.01; ww = (in[6] < 1.0e-7) ? 1.0e-8 : 1.0e-1*in[6]; em[0] = em[2] = em[6] = in[0]; em[4] = 10*n; reltolres =in[3]; abstolres=in[4]*in[4]; maxfe=(int)in[5]; err=0; fe=it=1; p=fpar=res=0.0; pw = -log(ww*in[0])/2.30; if (!(*funct)(m,n,par,g,g_pnt)) { err=3; out[4]=fe; out[5]=it-1; out[1]=err; free_real_vector(val,1); free_real_vector(b,1); free_real_vector(bb,1); free_real_vector(parpres,1); free_real_matrix(jac,1,m,1); return; } fpar=vecvec(1,m,0,g,g);// norm of residual vector out[3]=sqrt(fpar); emergency=0; it=1; do { (*jacobian)(m,n,par,g,jac,g_pnt); i = qrisngvaldec(jac,m,n,val,v,em); if (it == 1) lambda = in[6]*vecvec(1,n,0,val,val); else if (p == 0.0) lambda *= w; for (i=1; i<=n; i++) b[i] = val[i]*tamvec(1,m,i,jac,g); while (1) { for (i=1; i<=n; i++) bb[i]=b[i]/(val[i]*val[i]+lambda); for (i=1; i<=n; i++) parpres[i]=par[i]-matvec(1,n,i,v,bb); //normalization ,this section only used for cylinder fitting, //when it is used in other situations,it should be removed temp = sqrt(parpres[4]*parpres[4]+parpres[5]*parpres[5]+parpres[6]*parpres[6]); parpres[4] /= temp; parpres[5] /= temp; parpres[6] /= temp; //end normalization fe++; if (fe >= maxfe) err=1; else if (!(*funct)(m,n,parpres,g,g_pnt)) err=2; if (err != 0) { emergency = 1; break; } fparpres=vecvec(1,m,0,g,g); res=fpar-fparpres; if (res < mu*vecvec(1,n,0,b,bb)) { p += 1.0; lambda *= vv; if (p == 1.0) { lambdamin=ww*vecvec(1,n,0,val,val); if (lambda < lambdamin) lambda=lambdamin; } if (p >= pw) { err=4; emergency=1; break; } } // end if else { dupvec(1,n,0,par,parpres); fpar=fparpres; break; } // end else } // end while if (emergency) break; it++; } while ( (fpar > abstolres) && (res > reltolres*fpar+abstolres) ); for (i=1; i<=n; i++) mulcol(1,n,i,i,jac,v,1.0/(val[i]+in[0])); for (i=1; i<=n; i++) { for (j=1; j<=i; j++) v[i][j]=v[j][i]=mattam(1,n,i,j,jac,jac); lambda=lambdamin=val[1]; } for (i=2; i<=n; i++) { if (val[i] > lambda) lambda=val[i]; else { if (val[i] < lambdamin) lambdamin=val[i]; } } temp=lambda/(lambdamin+in[0]); out[7]=temp*temp; out[2]=sqrt(fpar); out[6]=sqrt(res+fpar)-out[2]; out[4]=fe; out[5]=it-1; out[1]=err; if(val != NULL) { free_real_vector(val,1); val = NULL; } if (b != NULL) { free_real_vector(b,1); b = NULL; } if(bb!=NULL) { free_real_vector(bb,1); bb = NULL; } if(parpres != NULL) { free_real_vector(parpres,1); parpres = NULL; } if (jac != NULL) { free_real_matrix(jac,1,m,1); jac = NULL; } }