/* Ref: Weiss, Algorithm 12 BiCGSTAB * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int bicgstab (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *rs = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *s = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "bicgstab"); CHECK_MALLOC (rs, "bicgstab"); CHECK_MALLOC (p, "bicgstab"); CHECK_MALLOC (ap, "bicgstab"); CHECK_MALLOC (s, "bicgstab"); CHECK_MALLOC (t, "bicgstab"); double rsap; // (r*, A.p) double st; double t2; double rho, rho1; double delta; double gamma; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... cblas_daxpy (n, -1.0, b, 1, r, 1); // - b cblas_dcopy (n, r, 1, rs, 1); // r* = r cblas_dcopy (n, r, 1, p, 1); // p = r rho = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = cblas_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; cblas_dcopy (n, r, 1, s, 1); // s = r ... cblas_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = cblas_ddot (n, s, 1, t, 1); // st = (s, t) t2 = cblas_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; cblas_dcopy (n, s, 1, r, 1); // r = s ... cblas_daxpy (n, gamma, t, 1, r, 1); // + gamma t cblas_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... cblas_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(cblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; cblas_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p cblas_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // - b dcopy_ (&n, r, &i_1, rs, &i_1); // r* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r rho = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = ddot_ (&n, rs, &i_1, ap, &i_1); // rsap = (r*, A.p) delta = - rho / rsap; dcopy_ (&n, r, &i_1, s, &i_1); // s = r ... daxpy_ (&n, &delta, ap, &i_1, s, &i_1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = ddot_ (&n, s, &i_1, t, &i_1); // st = (s, t) t2 = ddot_ (&n, t, &i_1, t, &i_1); // t2 = (t, t) gamma = - st / t2; dcopy_ (&n, s, &i_1, r, &i_1); // r = s ... daxpy_ (&n, &gamma, t, &i_1, r, &i_1); // + gamma t daxpy_ (&n, &delta, p, &i_1, x, &i_1); // x = x + delta p... daxpy_ (&n, &gamma, s, &i_1, x, &i_1); // + gamma s res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(blas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } if (res2 > 1.0e20) { // already too big residual break; } rho1 = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; daxpy_ (&n, &gamma, ap, &i_1, p, &i_1); // p = p + gamma A.p dscal_ (&n, &beta, p, &i_1); // p = beta (p + gamma A.p) daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta(p + gamma A.p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... my_daxpy (n, -1.0, b, 1, r, 1); // - b my_dcopy (n, r, 1, rs, 1); // r* = r my_dcopy (n, r, 1, p, 1); // p = r rho = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = my_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; my_dcopy (n, r, 1, s, 1); // s = r ... my_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = my_ddot (n, s, 1, t, 1); // st = (s, t) t2 = my_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; my_dcopy (n, s, 1, r, 1); // r = s ... my_daxpy (n, gamma, t, 1, r, 1); // + gamma t my_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... my_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(myblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; my_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p my_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (rs); free (p); free (ap); free (s); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-bicgstab %d %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) { /* deal with the underscores now so that they can be ignored after this point */ #ifdef WINDOWS double (*my_ddot)( int *, double *, int *, double *, int *) = ddot; complex16 (*my_zdot)( int *, complex16 *, int *, complex16 *, int *) = zdotu; void (*my_dgemm)(char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *) = dgemm; void (*my_zgemm)(char *, char *, int *, int *, int *, complex16 *, complex16 *, int *, complex16 *, int *, complex16 *, complex16 *, int *) = zgemm; #else double (*my_ddot)( int *, double *, int *, double *, int *) = ddot_; complex16 (*my_zdot)( int *, complex16 *, int *, complex16 *, int *) = zdotu_; void (*my_dgemm)(char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *) = dgemm_; void (*my_zgemm)(char *, char *, int *, int *, int *, complex16 *, complex16 *, int *, complex16 *, int *, complex16 *, complex16 *, int *) = zgemm_; #endif mwSize M, N, K, K2; mwSize nOmega1, nOmega2, nOmega; mwIndex i,j,k,m; double *U, *V, *output, *outputBLAS; double *U_imag, *V_imag, *output_imag; /* for complex data */ complex16 *U_cplx, *V_cplx, temp_cplx; /* for complex data */ double *omega, *omegaX, *omegaY; mwIndex *omegaI, *omegaJ; /* for sparse version */ int SPARSE = false; int COMPLEX = false; int USE_BLAS = false; int LARGE_BIT = false; int MODE_THREE = false; complex16 alpha_cplx, beta_cplx; complex16 *output_cplx; int one = 1; char transA = 'N', transB = 'T'; mwSize LDA, LDB; double alpha, beta; double BLAS_CUTOFF; /* Check for proper number of input and output arguments */ if ( (nrhs < 3) || (nrhs > 5) ) { printUsage(); mexErrMsgTxt("Three to five input argument required."); } if(nlhs > 1){ printUsage(); mexErrMsgTxt("Too many output arguments."); } /* Check data type of input argument */ if (!(mxIsDouble(prhs[0])) || !((mxIsDouble(prhs[1]))) ){ printUsage(); mexErrMsgTxt("Input arguments wrong data-type (must be doubles)."); } /* Get the size and pointers to input data */ /* TRANSPOSE VERSION: switch mxGetM and mxGetN */ M = mxGetN(prhs[0]); K = mxGetM(prhs[0]); N = mxGetN(prhs[1]); K2 = mxGetM(prhs[1]); if ( K != K2 ) { printUsage(); mexErrMsgTxt("Inner dimension of U and V' must agree."); } COMPLEX = (( (mxIsComplex(prhs[0])) ) || (mxIsComplex(prhs[1])) ); nOmega1 = mxGetM( prhs[2] ); nOmega2 = mxGetN( prhs[2] ); /* on 64-bit systems, these may be longs, but I really want * them to be ints so that they work with the BLAS calls */ mxAssert(M<INT_MAX,"Matrix is too large for 32-bit FORTRAN"); mxAssert(N<INT_MAX,"Matrix is too large for 32-bit FORTRAN"); mxAssert(K<INT_MAX,"Matrix is too large for 32-bit FORTRAN"); if ( (nOmega1 != 1) && (nOmega2 != 1) ) { /* mexErrMsgTxt("Omega must be a vector"); */ /* Update: * if this happens, we assume Omega is really a sparse matrix * and everything is OK */ if ( ( nOmega1 != M ) || ( nOmega2 != N ) || (!mxIsSparse( prhs[2] )) ){ printUsage(); mexErrMsgTxt("Omega must be a vector or a sparse matrix"); } nOmega = mxGetNzmax( prhs[2] ); SPARSE = true; } else { nOmega = nOmega1 < nOmega2 ? nOmega2 : nOmega1; if ( nOmega > N*M ) { printUsage(); mexErrMsgTxt("Omega must have M*N or fewer entries"); } } U = mxGetPr(prhs[0]); V = mxGetPr(prhs[1]); if (COMPLEX) { U_imag = mxGetPi(prhs[0]); V_imag = mxGetPi(prhs[1]); plhs[0] = mxCreateDoubleMatrix(nOmega, 1, mxCOMPLEX); output = mxGetPr(plhs[0]); output_imag = mxGetPi(plhs[0]); U_cplx = (complex16 *)mxMalloc( M*K*sizeof(complex16) ); V_cplx = (complex16 *)mxMalloc( N*K*sizeof(complex16) ); if ( (U_cplx == NULL) || ( V_cplx == NULL ) ) { /* this should be very rare */ mexErrMsgTxt("Unable to allocate memory. Matrix too large?"); } for ( i=0 ; i < M*K ; i++ ){ U_cplx[i].re = (double)U[i]; U_cplx[i].im = (double)U_imag[i]; } for ( i=0 ; i < N*K ; i++ ){ V_cplx[i].re = (double)V[i]; V_cplx[i].im = -(double)V_imag[i]; /* minus sign, since adjoint, not transpose */ } } else { plhs[0] = mxCreateDoubleMatrix(nOmega, 1, mxREAL); /* mxCreateDoubleMatrix automatically zeros out the entries */ output = mxGetPr(plhs[0]); } /* Check for the optional input argument that specifies the cutoff * for level-3 BLAS */ BLAS_CUTOFF = 0.4; MODE_THREE = false; if ( nrhs > 4 ) { BLAS_CUTOFF = (double)*mxGetPr( prhs[4] ); MODE_THREE = true; } else if (nrhs > 3 ) { /* decide if this third argument is BLAS_CUTOFF, * or omegaX */ nOmega1 = mxGetM( prhs[3] ); nOmega2 = mxGetN( prhs[3] ); if ( (nOmega1 == 1) && (nOmega2 == 1) ) { BLAS_CUTOFF = (double)*mxGetPr( prhs[3] ); MODE_THREE = false; } else { MODE_THREE = true; } } /* Decide if we'll make a level-3 BLAS call */ USE_BLAS = ( nOmega >= BLAS_CUTOFF * (M*N) ); /* * We have 3 "modes": * Mode 1 * if omega is a vector of linear indices * Mode 2 * if omega is given only by the nonzero elements of an input * sparse matrix Y * Mode 3 * if omega is given as a set of subscripts, i.e. omegaX, omegaY * then the "for" loop is slightly different * * October 29, 2009: changing this a bit. * For any of the modes, we first check if length(omega) > .8*M*N * If so, we make a level-3 BLAS call (dgemm), and then use this * matrix as needed. * */ /* determine if we're on a 64-bit processor */ LARGE_BIT = ( sizeof( size_t ) > 4 ); if ( USE_BLAS ) { /* we need to compute A itself, so use level-3 BLAS */ /* TRANSPOSE VERSION: switch N and T below, and change the step size and LDA */ /* transA = 'N'; transB = 'T'; LDA = M; LDB = N; */ transA = 'T'; transB = 'N'; LDA = K; LDB = K; if (COMPLEX) { alpha_cplx.re = 1.0; alpha_cplx.im = 0.0; beta_cplx.re = 0.0; beta_cplx.im = 0.0; /* need to make a new complex data structure */ output_cplx = (complex16 *) mxMalloc( M*N*sizeof(complex16) ); if ( output_cplx == NULL ) { /* we don't have enough RAM available */ USE_BLAS = false; } else { my_zgemm(&transA,&transB,(int*)&M,(int*)&N,(int*)&K, &alpha_cplx,U_cplx,(int*)&LDA,V_cplx,(int*)&LDB,&beta_cplx,output_cplx,(int *)&M ); } } else { outputBLAS = (double *)mxMalloc( M*N*sizeof(double) ); if ( outputBLAS == NULL ) { /* we don't have enough RAM available */ USE_BLAS = false; } else { alpha = 1.0; beta = 0.0; my_dgemm(&transA,&transB,(int*)&M,(int*)&N,(int*)&K, &alpha,U,(int*)&LDA,V,(int*)&LDB,&beta,outputBLAS,(int*)&M ); } } } /* --- MODE 1 ---*/ if ( (!MODE_THREE) && (!SPARSE) ) { /* by default, make output the same shape (i.e. row- or column- * vector) as the input "omega" */ mxSetM( plhs[0], mxGetM( prhs[2] ) ); mxSetN( plhs[0], mxGetN( prhs[2] ) ); omega = mxGetPr(prhs[2]); if ( !USE_BLAS ) { if ( (COMPLEX) && (LARGE_BIT) ) { /* TRANSPOSE VERSION: this needs to be tested!!! */ for ( k=0 ; k < nOmega ; k++ ){ i = (mwIndex) ( (mwIndex)(omega[k]-1) % M); j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M); /* implement the BLAS call myself, since BLAS isn't working for me * ZDOTU(N,ZX,INCX,ZY,INCY) * */ temp_cplx.re = 0.0; temp_cplx.im = 0.0; for ( m=0 ; m < K ; m++ ){ /*temp_cplx.re += U_cplx[i+m*M].re * V_cplx[j+m*N].re - U_cplx[i+m*M].im * V_cplx[j+m*N].im; temp_cplx.im += U_cplx[i+m*M].im * V_cplx[j+m*N].re + U_cplx[i+m*M].re * V_cplx[j+m*N].im; */ temp_cplx.re += U_cplx[K*i+m].re * V_cplx[K*j+m].re - U_cplx[K*i+m].im * V_cplx[K*j+m].im; temp_cplx.im += U_cplx[K*i+m].im * V_cplx[K*j+m].re + U_cplx[K*i+m].re * V_cplx[K*j+m].im; } output[k] = temp_cplx.re; output_imag[k] = temp_cplx.im; } } else if (COMPLEX) { /* TRANSPOSE VERSION: UPDATED */ for ( k=0 ; k < nOmega ; k++ ){ i = (mwIndex) ( (mwIndex)(omega[k]-1) % M); j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M); /*temp_cplx = my_zdot( (int*) &K, U_cplx+i, (int*)&M, V_cplx+j, (int*)&N ); */ temp_cplx = my_zdot( (int*) &K, U_cplx+i*K, (int*)&one, V_cplx+j*K, (int*)&one ); output[k] = temp_cplx.re; output_imag[k] = temp_cplx.im; } } else { /* TRANSPOSE VERSION: UPDATED */ for ( k=0 ; k < nOmega ; k++ ){ /* don't forget that Matlab is 1-indexed, C is 0-indexed */ i = (mwIndex) ( (mwIndex)(omega[k]-1) % M); j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M); /*output[k] = my_ddot( (int*)&K, U+i, (int*)&M, V+j, (int*)&N );*/ output[k] = my_ddot( (int*)&K, U+i*K, (int*)&one, V+j*K, (int*)&one ); } } } else { /* (USE_BLAS) is true */ /* We already have the full matrix, so just find the entries */ if (COMPLEX) { for ( k=0 ; k < nOmega ; k++ ){ output[k] = output_cplx[ (mwIndex)omega[k] -1 ].re; output_imag[k] = output_cplx[ (mwIndex)omega[k] - 1 ].im; } mxFree( output_cplx ); } else { for ( k=0 ; k < nOmega ; k++ ){ /* i = (mwIndex) ( (mwIndex)(omega[k]-1) % M); j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M); output[k] = outputBLAS( j*M + i ); */ /* This now simplifies a lot! */ output[k] = outputBLAS[ (mwIndex)omega[k] - 1 ]; } mxFree( outputBLAS ); } } } else { /* ----------- MODE 2 ------------------------------- */ if (SPARSE) { /* sparse array indices in Matlab are rather confusing; * see the mxSetJc help file to get started. The Ir index * is straightforward: it contains rows indices of nonzeros, * in column-major order. But the Jc index is tricky... * Basically, Jc (which has N+1 entries, not nnz entries like Ir) * tells you which Ir entries correspond to the jth row, thus fully * specifying the indices. Ir[ Jc[j]:Jc[J+1] ] are the rows * that correspond to column j. This works because Ir is * in column-major order. For this to work (and match A(omega)), * we need omega to be sorted! *Note: everything is already 0-based, not 1-based * */ omegaI = mxGetIr( prhs[2] ); omegaJ = mxGetJc( prhs[2] ); if ( USE_BLAS ) { /* We already have the full matrix, so just find the entries */ if (COMPLEX) { for ( j=0 ; j < N ; j++ ){ for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) { i = omegaI[k]; output[k] = output_cplx[ j*M + i ].re; output_imag[k] = output_cplx[ j*M + i ].im; } } mxFree( output_cplx ); } else { for ( j=0 ; j < N ; j++ ){ for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) { i = omegaI[k]; output[k] = outputBLAS[ j*M + i ]; } } mxFree( outputBLAS ); } } else if ((COMPLEX)&&(LARGE_BIT)) { /* TRANSPOSE VERSION: this needs to be tested!!! */ for ( j=0 ; j < N ; j++ ){ for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) { i = omegaI[k]; temp_cplx.re = 0.0; temp_cplx.im = 0.0; for ( m=0 ; m < K ; m++ ){ /*temp_cplx.re += U_cplx[i+m*M].re * V_cplx[j+m*N].re - U_cplx[i+m*M].im * V_cplx[j+m*N].im; temp_cplx.im += U_cplx[i+m*M].im * V_cplx[j+m*N].re + U_cplx[i+m*M].re * V_cplx[j+m*N].im;*/ temp_cplx.re += U_cplx[K*i+m].re * V_cplx[K*j+m].re - U_cplx[K*i+m].im * V_cplx[K*j+m].im; temp_cplx.im += U_cplx[K*i+m].im * V_cplx[K*j+m].re + U_cplx[K*i+m].re * V_cplx[K*j+m].im; } output[k] = temp_cplx.re; output_imag[k] = temp_cplx.im; } } } else if (COMPLEX) { /* TRANSPOSE VERSION: UPDATED */ for ( j=0 ; j < N ; j++ ){ for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) { i = omegaI[k]; /* temp_cplx = my_zdot((int*) &K, U_cplx+i, (int*)&M, V_cplx+j, (int*)&N ); */ temp_cplx = my_zdot((int*) &K, U_cplx+i*K, (int*)&one, V_cplx+j*K, (int*)&one ); output[k] = temp_cplx.re; output_imag[k] = temp_cplx.im; } } } else { /* simple case */ /* TRANSPOSE VERSION: UPDATED */ for ( j=0 ; j < N ; j++ ){ for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) { i = omegaI[k]; /*output[k] = my_ddot( (int*)&K, U+i, (int*)&M, V+j, (int*)&N );*/ output[k] = my_ddot( (int*)&K, U+i*K, (int*)&one, V+j*K, (int*)&one ); } } } /* ----------- MODE 3 ------------------------------- */ } else { /* we have omegaX and omegaY, the row and column indices */ nOmega1 = mxGetM( prhs[3] ); nOmega2 = mxGetN( prhs[3] ); if ( (nOmega1 != 1) && (nOmega2 != 1) ) { printUsage(); mexErrMsgTxt("OmegaY must be a vector"); } nOmega1 = nOmega1 < nOmega2 ? nOmega2 : nOmega1; if ( nOmega1 != nOmega ) { printUsage(); mexErrMsgTxt("In subscript index format, subscript vectors must be same length."); } omegaX = mxGetPr(prhs[2]); omegaY = mxGetPr(prhs[3]); if ( USE_BLAS ) { /* We already have the full matrix, so just find the entries */ if (COMPLEX) { for ( k=0 ; k < nOmega ; k++ ){ i = omegaX[k] - 1; j = omegaY[k] - 1; output[k] = output_cplx[ j*M + i ].re; output_imag[k] = output_cplx[ j*M + i ].im; } mxFree( output_cplx ); } else { for ( k=0 ; k < nOmega ; k++ ){ i = omegaX[k] - 1; j = omegaY[k] - 1; output[k] = outputBLAS[ j*M + i ]; } mxFree( outputBLAS ); } } else if ((COMPLEX)&&(LARGE_BIT)) { /* TRANSPOSE VERSION: this needs to be tested!!! */ for ( k=0 ; k < nOmega ; k++ ){ i = omegaX[k] - 1; j = omegaY[k] - 1; temp_cplx.re = 0.0; temp_cplx.im = 0.0; for ( m=0 ; m < K ; m++ ){ /*temp_cplx.re += U_cplx[i+m*M].re * V_cplx[j+m*N].re - U_cplx[i+m*M].im * V_cplx[j+m*N].im; temp_cplx.im += U_cplx[i+m*M].im * V_cplx[j+m*N].re + U_cplx[i+m*M].re * V_cplx[j+m*N].im; */ temp_cplx.re += U_cplx[K*i+m].re * V_cplx[K*j+m].re - U_cplx[K*i+m].im * V_cplx[K*j+m].im; temp_cplx.im += U_cplx[K*i+m].im * V_cplx[K*j+m].re + U_cplx[K*i+m].re * V_cplx[K*j+m].im; } output[k] = temp_cplx.re; output_imag[k] = temp_cplx.im; } } else if (COMPLEX) { /* TRANSPOSE VERSION: UPDATED */ for ( k=0 ; k < nOmega ; k++ ){ i = omegaX[k] - 1; j = omegaY[k] - 1; /* temp_cplx = my_zdot( (int*)&K, U_cplx+i, (int*)&M, V_cplx+j, (int*)&N ); */ temp_cplx = my_zdot( (int*)&K, U_cplx+i*K, (int*)&one, V_cplx+j*K, (int*)&one ); output[k] = temp_cplx.re; output_imag[k] = temp_cplx.im; } } else { /* TRANSPOSE VERSION: UPDATED */ for ( k=0 ; k < nOmega ; k++ ){ i = omegaX[k] - 1; j = omegaY[k] - 1; /* output[k] = my_ddot( (int*)&K, U+i, (int*)&M, V+j, (int*)&N ); */ output[k] = my_ddot( (int*)&K, U+i*K, (int*)&one, V+j*K, (int*)&one ); } } } } }
/* Ref: Weiss, Algorithm 11 CGS * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cgs (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_m1 = -1.0; double d_2 = 2.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *r0 = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *u = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *q = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "cgs"); CHECK_MALLOC (r0, "cgs"); CHECK_MALLOC (p, "cgs"); CHECK_MALLOC (u, "cgs"); CHECK_MALLOC (ap, "cgs"); CHECK_MALLOC (q, "cgs"); CHECK_MALLOC (t, "cgs"); double r0ap; double rho, rho1; double delta; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, r0, 1); // r0* = r cblas_dcopy (n, r, 1, p, 1); // p = r cblas_dcopy (n, r, 1, u, 1); // u = r rho = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = cblas_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; cblas_dcopy (n, u, 1, q, 1); // q = u cblas_dscal (n, 2.0, q, 1); // q = 2 u cblas_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q cblas_daxpy (n, delta, t, 1, r, 1); // r = r + delta t cblas_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; cblas_dcopy (n, q, 1, qu, 1); // qu = q cblas_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u cblas_dcopy (n, r, 1, u, 1); // u = r cblas_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) cblas_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p cblas_dcopy (n, u, 1, p, 1); // p = u cblas_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, r0, &i_1); // r0* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r dcopy_ (&n, r, &i_1, u, &i_1); // u = r rho = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = ddot_ (&n, r0, &i_1, ap, &i_1); // r0ap = (r0*, A.p) delta = - rho / r0ap; dcopy_ (&n, u, &i_1, q, &i_1); // q = u dscal_ (&n, &d_2, q, &i_1); // q = 2 u daxpy_ (&n, &delta, ap, &i_1, q, &i_1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q daxpy_ (&n, &delta, t, &i_1, r, &i_1); // r = r + delta t daxpy_ (&n, &delta, q, &i_1, x, &i_1); // x = x + delta q res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; dcopy_ (&n, q, &i_1, qu, &i_1); // qu = q daxpy_ (&n, &d_m1, u, &i_1, qu, &i_1); // qu = q - u dcopy_ (&n, r, &i_1, u, &i_1); // u = r daxpy_ (&n, &beta, qu, &i_1, u, &i_1); // u = r + beta (q - u) daxpy_ (&n, &beta, p, &i_1, qu, &i_1); // qu = q - u + beta * p dcopy_ (&n, u, &i_1, p, &i_1); // p = u daxpy_ (&n, &beta, qu, &i_1, p, &i_1); // p = u + beta (q - u + b * p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, r0, 1); // r0* = r my_dcopy (n, r, 1, p, 1); // p = r my_dcopy (n, r, 1, u, 1); // u = r rho = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = my_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; my_dcopy (n, u, 1, q, 1); // q = u my_dscal (n, 2.0, q, 1); // q = 2 u my_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q my_daxpy (n, delta, t, 1, r, 1); // r = r + delta t my_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; my_dcopy (n, q, 1, qu, 1); // qu = q my_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u my_dcopy (n, r, 1, u, 1); // u = r my_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) my_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p my_dcopy (n, u, 1, p, 1); // p = u my_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (r0); free (p); free (u); free (ap); free (q); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-cgs it= %d res^2= %e\n", i, res2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
/* Classical CG method -- Weiss' Algorithm 2 * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cg (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *p = (double *)malloc (sizeof (double) * n); double *r = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (p, "cg"); CHECK_MALLOC (r, "cg"); CHECK_MALLOC (ap, "cg"); double r2; double res2 = 0.0; double pap; double gamma; double beta; int i; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, p, 1); // p = r for (i = 0; i < itmax; i ++) { r2 = cblas_ddot (n, r, 1, r, 1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = cblas_ddot (n, p, 1, ap, 1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) cblas_daxpy (n, gamma, p, 1, x, 1); // x += gamma p cblas_daxpy (n, gamma, ap, 1, r, 1); // r += gamma Ap // new norm of r res2 = cblas_ddot (n, r, 1, r, 1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) cblas_dscal (n, beta, p, 1); // p *= beta cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta p r2 = res2; } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, p, &i_1); // p = r for (i = 0; i < itmax; i ++) { r2 = ddot_ (&n, r, &i_1, r, &i_1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = ddot_ (&n, p, &i_1, ap, &i_1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) daxpy_ (&n, &gamma, p, &i_1, x, &i_1); // x += gamma p daxpy_ (&n, &gamma, ap, &i_1, r, &i_1); // r += gamma Ap // new norm of r res2 = ddot_ (&n, r, &i_1, r, &i_1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) dscal_ (&n, &beta, p, &i_1); // p *= beta daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta p r2 = res2; } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, p, 1); // p = r for (i = 0; i < itmax; i ++) { r2 = my_ddot (n, r, 1, r, 1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = my_ddot (n, p, 1, ap, 1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) my_daxpy (n, gamma, p, 1, x, 1); // x += gamma p my_daxpy (n, gamma, ap, 1, r, 1); // r += gamma Ap // new norm of r res2 = my_ddot (n, r, 1, r, 1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) my_dscal (n, beta, p, 1); // p *= beta my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta p r2 = res2; } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (p); free (r); free (ap); if (it->debug == 1) { fprintf (it->out, "libiter-cg it= %d res^2= %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }