/** * @brief Normalizes all FSR scalar fluxes and Track boundary angular * fluxes to the total fission source (times \f$ \nu \f$). */ void VectorizedSolver::normalizeFluxes() { FP_PRECISION* nu_sigma_f; FP_PRECISION volume; FP_PRECISION tot_fission_source; FP_PRECISION norm_factor; /* Compute total fission source for each FSR, energy group */ #pragma omp parallel for private(volume, nu_sigma_f) \ reduction(+:tot_fission_source) schedule(guided) for (int r=0; r < _num_FSRs; r++) { /* Get pointers to important data structures */ nu_sigma_f = _FSR_materials[r]->getNuSigmaF(); volume = _FSR_volumes[r]; /* Loop over energy group vector lengths */ for (int v=0; v < _num_vector_lengths; v++) { /* Loop over each energy group within this vector */ #pragma simd vectorlength(VEC_LENGTH) for (int e=v*VEC_LENGTH; e < (v+1)*VEC_LENGTH; e++) { _fission_sources(r,e) = nu_sigma_f[e] * _scalar_flux(r,e); _fission_sources(r,e) *= volume; } } } /* Compute the total fission source */ int size = _num_FSRs * _num_groups; #ifdef SINGLE tot_fission_source = cblas_sasum(size, _fission_sources, 1); #else tot_fission_source = cblas_dasum(size, _fission_sources, 1); #endif /* Compute the normalization factor */ norm_factor = 1.0 / tot_fission_source; log_printf(DEBUG, "Tot. Fiss. Src. = %f, Normalization factor = %f", tot_fission_source, norm_factor); /* Normalize the FSR scalar fluxes */ #ifdef SINGLE cblas_sscal(size, norm_factor, _scalar_flux, 1); #else cblas_dscal(size, norm_factor, _scalar_flux, 1); #endif /* Normalize the Track angular boundary fluxes */ size = 2 * _tot_num_tracks * _num_polar * _num_groups; #ifdef SINGLE cblas_sscal(size, norm_factor, _boundary_flux, 1); #else cblas_dscal(size, norm_factor, _boundary_flux, 1); #endif return; }
double maxeig(double *xmat, mwSignedIndex n) { // xmat is symmetric n x n matrix mwSignedIndex incx=1,indmax,maxloop=10000,k=0; double alpha, beta, dmax, dmax_temp,dmax_tol; double *bufveca=(double *) calloc(n,sizeof(double)); double *bufvecb=(double *) calloc(n,sizeof(double)); dmax_tol=.001; // do power series to get approximation to max eigenvalue of A+X alpha=0.0;cblas_dscal(n,alpha,bufveca,incx);bufveca[0]=1.0; // x_0 = [1,0,0,...] do something better later beta=0.0;dmax=1.0;dmax_temp=0.0; while ((dabsf(dmax-dmax_temp)>dmax_tol)&&(k<=maxloop)){ dmax_temp=dmax; alpha=1.0; cblas_dgemv(CblasColMajor,CblasNoTrans,n,n,alpha,xmat,n,bufveca,incx,beta,bufvecb,incx); indmax=idxmax(bufvecb,n);dmax=bufvecb[indmax]; alpha=1.0/dmax;cblas_dscal(n,alpha,bufvecb,incx); cblas_dcopy(n,bufvecb,incx,bufveca,incx); k++; } alpha=1.0; // compute Rayleigh Quotient to approximate max eigenvalue of A+X cblas_dgemv(CblasColMajor,CblasNoTrans,n,n,alpha,xmat,n,bufvecb,incx,beta,bufveca,incx); dmax=doubdot(bufvecb,bufveca,n);alpha=doubnorm2(bufvecb,n);dmax=dmax/alpha/alpha; free(bufveca);free(bufvecb); return dmax; }
void get_class(crbm *m, double *h, double *py, int batch_size){ int i; double sum; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, batch_size, m->ncat, m->nhidden, 1.0, h, m->nhidden, m->u, m->ncat, 0, py, m->ncat); cblas_dger(CblasRowMajor, batch_size, m->ncat, 1, I, 1, m->by, 1, py, m->ncat); for(i = 0; i < batch_size * m->ncat; i++){ py[i] = exp(py[i]); } //sum cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, batch_size, 1, m->ncat, 1, py, m->ncat, I, 1, 0, a, 1); for(i = 0; i < batch_size; i++){ cblas_dscal(m->ncat, 1.0 / a[i], py + i * m->ncat, 1); //printf("sum:%.2lf\n", cblas_dasum(m->ncat, py + i * m->ncat, 1)); } }
void THBlas_(scal)(long n, real a, real *x, long incx) { if(n == 1) incx = 1; #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) if( (n <= INT_MAX) && (incx <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; #if defined(TH_REAL_IS_DOUBLE) cblas_dscal(i_n, a, x, i_incx); #else cblas_sscal(i_n, a, x, i_incx); #endif return; } #endif { long i; for(i = 0; i < n; i++) x[i*incx] *= a; } }
JNIEXPORT void JNICALL Java_uncomplicate_neanderthal_CBLAS_dscal (JNIEnv *env, jclass clazz, jint N, jdouble alpha, jobject X, jint offsetX, jint incX) { double *cX = (double *) (*env)->GetDirectBufferAddress(env, X); cblas_dscal(N, alpha, cX + offsetX, incX); };
/* * Class: com_intel_analytics_bigdl_mkl_MKL * Method: vdscal * Signature: (II[DII)V */ JNIEXPORT void JNICALL Java_com_intel_analytics_bigdl_mkl_MKL_vdscal (JNIEnv * env, jclass cls, jint n, jdouble a, jdoubleArray x, jint xOffset, jint incx) { jdouble * jni_x = (*env)->GetPrimitiveArrayCritical(env, x, JNI_FALSE); cblas_dscal(n, a, jni_x + xOffset, incx); (*env)->ReleasePrimitiveArrayCritical(env, x, jni_x, 0); }
void eblas_dscal(int N, double a, double* x, int incx) { #ifdef MKL_PROVIDES_BLAS cblas_dscal(N, a, x, incx); #else threadLaunch((N<100000) ? 1 : 0, eblas_dscal_sub, N, a, x, incx); #endif }
/* Computes: y <- alpha A^T*x + beta y */ int mfiles_dgemv1(double alpha, const mxArray *A, const mxArray *x, double beta, mxArray *y) { size_t rA = mxGetM(A); size_t cA = mxGetN(A); size_t rx = mxGetM(x); size_t cx = mxGetN(x); size_t ry = mxGetM(y); size_t cy = mxGetN(y); if (mxIsSparse(x) || mxIsSparse(y)) { mexErrMsgIdAndTxt("mfiles:BadType", "Sparse vectors are not supported."); } if (mxIsComplex(A) || mxIsComplex(x) || mxIsComplex(y)) { mexErrMsgIdAndTxt("mfiles:BadType", "Complex data is not supported."); } if ((rA != rx) || (cA != ry) || (cx != 1) || (cy != 1)) { mexErrMsgIdAndTxt("mfiles:BadDim", "Dimensions of matrices do not match."); } if (mxIsSparse(A)) { double *px = mxGetPr(x); double *py = mxGetPr(y); double *pz = mxCalloc(ry, sizeof (double)); cs *cs_A = cs_calloc(1, sizeof (cs)); mfiles_mx2cs(A, cs_A); /* Transpose A */ cs *cs_AT = cs_transpose(cs_A, 1); /* Compute z <- A^T*x */ cs_gaxpy(cs_AT, px, pz); /* Compute y <- beta y */ cblas_dscal(ry, beta, py, 1); /* Compute y <- alpha*z+y */ cblas_daxpy(ry, alpha, pz, 1, py, 1); cs_free(cs_A); /* Check this cs_free and cs_spfree ? */ cs_spfree(cs_AT); mxFree(pz); } else { double *pA = mxGetPr(A); double *px = mxGetPr(x); double *py = mxGetPr(y); cblas_dgemv(CblasRowMajor, CblasTrans, rA, cA, alpha, pA, rA, px, 1, beta, py, 1); } return EXIT_SUCCESS; }
void Scope::process(const double* inputs) { double max = 1.; m_decoder->process(inputs, m_matrix); max = fabs(m_matrix[cblas_idamax(m_number_of_points, m_matrix, 1)]); if(max > 1.) { cblas_dscal(m_number_of_points, (1. / max), m_matrix, 1.); } }
void Space_trans::for_space() { fftw_execute(pf); cblas_dscal(n3*md*2,1./md,realdata[0],1); /* for(int i = 0; i<n3*md; i++) */ /* { */ /* realdata[i][0]/=md; */ /* realdata[i][1]/=md; */ /* } */ return; }
/* * 最大最大激励化 第 unitdx 个单元 * */ double LayerWiseRBMs::maximizeUnit(int layerIdx, int unitIdx, double * unitSample, double argvNorm, int epoch){ int AMnumIn = layers[0]->numVis; // unitsample 归一化 double curNorm = squareNorm(unitSample, AMnumIn, 1); cblas_dscal(AMnumIn, argvNorm / curNorm, unitSample, 1); double maxValue =0; for(int k=0; k<epoch; k++){ // forward for(int i=0; i<=layerIdx; i++){ if(i==0) layers[i]->setInput(unitSample); else layers[i]->setInput(layers[i-1]->getOutput()); layers[i]->setBatchSize(1); layers[i]->runBatch(); } maxValue = layers[layerIdx]->getOutput()[unitIdx]; //back propagate for(int i=layerIdx; i>=0; i--){ if(i==layerIdx) layers[i]->getAMDelta(unitIdx, NULL) ; else layers[i]->getAMDelta(-1, layers[i+1]->AMDelta); } double lr = 0.01 * cblas_dasum(AMnumIn, unitSample, 1) / cblas_dasum(AMnumIn, layers[0]->AMDelta, 1); // update unitSample cblas_daxpy(AMnumIn, lr, layers[0]->AMDelta, 1, unitSample, 1); //归一化 unitSample curNorm = squareNorm(unitSample, AMnumIn, 1); cblas_dscal(AMnumIn, argvNorm / curNorm, unitSample, 1); } return maxValue; }
/* compute psi function */ void ACPsi( GlobalFrictionContactProblem* problem, AlartCurnierFun3x3Ptr computeACFun3x3, double *globalVelocity, double *reaction, double *velocity, double *rho, double *psi) { assert(problem->H->size1 == problem->dimension * problem->numberOfContacts); unsigned int m = problem->H->size1; unsigned int n = problem->M->size0; unsigned int problem_size = n + 2*m ; cblas_dscal(problem_size, 0., psi, 1); /* -problem->M * globalVelocity + problem->H * reaction + problem->q ==> psi */ cblas_dscal(problem_size, 0., psi, 1); cblas_dcopy(n, problem->q, 1, psi, 1); NM_gemv(1., problem->H, reaction, 1, psi); NM_gemv(-1., problem->M, globalVelocity, 1, psi); /* -velocity + trans(problem->H) * globalVelocity + problem->b ==> psi + n */ cblas_daxpy(m, -1., velocity, 1, psi + n, 1); cblas_daxpy(m, 1, problem->b, 1, psi + n, 1); NM_tgemv(1., problem->H, globalVelocity, 1, psi + n); /* compute AC function */ fc3d_AlartCurnierFunction(m, computeACFun3x3, reaction, velocity, problem->mu, rho, psi+n+m, NULL, NULL); }
void Scope::process(const float* inputs) { double max = 1.; for(unsigned int i = 0; i < m_number_of_harmonics; i++) { m_harmonics[i] = inputs[i]; } m_decoder->process(m_harmonics, m_matrix); max = fabs(m_matrix[cblas_idamax(m_number_of_points, m_matrix, 1)]); if(max > 1.) { cblas_dscal(m_number_of_points, (1. / max), m_matrix, 1.); } }
void Orthogonalize(OrthoContext* c, double* p, int numBases, double* orthonormalBases) { memcpy(c->Pv->Data, p, c->Pv->Count * sizeof(double)); memcpy(c->Bases->Data, orthonormalBases, numBases * c->Pv->Count * sizeof(double)); c->Bases->RowCount = numBases; c->Dp->Count = numBases; int basisLen = c->Pv->Count; GEMV(1, c->Bases, c->Pv, 0, c->Dp); for (int i = 0, offset = 0; i < numBases; i++, offset += basisLen) AXPY2(-1 * c->Dp->Data[i], c->Bases->Data + offset, basisLen, c->Pv->Data); double mag = cblas_dnrm2(basisLen, c->Pv->Data, 1); cblas_dscal(basisLen, 1.0 / mag, c->Pv->Data, 1); memcpy(p, c->Pv->Data, basisLen * sizeof(double)); }
ECLBLAS_CALL void dpotf2(bool & __isAllResult, size32_t & __lenResult, void * & __result, uint8_t tri, uint32_t r, bool isAllA, size32_t lenA, const void * A, bool clear) { unsigned int cells = r*r; __isAllResult = false; __lenResult = cells * sizeof(double); double *new_a = (double*) rtlMalloc(__lenResult); memcpy(new_a, A, __lenResult); double ajj; // x and y refer to the embedded vectors for the multiply, not an axis unsigned int diag, a_pos, x_pos, y_pos; unsigned int col_step = r; // between columns unsigned int row_step = 1; // between rows unsigned int x_step = (tri==UPPER_TRIANGLE) ? row_step : col_step; unsigned int y_step = (tri==UPPER_TRIANGLE) ? col_step : row_step; for (unsigned int j=0; j<r; j++) { diag = (j * r) + j; // diagonal x_pos = j * ((tri==UPPER_TRIANGLE) ? col_step : row_step); a_pos = (j+1) * ((tri==UPPER_TRIANGLE) ? col_step : row_step); y_pos = diag + y_step; // ddot.value <- x'*y ajj = new_a[diag] - cblas_ddot(j, (new_a+x_pos), x_step, (new_a+x_pos), x_step); //if ajj is 0, negative or NaN, then error if (ajj <= 0.0) { rtlFree(new_a); rtlFail(0, "Not a positive definite matrix"); } ajj = sqrt(ajj); new_a[diag] = ajj; if ( j < r-1) { // y <- alpha*op(A)*x + beta*y cblas_dgemv(CblasColMajor, (tri==UPPER_TRIANGLE) ? CblasTrans : CblasNoTrans, (tri==UPPER_TRIANGLE) ? j : r-1-j, // M (tri==UPPER_TRIANGLE) ? r-1-j : j, // N -1.0, // alpha (new_a+a_pos), r, //A (new_a+x_pos), x_step, //X 1.0, (new_a+y_pos), y_step); // beta and Y // x <- alpha * x cblas_dscal(r-1-j, 1.0/ajj, (new_a+y_pos), y_step); } // clear lower or upper part if clear flag set for(unsigned int k=1; clear && k<r-j; k++) new_a[(k*x_step)+diag] = 0.0; } __result = (void*) new_a; }
/* compute psi function */ void ACPsi( GlobalFrictionContactProblem* problem, AlartCurnierFun3x3Ptr computeACFun3x3, double *globalVelocity, double *reaction, double *velocity, double *rho, double *psi) { assert(problem->H->size1 == problem->dimension * problem->numberOfContacts); unsigned int localProblemSize = problem->H->size1; unsigned int ACProblemSize = sizeOfPsi(NM_triplet(problem->M), NM_triplet(problem->H)); unsigned int globalProblemSize = problem->M->size0; /* psi <- compute -problem->M * globalVelocity + problem->H * reaction + problem->q ... */ cblas_dscal(ACProblemSize, 0., psi, 1); cblas_dcopy(globalProblemSize, problem->q, 1, psi, 1); NM_gemv(1., problem->H, reaction, 1, psi); NM_gemv(-1., problem->M, globalVelocity, 1, psi); /* psi + globalProblemSize <- compute -velocity + trans(problem->H) * globalVelocity + problem->b ... */ cblas_daxpy(localProblemSize, -1., velocity, 1, psi + globalProblemSize, 1); cblas_daxpy(localProblemSize, 1, problem->b, 1, psi + globalProblemSize, 1); NM_tgemv(1., problem->H, globalVelocity, 1, psi + globalProblemSize); /* compute AC function */ fc3d_AlartCurnierFunction(localProblemSize, computeACFun3x3, reaction, velocity, problem->mu, rho, psi+globalProblemSize+ problem->H->size1, NULL, NULL); }
// rescales the rows of A by the given weights // weights only needs to be defined on the root process void rescaleRows(double *localRowChunk, double *weights, distMatrixInfo *matInfo) { int mpi_rank = matInfo->mpi_rank; int numcols = matInfo->numcols; int localrows = matInfo->localrows; int * rowcounts = matInfo->rowcounts; int * rowoffsets = matInfo->rowoffsets; MPI_Comm *comm = matInfo->comm; double *localweights = (double *) malloc(localrows * sizeof(double)); if(mpi_rank != 0) { MPI_Scatterv(NULL, rowcounts, rowoffsets, MPI_DOUBLE, localweights, localrows, MPI_DOUBLE, 0, *comm); } else { MPI_Scatterv(weights, rowcounts, rowoffsets, MPI_DOUBLE, localweights, localrows, MPI_DOUBLE, 0, *comm); } for(int rowIdx = 0; rowIdx < localrows; rowIdx = rowIdx + 1) cblas_dscal(numcols, localweights[rowIdx], localRowChunk + (rowIdx*numcols), 1); free(localweights); }
void hoa_scope_perform64(t_hoa_scope *x, t_object *dsp64, double **ins, long numins, double **outs, long numouts, long sampleframes, long flags, void *userparam) { for(int i = 0; i < numins; i++) { cblas_dcopy(sampleframes, ins[i], 1, x->f_signals+i, numins); } cblas_dscal(numins * sampleframes, x->f_gain, x->f_signals, 1); x->f_index = 0; while(--sampleframes) { x->f_index++; } if(x->f_startclock) { x->f_startclock = 0; clock_delay(x->f_clock,0); } }
/** Create the Householder symmetric matrix v*v^T * * This matrix is used to process the rows and columns of the input matrix. */ static void create_house_matrix_packed(size_t order, double shift, double *source, size_t incs, double *hhp) { double h[order]; int i; /* zero out the destination hhp (it's packed aka triangular, so the size is * non-square) */ for (i = 0; i < order * (order + 1) / 2; i++) { hhp[i] = 0; } /* create and normalize householder vector h */ cblas_dcopy(order, source, incs, h, 1); h[0] += MYSIGN(h[0]) * cblas_dnrm2(order, h, 1); h[0] -= shift; cblas_dscal(order, 1.0 / cblas_dnrm2(order, h, 1), h, 1); /* hhp = h h^T */ cblas_dspr(CblasRowMajor, CblasUpper, order, 1.0, h, 1, hhp); }
/* mB = mC => C <- alpha*A + beta*C otherwise C <- alpha*A + beta*B */ void geam(double alpha, Mat mA, double beta, Mat mB, Mat mC) { const int n = MatN(mA); const int n2 = MatN2(mA); const void* const a = MatElems(mA); const void* const b = MatElems(mB); void* const c = MatElems(mC); const bool dev = MatDev(mA); switch (MatElemSize(mA)) { case 4: if (dev) { float alpha32 = alpha, beta32 = beta; cublasSgeam(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &alpha32, a, n, &beta32, b, n, c, n); } else { if (b == c) { cblas_sscal(n2, beta, c, 1); } else { memset(c, 0, MatSize(mC)); cblas_saxpy(n2, beta, b, 1, c, 1); } cblas_saxpy(n2, alpha, a, 1, c, 1); } break; case 8: if (dev) { cublasDgeam(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &alpha, a, n, &beta, b, n, c, n); } else { if (b == c) { cblas_dscal(n2, beta, c, 1); } else { memset(c, 0, MatSize(mC)); cblas_daxpy(n2, beta, b, 1, c, 1); } cblas_daxpy(n2, alpha, a, 1, c, 1); } break; } }
// The main chebyshev iteration void cheby_solver_iterate( const int x, const int y, const int z, const int halo_depth, double alpha, double beta, double* vec_u, double* vec_u0, double* vec_p, double* vec_r, double* vec_w, double* vec_kx, double* vec_ky, double* vec_kz, int* a_row_index, int* a_col_index, double* a_non_zeros) { int m = x*y*z; mkl_cspblas_dcsrgemv( "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w); int x_inner = x - 2*halo_depth; #pragma omp parallel for for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { const int offset = ii*x*y + jj*x + halo_depth; cblas_dcopy(x_inner, vec_u0 + offset, 1, vec_r + offset, 1); cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1); cblas_dscal(x_inner, alpha, vec_p + offset, 1); cblas_daxpy(x_inner, beta, vec_r + offset, 1, vec_p + offset, 1); } } cheby_calc_u(x, y, z, halo_depth, vec_u, vec_p); }
void THBlas_scale(long size, real alpha, real *y, long yStride) { if(size == 1) yStride = 1; #if USE_CBLAS if( (size < INT_MAX) && (yStride < INT_MAX) ) { #ifdef USE_DOUBLE cblas_dscal(size, alpha, y, yStride); #else cblas_sscal(size, alpha, y, yStride); #endif return; } #endif { long i; for(i = 0; i < size; i++) y[i*yStride] *= alpha; } }
// Calculates p void cg_solver_calc_p( const int x, const int y, const int z, const int halo_depth, const double beta, double* vec_p, double* vec_r) { int x_inner = x - 2*halo_depth; #pragma omp parallel for for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { const int offset = ii*x*y + jj*x + halo_depth; cblas_dscal(x_inner, beta, vec_p + offset, 1); cblas_daxpy(x_inner, 1.0, vec_r + offset, 1, vec_p + offset, 1); } } }
/* Ref: Weiss, Algorithm 12 BiCGSTAB * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int bicgstab (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *rs = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *s = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "bicgstab"); CHECK_MALLOC (rs, "bicgstab"); CHECK_MALLOC (p, "bicgstab"); CHECK_MALLOC (ap, "bicgstab"); CHECK_MALLOC (s, "bicgstab"); CHECK_MALLOC (t, "bicgstab"); double rsap; // (r*, A.p) double st; double t2; double rho, rho1; double delta; double gamma; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... cblas_daxpy (n, -1.0, b, 1, r, 1); // - b cblas_dcopy (n, r, 1, rs, 1); // r* = r cblas_dcopy (n, r, 1, p, 1); // p = r rho = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = cblas_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; cblas_dcopy (n, r, 1, s, 1); // s = r ... cblas_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = cblas_ddot (n, s, 1, t, 1); // st = (s, t) t2 = cblas_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; cblas_dcopy (n, s, 1, r, 1); // r = s ... cblas_daxpy (n, gamma, t, 1, r, 1); // + gamma t cblas_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... cblas_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(cblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; cblas_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p cblas_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // - b dcopy_ (&n, r, &i_1, rs, &i_1); // r* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r rho = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = ddot_ (&n, rs, &i_1, ap, &i_1); // rsap = (r*, A.p) delta = - rho / rsap; dcopy_ (&n, r, &i_1, s, &i_1); // s = r ... daxpy_ (&n, &delta, ap, &i_1, s, &i_1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = ddot_ (&n, s, &i_1, t, &i_1); // st = (s, t) t2 = ddot_ (&n, t, &i_1, t, &i_1); // t2 = (t, t) gamma = - st / t2; dcopy_ (&n, s, &i_1, r, &i_1); // r = s ... daxpy_ (&n, &gamma, t, &i_1, r, &i_1); // + gamma t daxpy_ (&n, &delta, p, &i_1, x, &i_1); // x = x + delta p... daxpy_ (&n, &gamma, s, &i_1, x, &i_1); // + gamma s res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(blas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } if (res2 > 1.0e20) { // already too big residual break; } rho1 = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; daxpy_ (&n, &gamma, ap, &i_1, p, &i_1); // p = p + gamma A.p dscal_ (&n, &beta, p, &i_1); // p = beta (p + gamma A.p) daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta(p + gamma A.p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... my_daxpy (n, -1.0, b, 1, r, 1); // - b my_dcopy (n, r, 1, rs, 1); // r* = r my_dcopy (n, r, 1, p, 1); // p = r rho = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = my_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; my_dcopy (n, r, 1, s, 1); // s = r ... my_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = my_ddot (n, s, 1, t, 1); // st = (s, t) t2 = my_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; my_dcopy (n, s, 1, r, 1); // r = s ... my_daxpy (n, gamma, t, 1, r, 1); // + gamma t my_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... my_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(myblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; my_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p my_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (rs); free (p); free (ap); free (s); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-bicgstab %d %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
int nonSmoothNewtonNeigh(int n, double* z, NewtonFunctionPtr* phi, NewtonFunctionPtr* jacobianPhi, int* iparam, double* dparam) { int itermax = iparam[0]; // maximum number of iterations allowed int iterMaxWithSameZ = itermax / 4; int niter = 0; // current iteration number double tolerance = dparam[0]; /* double coef; */ sFphi = phi; sFjacobianPhi = jacobianPhi; // verbose=1; if (verbose > 0) { printf(" ============= Starting of Newton process =============\n"); printf(" - tolerance: %14.7e\n - maximum number of iterations: %i\n", tolerance, itermax); } int incx = 1; /* int n2 = n*n; */ int infoDGESV; /** merit function and its jacobian */ double psi_z; /** The algorithm is alg 4.1 of the paper of Kanzow and Kleinmichel, "A new class of semismooth Newton-type methods for nonlinear complementarity problems", in Computational Optimization and Applications, 11, 227-251 (1998). We try to keep the same notations */ double rho = 1e-8; double descentCondition, criterion, norm_jacobian_psi_z, normPhi_z; double p = 2.1; double terminationCriterion = 1; double norm; int findNewZ, i, j, NbLookingForANewZ; /* int naux=0; */ double aux = 0; /* double aux1=0; */ int ii; int resls = 1; /* char c; */ /* double * oldz; */ /* oldz=(double*)malloc(n*sizeof(double));*/ NbLookingForANewZ = 0; /** Iterations ... */ while ((niter < itermax) && (terminationCriterion > tolerance)) { scmp++; ++niter; /** Computes phi and its jacobian */ if (sZsol) { for (ii = 0; ii < sN; ii++) szzaux[ii] = sZsol[ii] - z[ii]; printf("dist zzsol %.32e.\n", cblas_dnrm2(n, szzaux, 1)); } (*sFphi)(n, z, sphi_z, 0); (*sFjacobianPhi)(n, z, sjacobianPhi_z, 1); /* Computes the jacobian of the merit function, jacobian_psi = transpose(jacobianPhiMatrix).phiVector */ cblas_dgemv(CblasColMajor,CblasTrans, n, n, 1.0, sjacobianPhi_z, n, sphi_z, incx, 0.0, sgrad_psi_z, incx); norm_jacobian_psi_z = cblas_dnrm2(n, sgrad_psi_z, 1); /* Computes norm2(phi) */ normPhi_z = cblas_dnrm2(n, sphi_z, 1); /* Computes merit function */ psi_z = 0.5 * normPhi_z * normPhi_z; if (normPhi_z < tolerance) { /*it is the solution*/ terminationCriterion = tolerance / 2.0; break; } if (verbose > 0) { printf("Non Smooth Newton, iteration number %i, norm grad psi= %14.7e , psi = %14.7e, normPhi = %e .\n", niter, norm_jacobian_psi_z, psi_z, normPhi_z); printf(" -----------------------------------------------------------------------\n"); } NbLookingForANewZ++; if (niter > 2) { if (10 * norm_jacobian_psi_z < tolerance || !resls || NbLookingForANewZ > iterMaxWithSameZ) { NbLookingForANewZ = 0; resls = 1; /* if (NbLookingForANewZ % 10 ==1 && 0){ printf("Try NonMonotomnelineSearch\n"); cblas_dcopy(n,sgrad_psi_z,1,sdir_descent,1); cblas_dscal( n , -1.0 ,sdir_descent,incx); NonMonotomnelineSearch( z, phi, 10); continue; } */ /* FOR DEBUG ONLY*/ if (sZsol) { printf("begin plot prev dir\n"); plotMerit(z, 0, 0); printf("end\n"); /* gets(&c);*/ (*sFphi)(n, sZsol, szaux, 0); printf("value psi(zsol)=%e\n", cblas_dnrm2(n, szaux, 1)); cblas_dcopy(n, sZsol, incx, szaux, incx); cblas_daxpy(n , -1 , z , 1 , szaux , 1); printf("dist to sol %e \n", cblas_dnrm2(n, szaux, 1)); for (ii = 0; ii < n; ii++) sdir_descent[ii] = sZsol[ii] - z[ii]; aux = norm; norm = 1; printf("begin plot zzsol dir\n"); plotMerit(z, 0, 0); printf("end\n"); /* gets(&c);*/ norm = aux; } printf("looking for a new Z...\n"); /*may be a local minimal*/ /*find a gradiant going out of this cul-de-sac.*/ norm = n / 2; findNewZ = 0; for (j = 0; j < 20; j++) { for (i = 0; i < n; i++) { if (sZsol) { /* FOR DEBUG ONLY*/ (*sFphi)(n, sZsol, sphi_zaux, 0); norm = cblas_dnrm2(n, sphi_zaux, 1); printf("Norm of the sol %e.\n", norm); for (ii = 0; ii < n; ii++) sdir_descent[ii] = sZsol[ii] - z[ii]; norm = 1; } else { for (ii = 0; ii < n; ii++) { sdir_descent[ii] = 1.0 * rand(); } cblas_dscal(n, 1 / cblas_dnrm2(n, sdir_descent, 1), sdir_descent, incx); cblas_dscal(n, norm, sdir_descent, incx); } cblas_dcopy(n, z, incx, szaux, incx); // cblas_dscal(n,0.0,zaux,incx); /* zaux = z + dir */ cblas_daxpy(n , norm , sdir_descent , 1 , szaux , 1); /* Computes the jacobian of the merit function, jacobian_psi_zaux = transpose(jacobianPhi_zaux).phi_zaux */ (*sFphi)(n, szaux, sphi_zaux, 0); (*sFjacobianPhi)(n, szaux, sjacobianPhi_zaux, 1); /* FOR DEBUG ONLY*/ if (sZsol) { aux = cblas_dnrm2(n, sphi_zaux, 1); printf("Norm of the sol is now %e.\n", aux); for (ii = 0; ii < n; ii++) printf("zsol %e zaux %e \n", sZsol[ii], szaux[ii]); } cblas_dgemv(CblasColMajor, CblasTrans, n, n, 1.0, sjacobianPhi_zaux, n, sphi_zaux, incx, 0.0, sgrad_psi_zaux, incx); cblas_dcopy(n, szaux, 1, szzaux, 1); cblas_daxpy(n , -1 , z , incx , szzaux , incx); /*zzaux must be a descente direction.*/ /*ie jacobian_psi_zaux.zzaux <0 printf("jacobian_psi_zaux : \n");*/ /*cblas_dcopy(n,sdir,incx,sdir_descent,incx); plotMerit(z, phi);*/ aux = cblas_ddot(n, sgrad_psi_zaux, 1, szzaux, 1); /* aux1 = cblas_dnrm2(n,szzaux,1); aux1 = cblas_dnrm2(n,sgrad_psi_zaux,1);*/ aux = aux / (cblas_dnrm2(n, szzaux, 1) * cblas_dnrm2(n, sgrad_psi_zaux, 1)); /* printf("aux: %e\n",aux);*/ if (aux < 0.1 * (j + 1)) { //zaux is the new point. findNewZ = 1; cblas_dcopy(n, szaux, incx, z, incx); break; } } if (findNewZ) break; if (j == 10) { norm = n / 2; } else if (j > 10) norm = -2 * norm; else norm = -norm / 2.0; } if (! findNewZ) { printf("failed to find a new z\n"); /* exit(1);*/ continue; } else continue; } } /* Stops if the termination criterion is satisfied */ terminationCriterion = norm_jacobian_psi_z; /* if(terminationCriterion < tolerance){ break; }*/ /* Search direction calculation Find a solution dk of jacobianPhiMatrix.d = -phiVector. dk is saved in phiVector. */ cblas_dscal(n , -1.0 , sphi_z, incx); DGESV(n, 1, sjacobianPhi_z, n, sipiv, sphi_z, n, &infoDGESV); if (infoDGESV) { printf("DGEV error %d.\n", infoDGESV); } cblas_dcopy(n, sphi_z, 1, sdir_descent, 1); criterion = cblas_dnrm2(n, sdir_descent, 1); /* printf("norm dir descent %e\n",criterion);*/ /*printf("begin plot descent dir\n"); plotMerit(z, phi); printf("end\n"); gets(&c);*/ /*printf("begin plot zzsol dir\n"); plotMeritToZsol(z,phi); printf("end\n"); gets(&c);*/ /* norm = cblas_dnrm2(n,sdir_descent,1); printf("norm desc %e \n",norm); cblas_dscal( n , 1/norm , sdir_descent, 1); */ /* descentCondition = jacobian_psi.dk */ descentCondition = cblas_ddot(n, sgrad_psi_z, 1, sdir_descent, 1); /* Criterion to be satisfied: error < -rho*norm(dk)^p */ criterion = -rho * pow(criterion, p); /* printf("ddddddd %d\n",scmp); if (scmp>100){ displayMat(sjacobianPhi_z,n,n,n); exit(1); }*/ // if ((infoDGESV != 0 || descentCondition > criterion) && 0) // { // printf("no a desc dir, get grad psy\n"); /* dk = - jacobian_psi (remind that dk is saved in phi_z) */ // cblas_dcopy(n, sgrad_psi_z, 1, sdir_descent, 1); // cblas_dscal(n , -1.0 , sdir_descent, incx); /*DEBUG ONLY*/ /*printf("begin plot new descent dir\n"); plotMerit(z); printf("end\n"); gets(&c);*/ // } /* coef=fabs(norm_jacobian_psi_z*norm_jacobian_psi_z/descentCondition); if (coef <1){ cblas_dscal(n,coef,sdir_descent,incx); printf("coef %e norm dir descent is now %e\n",coef,cblas_dnrm2(n,sdir_descent,1)); }*/ /* Step-3 Line search: computes z_k+1 */ /*linesearch_Armijo(n,z,sdir_descent,psi_z, descentCondition, phi);*/ /* if (niter == 10){ printf("begin plot new descent dir\n"); plotMerit(z); printf("end\n"); gets(&c); }*/ /* memcpy(oldz,z,n*sizeof(double));*/ resls = linesearch2_Armijo(n, z, psi_z, descentCondition); if (!resls && niter > 1) { /* displayMat(sjacobianPhi_z,n,n,n); printf("begin plot new descent dir\n"); plotMerit(oldz,psi_z, descentCondition); printf("end\n"); gets(&c);*/ } /* lineSearch_Wolfe(z, descentCondition, phi,jacobianPhi);*/ /* if (niter>3){ printf("angle between prev dir %e.\n",acos(cblas_ddot(n, sdir_descent, 1, sPrevDirDescent, 1)/(cblas_dnrm2(n,sdir_descent,1)*cblas_dnrm2(n,sPrevDirDescent,1)))); }*/ cblas_dcopy(n, sdir_descent, 1, sPrevDirDescent, 1); /* for (j=20;j<32;j++){ if (z[j]<0) z[j]=0; }*/ /* if( 1 || verbose>0) { printf("Non Smooth Newton, iteration number %i, error grad equal to %14.7e , psi value is %14.7e .\n",niter, terminationCriterion,psi_z); printf(" -----------------------------------------------------------------------\n"); }*/ } /* Total number of iterations */ iparam[1] = niter; /* Final error */ dparam[1] = terminationCriterion; /** Free memory*/ if (verbose > 0) { if (dparam[1] > tolerance) printf("Non Smooth Newton warning: no convergence after %i iterations\n" , niter); else printf("Non Smooth Newton: convergence after %i iterations\n" , niter); printf(" The residue is : %e \n", dparam[1]); } /* free(oldz);*/ if (dparam[1] > tolerance) return 1; else return 0; }
void caffe_cpu_scale<double>(const int n, const double alpha, const double *x, double* y) { cblas_dcopy(n, x, 1, y, 1); cblas_dscal(n, alpha, y, 1); }
void caffe_scal<double>(const int N, const double alpha, double *X) { cblas_dscal(N, alpha, X, 1); }
void eblas_dscal_sub(size_t iStart, size_t iStop, double a, double* x, int incx) { cblas_dscal(iStop-iStart, a, x+incx*iStart, incx); }
void FrictionContact2D_latin(FrictionContactProblem* problem , double *reaction , double *velocity , int *info, SolverOptions* options) { int nc = problem->numberOfContacts; assert(nc>0); double * vec = problem->M->matrix0; double *qq = problem->q; double * mu = problem->mu; int info77 = 0; int i, j, kk, iter1, ino, ddl, nrhs; int info2 = 0; int n = 2 * nc; size_t idim, nbno; int incx = 1, incy = 1; size_t taille, taillet, taillen, itt; int *ddln; int *ddlt, *vectnt; assert(n>0); double errmax, alpha, beta, maxa, k_latin; double aa, nt, wn, tc, zc0; double err1, num11, err0; double den11, den22, knz0, ktz0, *ktz, *wf; double *wc, *zc, *wt, *maxwt, *wnum1, *znum1; double *zt, *maxzt; double *kn, *kt; // char trans='T', diag='N'; // char uplo='U', notrans='N'; double *k, *DPO, *kf, *kninv; double *kinvwden1, *kzden1, *kfinv, *knz, *wtnc; /* Recup input */ itt = options->iparam[0]; errmax = options->dparam[0]; k_latin = options->dparam[2]; /* Initialize output */ options->iparam[1] = 0; options->dparam[1] = 0.0; /* Allocations */ k = (double*) malloc(n * n * sizeof(double)); DPO = (double*) malloc(n * n * sizeof(double)); kf = (double*) malloc(n * n * sizeof(double)); kfinv = (double*) malloc(n * n * sizeof(double)); kninv = (double*) malloc(nc * nc * sizeof(double)); kn = (double*) malloc(nc * nc * sizeof(double)); kt = (double*) malloc(nc * nc * sizeof(double)); kinvwden1 = (double*) malloc(n * sizeof(double)); kzden1 = (double*) malloc(n * sizeof(double)); wc = (double*) malloc(n * sizeof(double)); zc = (double*) malloc(n * sizeof(double)); znum1 = (double*) malloc(n * sizeof(double)); wnum1 = (double*) malloc(n * sizeof(double)); wt = (double*) malloc(n * sizeof(double)); maxzt = (double*) malloc(n * sizeof(double)); knz = (double*) malloc(nc * sizeof(double)); wtnc = (double*) malloc(nc * sizeof(double)); ktz = (double*) malloc(nc * sizeof(double)); wf = (double*) malloc(nc * sizeof(double)); maxwt = (double*) malloc(nc * sizeof(double)); zt = (double*) malloc(nc * sizeof(double)); vectnt = (int*) malloc(n * sizeof(int)); ddln = (int*) malloc(nc * sizeof(int)); ddlt = (int*) malloc(nc * sizeof(int)); /* Initialization */ for (i = 0; i < n * n; i++) { k[i] = 0.; kf[i] = 0.; kfinv[i] = 0.; if (i < nc * nc) { kn[i] = 0.0; kt[i] = 0.0; kninv[i] = 0.0; if (i < n) { wc[i] = 0.0; zc[i] = 0.; reaction[i] = 0.; velocity[i] = 0.; znum1[i] = 0.; wnum1[i] = 0.; wt[i] = 0.; maxzt[i] = 0.; if (i < nc) { maxwt[i] = 0.; zt[i] = 0.; knz[i] = 0.; ktz[i] = 0.; wf[i] = 0.; wtnc[i] = 0.; } } } } for (i = 0; i < n; i++) { if (fabs(vec[i * n + i]) < DBL_EPSILON) { if (verbose > 0) printf("\n Warning nul diagonal term in M matrix \n"); free(k); free(DPO); free(kf); free(kfinv); free(kninv); free(kn); free(kt); free(kinvwden1); free(kzden1); free(wc); free(zc); free(znum1); free(wnum1); free(wt); free(maxzt); free(knz); free(wtnc); free(ktz); free(wf); free(maxwt); free(zt); free(vectnt); free(ddln); free(ddlt); *info = 3; return; } else { k[i + n * i] = k_latin / vec[i * n + i]; vectnt[i] = i + 1; } } for (i = 0; i < nc; i++) { ddln[i] = vectnt[2 * i]; if (i != 0) ddlt[i] = vectnt[2 * i - 1]; else ddlt[i] = 0; } for (i = 0; i < nc; i++) { kn[i + nc * i] = k[ddln[i] + n * ddln[i]]; kt[i + nc * i] = k[ddlt[i] + n * ddlt[i]]; } taillen = sizeof(ddln) / sizeof(ddln[0]); taillet = sizeof(ddlt) / sizeof(ddlt[0]); idim = 1 + taillen / taillet; taille = 0; for (i = 0; i < n; i++) taille = sizeof(qq[i]) + taille; taille = taille / sizeof(qq[0]); nbno = taille / idim; for (i = 0; i < nc; i++) { kf[ddln[i] + n * ddln[i]] = kn[i + nc * i]; kf[ddlt[i] + n * ddlt[i]] = kt[i + nc * i]; } for (i = 0; i < n; i++) { kfinv[i + n * i] = 1. / kf[i + n * i]; if (i < nc) kninv[i + nc * i] = 1. / kt[i + nc * i]; } for (i = 0; i < n; i++) for (j = 0; j < n; j++) DPO[i + n * j] = vec[j * n + i] + kfinv[i + n * j]; DPOTRF(LA_UP, n, DPO , n, &info2); if (info2 != 0) { if (verbose > 0) printf("\n Matter with Cholesky factorization \n"); free(k); free(DPO); free(kf); free(kfinv); free(kninv); free(kn); free(kt); free(kinvwden1); free(kzden1); free(wc); free(zc); free(znum1); free(wnum1); free(wt); free(maxzt); free(knz); free(wtnc); free(ktz); free(wf); free(maxwt); free(zt); free(vectnt); free(ddln); free(ddlt); *info = 2; return; } /* Iteration loops */ iter1 = 0; err1 = 1.; while ((iter1 < itt) && (err1 > errmax)) { /* Linear stage (zc,wc) -> (z,w) */ alpha = 1.; beta = 1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kfinv, n, zc, incx, beta, wc, incy); cblas_dcopy(n, qq, incx, znum1, incy); alpha = -1.; cblas_dscal(n , alpha , znum1 , incx); alpha = 1.; cblas_daxpy(n, alpha, wc, incx, znum1, incy); nrhs = 1; DTRTRS(LA_UP, LA_TRANS, LA_NONUNIT, n, nrhs, DPO, n, znum1, n, &info77); DTRTRS(LA_UP, LA_NOTRANS, LA_NONUNIT, n, nrhs, DPO, n, znum1, n, &info77); cblas_dcopy(n, znum1, incx, reaction, incy); alpha = -1.; beta = 1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kfinv, n, reaction, incx, beta, wc, incy); cblas_dcopy(n, wc, incx, velocity, incy); /* Local stage (z,w)->(zc,wc) */ for (i = 0; i < n; i++) { zc[i] = 0.; wc[i] = 0.0; } /* Normal party */ for (i = 0; i < nc; i++) { knz0 = 0.; for (kk = 0; kk < nc; kk++) { knz[i] = kt[i + nc * kk] * velocity[ddlt[kk]] + knz0; knz0 = knz[i]; } zt[i] = reaction[ddlt[i]] - knz[i]; if (zt[i] > 0.0) { zc[ddlt[i]] = zt[i]; maxzt[i] = 0.0; } else { zc[ddlt[i]] = 0.0; maxzt[i] = -zt[i]; } } for (i = 0; i < nc; i++) { zc0 = 0.; ktz0 = 0.; for (j = 0; j < nc; j++) { wc[ddlt[i]] = kninv[i + nc * j] * maxzt[j] + zc0; zc0 = wc[ddlt[i]]; ktz[i] = kn[i + nc * j] * velocity[ddln[j]] + ktz0; ktz0 = ktz[i]; } wf[i] = reaction[ddln[i]] - ktz[i]; } /* Loop other nodes */ for (ino = 0; ino < nbno; ino++) { ddl = ddln[ino]; nt = fabs(wf[ino]); /* Tangential vector */ if (nt < 1.e-8) tc = 0.; else tc = wf[ino] / nt; /* Tangentiel component */ wn = zc[ddlt[ino]]; aa = nt - mu[ino] * wn; if (aa > 0.0) { maxa = aa; } else { maxa = 0.0; } wc[ddl] = (maxa / (-1 * kn[ino + nc * ino])) * tc; aa = -nt + mu[ino] * wn; if (aa > 0.0) { maxa = aa; } else { maxa = 0.0; } zc[ddl] = (mu[ino] * wn - maxa) * tc; } /* Convergence criterium */ cblas_dcopy(n, reaction, incx, znum1, incy); alpha = -1.; cblas_daxpy(n, alpha, zc, incx, znum1, incy); cblas_dcopy(n, velocity, incx, wnum1, incy); cblas_daxpy(n, alpha, wc, incx, wnum1, incy); alpha = 1.; beta = 1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kf, n, wnum1, incx, beta, znum1, incy); num11 = 0.; alpha = 1.; beta = 0.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kfinv, n, znum1, incx, beta, wnum1, incy); num11 = cblas_ddot(n, wnum1, incx, znum1, incy); cblas_dcopy(n, reaction, incx, znum1, incy); alpha = 1.; beta = 1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kf, n, velocity, incx, beta, znum1, incy); alpha = 1.; beta = 0.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kfinv, n, znum1, incx, beta, wnum1, incy); den11 = cblas_ddot(n, wnum1, incx, znum1, incy); cblas_dcopy(n, zc, incx, znum1, incy); alpha = 1.; beta = 1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kf, n, wc, incx, beta, znum1, incy); alpha = 1.; beta = 0.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alpha, kfinv, n, znum1, incx, beta, wnum1, incy); den22 = cblas_ddot(n, znum1, incx, wnum1, incy); err0 = num11 / (den11 + den22); err1 = sqrt(err0); options->iparam[1] = iter1; options->dparam[1] = err1; iter1 = iter1 + 1; } if (err1 > errmax) { if (verbose > 0) printf("No convergence after %d iterations, the residue is %g\n", iter1, err1); *info = 1; } else { if (verbose > 0) printf("Convergence after %d iterations, the residue is %g \n", iter1, err1); *info = 0; } free(k); free(DPO); free(kf); free(kfinv); free(kninv); free(kn); free(kt); free(kinvwden1); free(kzden1); free(wc); free(zc); free(znum1); free(wnum1); free(wt); free(maxzt); free(knz); free(wtnc); free(ktz); free(wf); free(maxwt); free(zt); free(vectnt); free(ddln); free(ddlt); }
/* Ref: Weiss, Algorithm 11 CGS * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cgs (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_m1 = -1.0; double d_2 = 2.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *r0 = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *u = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *q = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "cgs"); CHECK_MALLOC (r0, "cgs"); CHECK_MALLOC (p, "cgs"); CHECK_MALLOC (u, "cgs"); CHECK_MALLOC (ap, "cgs"); CHECK_MALLOC (q, "cgs"); CHECK_MALLOC (t, "cgs"); double r0ap; double rho, rho1; double delta; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, r0, 1); // r0* = r cblas_dcopy (n, r, 1, p, 1); // p = r cblas_dcopy (n, r, 1, u, 1); // u = r rho = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = cblas_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; cblas_dcopy (n, u, 1, q, 1); // q = u cblas_dscal (n, 2.0, q, 1); // q = 2 u cblas_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q cblas_daxpy (n, delta, t, 1, r, 1); // r = r + delta t cblas_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; cblas_dcopy (n, q, 1, qu, 1); // qu = q cblas_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u cblas_dcopy (n, r, 1, u, 1); // u = r cblas_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) cblas_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p cblas_dcopy (n, u, 1, p, 1); // p = u cblas_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, r0, &i_1); // r0* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r dcopy_ (&n, r, &i_1, u, &i_1); // u = r rho = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = ddot_ (&n, r0, &i_1, ap, &i_1); // r0ap = (r0*, A.p) delta = - rho / r0ap; dcopy_ (&n, u, &i_1, q, &i_1); // q = u dscal_ (&n, &d_2, q, &i_1); // q = 2 u daxpy_ (&n, &delta, ap, &i_1, q, &i_1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q daxpy_ (&n, &delta, t, &i_1, r, &i_1); // r = r + delta t daxpy_ (&n, &delta, q, &i_1, x, &i_1); // x = x + delta q res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; dcopy_ (&n, q, &i_1, qu, &i_1); // qu = q daxpy_ (&n, &d_m1, u, &i_1, qu, &i_1); // qu = q - u dcopy_ (&n, r, &i_1, u, &i_1); // u = r daxpy_ (&n, &beta, qu, &i_1, u, &i_1); // u = r + beta (q - u) daxpy_ (&n, &beta, p, &i_1, qu, &i_1); // qu = q - u + beta * p dcopy_ (&n, u, &i_1, p, &i_1); // p = u daxpy_ (&n, &beta, qu, &i_1, p, &i_1); // p = u + beta (q - u + b * p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, r0, 1); // r0* = r my_dcopy (n, r, 1, p, 1); // p = r my_dcopy (n, r, 1, u, 1); // u = r rho = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = my_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; my_dcopy (n, u, 1, q, 1); // q = u my_dscal (n, 2.0, q, 1); // q = 2 u my_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q my_daxpy (n, delta, t, 1, r, 1); // r = r + delta t my_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; my_dcopy (n, q, 1, qu, 1); // qu = q my_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u my_dcopy (n, r, 1, u, 1); // u = r my_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) my_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p my_dcopy (n, u, 1, p, 1); // p = u my_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (r0); free (p); free (u); free (ap); free (q); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-cgs it= %d res^2= %e\n", i, res2); } it->niter = i; it->res2 = res2 / b2; return (ret); }