__cminpack_attr__ void __cminpack_func__(lmpar)(int n, real *r, int ldr, const int *ipvt, const real *diag, const real *qtb, real delta, real *par, real *x, real *sdiag, real *wa1, real *wa2) { /* Initialized data */ #define p1 .1 #define p001 .001 /* System generated locals */ real d1, d2; /* Local variables */ int j, l; real fp; real parc, parl; int iter; real temp, paru, dwarf; int nsing; real gnorm; real dxnorm; /* ********** */ /* subroutine lmpar */ /* given an m by n matrix a, an n by n nonsingular diagonal */ /* matrix d, an m-vector b, and a positive number delta, */ /* the problem is to determine a value for the parameter */ /* par such that if x solves the system */ /* a*x = b , sqrt(par)*d*x = 0 , */ /* in the least squares sense, and dxnorm is the euclidean */ /* norm of d*x, then either par is zero and */ /* (dxnorm-delta) .le. 0.1*delta , */ /* or par is positive and */ /* abs(dxnorm-delta) .le. 0.1*delta . */ /* this subroutine completes the solution of the problem */ /* if it is provided with the necessary information from the */ /* qr factorization, with column pivoting, of a. that is, if */ /* a*p = q*r, where p is a permutation matrix, q has orthogonal */ /* columns, and r is an upper triangular matrix with diagonal */ /* elements of nonincreasing magnitude, then lmpar expects */ /* the full upper triangle of r, the permutation matrix p, */ /* and the first n components of (q transpose)*b. on output */ /* lmpar also provides an upper triangular matrix s such that */ /* t t t */ /* p *(a *a + par*d*d)*p = s *s . */ /* s is employed within lmpar and may be of separate interest. */ /* only a few iterations are generally needed for convergence */ /* of the algorithm. if, however, the limit of 10 iterations */ /* is reached, then the output par will contain the best */ /* value obtained so far. */ /* the subroutine statement is */ /* subroutine lmpar(n,r,ldr,ipvt,diag,qtb,delta,par,x,sdiag, */ /* wa1,wa2) */ /* where */ /* n is a positive integer input variable set to the order of r. */ /* r is an n by n array. on input the full upper triangle */ /* must contain the full upper triangle of the matrix r. */ /* on output the full upper triangle is unaltered, and the */ /* strict lower triangle contains the strict upper triangle */ /* (transposed) of the upper triangular matrix s. */ /* ldr is a positive integer input variable not less than n */ /* which specifies the leading dimension of the array r. */ /* ipvt is an integer input array of length n which defines the */ /* permutation matrix p such that a*p = q*r. column j of p */ /* is column ipvt(j) of the identity matrix. */ /* diag is an input array of length n which must contain the */ /* diagonal elements of the matrix d. */ /* qtb is an input array of length n which must contain the first */ /* n elements of the vector (q transpose)*b. */ /* delta is a positive input variable which specifies an upper */ /* bound on the euclidean norm of d*x. */ /* par is a nonnegative variable. on input par contains an */ /* initial estimate of the levenberg-marquardt parameter. */ /* on output par contains the final estimate. */ /* x is an output array of length n which contains the least */ /* squares solution of the system a*x = b, sqrt(par)*d*x = 0, */ /* for the output par. */ /* sdiag is an output array of length n which contains the */ /* diagonal elements of the upper triangular matrix s. */ /* wa1 and wa2 are work arrays of length n. */ /* subprograms called */ /* minpack-supplied ... dpmpar,enorm,qrsolv */ /* fortran-supplied ... dabs,dmax1,dmin1,dsqrt */ /* argonne national laboratory. minpack project. march 1980. */ /* burton s. garbow, kenneth e. hillstrom, jorge j. more */ /* ********** */ /* dwarf is the smallest positive magnitude. */ dwarf = __cminpack_func__(dpmpar)(2); /* compute and store in x the gauss-newton direction. if the */ /* jacobian is rank-deficient, obtain a least squares solution. */ nsing = n; for (j = 0; j < n; ++j) { wa1[j] = qtb[j]; if (r[j + j * ldr] == 0. && nsing == n) { nsing = j; } if (nsing < n) { wa1[j] = 0.; } } # ifdef USE_CBLAS cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, nsing, r, ldr, wa1, 1); # else if (nsing >= 1) { int k; for (k = 1; k <= nsing; ++k) { j = nsing - k; wa1[j] /= r[j + j * ldr]; temp = wa1[j]; if (j >= 1) { int i; for (i = 0; i < j; ++i) { wa1[i] -= r[i + j * ldr] * temp; } } } } # endif for (j = 0; j < n; ++j) { l = ipvt[j]-1; x[l] = wa1[j]; } /* initialize the iteration counter. */ /* evaluate the function at the origin, and test */ /* for acceptance of the gauss-newton direction. */ iter = 0; for (j = 0; j < n; ++j) { wa2[j] = diag[j] * x[j]; } dxnorm = __cminpack_enorm__(n, wa2); fp = dxnorm - delta; if (fp <= p1 * delta) { goto TERMINATE; } /* if the jacobian is not rank deficient, the newton */ /* step provides a lower bound, parl, for the zero of */ /* the function. otherwise set this bound to zero. */ parl = 0.; if (nsing >= n) { for (j = 0; j < n; ++j) { l = ipvt[j]-1; wa1[j] = diag[l] * (wa2[l] / dxnorm); } # ifdef USE_CBLAS cblas_dtrsv(CblasColMajor, CblasUpper, CblasTrans, CblasNonUnit, n, r, ldr, wa1, 1); # else for (j = 0; j < n; ++j) { real sum = 0.; if (j >= 1) { int i; for (i = 0; i < j; ++i) { sum += r[i + j * ldr] * wa1[i]; } } wa1[j] = (wa1[j] - sum) / r[j + j * ldr]; } # endif temp = __cminpack_enorm__(n, wa1); parl = fp / delta / temp / temp; } /* calculate an upper bound, paru, for the zero of the function. */ for (j = 0; j < n; ++j) { real sum; # ifdef USE_CBLAS sum = cblas_ddot(j+1, &r[j*ldr], 1, qtb, 1); # else sum = 0.; int i; for (i = 0; i <= j; ++i) { sum += r[i + j * ldr] * qtb[i]; } # endif l = ipvt[j]-1; wa1[j] = sum / diag[l]; } gnorm = __cminpack_enorm__(n, wa1); paru = gnorm / delta; if (paru == 0.) { paru = dwarf / min(delta,(real)p1) /* / p001 ??? */; } /* if the input par lies outside of the interval (parl,paru), */ /* set par to the closer endpoint. */ *par = max(*par,parl); *par = min(*par,paru); if (*par == 0.) { *par = gnorm / dxnorm; } /* beginning of an iteration. */ for (;;) { ++iter; /* evaluate the function at the current value of par. */ if (*par == 0.) { /* Computing MAX */ d1 = dwarf, d2 = p001 * paru; *par = max(d1,d2); } temp = sqrt(*par); for (j = 0; j < n; ++j) { wa1[j] = temp * diag[j]; } __cminpack_func__(qrsolv)(n, r, ldr, ipvt, wa1, qtb, x, sdiag, wa2); for (j = 0; j < n; ++j) { wa2[j] = diag[j] * x[j]; } dxnorm = __cminpack_enorm__(n, wa2); temp = fp; fp = dxnorm - delta; /* if the function is small enough, accept the current value */ /* of par. also test for the exceptional cases where parl */ /* is zero or the number of iterations has reached 10. */ if (fabs(fp) <= p1 * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) { goto TERMINATE; } /* compute the newton correction. */ # ifdef USE_CBLAS for (j = 0; j < nsing; ++j) { l = ipvt[j]-1; wa1[j] = diag[l] * (wa2[l] / dxnorm); } for (j = nsing; j < n; ++j) { wa1[j] = 0.; } /* exchange the diagonal of r with sdiag */ cblas_dswap(n, r, ldr+1, sdiag, 1); /* solve lower(r).x = wa1, result id put in wa1 */ cblas_dtrsv(CblasColMajor, CblasLower, CblasNoTrans, CblasNonUnit, nsing, r, ldr, wa1, 1); /* exchange the diagonal of r with sdiag */ cblas_dswap( n, r, ldr+1, sdiag, 1); # else /* !USE_CBLAS */ for (j = 0; j < n; ++j) { l = ipvt[j]-1; wa1[j] = diag[l] * (wa2[l] / dxnorm); } for (j = 0; j < n; ++j) { wa1[j] /= sdiag[j]; temp = wa1[j]; if (n > j+1) { int i; for (i = j+1; i < n; ++i) { wa1[i] -= r[i + j * ldr] * temp; } } } # endif /* !USE_CBLAS */ temp = __cminpack_enorm__(n, wa1); parc = fp / delta / temp / temp; /* depending on the sign of the function, update parl or paru. */ if (fp > 0.) { parl = max(parl,*par); } if (fp < 0.) { paru = min(paru,*par); } /* compute an improved estimate for par. */ /* Computing MAX */ d1 = parl, d2 = *par + parc; *par = max(d1,d2); /* end of an iteration. */ } TERMINATE: /* termination. */ if (iter == 0) { *par = 0.; } /* last card of subroutine lmpar. */ } /* lmpar_ */
double caffe_cpu_dot<double>(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); }
double wrapper_cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY) { return cblas_ddot(N, X, incX, Y, incY); }
int nonSmoothNewton(int n, double* z, NewtonFunctionPtr* phi, NewtonFunctionPtr* jacobianPhi, int* iparam, double* dparam) { if (phi == NULL || jacobianPhi == NULL) { fprintf(stderr, "NonSmoothNewton error: phi or its jacobian function = NULL pointer.\n"); exit(EXIT_FAILURE); } int itermax = iparam[0]; // maximum number of iterations allowed int niter = 0; // current iteration number double tolerance = dparam[0]; if (verbose > 0) { printf(" ============= Starting of Newton process =============\n"); printf(" - tolerance: %14.7e\n - maximum number of iterations: %i\n", tolerance, itermax); } int incx = 1; int n2 = n * n; int infoDGESV; /* Memory allocation for phi and its jacobian */ double * phiVector = (double*)malloc(n * sizeof(*phiVector)); double *jacobianPhiMatrix = (double*)malloc(n2 * sizeof(*jacobianPhiMatrix)); /** merit function and its jacobian */ double psi; double *jacobian_psi = (double*)malloc(n * sizeof(*jacobian_psi)); int* ipiv = (int *)malloc(n * sizeof(*ipiv)); if (phiVector == NULL || jacobianPhiMatrix == NULL || jacobian_psi == NULL || ipiv == NULL) { fprintf(stderr, "NonSmoothNewton, memory allocation failed.\n"); exit(EXIT_FAILURE); } /** The algorithm is alg 4.1 of the paper of Kanzow and Kleinmichel, "A new class of semismooth Newton-type methods for nonlinear complementarity problems", in Computational Optimization and Applications, 11, 227-251 (1998). We try to keep the same notations */ double rho = 1e-8; double descentCondition, criterion, norm_jacobian_psi, normPhi; double p = 2.1; double terminationCriterion = 1; if (jacobian_psi == NULL) { fprintf(stderr, "NonSmoothNewton, memory allocation failed for jacobian_psi.\n"); exit(EXIT_FAILURE); } /** Iterations ... */ while ((niter < itermax) && (terminationCriterion > tolerance)) { ++niter; /** Computes phi and its jacobian */ (*phi)(n, z, phiVector, 0); (*jacobianPhi)(n, z, jacobianPhiMatrix, 1); /* Computes the jacobian of the merit function, jacobian_psi = transpose(jacobianPhiMatrix).phiVector */ cblas_dgemv(CblasColMajor,CblasTrans, n, n, 1.0, jacobianPhiMatrix, n, phiVector, incx, 0.0, jacobian_psi, incx); norm_jacobian_psi = cblas_dnrm2(n, jacobian_psi, 1); /* Computes norm2(phi) */ normPhi = cblas_dnrm2(n, phiVector, 1); /* Computes merit function */ psi = 0.5 * normPhi * normPhi; /* Stops if the termination criterion is satisfied */ terminationCriterion = norm_jacobian_psi; if (terminationCriterion < tolerance) break; /* Search direction calculation Find a solution dk of jacobianPhiMatrix.d = -phiVector. dk is saved in phiVector. */ cblas_dscal(n , -1.0 , phiVector, incx); DGESV(n, 1, jacobianPhiMatrix, n, ipiv, phiVector, n, &infoDGESV); /* descentCondition = jacobian_psi.dk */ descentCondition = cblas_ddot(n, jacobian_psi, 1, phiVector, 1); /* Criterion to be satisfied: error < -rho*norm(dk)^p */ criterion = cblas_dnrm2(n, phiVector, 1); criterion = -rho * pow(criterion, p); if (infoDGESV != 0 || descentCondition > criterion) { /* dk = - jacobian_psi (remind that dk is saved in phiVector) */ cblas_dcopy(n, jacobian_psi, 1, phiVector, 1); cblas_dscal(n , -1.0 , phiVector, incx); } /* Step-3 Line search: computes z_k+1 */ linesearch_Armijo(n, z, phiVector, psi, descentCondition, phi); if (verbose > 0) { printf("Non Smooth Newton, iteration number %i, error equal to %14.7e .\n", niter, terminationCriterion); printf(" -----------------------------------------------------------------------\n"); } } /* Total number of iterations */ iparam[1] = niter; /* Final error */ dparam[1] = terminationCriterion; /** Free memory*/ free(phiVector); free(jacobianPhiMatrix); free(jacobian_psi); free(ipiv); if (verbose > 0) { if (dparam[1] > tolerance) printf("Non Smooth Newton warning: no convergence after %i iterations\n" , niter); else printf("Non Smooth Newton: convergence after %i iterations\n" , niter); printf(" The residue is : %e \n", dparam[1]); } if (dparam[1] > tolerance) return 1; else return 0; }
double caffe_cpu_strided_dot<double>(const int n, const double* x, const int incx, const double* y, const int incy) { return cblas_ddot(n, x, incx, y, incy); }
void fc3d_HyperplaneProjection(FrictionContactProblem* problem, double *reaction, double *velocity, int* info, SolverOptions* options) { /* int and double parameters */ int* iparam = options->iparam; double* dparam = options->dparam; /* Number of contacts */ int nc = problem->numberOfContacts; double* q = problem->q; NumericsMatrix* M = problem->M; double* mu = problem->mu; /* Dimension of the problem */ int n = 3 * nc; /* Maximum number of iterations */ int itermax = iparam[0]; /* Maximum number of iterations in Line--search */ int lsitermax = iparam[1]; /* Tolerance */ double tolerance = dparam[0]; double norm_q = cblas_dnrm2(nc*3 , problem->q , 1); /***** Fixed point iterations *****/ int iter = 0; /* Current iteration number */ double error = 1.; /* Current error */ int hasNotConverged = 1; int contact; /* Number of the current row of blocks in M */ int nLocal = 3; dparam[0] = dparam[2]; // set the tolerance for the local solver double * velocitytmp = (double *)calloc(n, sizeof(double)); double * reactiontmp = (double *)calloc(n, sizeof(double)); double * reactiontmp2 = (double *)calloc(n, sizeof(double)); double * reactiontmp3 = (double *)calloc(n, sizeof(double)); /* double tau = 1.0; */ double sigma = 0.99; /* if (dparam[3] > 0.0) */ /* { */ /* tau = dparam[3]; */ /* } */ /* else */ /* { */ /* printf("Hyperplane Projection method. tau <=0 is not well defined\n"); */ /* printf("Hyperplane Projection method. rho is set to 1.0\n"); */ /* } */ if (dparam[4] > 0.0 && dparam[4] < 1.0) { sigma = dparam[4]; } else { printf("Hyperplane Projection method. 0<sigma <1 is not well defined\n"); printf("Hyperplane Projection method. sigma is set to 0.99\n"); } /* double minusrho = -1.0*rho; */ while ((iter < itermax) && (hasNotConverged > 0)) { ++iter; cblas_dcopy(n , q , 1 , velocitytmp, 1); cblas_dcopy(n , reaction , 1 , reactiontmp, 1); NM_gemv(1.0, M, reactiontmp, 1.0, velocitytmp); // projection for each contact double rho = 1; for (contact = 0 ; contact < nc ; ++contact) { int pos = contact * nLocal; double normUT = sqrt(velocitytmp[pos + 1] * velocitytmp[pos + 1] + velocitytmp[pos + 2] * velocitytmp[pos + 2]); reactiontmp[pos] -= rho * (velocitytmp[pos] + mu[contact] * normUT); reactiontmp[pos + 1] -= rho * velocitytmp[pos + 1]; reactiontmp[pos + 2] -= rho * velocitytmp[pos + 2]; projectionOnCone(&reactiontmp[pos], mu[contact]); } // Armijo line search int stopingcriteria = 1; int i = -1; double alpha ; double lhs = NAN; double rhs; // z_k-y_k cblas_dcopy(n , reaction , 1 , reactiontmp3, 1); cblas_daxpy(n, -1.0, reactiontmp, 1, reactiontmp3, 1); while (stopingcriteria && (i < lsitermax)) { i++ ; cblas_dcopy(n , reactiontmp , 1 , reactiontmp2, 1); alpha = 1.0 / (pow(2.0, i)); #ifdef VERBOSE_DEBUG printf("alpha = %f\n", alpha); #endif cblas_dscal(n , alpha, reactiontmp2, 1); alpha = 1.0 - alpha; cblas_daxpy(n, alpha, reaction, 1, reactiontmp2, 1); cblas_dcopy(n , q , 1 , velocitytmp, 1); NM_gemv(1.0, M, reactiontmp2, 1.0, velocitytmp); /* #ifdef VERBOSE_DEBUG */ /* for (contact = 0 ; contact < nc ; ++contact) */ /* { */ /* for(int kk=0; kk<3;kk++) printf("reactiontmp2[%i]=%12.8e\t",contact*nLocal+kk, reactiontmp2[contact*nLocal+kk]); */ /* printf("\n"); */ /* } */ /* #endif */ lhs = cblas_ddot(n, velocitytmp, 1, reactiontmp3, 1); rhs = cblas_dnrm2(n, reactiontmp3, 1); rhs = sigma / rho * rhs * rhs; if (lhs >= rhs) stopingcriteria = 0; #ifdef VERBOSE_DEBUG printf("Number of iteration in Armijo line search = %i\n", i); printf("lhs = %f\n", lhs); printf("rhs = %f\n", rhs); printf("alpha = %f\n", alpha); printf("sigma = %f\n", sigma); printf("rho = %f\n", rho); #endif } double nonorm = cblas_dnrm2(n, velocitytmp, 1); double rhoequiv = lhs / (nonorm * nonorm); #ifdef VERBOSE_DEBUG printf("rho equiv = %f\n", rhoequiv); #endif cblas_daxpy(n, -rhoequiv, velocitytmp, 1, reaction , 1); // projection for each contact for (contact = 0 ; contact < nc ; ++contact) { int pos = contact * nLocal; projectionOnCone(&reaction[pos], mu[contact]); } /* **** Criterium convergence **** */ fc3d_compute_error(problem, reaction , velocity, tolerance, options, norm_q, &error); if (options->callback) { options->callback->collectStatsIteration(options->callback->env, nc * 3, reaction, velocity, error, NULL); } if (verbose > 0) printf("--------------- FC3D - Hyperplane Projection (HP) - Iteration %i rho = %14.7e \t rhoequiv = %14.7e \tError = %14.7e\n", iter, rho, rhoequiv, error); if (error < tolerance) hasNotConverged = 0; *info = hasNotConverged; } if (verbose > 0) printf("--------------- FC3D - Hyperplane Projection (HP) - #Iteration %i Final Residual = %14.7e\n", iter, error); dparam[0] = tolerance; dparam[1] = error; iparam[7] = iter; free(velocitytmp); free(reactiontmp); free(reactiontmp2); free(reactiontmp3); }
// Update inverse a1 after column in matrix a has changed // get new column out of array1 newCol // // a1= old inverse // newCol = new column lCol in the new matrix a_new // returns Det(a_old)/Det(a_new) doublevar InverseUpdateColumn(Array2 <doublevar> & a1, const Array1 <doublevar> & newCol, const int lCol, const int n) { Array1 <doublevar> & tmpColL(tmp11); tmpColL.Resize(n); Array1 <doublevar> & prod(tmp12); prod.Resize(n); doublevar f=0.0; #ifdef USE_BLAS int a1size=a1.GetDim(1); doublevar * a1col=a1.v+lCol*a1size; f=cblas_ddot(n,a1col, 1, newCol.v, 1); f=-1.0/f; cblas_dcopy(n,a1col,1,tmpColL.v,1); cblas_dgemv(CblasRowMajor,CblasNoTrans,n,n, 1.0,a1.v,a1size, newCol.v,1, 0.0,prod.v,1); cblas_dscal(n,f,prod.v,1); cblas_dger(CblasRowMajor, n,n,1.0, prod.v,1, tmpColL.v,1, a1.v,a1size); f=-f; cblas_dcopy(n,tmpColL.v,1,a1col,1); cblas_dscal(n,f,a1col,1); #else for(int i=0;i<n;++i) { f += a1(lCol,i)*newCol[i]; } f =-1.0/f; for(int j=0;j<n;++j) { tmpColL[j]=a1(lCol,j); prod[j] =0.0; for(int i=0;i<n;++i) { prod[j] += a1(j,i)*newCol[i]; } prod[j] *= f; } for(int i=0;i<n;++i) { doublevar & p(prod[i]); for(int j=0;j<n;++j) { a1(i,j) += tmpColL[j]*p; } } f = -f; for(int j=0;j<n;++j) { a1(lCol,j) = f*tmpColL[j]; } #endif return f; }
void ncp_pathsearch(NonlinearComplementarityProblem* problem, double* z, double* F, int *info , SolverOptions* options) { /* Main step of the algorithm: * - compute jacobians * - call modified lemke */ unsigned int n = problem->n; unsigned int preAlloc = options->iparam[SICONOS_IPARAM_PREALLOC]; int itermax = options->iparam[SICONOS_IPARAM_MAX_ITER]; double merit_norm = 1.0; double nn_tol = options->dparam[SICONOS_DPARAM_TOL]; int nbiter = 0; /* declare a LinearComplementarityProblem on the stack*/ LinearComplementarityProblem lcp_subproblem; lcp_subproblem.size = n; /* do some allocation if required * - nabla_F (used also as M for the LCP subproblem) * - q for the LCP subproblem * * Then fill the LCP subproblem */ if (!preAlloc || (preAlloc && !options->internalSolvers)) { options->internalSolvers = (SolverOptions *) malloc(sizeof(SolverOptions)); solver_options_set(options->internalSolvers, SICONOS_LCP_PIVOT); options->numberOfInternalSolvers = 1; SolverOptions * lcp_options = options->internalSolvers; /* We always allocation once and for all since we are supposed to solve * many LCPs */ lcp_options->iparam[SICONOS_IPARAM_PREALLOC] = 1; /* set the right pivot rule */ lcp_options->iparam[SICONOS_IPARAM_PIVOT_RULE] = SICONOS_LCP_PIVOT_PATHSEARCH; /* set the right stacksize */ lcp_options->iparam[SICONOS_IPARAM_PATHSEARCH_STACKSIZE] = options->iparam[SICONOS_IPARAM_PATHSEARCH_STACKSIZE]; } assert(problem->nabla_F); lcp_subproblem.M = problem->nabla_F; if (!preAlloc || (preAlloc && !options->dWork)) { options->dWork = (double *) malloc(4*n*sizeof(double)); } lcp_subproblem.q = options->dWork; double* x = &options->dWork[n]; double* x_plus = &options->dWork[2*n]; double* r = &options->dWork[3*n]; NMS_data* data_NMS; functions_LSA* functions; if (!preAlloc || (preAlloc && !options->solverData)) { options->solverData = malloc(sizeof(pathsearch_data)); pathsearch_data* solverData = (pathsearch_data*) options->solverData; /* do all the allocation */ solverData->data_NMS = create_NMS_data(n, NM_DENSE, options->iparam, options->dparam); solverData->lsa_functions = (functions_LSA*) malloc(sizeof(functions_LSA)); solverData->data_NMS->set = malloc(sizeof(positive_orthant)); data_NMS = solverData->data_NMS; functions = solverData->lsa_functions; /* for use in NMS; only those 3 functions are called */ init_lsa_functions(functions, &FB_compute_F_ncp, &ncp_FB); functions->compute_H = &FB_compute_H_ncp; set_set_id(data_NMS->set, SICONOS_SET_POSITIVE_ORTHANT); /* fill ls_data */ data_NMS->ls_data->compute_F = functions->compute_F; data_NMS->ls_data->compute_F_merit = functions->compute_F_merit; data_NMS->ls_data->z = NULL; /* XXX to check -- xhub */ data_NMS->ls_data->zc = NMS_get_generic_workV(data_NMS->workspace, n); data_NMS->ls_data->F = NMS_get_F(data_NMS->workspace, n); data_NMS->ls_data->F_merit = NMS_get_F_merit(data_NMS->workspace, n); data_NMS->ls_data->desc_dir = NMS_get_dir(data_NMS->workspace, n); /** \todo this value should be settable by the user with a default value*/ data_NMS->ls_data->alpha_min = fmin(data_NMS->alpha_min_watchdog, data_NMS->alpha_min_pgrad); data_NMS->ls_data->data = (void*)problem; data_NMS->ls_data->set = data_NMS->set; data_NMS->ls_data->sigma = options->dparam[SICONOS_DPARAM_NMS_SIGMA]; /* data_NMS->ls_data->searchtype is set in the NMS code */ } else { pathsearch_data* solverData = (pathsearch_data*) options->solverData; data_NMS = solverData->data_NMS; functions = solverData->lsa_functions; } /* initial value for ref_merit */ problem->compute_F(problem->env, n, z, F); functions->compute_F_merit(problem, z, F, data_NMS->ls_data->F_merit); data_NMS->ref_merit = .5 * cblas_ddot(n, data_NMS->ls_data->F_merit, 1, data_NMS->ls_data->F_merit, 1); data_NMS->merit_bestpoint = data_NMS->ref_merit; cblas_dcopy(n, z, 1, NMS_checkpoint_0(data_NMS, n), 1); cblas_dcopy(n, z, 1, NMS_checkpoint_T(data_NMS, n), 1); cblas_dcopy(n, z, 1, NMS_bestpoint(data_NMS, n), 1); /* -------------------- end init ---------------------------*/ int nms_failed = 0; double err = 10*nn_tol; /* to check the solution */ LinearComplementarityProblem lcp_subproblem_check; int check_lcp_solution = 1; /* XXX add config for that */ double normal_norm2_newton_point; /* F is already computed here at z */ while ((err > nn_tol) && (nbiter < itermax) && !nms_failed) { int force_watchdog_step = 0; int force_d_step_merit_check = 0; double check_ratio = 0.0; nbiter++; /* update M, q and r */ /* First find x */ ncp_pathsearch_compute_x_from_z(n, z, F, x); pos_part(n, x, x_plus); /* update x_plus */ ncp_pathsearch_update_lcp_data(problem, &lcp_subproblem, n, x_plus, x, r); if (check_lcp_solution) { lcp_subproblem_check.size = n; lcp_subproblem_check.M = problem->nabla_F; lcp_subproblem_check.q = lcp_subproblem.q; //cblas_dcopy(n, x, 1, lcp_subproblem_check.q , 1); //prodNumericsMatrix(n, n, -1.0, problem->nabla_F, x_plus, 0.0, lcp_subproblem.q); } double norm_r2 = cblas_ddot(n, r, 1, r, 1); if (norm_r2 < DBL_EPSILON*DBL_EPSILON) /* ||r|| < 1e-15 */ { DEBUG_PRINTF("ncp_pathsearch :: ||r|| = %e < %e; path search procedure was successful!\n", norm_r2, DBL_EPSILON*DBL_EPSILON); (*info) = 0; ncp_compute_error(n, z, F, nn_tol, &err); /* XXX F should be up-to-date, we should check only CC*/ break; } /* end update M, q and r */ lcp_pivot_covering_vector(&lcp_subproblem, x_plus, x, info, options->internalSolvers, r); switch (*info) { case LCP_PIVOT_SUCCESS: DEBUG_PRINT("ncp_pathsearch :: path search procedure was successful!\n"); if (check_lcp_solution) { double err_lcp = 0.0; cblas_daxpy(n, 1.0, r, 1, lcp_subproblem_check.q, 1); lcp_compute_error(&lcp_subproblem_check, x_plus, x, 1e-14, &err_lcp); double local_tol = fmax(1e-14, DBL_EPSILON*sqrt(norm_r2)); printf("ncp_pathsearch :: lcp solved with error = %e; local_tol = %e\n", err_lcp, local_tol); //assert(err_lcp < local_tol && "ncp_pathsearch :: lcp solved with very bad precision"); if (err_lcp > local_tol) { printf("ncp_pathsearch :: lcp solved with very bad precision\n"); NM_display(lcp_subproblem.M); printf("z r q x_plus\n"); for (unsigned i = 0; i < n; ++i) printf("%e %e %e %e\n", z[i], r[i], lcp_subproblem.q[i], x_plus[i]); options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = 0; lcp_pivot(&lcp_subproblem, x_plus, x, info, options->internalSolvers); options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = SICONOS_LCP_PIVOT_PATHSEARCH; lcp_compute_error(&lcp_subproblem_check, x_plus, x, 1e-14, &err_lcp); printf("ncp_pathsearch :: lcp resolved with error = %e; local_tol = %e\n", err_lcp, local_tol); } /* XXX missing recompute x ?*/ /* recompute the normal norm */ problem->compute_F(problem->env, n, x_plus, r); cblas_daxpy(n, -1.0, x, 1, r, 1); normal_norm2_newton_point = cblas_ddot(n, r, 1, r, 1); if (normal_norm2_newton_point > norm_r2) { printf("ncp_pathsearch :: lcp successfully solved, but the norm of the normal map increased! %e > %e\n", normal_norm2_newton_point, norm_r2); //assert(normal_norm2_newton_point <= norm_r2); } else { printf("ncp_pathsearch :: lcp successfully solved, norm of the normal map decreased! %e < %e\n", normal_norm2_newton_point, norm_r2); //check_ratio = norm_r2/normal_norm2_newton_point; } if (50*normal_norm2_newton_point < norm_r2) { force_d_step_merit_check = 1; } else if (10*normal_norm2_newton_point < norm_r2) { // check_ratio = sqrt(norm_r2/normal_norm2_newton_point); } } break; case LCP_PIVOT_RAY_TERMINATION: DEBUG_PRINT("ncp_pathsearch :: ray termination, let's fastened your seat belt!\n"); break; case LCP_PATHSEARCH_LEAVING_T: DEBUG_PRINT("ncp_pathsearch :: leaving t, fastened your seat belt!\n"); DEBUG_PRINTF("ncp_pathsearch :: max t value = %e\n", options->internalSolvers->dparam[2]); /* XXX fix 2 */ /* try to retry solving the problem */ /* XXX keep or not ? */ /* recompute the normal norm */ problem->compute_F(problem->env, n, x_plus, r); cblas_daxpy(n, -1.0, x, 1, r, 1); normal_norm2_newton_point = cblas_ddot(n, r, 1, r, 1); if (normal_norm2_newton_point > norm_r2) { printf("ncp_pathsearch :: lcp successfully solved, but the norm of the normal map increased! %e > %e\n", normal_norm2_newton_point, norm_r2); //assert(normal_norm2_newton_point <= norm_r2); } else { printf("ncp_pathsearch :: lcp successfully solved, norm of the normal map decreased! %e < %e\n", normal_norm2_newton_point, norm_r2); check_ratio = 5.0*norm_r2/normal_norm2_newton_point; } if (options->internalSolvers->dparam[2] > 1e-5) break; memset(x_plus, 0, sizeof(double) * n); problem->compute_F(problem->env, n, x_plus, r); ncp_pathsearch_compute_x_from_z(n, x_plus, r, x); ncp_pathsearch_update_lcp_data(problem, &lcp_subproblem, n, x_plus, x, r); lcp_pivot_covering_vector(&lcp_subproblem, x_plus, x, info, options->internalSolvers, r); if (*info == LCP_PIVOT_SUCCESS) { DEBUG_PRINT("ncp_pathsearch :: Lemke start worked !\n"); double err_lcp = 0.0; cblas_daxpy(n, 1.0, r, 1, lcp_subproblem_check.q, 1); lcp_compute_error(&lcp_subproblem_check, x_plus, x, 1e-14, &err_lcp); double local_tol = fmax(1e-14, DBL_EPSILON*sqrt(norm_r2)); printf("ncp_pathsearch :: lcp solved with error = %e; local_tol = %e\n", err_lcp, local_tol); assert(err_lcp < local_tol); } else { NM_display(lcp_subproblem.M); printf("z r q x_plus\n"); for (unsigned i = 0; i < n; ++i) printf("%e %e %e %e\n", z[i], r[i], lcp_subproblem.q[i], x_plus[i]); DEBUG_PRINT("ncp_pathsearch :: Lemke start did not succeeded !\n"); lcp_pivot_diagnose_info(*info); if (*info == LCP_PATHSEARCH_LEAVING_T) { DEBUG_PRINTF("ncp_pathsearch :: max t value after Lemke start = %e\n", options->internalSolvers->dparam[2]); } options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = 0; lcp_pivot(&lcp_subproblem, x_plus, x, info, options->internalSolvers); options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = SICONOS_LCP_PIVOT_PATHSEARCH; double err_lcp = 0.0; lcp_compute_error(&lcp_subproblem, x_plus, x, 1e-14, &err_lcp); printf("ncp_pathsearch :: lemke start resolved with info = %d; error = %e\n", *info, err_lcp); printf("x_plus x_minus\n"); for (unsigned i = 0; i < n; ++i) printf("%e %e\n", x_plus[i], x[i]); /* recompute the normal norm */ problem->compute_F(problem->env, n, x_plus, r); cblas_daxpy(n, -1.0, x, 1, r, 1); double normal_norm2_newton_point = cblas_ddot(n, r, 1, r, 1); if (normal_norm2_newton_point > norm_r2) { printf("ncp_pathsearch :: lcp successfully solved, but the norm of the normal map increased! %e > %e\n", normal_norm2_newton_point, norm_r2); //assert(normal_norm2_newton_point <= norm_r2); } else { printf("ncp_pathsearch :: lcp successfully solved, norm of the normal map decreased! %.*e < %.*e\n", DECIMAL_DIG, normal_norm2_newton_point, DECIMAL_DIG, norm_r2); } if (100*normal_norm2_newton_point < norm_r2) { force_d_step_merit_check = 1; } } break; case LCP_PIVOT_NUL: printf("ncp_pathsearch :: kaboom, kaboom still more work needs to be done\n"); lcp_pivot_diagnose_info(*info); // exit(EXIT_FAILURE); force_watchdog_step = 1; break; case LCP_PATHSEARCH_NON_ENTERING_T: DEBUG_PRINT("ncp_pathsearch :: non entering t, something is wrong here. Fix the f****** code!\n"); assert(0 && "ncp_pathsearch :: non entering t, something is wrong here\n"); force_watchdog_step = 1; break; default: printf("ncp_pathsearch :: unknown code returned by the path search\n"); exit(EXIT_FAILURE); } nms_failed = NMS(data_NMS, problem, functions, z, x_plus, force_watchdog_step, force_d_step_merit_check, check_ratio); /* at this point z has been updated */ /* recompute the normal norm */ problem->compute_F(problem->env, n, z, F); functions->compute_F_merit(problem, z, F, data_NMS->ls_data->F_merit); /* XXX is this correct ? */ merit_norm = .5 * cblas_ddot(n, data_NMS->ls_data->F_merit, 1, data_NMS->ls_data->F_merit, 1); ncp_compute_error(n, z, F, nn_tol, &err); /* XXX F should be up-to-date, we should check only CC*/ DEBUG_PRINTF("ncp_pathsearch :: iter = %d, ncp_error = %e; merit_norm^2 = %e\n", nbiter, err, merit_norm); } options->iparam[1] = nbiter; options->dparam[1] = err; if (nbiter == itermax) { *info = 1; } else if (nms_failed) { *info = 2; } else { *info = 0; } DEBUG_PRINTF("ncp_pathsearch procedure finished :: info = %d; iter = %d; ncp_error = %e; merit_norm^2 = %e\n", *info, nbiter, err, merit_norm); if (!preAlloc) { freeNumericsMatrix(problem->nabla_F); free(problem->nabla_F); problem->nabla_F = NULL; free(options->dWork); options->dWork = NULL; solver_options_delete(options->internalSolvers); free(options->internalSolvers); options->internalSolvers = NULL; free_NMS_data(data_NMS); free(functions); free(options->solverData); options->solverData = NULL; } }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; double *A, *B; double diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; double x2_m, x2_c; // real x for magma, cblas/fortran blas respectively double x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); double total_diff = 0.; double total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_dmalloc_pinned( &A, size ); assert( A != NULL ); magma_dmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test DASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dasum( m, A(0,j), incx ); x_c = cblas_dasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dasum", diff, error ); total_diff += diff; total_error += error; } // ----- test DNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dnrm2( m, A(0,j), incx ); x_c = cblas_dnrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dnrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dnrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test DDOT // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_ddot( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_ddot_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_ddot( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_ddot( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "ddot", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test DDOT // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_ddot( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_ddot_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_ddot( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_ddot( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "ddot", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_ddot and cblas_ddot disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_ddot and blasf77_ddot disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }
int solve_bicgstab_block(csr_t* mat, csr_t** ilu, int nb, double* b, double* x) { int n = mat->n; int nnz = mat->nnz; int *offset_ilu = (int*) calloc(nb, sizeof(int)); for ( int i = 1; i < nb; i++ ) offset_ilu[i] = offset_ilu[i-1] + ilu[i-1]->n; double tol = 1e-6, floatone = 1.0; const int max_iter = 200; double *r, *p, *y, *zm1, *zm2, *rm2, *rm1, *rm3, nrm0, nrm; r = (double*) malloc (sizeof(double) * n); p = (double*) malloc (sizeof(double) * n); y = (double*) malloc (sizeof(double) * n); rm1 = (double*) malloc (sizeof(double) * n); rm2 = (double*) malloc (sizeof(double) * n); rm3 = (double*) malloc (sizeof(double) * n); zm1 = (double*) malloc (sizeof(double) * n); zm2 = (double*) malloc (sizeof(double) * n); double rho = 1.0, rho1, beta = 0.0, alpha = 0.0, omega, temp, temp1; char lower1 = 'L', lower = 'N', lower2 = 'U'; char upper1 = 'U', upper = 'N', upper2 = 'N'; #ifdef TIMER double timerLUSol = 0, timerLUSol1, timerSpMV = 0, timerSpMV1; double timerTotal = omp_get_wtime(); #endif cblas_dcopy (n, b, 1, r, 1); cblas_dcopy (n, r, 1, p, 1); cblas_dcopy (n, r, 1, zm1, 1); nrm0 = cblas_dnrm2 (n, r, 1); for (int k = 0; k < max_iter; k++) { rho1 = rho; rho = cblas_ddot(n, zm1, 1, r, 1); if ( k > 0 ) { beta = (rho / rho1) * (alpha / omega); cblas_daxpy (n, -omega, zm2, 1, p, 1); cblas_dscal (n, beta, p, 1); cblas_daxpy (n, floatone, r, 1, p, 1); } #ifdef TIMER timerLUSol1 = omp_get_wtime(); #endif #pragma omp parallel { #pragma omp for for (int i = 0; i < nb; i++) mkl_dcsrtrsv (&lower1, &lower, &lower2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &p[offset_ilu[i]], &y[offset_ilu[i]]); #pragma omp for for (int i = 0; i < nb; i++) mkl_dcsrtrsv (&upper1, &upper, &upper2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &y[offset_ilu[i]], &rm2[offset_ilu[i]]); } #ifdef TIMER timerLUSol += omp_get_wtime() - timerLUSol1; timerSpMV1 = omp_get_wtime(); #endif mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm2, zm2); #ifdef TIMER timerSpMV += omp_get_wtime() - timerSpMV1; #endif temp = cblas_ddot(n, zm1, 1, zm2, 1); alpha = rho / temp; cblas_daxpy (n, -alpha, zm2, 1, r, 1); cblas_daxpy (n, alpha, rm2, 1, x, 1); nrm = cblas_dnrm2 (n, x, 1); if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf(" iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; } #ifdef TIMER timerLUSol1 = omp_get_wtime(); #endif #pragma omp parallel { #pragma omp for for (int i = 0; i < nb; i++) mkl_dcsrtrsv (&lower1, &lower, &lower2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &r[offset_ilu[i]], &y[offset_ilu[i]]); #pragma omp for for (int i = 0; i < nb; i++) mkl_dcsrtrsv (&upper1, &upper, &upper2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &y[offset_ilu[i]], &rm3[offset_ilu[i]]); } #ifdef TIMER timerLUSol += omp_get_wtime() - timerLUSol1; timerSpMV1 = omp_get_wtime(); #endif mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm3, y); #ifdef TIMER timerSpMV += omp_get_wtime() - timerSpMV1; #endif temp = cblas_ddot(n, y, 1, r, 1); temp1 = cblas_ddot(n, y, 1, y, 1); omega = temp / temp1; cblas_daxpy (n, omega, rm3, 1, x, 1); cblas_daxpy (n, -omega, y, 1, r, 1); nrm = cblas_dnrm2 (n, r, 1); if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf(" iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; } printf(" iteration = %3d, residual = %le \n", k+1, nrm / nrm0); } #ifdef TIMER printf("time LUSol\t%lf\ntime SpMV\t%lf\n",timerLUSol,timerSpMV); printf("time total\t%lf\n",omp_get_wtime()-timerTotal); #endif free (r); free (rm1); free (rm2); free (rm3); free (zm1); free (zm2); free (p); free (y); free (offset_ilu); return 0; }
void mlcp_pgs(MixedLinearComplementarityProblem* problem, double *z, double *w, int *info, SolverOptions* options) { if (!problem->isStorageType2) { printf("Siconos/Numerics: mlcp_pgs: Wrong Storage (!isStorageType2) for PGS solver\n"); exit(EXIT_FAILURE); } double* A = problem->A; double* B = problem->B; double* C = problem->C; double* D = problem->D; double* a = problem->a; double* b = problem->b; int n = problem->n; int m = problem->m; double *u = &z[0]; double *v = &z[n]; double *Buf; int incx, incy, incAx, incAy, incBx, incBy; int i, iter; int itermax, verbose; int pgsExplicit; double err, vi; double tol; double prev; double *diagA, *diagB; verbose = 0; incx = 1; incy = 1; /* Recup input */ itermax = options->iparam[0]; pgsExplicit = options->iparam[2]; tol = options->dparam[0]; /* Initialize output */ options->iparam[1] = 0; options->dparam[1] = 0.0; /* Allocation */ diagA = (double*)malloc(n * sizeof(double)); diagB = (double*)malloc(m * sizeof(double)); incx = 1; incy = 1; /* Preparation of the diagonal of the inverse matrix */ for (i = 0 ; i < n ; ++i) { if ((fabs(A[i * n + i]) < DBL_EPSILON)) { if (verbose > 0) { printf(" Vanishing diagonal term \n"); printf(" The local problem cannot be solved \n"); } *info = 2; free(diagA); free(diagB); *info = 1; return; } else { diagA[i] = 1.0 / A[i * n + i]; } } for (i = 0 ; i < m ; ++i) { if ((fabs(B[i * m + i]) < DBL_EPSILON)) { if (verbose > 0) { printf(" Vanishing diagonal term \n"); printf(" The local problem cannot be solved \n"); } *info = 2; free(diagA); free(diagB); return; } else { diagB[i] = 1.0 / B[i * m + i]; } } /*start iterations*/ iter = 0; err = 1.; incx = 1; incy = 1; incAx = n; incAy = 1; incBx = m; incBy = 1; mlcp_compute_error(problem, z, w, tol, &err); while ((iter < itermax) && (err > tol)) { ++iter; incx = 1; incy = 1; if (pgsExplicit) { /*Use w like a buffer*/ cblas_dcopy(n , w , incx , u , incy); //w <- q Buf = w; for (i = 0 ; i < n ; ++i) { prev = Buf[i]; Buf[i] = 0; //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i]; u[i] = - (a[i] + cblas_ddot(n , &A[i] , incAx , Buf , incAy) + cblas_ddot(m , &C[i] , incAx , v , incBy)) * diagA[i]; Buf[i] = prev; } for (i = 0 ; i < m ; ++i) { v[i] = 0.0; //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i]; vi = -(b[i] + cblas_ddot(n , &D[i] , incBx , u , incAy) + cblas_ddot(m , &B[i] , incBx , v , incBy)) * diagB[i]; if (vi < 0) v[i] = 0.0; else v[i] = vi; } } else { for (i = 0 ; i < n ; ++i) { u[i] = 0.0; //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i]; u[i] = - (a[i] + cblas_ddot(n , &A[i] , incAx , u , incAy) + cblas_ddot(m , &C[i] , incAx , v , incBy)) * diagA[i]; } for (i = 0 ; i < m ; ++i) { v[i] = 0.0; //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i]; vi = -(b[i] + cblas_ddot(n , &D[i] , incBx , u , incAy) + cblas_ddot(m , &B[i] , incBx , v , incBy)) * diagB[i]; if (vi < 0) v[i] = 0.0; else v[i] = vi; } } /* **** Criterium convergence compliant with filter_result_MLCP **** */ mlcp_compute_error(problem, z, w, tol, &err); if (verbose == 2) { printf(" # i%d -- %g : ", iter, err); for (i = 0 ; i < n ; ++i) printf(" %g", u[i]); for (i = 0 ; i < m ; ++i) printf(" %g", v[i]); for (i = 0 ; i < m ; ++i) printf(" %g", w[i]); printf("\n"); } /* **** ********************* **** */ } options->iparam[1] = iter; options->dparam[1] = err; if (err > tol) { printf("Siconos/Numerics: mlcp_pgs: No convergence of PGS after %d iterations\n" , iter); printf("Siconos/Numerics: mlcp_pgs: The residue is : %g \n", err); *info = 1; } else { if (verbose > 0) { printf("Siconos/Numerics: mlcp_pgs: Convergence of PGS after %d iterations\n" , iter); printf("Siconos/Numerics: mlcp_pgs: The residue is : %g \n", err); } *info = 0; } free(diagA); free(diagB); return; }
int solve_bicgstab(csr_t* mat, csr_t* ilu, double* b, double* x) { double tol = 1e-6, floatone = 1.0; const int max_iter = 200; int n = mat->n; int nnz = mat->nnz; double *r, *p, *y, *zm1, *zm2, *rm2, *rm1, *rm3, nrm0, nrm; r = (double*) calloc (n, sizeof(double)); p = (double*) calloc (n, sizeof(double)); y = (double*) calloc (n, sizeof(double)); rm1 = (double*) calloc (n, sizeof(double)); rm2 = (double*) calloc (n, sizeof(double)); rm3 = (double*) calloc (n, sizeof(double)); zm1 = (double*) calloc (n, sizeof(double)); zm2 = (double*) calloc (n, sizeof(double)); double rho = 1.0, rho1, beta = 0.0, alpha = 0.0, omega, temp, temp1; char lower1 = 'L', lower = 'N', lower2 = 'U'; char upper1 = 'U', upper = 'N', upper2 = 'N'; #ifdef TIMER double timerLUSol = 0, timerLUSol1, timerSpMV = 0, timerSpMV1, timerVector = 0, timerVector1; double timerTotal = omp_get_wtime(); #endif cblas_dcopy (n, b, 1, r, 1); cblas_dcopy (n, r, 1, p, 1); cblas_dcopy (n, r, 1, zm1, 1); nrm0 = cblas_dnrm2 (n, r, 1); for (int k = 0; k < max_iter; k++) { rho1 = rho; #ifdef TIMER timerVector1 = omp_get_wtime(); #endif rho = cblas_ddot(n, zm1, 1, r, 1); if ( k > 0 ) { beta = (rho / rho1) * (alpha / omega); cblas_daxpy (n, -omega, zm2, 1, p, 1); cblas_dscal (n, beta, p, 1); cblas_daxpy (n, floatone, r, 1, p, 1); } #ifdef TIMER timerVector += omp_get_wtime() - timerVector1; timerLUSol1 = omp_get_wtime(); #endif mkl_dcsrtrsv (&lower1, &lower, &lower2, &n, ilu->val, ilu->ia, ilu->ja, p, y); mkl_dcsrtrsv (&upper1, &upper, &upper2, &n, ilu->val, ilu->ia, ilu->ja, y, rm2); #ifdef TIMER timerLUSol += omp_get_wtime() - timerLUSol1; timerSpMV1 = omp_get_wtime(); #endif mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm2, zm2); #ifdef TIMER timerSpMV += omp_get_wtime() - timerSpMV1; timerVector1 = omp_get_wtime(); #endif temp = cblas_ddot(n, zm1, 1, zm2, 1); alpha = rho / temp; cblas_daxpy (n, -alpha, zm2, 1, r, 1); cblas_daxpy (n, alpha, rm2, 1, x, 1); nrm = cblas_dnrm2 (n, r, 1); #ifdef TIMER timerVector += omp_get_wtime() - timerVector1; #endif if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf(" iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; } #ifdef TIMER timerLUSol1 = omp_get_wtime(); #endif mkl_dcsrtrsv (&lower1, &lower, &lower2, &n, ilu->val, ilu->ia, ilu->ja, r, y); mkl_dcsrtrsv (&upper1, &upper, &upper2, &n, ilu->val, ilu->ia, ilu->ja, y, rm3); #ifdef TIMER timerLUSol += omp_get_wtime() - timerLUSol1; timerSpMV1 = omp_get_wtime(); #endif mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm3, y); #ifdef TIMER timerSpMV += omp_get_wtime() - timerSpMV1; timerVector1 = omp_get_wtime(); #endif temp = cblas_ddot(n, y, 1, r, 1); temp1 = cblas_ddot(n, y, 1, y, 1); omega = temp / temp1; cblas_daxpy (n, omega, rm3, 1, x, 1); cblas_daxpy (n, -omega, y, 1, r, 1); nrm = cblas_dnrm2 (n, r, 1); #ifdef TIMER timerVector += omp_get_wtime() - timerVector1; #endif if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf(" iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; } printf(" iteration = %3d, residual = %le \n", k+1, nrm / nrm0); } #ifdef TIMER printf("time LUSol\t%lf\ntime SpMV\t%lf\ntime v-v oper\t%lf\n",timerLUSol,timerSpMV,timerVector); printf("time total\t%lf\n",omp_get_wtime()-timerTotal); #endif free (r); free (rm1); free (rm2); free (rm3); free (zm1); free (zm2); free (p); free (y); return 0; }
void variationalInequality_HyperplaneProjection(VariationalInequality* problem, double *x, double *w, int* info, SolverOptions* options) { /* /\* int and double parameters *\/ */ int* iparam = options->iparam; double* dparam = options->dparam; /* Number of contacts */ int n = problem->size; /* Maximum number of iterations */ int itermax = iparam[0]; /* Maximum number of iterations in Line--search */ int lsitermax = iparam[1]; assert(lsitermax >0); /* Tolerance */ double tolerance = dparam[0]; /***** Fixed point iterations *****/ int iter = 0; /* Current iteration number */ double error = 1.; /* Current error */ int hasNotConverged = 1; dparam[0] = dparam[2]; // set the tolerance for the local solver double * xtmp = (double *)malloc(n * sizeof(double)); double * wtmp = (double *)malloc(n * sizeof(double)); double * xtmp2 = (double *)malloc(n * sizeof(double)); double * xtmp3 = (double *)malloc(n * sizeof(double)); int isVariable = 0; double tau = 1.0; double sigma = 0.99; if (dparam[3] > 0.0) { tau = dparam[3]; } else { printf("Hyperplane Projection method. tau <=0 is not well defined\n"); printf("Hyperplane Projection method. tau is set to 1.0\n"); } if (dparam[4] > 0.0 && dparam[4] < 1.0) { sigma = dparam[4]; } else { printf("Hyperplane Projection method. 0<sigma <1 is not well defined\n"); printf("Hyperplane Projection method. sigma is set to %6.4e\n", sigma); } isVariable=0; if (!isVariable) { /* double minusrho = -1.0*rho; */ while ((iter < itermax) && (hasNotConverged > 0)) { ++iter; /** xtmp <-- x (x_k) */ cblas_dcopy(n , x , 1 , xtmp, 1); /* xtmp (y_k)= P_X(x_k-tau F(x_k)) */ problem->F(problem, n, xtmp, wtmp); cblas_daxpy(n, -tau, wtmp , 1, xtmp , 1) ; cblas_dcopy(n , xtmp, 1 , xtmp2, 1); problem->ProjectionOnX(problem, xtmp2,xtmp); // Armijo line search int stopingcriteria = 1; int ls_iter = -1; double alpha = 1.0; double lhs = NAN; double rhs; // xtmp3 = z_k-y_k cblas_dcopy(n , x , 1 , xtmp3, 1); cblas_daxpy(n, -1.0, xtmp, 1, xtmp3, 1); rhs = cblas_dnrm2(n,xtmp3, 1); rhs = sigma / tau * rhs * rhs; DEBUG_EXPR_WE( for (int i =0; i< n ; i++) { printf("(y_k) xtmp[%i]=%6.4e\t",i,xtmp[i]); printf("(x_k-y_k) xtmp3[%i]=%6.4e\n",i,xtmp3[i]); } ); while (stopingcriteria && (ls_iter < lsitermax)) { ls_iter++ ; /* xtmp2 = alpha * y_k + (1-alpha) x_k */ alpha = 1.0 / (pow(2.0, ls_iter)); DEBUG_PRINTF("alpha = %6.4e\n", alpha); cblas_dcopy(n ,xtmp , 1 , xtmp2, 1); cblas_dscal(n , alpha, xtmp2, 1); cblas_daxpy(n, 1.0-alpha, x, 1, xtmp2, 1); /* wtmp = */ problem->F(problem, n, xtmp2,wtmp); DEBUG_EXPR_WE( for (int i =0; i< n ; i++) { printf("(z_k) xtmp2[%i]=%6.4e\n",i,xtmp2[i]); printf("F(z_k) wtmp[%i]=%6.4e\n",i,wtmp[i]); } ); lhs = cblas_ddot(n, wtmp, 1, xtmp3, 1); if (lhs >= rhs) stopingcriteria = 0; DEBUG_PRINTF("ls_iter= %i, lsitermax =%i, stopingcriteria %i\n",ls_iter,lsitermax,stopingcriteria); DEBUG_PRINTF("Number of iteration in Armijo line search = %i\t, lhs = %6.4e\t, rhs = %6.4e\t, alpha = %6.4e\t, sigma = %6.4e\t, tau = %6.4e\n", ls_iter, lhs, rhs, alpha, sigma, tau); } cblas_dcopy(n , x , 1 , xtmp3, 1); cblas_daxpy(n, -1.0, xtmp2, 1, xtmp3, 1); DEBUG_PRINTF("norm(x-x_k) = %6.4e\n",cblas_dnrm2(n, xtmp3, 1) ); lhs=cblas_ddot(n, wtmp, 1, xtmp3, 1); double nonorm = cblas_dnrm2(n, wtmp, 1); double rhoequiv = lhs / (nonorm * nonorm); /* rhoequiv =1.0; */ DEBUG_PRINTF("nonorm = %6.4e\n", nonorm); DEBUG_PRINTF("lhs = %6.4e\n", lhs); DEBUG_PRINTF("rho equiv = %6.4e\n", rhoequiv); cblas_daxpy(n, -rhoequiv, wtmp, 1, x , 1); cblas_dcopy(n , x, 1 , xtmp, 1); problem->ProjectionOnX(problem, xtmp,x); /* **** Criterium convergence **** */ variationalInequality_computeError(problem, x , w, tolerance, options, &error); DEBUG_EXPR_WE( for (int i =0; i< n ; i++) { printf("x[%i]=%6.4e\t",i,x[i]); printf("w[%i]=F[%i]=%6.4e\n",i,i,w[i]); } );
/** Purpose ------- DLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by DSYTRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e DOUBLE_PRECISION array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W DOUBLE_PRECISION array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_aux ********************************************************************/ extern "C" double magma_dlatrd_mgpu(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0, double *A, magma_int_t lda, double *e, double *tau, double *W, magma_int_t ldw, double **dA, magma_int_t ldda, magma_int_t offset, double **dW, magma_int_t lddw, double *dwork[MagmaMaxGPUs], magma_int_t ldwork, magma_int_t k, double *dx[MagmaMaxGPUs], double *dy[MagmaMaxGPUs], double *work, magma_queue_t stream[][10], double *times) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(id, i, j) (dA[(id)] + ((j)+loffset)*ldda + (i) + offset) #define dW(id, i, j) (dW[(id)] + (j) *lddw + (i)) #define dW1(id, i, j) (dW[(id)] + ((j)+nb) *lddw + (i)) double mv_time = 0.0; magma_int_t i; #ifndef MAGMABLAS_DSYMV_MGPU magma_int_t loffset = nb0*((offset/nb0)/num_gpus); #endif double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double value = MAGMA_D_ZERO; magma_int_t id, idw, i_one = 1; //magma_int_t kk; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; double alpha; double *dx2[MagmaMaxGPUs]; double *f; magma_dmalloc_cpu( &f, n ); if (n <= 0) { return 0; } //#define PROFILE_SYMV #ifdef PROFILE_SYMV magma_event_t start, stop; float etime; magma_timestr_t cpu_start, cpu_end; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ double wii = *W(i, iw+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione); wii = *A(i, i+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_D_REAL( alpha ); *A(i-1,i) = MAGMA_D_MAKE( 1, 0 ); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); dx2[id] = dW1(id, 0, iw); magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]); #ifndef MAGMABLAS_DSYMV_MGPU magma_dsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] ); #endif } magmablas_dsymv_mgpu(num_gpus, k, MagmaUpper, i, nb0, c_one, dA, ldda, 0, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(0, iw), stream ); if (i < n-1) { blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } /* overlap update */ if ( i < n-1 && i-1 >= n - nb ) { magma_int_t im1_1 = i_1 - 1; magma_int_t im1 = i-1; /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t im1_n = i_n + 1; lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw); #endif blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw); lapackf77_dlacgv(&im1_n, A(im1, i +1), &lda); #endif blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(im1, i+1), &lda, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&im1_n, A(im1, i+1), &lda); #endif } // 3. Here is where we need it // TODO find the right place magmablas_dsymv_sync(num_gpus, k, i, work, W(0, iw), stream ); if (i < n-1) { blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -.5f * value; blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) { magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] ); } else { magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] ); } } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; idw = ((offset+i)/nb)%num_gpus; if ( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); double wii = *W(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione); wii = *A(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &lda); #endif wii = -wii; blasf77_daxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); #ifdef PROFILE_SYMV cpu_start = get_current_time(); #endif lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); #ifdef PROFILE_SYMV cpu_end = get_current_time(); times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0; #endif e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = MAGMA_D_MAKE( 1, 0 ); trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU //trace_gpu_start( idw, 0, "comm", "comm1" ); #ifndef MAGMABLAS_DSYMV_MGPU magma_setdevice(idw); magma_dsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 ); #endif for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); #ifdef MAGMABLAS_DSYMV_MGPU dx2[id] = dW1(id, 0, i)-offset; #else dx2[id] = dx[id]; magma_dsetvector( i_n, A(i+1,i), 1, dx[id], 1 ); #endif magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] ); trace_gpu_end( id, 0 ); } /* mat-vec on multiple GPUs */ #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(start, stream[0][0]); #endif magmablas_dsymv_mgpu(num_gpus, k, MagmaLower, i_n, nb0, c_one, dA, ldda, offset+i+1, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(stop, stream[0][0]); #endif trace_cpu_start( 0, "gemv", "gemv" ); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); trace_cpu_end( 0 ); /* overlap update */ if ( i > 0 && i+1 < n ) { magma_int_t ip1 = i+1; trace_cpu_start( 0, "gemv", "gemv" ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(ip1, 0), &ldw); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda, W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(ip1, 0), &ldw); lapackf77_dlacgv(&i, A(ip1, 0), &lda); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw, A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, A(ip1, 0), &lda); #endif trace_cpu_end( 0 ); } /* synchronize */ magmablas_dsymv_sync(num_gpus, k, i_n, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV cudaEventElapsedTime(&etime, start, stop); mv_time += (etime/1000.0); times[1+(i_n/(n0/10))] += (etime/1000.0); #endif trace_cpu_start( 0, "axpy", "axpy" ); if (i != 0) blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i]* -.5f * value; blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); trace_cpu_end( 0 ); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) { magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] ); } else { magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] ); } } } } } #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_destory( start ); magma_event_destory( stop ); #endif for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) magma_queue_sync(stream[id][1]); } magma_free_cpu(f); return mv_time; } /* magma_dlatrd_mgpu */
int frictionContactFBLSA( fc3d_nonsmooth_Newton_solvers* equation, double *reaction, double *velocity, double *mu, double *rho, double *F, double *A, double *B, NumericsMatrix *W, double *qfree, NumericsMatrix *blockAWpB, double *direction, double *tmp, double alpha[1], unsigned int maxiter_ls) { unsigned problemSize = W->size0; // notes : // - F contains FB or grad FB merit // - tmp contains direction, scal*direction, reaction+scal*direction // cf Newton_Methods.c, L59 double p = 2.1; double fblsa_rho = 1e-8; double gamma = 1e-4; double scal = 1.; // F <- compute fb fc3d_FischerBurmeisterFunction(problemSize, (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterFunctionGenerated, reaction, velocity, mu, rho, F, NULL, NULL); double thetafb0 = 0.5 * cblas_ddot(problemSize, F, 1, F, 1); // F <- compute gradient of fb merit function (ugly) fc3d_FischerBurmeisterFunction(problemSize, (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterGradMeritFunctionGenerated, reaction, velocity, mu, rho, F, NULL, NULL); double norm_dir_exp_p = pow(cblas_dnrm2(problemSize, direction, 1), p); double gradmeritfb_dir = cblas_ddot(problemSize, F, 1, direction, 1); if (!isnan(gradmeritfb_dir) && !isinf(gradmeritfb_dir) && gradmeritfb_dir > (-fblsa_rho * norm_dir_exp_p)) { if (verbose > 0) { printf("fc3d FBLSA: condition 9.1.6 unsatisfied, gradmeritfb_dir=%g, norm_r=%g\n", gradmeritfb_dir, norm_dir_exp_p); } // FIX: failure... if (verbose > 0) { printf("fc3d FBLSA: set d^k to - grad merit(fb)\n"); } cblas_dcopy(problemSize, F, 1, direction, 1); cblas_dscal(problemSize, -1, direction, 1); } for (unsigned int iter = 0; iter < maxiter_ls; ++iter) { scal /= 2.; // tmp <- 2^(-ik)*direction+reaction cblas_dcopy(problemSize, reaction, 1, tmp, 1); cblas_daxpy(problemSize, scal, direction, 1, tmp, 1); // velocity <- W*tmp + qfree cblas_dcopy(problemSize, qfree, 1, velocity, 1); NM_gemv(1., W, tmp, 1., velocity); // compute fb fc3d_FischerBurmeisterFunction(problemSize, (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterFunctionGenerated, tmp, velocity, mu, rho, F, NULL, NULL); double thetafb = 0.5 * cblas_ddot(problemSize, F, 1, F, 1); // compute grad merit fb (ugly) fc3d_FischerBurmeisterFunction(problemSize, (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterGradMeritFunctionGenerated, tmp, velocity, mu, rho, F, NULL, NULL); // tmp <- scal*direction cblas_dscal(problemSize, 0., tmp, 1); cblas_daxpy(problemSize, scal, direction, 1, tmp, 1); double grad_meritf_reaction = cblas_ddot(problemSize, F, 1, tmp, 1); if (!isinf(grad_meritf_reaction) && !isnan(grad_meritf_reaction) && thetafb < thetafb0 + gamma * scal * grad_meritf_reaction) { if (verbose > 0) { printf("fc3d FBLSA success. iteration = %i, thetafb=%g, thetafb0=%g, gradmeritf,reaction=%g\n", iter, thetafb, thetafb0, gamma*scal*grad_meritf_reaction); } // tmp <- reaction + tmp cblas_daxpy(problemSize, 1, reaction, 1, tmp, 1); return 0; } } if (verbose > 0) { printf("fc3d FBLSA reached the max number of iteration reached = %i\n", maxiter_ls); } return -1; }
double F77_ddot(const int *N, const double *X, const int *incX, const double *Y, const int *incY) { return cblas_ddot(*N, X, *incX, Y, *incY); }
int main(int argc, char* argv[]) { char dummy[L2_CACHE_SIZE]; // Tests de performances de ddot int size = 50; blas_t *matriceD, *matriceE; alloc_vecteur(&matriceD, size); alloc_vecteur(&matriceE, size); printf("Tests de performance de la fonction ddot\n"); perf_t *t1, *t2,*t3, *t4,*t5, *t6,*t7, *t8, *t9, *t10; t1 = malloc(sizeof(perf_t)); t2 = malloc(sizeof(perf_t)); t3 = malloc(sizeof(perf_t)); t4 = malloc(sizeof(perf_t)); t5 = malloc(sizeof(perf_t)); t6 = malloc(sizeof(perf_t)); t7 = malloc(sizeof(perf_t)); t8 = malloc(sizeof(perf_t)); t9 = malloc(sizeof(perf_t)); t10 = malloc(sizeof(perf_t)); double mflops, mflops1,mflops2,mflops3,mflops4, mflops5; char command[200]; system("rm results/ddot_perf.txt"); for(size = 50; size < 100000000; size += size/4) { printf("M: %d ", size); if(size != 50) { free(matriceD); free(matriceE); alloc_vecteur(&matriceD, size); alloc_vecteur(&matriceE, size); } memset(dummy, 0, sizeof(dummy)); perf(t1); blas_t res = cblas_ddot(size, matriceD, 1, matriceE, 1); perf(t2); perf_diff(t1, t2); mflops = perf_mflops(t2, 2 * size); printf("Mflops/s: %le\n", mflops); sprintf(command, "echo %d %lf >> results/ddot_perf.txt", size, mflops); system(command); } // Test de performance dgemm ////////////////////////////////////////// long m = 100; blas_t *matriceA, *matriceB, *matriceC; alloc_matrice(&matriceA, m, m); alloc_matrice(&matriceB, m, m); matriceC = calloc(m*m,sizeof(blas_t)); system("rm results/dgemm_perf.txt"); for(; m< 1000; m+=20) { printf("M: %d ", m); if(m != 100) { free(matriceA); free(matriceB); free(matriceC); alloc_matrice(&matriceA, m, m); alloc_matrice(&matriceB, m, m); alloc_matrice(&matriceC, m, m); } memset(dummy, 0, sizeof(dummy)); perf(t1); cblas_dgemm_scalaire( CblasNoTrans, CblasNoTrans ,m, m, m, 1, matriceA, m, matriceB, m, 1, matriceC, m); perf(t2); perf_diff(t1, t2); mflops1 = perf_mflops(t2, m * m * m * 3 + m * m ); perf(t3); cblas_dgemm_scalaire1(matriceC, m, matriceA, m, matriceB, m, m); perf(t4); perf_diff(t3, t4); mflops2 = perf_mflops(t4, m * m * m * 3); perf(t5); cblas_dgemm_scalaire2(matriceC, m, matriceA, m, matriceB, m, m); perf(t6); perf_diff(t5, t6); mflops3 = perf_mflops(t6, m * m * m * 3); perf(t7); cblas_dgemm_scalaire3(matriceC, m, matriceA, m, matriceB, m, m); perf(t8); perf_diff(t7, t8); mflops4 = perf_mflops(t8, m * m * m * 3); perf(t9); cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, m,m, 1, matriceA, m, matriceB, m, 1, matriceC, m); perf(t10); perf_diff(t9, t10); mflops5 = perf_mflops(t10, m * m * m * 3); sprintf(command, "echo %d %lf %lf %lf %lf %lf >> results/dgemm_perf.txt", m * m, mflops1, mflops2, mflops3, mflops4, mflops5); system(command); printf("Mflops/s : %d %lf %lf %lf %lf %lf\n", m * m, mflops1, mflops2, mflops3, mflops4, mflops5 ); } free(matriceA); free(matriceB); free(matriceC); free(matriceD); free(matriceE); return EXIT_SUCCESS; }
int ConjugateGradientSolverMPI(unsigned long *II, unsigned long *J, double *A, unsigned long M,unsigned long local_M, unsigned long N, unsigned long long nz, double *b, unsigned long M_Vector, unsigned long N_Vector, unsigned long long nz_vector, int numIterations) { //A*x=b double *Ax=(double *) malloc(nz_vector * sizeof(double)); double *Ap=(double *) malloc(nz_vector * sizeof(double)); double *r=(double *) malloc(nz_vector * sizeof(double)); double *p=(double *) malloc(nz_vector * sizeof(double)); double *x=(double *) calloc(nz_vector,sizeof(double)); double *Ap_partial=(double *) malloc(local_M * sizeof(double)); //r = b-A*x //If we take x=0 the init multiplication is avoided and r=b memcpy(r, b, N*sizeof(double)); //p=r memcpy(p, r, N*sizeof(double)); //rsold = r*r double rsold = cblas_ddot(N,r,1,r,1); int stop = 0; double alphaCG = 0.0; double rsnew = 0.0; unsigned long k = 0; unsigned long maxIterations = M*2; if ( numIterations != 0) { maxIterations = numIterations; } while(!stop){ //Ap=A*p //cblas_dgemv(CblasColMajor, CblasNoTrans, M,N , 1.0, A, N, p, 1, 0.0, Ap, 1); cblas_dgemv(CblasRowMajor, CblasNoTrans, local_M,N , 1.0, A, N, p, 1, 0.0, Ap_partial, 1); MPI_Allgather (Ap_partial,local_M,MPI_DOUBLE,Ap,local_M,MPI_DOUBLE,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); //alphaCG=rsold/(p'*Ap) alphaCG = rsold/cblas_ddot(N,p,1,Ap,1); //x=x+alphaCG*p cblas_daxpy(N,alphaCG,p,1,x,1); //r=r-alphaCG*Ap cblas_daxpy(N,-alphaCG,Ap,1,r,1); //rsnew = r'*r rsnew = cblas_ddot(N,r,1,r,1); if((sqrt(rsnew)<=EPSILON)||(k == maxIterations)){ stop = 1; } //p=r+rsnew/rsold*p cblas_dscal(N, rsnew/rsold, p, 1); cblas_daxpy(N,1.0,r,1,p,1); rsold = rsnew; k++; } memcpy(b, x, N*sizeof(double)); free(Ax); free(Ap); free(r); free(p); free(x); fprintf(stderr, "[%s] Number of iterations %lu\n",__func__,k); return 1; }
void FrictionContact2D_cpg(FrictionContactProblem* problem , double *reaction , double *velocity , int *info, SolverOptions* options) { int nc = problem->numberOfContacts; assert(nc>0); double * vec = problem->M->matrix0; double * mu = problem->mu; int n = 2 * nc, i, iter; assert(n>0); int incx = 1, incy = 1; int *stat, *statusi, it_end; double eps = 1.e-12; double pAp, alpha, beta, wAp, rp, normr; double alphaf, betaf, den, num, res; double *p, *fric, *r; double *fric1, *v, *w, *Ap, *xi, *z; int maxit = options->iparam[0]; double tol = options->dparam[0]; options->iparam[1] = 0; options->dparam[1] = 0.0; r = (double*) malloc(n * sizeof(double)); p = (double*) malloc(n * sizeof(double)); v = (double*) malloc(n * sizeof(double)); w = (double*) malloc(n * sizeof(double)); Ap = (double*) malloc(n * sizeof(double)); xi = (double*) malloc(n * sizeof(double)); z = (double*) malloc(n * sizeof(double)); fric1 = (double*) malloc(n * sizeof(double)); fric = (double*) malloc(nc * sizeof(double)); stat = (int*) malloc(nc * sizeof(int)); statusi = (int*) malloc(nc * sizeof(int)); for (i = 0; i < n ; i++) { reaction[i] = 0.0; xi[i] = 0.0; r[i] = 0.0; v[i] = 0.0; p[i] = 0.0; w[i] = 0.0; Ap[i] = 0.0; z[i] = 0.0; fric1[i] = 1.0; if (i < nc) { fric[i] = mu[i] * fric1[i]; stat[i] = 0; statusi[i] = 0; } } cblas_dcopy(n, problem->q, incx, r, incy); alphaf = -1.; betaf = -1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alphaf, vec, n, reaction, incx, betaf, r, incy); /* Check for initial status */ for (i = 0; i < nc; i++) { mu[i] = fric[i]; if (reaction[2 * i] <= eps) { /* No contact */ stat[i] = 0; } else if (reaction[2 * i + 1] <= -mu[i]*reaction[2 * i]) { /* Slide backward */ stat[i] = 1; } else if (reaction[2 * i + 1] >= mu[i]*reaction[2 * i]) { /* Slide forward */ stat[i] = 3; } else { /* Stick contact */ stat[i] = 2; } } iter = 0; normr = 1.0; while ((iter < maxit) && (normr > tol)) { for (i = 0 ; i < nc ; i++) statusi[i] = stat[i]; cblas_dcopy(n, r, incx, v, incy); if (iter == 0) { cblas_dcopy(n, r, incx, w, incy); cblas_dcopy(n, w, incx, p, incy); } alphaf = 1.0; betaf = 0.0; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alphaf, vec, n, p, incx, betaf, Ap, incy); pAp = cblas_ddot(n, p, incx, Ap, incy); /*} else { alphaf = 1.0; betaf = 0.0; dgemv_( ¬rans, (integer *)&n, (integer *)&n, &alphaf, vec, (integer *)&n, p, &incx, &betaf, Ap, &incy ); pAp = ddot_( (integer *)&n, p, &incx, Ap, &incy );*/ if (pAp == 0) { if (verbose > 0) printf("\n Operation non conform alpha at the iteration %d \n", iter); free(r); free(fric); free(p); free(v); free(w); free(Ap); free(xi); free(z); free(fric1); free(stat); free(statusi); *info = 2; return; } /*} */ rp = cblas_ddot(n, r, incx, p, incy); alpha = rp / pAp; cblas_dcopy(n, reaction, incx, xi, incy); alphaf = alpha; cblas_daxpy(n, alphaf, p, incx, xi, incy); FrictionContact2D_projc(xi, &n, statusi, p, fric, reaction, stat); /* r(:)=b(:)-matmul(A,x) */ cblas_dcopy(n, problem->q, incx, r, incy); alphaf = -1.; betaf = -1.; cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alphaf, vec, n, reaction, incx, betaf, r, incy); FrictionContact2D_projf(statusi, &n, r, fric, w); FrictionContact2D_projf(statusi, &n, p, fric, z); wAp = cblas_ddot(n, w, incx, Ap, incy); beta = - wAp / pAp; cblas_dcopy(n, w, incx, p, incy); alphaf = beta; cblas_daxpy(n, alphaf, z, incx, p, incy); /* alphaf = 1.; betaf = 0.; dgemv_( ¬rans, (integer *)&n, (integer *)&n, &alphaf, vec , (integer *)&n, p, &incx, &betaf, Ap, &incy ); pAp = ddot_( (integer *)&n, p, &incx, Ap, &incy );*/ cblas_dcopy(n, r, incx, xi, incy); alphaf = -1.; cblas_daxpy(n, alphaf, v, incx, xi, incy); num = cblas_ddot(n, xi, incx, xi, incy); den = cblas_ddot(n, v, incx, v, incy); normr = sqrt(num / den); it_end = iter; res = normr; options->iparam[1] = it_end; options->dparam[1] = res; iter = iter + 1; } if (normr < tol) { if (verbose > 0) printf("convergence after %d iterations with a residual %g\n", iter - 1, normr); *info = 0; } else { if (verbose > 0) printf("no convergence after %d iterations with a residual %g\n", iter - 1, normr); *info = 1; } alpha = -1.; cblas_dscal(n , alpha , r , incx); cblas_dcopy(n, r, incx, velocity, incy); free(fric); free(p); free(v); free(w); free(Ap); free(xi); free(z); free(fric1); free(stat); free(statusi); free(r); }
double dotprod2(int N,double *X,int incx,double *Y,int incy) { return cblas_ddot(N, X, incx, Y, incy); }
extern "C" magma_int_t magma_dlatrd2(char uplo, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *e, double *tau, double *w, magma_int_t ldw, double *da, magma_int_t ldda, double *dw, magma_int_t lddw, double *dwork, magma_int_t ldwork) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DLATRD2 reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', DLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', DLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by DSYTRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) DOUBLE_PRECISION array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) DOUBLE_PRECISION array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t i; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double value = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; double alpha; double *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside dlatrd if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw); lapackf77_dlacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_D_REAL( alpha ); *A(i-1,i) = MAGMA_D_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_dsymv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_dgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(i, 0), &ldw); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(i, 0), &ldw); lapackf77_dlacgv(&i, A(i ,0), &lda); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = MAGMA_D_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_dsymv_work('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_dgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i!=0) blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* dlatrd */
int LineSearchGP(FrictionContactProblem* localproblem, computeNonsmoothFunction Function, double * t_opt, double R[3], double dR[3], double *rho, int LSitermax, double * F, double * A, double * B, double * velocity) { double alpha = *t_opt; double inf = 1e20; double alphamin = 0.0; double alphamax = inf; double m1 = 0.1, m2 = 0.9; /* // store the value fo the function */ /* double F[3]={0.,0.,0.}; */ /* // Store the (sub)-gradient of the function */ /* double A[9]={0.,0.,0.,0.,0.,0.,0.,0.,0.}; */ /* double B[9]={0.,0.,0.,0.,0.,0.,0.,0.,0.}; */ /* double velocity[3]={0.,0.,0.}; */ double mu = localproblem->mu[0]; double * qLocal = localproblem->q; double * MLocal = localproblem->M->matrix0; /* for (int i=0; i<3; i++) velocity[i] = MLocal[i+0*3]*R[0] + qLocal[i] */ /* + MLocal[i+1*3]*R[1] + */ /* + MLocal[i+2*3]*R[2] ; */ /* Function(R,velocity,mu,rho,F,A,B); */ // Computation of q(t) and q'(t) for t =0 double q0 = 0.5 * cblas_ddot(3 , F , 1 , F , 1); double tmp[3] = {0., 0., 0.}; // Value of AW+B double AWplusB[9] = {0., 0., 0., 0., 0., 0., 0., 0., 0.}; // compute A MLocal +B for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { AWplusB[i + 3 * j] = 0.0; for (int k = 0; k < 3; k++) { AWplusB[i + 3 * j] += A[i + 3 * k] * MLocal[k + j * 3]; } AWplusB[i + 3 * j] += B[i + 3 * j]; } } #ifdef VERBOSE_DEBUG for (int l = 0; l < 3; l++) { for (int k = 0; k < 3; k++) { printf("AWplusB[%i+3*%i] = %le\t", l, k, AWplusB[l + 3 * k]); } printf("\n"); } #endif for (int i = 0; i < 3; i++) { tmp[i] = 0.0; for (int j = 0; j < 3; j++) { tmp[i] += AWplusB[i + 3 * j] * dR[j] ; } } double dqdt0 = 0.0; for (int i = 0; i < 3; i++) { dqdt0 += F[i] * tmp[i]; } #ifdef VERBOSE_DEBUG printf("q0 = %12.8e \n", q0); printf("dqdt0 = %12.8e \n", dqdt0); for (int i = 0; i < 3; i++) { printf("tmp[%i] = %12.8e \t", i, tmp[i]); } printf("\n"); for (int i = 0; i < 3; i++) { printf("dR[%i] = %12.8e \t", i, dR[i]); } printf("\n"); #endif for (int iter = 0; iter < LSitermax; iter++) { for (int i = 0; i < 3; i++) tmp[i] = R[i] + alpha * dR[i]; for (int i = 0; i < 3; i++) velocity[i] = MLocal[i + 0 * 3] * tmp[0] + qLocal[i] + MLocal[i + 1 * 3] * tmp[1] + + MLocal[i + 2 * 3] * tmp[2] ; Function(tmp, velocity, mu, rho, F, NULL, NULL); double q = 0.5 * cblas_ddot(3 , F , 1 , F , 1); double slope = (q - q0) / alpha; #ifdef VERBOSE_DEBUG printf("q = %12.8e \n", q); printf("slope = %12.8e \n", slope); #endif int C1 = (slope >= m2 * dqdt0); int C2 = (slope <= m1 * dqdt0); if (C1 && C2) { #ifdef VERBOSE_DEBUG printf("Sucess in LS: alpha = %12.8e\n", alpha); #endif *t_opt = alpha; if (verbose > 1) { printf("----------------------------------------- LineSearchGP success number of iteration = %i alpha = %.10e \n", iter, alpha); } return 0; } else if (!C1) { #ifdef VERBOSE_DEBUG printf("LS: alpha too small = %12.8e\t, slope =%12.8e\n", alpha, slope); printf(" m1*dqdt0 =%12.8e\t, m2*dqdt0 =%12.8e\n ", m1 * dqdt0 , m2 * dqdt0); #endif //std::cout << "t = " << t << " is too small : slope = " << slope << ", m2*qp0 = " << m2*qp0 << std::endl; alphamin = alpha; } else // not(C2) { #ifdef VERBOSE_DEBUG printf("LS: alpha too big = %12.8e\t, slope =%12.8e\n", alpha, slope); printf(" m1*dqdt0 =%12.8e\t, m2*dqdt0 =%12.8e\n ", m1 * dqdt0 , m2 * dqdt0); #endif //std::cout << "t = " << t << " is too big : slope = " << slope << ", m1*qp0 = " << m1*qp0 << std::endl; alphamax = alpha; } if (alpha < inf) { alpha = 0.5 * (alphamin + alphamax); } else { alpha = 10 * alpha; } } if (verbose > 1) { printf("----------------------------------------- LineSearchGP failed max number of iteration reached = %i with alpha = %.10e \n", LSitermax, alpha); } *t_opt = alpha; return -1; }
double jcho::Matrix<double>::dot(const Matrix<double> &v) const { AssertSameDimensions(v); AssertIsVector(); return cblas_ddot(m() == 1 ? n() : m(), _storage, 1, v._storage, 1); }
void computeFGlobal_AC(double* reaction, double* FGlobal) { int numberOfContacts = globalFC3D->numberOfContacts; int n = 3 * numberOfContacts; NumericsMatrix * MGlobal = globalFC3D->M; double * MLocal = localFC3D->M->matrix0; double * qLocal = localFC3D->q; double *mu = globalFC3D->mu; int contact, diagPos = 0, position; int in, it, is, inc, incx; double * reactionLocal; double alpha, det, beta, num, coef2, mrn; for (contact = 0; contact < numberOfContacts; ++contact) { position = 3 * contact; if (MGlobal->storageType == 1) /* Sparse storage */ { /* The part of MGlobal which corresponds to the current block is copied into MLocal */ diagPos = numberOfContacts * contact + contact; MLocal = MGlobal->matrix1->block[diagPos]; } else if (MGlobal->storageType == 0) { in = 3 * contact; it = in + 1; is = it + 1; inc = n * in; double *MM = MGlobal->matrix0; /* The part of MM which corresponds to the current block is copied into MLocal */ MLocal[0] = MM[inc + in]; MLocal[1] = MM[inc + it]; MLocal[2] = MM[inc + is]; inc += n; MLocal[3] = MM[inc + in]; MLocal[4] = MM[inc + it]; MLocal[5] = MM[inc + is]; inc += n; MLocal[6] = MM[inc + in]; MLocal[7] = MM[inc + it]; MLocal[8] = MM[inc + is]; } reactionLocal = &reaction[3 * contact]; incx = 3; velocityLocal[0] = cblas_ddot(3 , MLocal , incx , reactionLocal , 1) + qLocal[0]; velocityLocal[1] = cblas_ddot(3 , MLocal , incx , reactionLocal , 1) + qLocal[1]; velocityLocal[2] = cblas_ddot(3 , MLocal , incx , reactionLocal , 1) + qLocal[2]; an = 1. / MLocal[0]; alpha = MLocal[4] + MLocal[8]; det = MLocal[4] * MLocal[8] - MLocal[7] + MLocal[5]; beta = alpha * alpha - 4 * det; at = 2 * (alpha - beta) / ((alpha + beta) * (alpha + beta)); projN = reactionLocal[0] - an * velocityLocal[0]; projT = reactionLocal[1] - at * velocityLocal[1]; projS = reactionLocal[2] - at * velocityLocal[2]; coef2 = mu[contact] * mu[contact]; if (projN > 0) { FGlobal[position] = velocityLocal[0]; } else { FGlobal[position] = reactionLocal[0] / an; } mrn = projT * projT + projS * projS; if (mrn <= coef2 * reactionLocal[0]*reactionLocal[0]) { FGlobal[position + 1] = velocityLocal[1]; FGlobal[position + 2] = velocityLocal[2]; } else { num = mu[contact] / sqrt(mrn); FGlobal[position + 1] = (reactionLocal[1] - projT * reactionLocal[0] * num) / at; FGlobal[position + 2] = (reactionLocal[2] - projS * reactionLocal[0] * num) / at; } } }
/** * Backtracking linesearch for the aproximate tangent direction */ int linesearch_atd(state_t* state , parameters_t pars, problem_t prob) { double a0 = 1.; //Initial step length double a = a0; state->nbacktrack = 0; // Calculate the largest step before either tau or kappa reach the boundary if(state->dkappa < 0) a = fmin(a,-state->kappa/state->dkappa); if(state->dtau < 0) a = fmin(a,-state->tau/state->dtau); //If with the full step either reaches the boundary make sure that after //the step the new trial tau or kappa is at most pars.eta of the way to the boundary. if(a<a0) a = a*pars.eta; // allocate work vectors for the trial steps double* xa = (double*)calloc(prob.A.n,sizeof(double)); double* sa = (double*)calloc(prob.A.n,sizeof(double)); //Allocate work vectors to calculate the centrality double* psi = (double*)calloc(prob.A.n,sizeof(double)); double* hpsi = (double*)calloc(prob.A.n,sizeof(double)); double kappaa; double taua; double dga; double mua; //TODO: add clean up before return? if(xa == NULL) return OUT_OF_MEMORY; if(sa == NULL) return OUT_OF_MEMORY; if(psi == NULL) return OUT_OF_MEMORY; if(hpsi == NULL) return OUT_OF_MEMORY; //Counter for the number of backtracks int nsect = 0; int j = 0; //Define some variables //used in the loop. bool dFeas, pFeas; //Flags to indicate feasiblity bool dosect; //Flag to indicate if there should be a backtrack double centmeas; //Present value of centrality measure for(j=0;j<pars.max_backtrack;j++) { dosect = false; //Take an a sized step in x, tau and s, kappa, and store in xa, sa, taua, kappaa cblas_dcopy(prob.A.n,state->x,1,xa,1); cblas_dcopy(prob.A.n,state->s,1,sa,1); cblas_daxpy(prob.A.n,a,state->dx,1,xa,1); cblas_daxpy(prob.A.n,a,state->ds,1,sa,1); taua = state->tau + a*state->dtau; kappaa = state->kappa + a*state->dkappa; //Calculate the duality gap at the new trial point dga = cblas_ddot(prob.A.n,xa,1,sa,1) + taua*kappaa; mua = dga / (prob.nu + 1); //Check if x,s are feasible wrt the cones dFeas = dual_feas(prob,sa); pFeas = primal_feas(prob,xa); //If not primal or dual feasible backtrack if(!pFeas) { dosect = true; //printf("C: Not primal feasible\n"); } else if(!dFeas) { dosect = true; //printf("C: Not dual feasible\n"); } else //If both primal and dual feasible measure the centrality { centmeas = eval_cent_meas(prob,xa,sa,*state,mua,psi,hpsi); //Decide if we need to backtrack if(centmeas > mua*pars.theta) { dosect = true; } //printf("Evaluating dist: %g, %g, %i:\n",centmeas,mua*pars.theta,dosect); } //If we must backtrack do so if(dosect) { a = a*pars.lscaff; state->nbacktrack += 1; } else { break; } } //Check if the backtrack reached the maximum number of iterates if(j==pars.max_backtrack) { //If the backtrack fails do not update the vectors and release //the local work vectors, and test vectors free(xa); free(sa); free(psi); free(hpsi); state->a = a; return BACKTRACK_FAIL; } else { //If the backtrack succedded accept the iterate //store it in state and take the step in y //Copy the successeful xa and sa to state.x state.s //just switching the pointers does not work because //we should not free a matlab allocated vector cblas_dcopy(prob.A.n,xa,1,state->x,1); cblas_dcopy(prob.A.n,sa,1,state->s,1); state->tau = taua; state->kappa = kappaa; //Take the step in y cblas_daxpy(prob.A.m,a,state->dy,1,state->y,1); state->a = a; //Free the work vectors free(xa); free(sa); free(psi); free(hpsi); return OK; } //state->a = a; //return OK; }
// Initialises the CG solver void cg_solver_init( const int x, const int y, const int z, const int halo_depth, const int coefficient, double rx, double ry, double rz, double* rro, double* density, double* energy, double* vec_u, double* vec_p, double* vec_r, double* vec_w, double* vec_kx, double* vec_ky, double* vec_kz, int* a_row_index, int* a_col_index, double* a_non_zeros) { if(coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY) { die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); } #pragma omp parallel for for(int ii = 0; ii < z; ++ii) { for(int jj = 0; jj < y; ++jj) { for(int kk = 0; kk < x; ++kk) { const int index = ii*y*x+jj*x+kk; vec_p[index] = 0.0; vec_r[index] = 0.0; vec_u[index] = energy[index]*density[index]; } } } #pragma omp parallel for for(int ii = 1; ii < z-1; ++ii) { for(int jj = 1; jj < y-1; ++jj) { for(int kk = 1; kk < x-1; ++kk) { const int index = ii*y*x+jj*x+kk; vec_w[index] = (coefficient == CONDUCTIVITY) ? density[index] : 1.0/density[index]; } } } #pragma omp parallel for for(int ii = halo_depth; ii < z-1; ++ii) { for(int jj = halo_depth; jj < y-1; ++jj) { for(int kk = halo_depth; kk < x-1; ++kk) { const int index = ii*x*y + jj*x + kk; vec_kx[index] = rx*(vec_w[index-1]+vec_w[index]) / (2.0*vec_w[index-1]*vec_w[index]); vec_ky[index] = ry*(vec_w[index-x]+vec_w[index]) / (2.0*vec_w[index-x]*vec_w[index]); vec_kz[index] = rz*(vec_w[index-x*y]+vec_w[index]) / (2.0*vec_w[index-x*y]*vec_w[index]); } } } // Initialise the CSR sparse coefficient matrix for(int ii = halo_depth; ii < z-1; ++ii) { for(int jj = halo_depth; jj < y-1; ++jj) { for(int kk = halo_depth; kk < x-1; ++kk) { const int index = ii*x*y + jj*x + kk; int coef_index = a_row_index[index]; if(ii >= halo_depth) { a_non_zeros[coef_index] = -vec_kz[index]; a_col_index[coef_index++] = index-x*y; } if(jj >= halo_depth) { a_non_zeros[coef_index] = -vec_ky[index]; a_col_index[coef_index++] = index-x; } if(kk >= halo_depth) { a_non_zeros[coef_index] = -vec_kx[index]; a_col_index[coef_index++] = index-1; } a_non_zeros[coef_index] = (1.0 + vec_kx[index+1] + vec_kx[index] + vec_ky[index+x] + vec_ky[index] + vec_kz[index+x*y] + vec_kz[index]); a_col_index[coef_index++] = index; if(ii < z-halo_depth) { a_non_zeros[coef_index] = -vec_kz[index+x*y]; a_col_index[coef_index++] = index+x*y; } if(jj < y-halo_depth) { a_non_zeros[coef_index] = -vec_ky[index+x]; a_col_index[coef_index++] = index+x; } if(kk < x-halo_depth) { a_non_zeros[coef_index] = -vec_kx[index+1]; a_col_index[coef_index] = index+1; } } } } double rro_temp = 0.0; int m = x*y*z; mkl_cspblas_dcsrgemv( "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w); int x_inner = x-2*halo_depth; #pragma omp parallel for reduction(+:rro_temp) for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { const int offset = ii*y*x + jj*x + halo_depth; cblas_dcopy(x_inner, vec_u + offset, 1, vec_r + offset, 1); cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1); cblas_dcopy(x_inner, vec_r + offset, 1, vec_p + offset, 1); rro_temp += cblas_ddot(x_inner, vec_r + offset, 1, vec_p + offset, 1); } } // Sum locally *rro += rro_temp; }
void fc3d_projectionWithDiagonalization_update(int contact, FrictionContactProblem* problem, FrictionContactProblem* localproblem, double* reaction, SolverOptions* options) { /* Build a local problem for a specific contact reaction corresponds to the global vector (size n) of the global problem. */ /* Call the update function which depends on the storage for MGlobal/MBGlobal */ /* Build a local problem for a specific contact reaction corresponds to the global vector (size n) of the global problem. */ /* The part of MGlobal which corresponds to the current block is copied into MLocal */ fc3d_nsgs_fillMLocal(problem, localproblem, contact); /**** Computation of qLocal = qBlock + sum over a row of blocks in MGlobal of the products MLocal.reactionBlock, excluding the block corresponding to the current contact. ****/ NumericsMatrix * MGlobal = problem->M; double * MLocal = localproblem->M->matrix0; double *qLocal = localproblem->q; double * qGlobal = problem->q; int n = 3 * problem->numberOfContacts; int in = 3 * contact, it = in + 1, is = it + 1; /* reaction current block set to zero, to exclude current contact block */ /* double rin= reaction[in] ; double rit= reaction[it] ; double ris= reaction[is] ; */ /* qLocal computation*/ qLocal[0] = qGlobal[in]; qLocal[1] = qGlobal[it]; qLocal[2] = qGlobal[is]; if (MGlobal->storageType == 0) { double * MM = MGlobal->matrix0; int incx = n, incy = 1; qLocal[0] += cblas_ddot(n , &MM[in] , incx , reaction , incy); qLocal[1] += cblas_ddot(n , &MM[it] , incx , reaction , incy); qLocal[2] += cblas_ddot(n , &MM[is] , incx , reaction , incy); // Substract diagonal term qLocal[0] -= MM[in + n * in] * reaction[in]; qLocal[1] -= MM[it + n * it] * reaction[it]; qLocal[2] -= MM[is + n * is] * reaction[is]; } else if (MGlobal->storageType == 1) { /* qLocal += rowMB * reaction with rowMB the row of blocks of MGlobal which corresponds to the current contact */ subRowProdSBM(n, 3, contact, MGlobal->matrix1, reaction, qLocal, 0); // Substract diagonal term qLocal[0] -= MLocal[0] * reaction[in]; qLocal[1] -= MLocal[4] * reaction[it]; qLocal[2] -= MLocal[8] * reaction[is]; } /* reaction[in] = rin; reaction[it] = rit; reaction[is] = ris; */ /* Friction coefficient for current block*/ localproblem->mu[0] = problem->mu[contact]; }
int globalLineSearchGP( fc3d_nonsmooth_Newton_solvers* equation, double *reaction, double *velocity, double *mu, double *rho, double *F, double *A, double *B, NumericsMatrix *W, double *qfree, NumericsMatrix *AWpB, double *direction, double *tmp, double alpha[1], unsigned int maxiter_ls) { unsigned problemSize = W->size0; double inf = 1e10; double alphamin = 0.0; double alphamax = inf; double m1 = 0.01, m2 = 0.99; // Computation of q(t) and q'(t) for t =0 double q0 = 0.5 * cblas_ddot(problemSize, F, 1, F, 1); if (isnan(q0) || isinf(q0)) { if (verbose > 0) { fprintf(stderr, "global line search warning. q0 is not a finite number.\n"); } return -1; } // tmp <- AWpB * direction NM_gemv(1., AWpB, direction, 0., tmp); double dqdt0 = cblas_ddot(problemSize, F, 1, tmp, 1); for (unsigned int iter = 0; iter < maxiter_ls; ++iter) { // tmp <- alpha*direction+reaction cblas_dcopy(problemSize, reaction, 1, tmp, 1); cblas_daxpy(problemSize, alpha[0], direction, 1, tmp, 1); // velocity <- W*tmp + qfree cblas_dcopy(problemSize, qfree, 1, velocity, 1); NM_gemv(1., W, tmp, 1., velocity); equation->function(equation->data, problemSize, tmp, velocity, mu, rho, F, NULL, NULL); double q = 0.5 * cblas_ddot(problemSize, F, 1, F, 1); if (isnan(q) || isinf(q)) { printf("global line search warning. q is not a finite number.\n"); return -1; } assert(q >= 0); double slope = (q - q0) / alpha[0]; int C1 = (slope >= m2 * dqdt0); int C2 = (slope <= m1 * dqdt0); if (C1 && C2) { if (verbose > 0) { printf(" globalLineSearchGP. success. ls_iter = %i alpha = %.10e, q = %.10e\n", iter, alpha[0], q); } return 0; } else if (!C1) { alphamin = alpha[0]; } else { // not(C2) alphamax = alpha[0]; } if (alpha[0] < inf) { alpha[0] = 0.5 * (alphamin + alphamax); } else { alpha[0] = alphamin; } } if (verbose > 0) { printf("global line search reached the max number of iteration = %i with alpha = %.10e \n", maxiter_ls, alpha[0]); } return -1; }
int test_subRowprodNonSquare(NumericsMatrix* M3, NumericsMatrix* M4) { printf("== Numerics tests: subRowProdNonSquare(NumericsMatrix,vector) == \n"); int i , n = M3->size0, m = M3->size1; double * x = (double *)malloc(m * sizeof(double)); for (i = 0; i < m; i++) { x[i] = i + 1; } int min = 1; int max = 3; int sizeY = max - min; int sizeX = m; /* Computes yRef = subA*x, subA = A limited to row min to max*/ double * y = (double *)malloc(sizeY * sizeof(double)); double yref[2]; int incx = n, incy = 1; for (i = 0; i < sizeY; i++) yref[i] = cblas_ddot(m, &(M3->matrix0[min + i]), incx, x, incy); subRowProd(sizeX, sizeY, min, M3, x, y, 1); double tol = 1e-12; int info = 0; for (i = 0; i < sizeY; i++) { if (fabs(y[i] - yref[i]) > tol) info = 1; /* printf("%lf\n", fabs(y[i]-yref[i])); */ /* printf("%lf\n", y[i]); */ /* printf("%lf\n", yref[i]); */ } if (info == 0) printf("Step 0 ( y = subA*x, double* storage NonSquare) ok ...\n"); else printf("Step 0 ( y = subA*x, double* storage NonSquare) failed ...\n"); /* += */ subRowProd(sizeX, sizeY, min, M3, x, y, 0); for (i = 0; i < sizeY; i++) { if (fabs(y[i] - 2 * yref[i]) > tol) info = 1; /* printf("%lf\n", fabs(y[i]-2*yref[i])); */ /* printf("%lf\n", y[i]); */ /* printf("%lf\n", 2*yref[i]); */ } if (info == 0) printf("Step 1 ( y += subA*x, double* storage NonSquare) ok ...\n"); else printf("Step 1 ( y += subA*x, double* storage NonSquare) failed ...\n"); free(y); sizeY = 2; int pos = 1; // pos of the required row of blocks y = (double *)malloc(sizeY * sizeof(double)); for (i = 0; i < sizeY; i++) { y[i] = 0.0; yref[i] = cblas_ddot(m, &(M3->matrix0[4 + i]), incx, x, incy); } /* Sparse ... */ subRowProd(sizeX, sizeY, pos, M4, x, y, 1); for (i = 0; i < sizeY; i++) { if (fabs(y[i] - yref[i]) > tol) info = 1; // printf("%lf\n", fabs(y[i]-yref[i])); } for (i = 0; i < sizeY; i++) yref[i] = cblas_ddot(m, &(M3->matrix0[6 + i]), incx, x, incy); subRowProd(sizeX, sizeY, pos + 1, M4, x, y, 1); for (i = 0; i < sizeY; i++) { if (fabs(y[i] - yref[i]) > tol) info = 1; // printf("%lf\n", fabs(y[i]-yref[i])); } if (info == 0) printf("Step 2 ( y = subA*x, sparse storage NonSquare) ok ...\n"); else printf("Step 2 ( y = subA*x, sparse storage NonSquare) failed ...\n"); /* Sparse, += ... */ subRowProd(sizeX, sizeY, pos + 1, M4, x, y, 0); for (i = 0; i < sizeY; i++) { if (fabs(y[i] - 2 * yref[i]) > tol) info = 1; /* printf("%lf\n", fabs(y[i]-yref[i])); */ } if (info == 0) printf("Step 3 ( y += subA*x, sparse storage) ok ...\n"); else printf("Step 3 ( y += subA*x, sparse storage) failed ...\n"); free(x); free(y); printf("== End of test subRowProd(NumericsMatrix,vector), result = %d\n", info); return info; }
double vector_t::dot(vector_t const &b) const { stack_assert(len == b.len); stack::fe_asserter dummy{}; return cblas_ddot(len, data.get(), inc, b.data.get(), b.inc); }