void MLGP_AXPY(unsigned N, FLOAT a, FLOAT* X, unsigned incX, FLOAT* Y, unsigned incY) { #ifdef DOUBLE #define AXPY(...) daxpy_(__VA_ARGS__) #else #define AXPY(...) saxpy_(__VA_ARGS__) #endif return AXPY(&N, &a, X, &incX, Y, &incY); }
void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { TYPE alpha = *((TYPE *)arg); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]); AXPY((int)n, alpha, block_x, 1, block_y, 1); }
void minimize_dual(DOUBLE *Xopt, DOUBLE *Xorig, INT length, DOUBLE *SSt, DOUBLE *SXt, DOUBLE *SXtXSt, DOUBLE trXXt, \ DOUBLE c, INT N, INT K) { DOUBLE INTERV = 0.1; DOUBLE EXT = 3.0; INT MAX = 20; DOUBLE RATIO = (DOUBLE) 10; DOUBLE SIG = 0.1; DOUBLE RHO = SIG / (DOUBLE) 2; INT MN = K * 1; CHAR lamch_opt = 'U'; DOUBLE realmin = LAMCH(&lamch_opt); DOUBLE red = 1; INT i = 0; INT ls_failed = 0; DOUBLE f0; DOUBLE *df0 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); DOUBLE *dftemp = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); DOUBLE *df3 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); DOUBLE *s = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); DOUBLE d0; INT derivFlag = 1; DOUBLE *X = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); datacpy(X, Xorig, MN); INT maxNK = IMAX(N, K); DOUBLE *SStLambda = (DOUBLE *) MALLOC(maxNK * K * sizeof(DOUBLE)); DOUBLE *tempMatrix = (DOUBLE *) MALLOC(maxNK * K * sizeof(DOUBLE)); dual_obj_grad(&f0, df0, X, SSt, SXt, SXtXSt, trXXt, c, N, K, derivFlag, SStLambda, tempMatrix); INT incx = 1; INT incy = 1; datacpy(s, df0, MN); DOUBLE alpha = -1; SCAL(&MN, &alpha, s, &incx); d0 = - DOT(&MN, s, &incx, s, &incy); DOUBLE x1; DOUBLE x2; DOUBLE x3; DOUBLE x4; DOUBLE *X0 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); DOUBLE *X3 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); DOUBLE F0; DOUBLE *dF0 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE)); INT Mmin; DOUBLE f1; DOUBLE f2; DOUBLE f3; DOUBLE f4; DOUBLE d1; DOUBLE d2; DOUBLE d3; DOUBLE d4; INT success; DOUBLE A; DOUBLE B; DOUBLE sqrtquantity; DOUBLE tempnorm; DOUBLE tempinprod1; DOUBLE tempinprod2; DOUBLE tempscalefactor; x3 = red / (1 - d0); while (i++ < length) { datacpy(X0, X, MN); datacpy(dF0, df0, MN); F0 = f0; Mmin = MAX; while (1) { x2 = 0; f2 = f0; d2 = d0; f3 = f0; datacpy(df3, df0, MN); success = 0; while ((!success) && (Mmin > 0)) { Mmin = Mmin - 1; datacpy(X3, X, MN); alpha = x3; AXPY(&MN, &alpha, s, &incx, X3, &incy); dual_obj_grad(&f3, df3, X3, SSt, SXt, SXtXSt, trXXt, c, N, K, derivFlag, SStLambda, tempMatrix); if (ISNAN(f3) || ISINF(f3)) { /* any(isnan(df3)+isinf(df3)) */ x3 = (x2 + x3) * 0.5; } else { success = 1; } } if (f3 < F0) { datacpy(X0, X, MN); alpha = x3; AXPY(&MN, &alpha, s, &incx, X0, &incy); datacpy(dF0, df3, MN); F0 = f3; } d3 = DOT(&MN, df3, &incx, s, &incy); if ((d3 > SIG * d0) || (f3 > f0 + x3 * RHO * d0) || (Mmin == 0)) { break; } x1 = x2; f1 = f2; d1 = d2; x2 = x3; f2 = f3; d2 = d3; A = 6 * (f1 - f2) + 3 * (d2 + d1) * (x2 - x1); B = 3 * (f2 - f1) - (2 * d1 + d2) * (x2 - x1); sqrtquantity = B * B - A * d1 * (x2 - x1); if (sqrtquantity < 0) { x3 = x2 * EXT; } else { x3 = x1 - d1 * SQR(x2 - x1) / (B + SQRT(sqrtquantity)); if (ISNAN(x3) || ISINF(x3) || (x3 < 0)) { x3 = x2 * EXT; } else if (x3 > x2 * EXT) { x3 = x2 * EXT; } else if (x3 < x2 + INTERV * (x2 - x1)) { x3 = x2 + INTERV * (x2 - x1); } } } while (((ABS(d3) > - SIG * d0) || (f3 > f0 + x3 * RHO * d0)) && (Mmin > 0)) { if ((d3 > 0) || (f3 > f0 + x3 * RHO * d0)) { x4 = x3; f4 = f3; d4 = d3; } else { x2 = x3; f2 = f3; d2 = d3; } if (f4 > f0) { x3 = x2 - (0.5 * d2 * SQR(x4 - x2)) / (f4 - f2 - d2 * (x4 - x2)); } else { A = 6 * (f2 - f4) / (x4 - x2) + 3 * (d4 + d2); B = 3 * (f4 - f2) - (2 * d2 + d4) * (x4 - x2); x3 = x2 + (SQRT(B * B - A * d2 * SQR(x4 - x2)) - B) / A; } if (ISNAN(x3) || ISINF(x3)) { x3 = (x2 + x4) * 0.5; } x3 = IMAX(IMIN(x3, x4 - INTERV * (x4 - x2)), x2 + INTERV * (x4 - x2)); datacpy(X3, X, MN); alpha = x3; AXPY(&MN, &alpha, s, &incx, X3, &incy); dual_obj_grad(&f3, df3, X3, SSt, SXt, SXtXSt, trXXt, c, N, K, derivFlag, SStLambda, tempMatrix); if (f3 < F0) { datacpy(X0, X, MN); alpha = x3; AXPY(&MN, &alpha, s, &incx, X0, &incy); datacpy(dF0, df3, MN); F0 = f3; } Mmin = Mmin - 1; d3 = DOT(&MN, df3, &incx, s, &incy); } if ((ABS(d3) < - SIG * d0) && (f3 < f0 + x3 * RHO * d0)) { alpha = x3; AXPY(&MN, &alpha, s, &incx, X, &incy); f0 = f3; datacpy(dftemp, df3, MN); alpha = -1; AXPY(&MN, &alpha, df0, &incx, dftemp, &incy); tempinprod1 = DOT(&MN, dftemp, &incx, df3, &incy); tempnorm = NRM2(&MN, df0, &incx); tempinprod2 = SQR(tempnorm); tempscalefactor = tempinprod1 / tempinprod2; alpha = tempscalefactor; SCAL(&MN, &alpha, s, &incx); alpha = -1; AXPY(&MN, &alpha, df3, &incx, s, &incy); datacpy(df0, df3, MN); d3 = d0; d0 = DOT(&MN, df0, &incx, s, &incy); if (d0 > 0) { datacpy(s, df0, MN); alpha = -1; SCAL(&MN, &alpha, s, &incx); tempnorm = NRM2(&MN, s, &incx); d0 = - SQR(tempnorm); } x3 = x3 * IMIN(RATIO, d3 / (d0 - realmin)); ls_failed = 0; } else { datacpy(X, X0, MN); datacpy(df0, dF0, MN); f0 = F0; if ((ls_failed == 1) || (i > length)) { break; } datacpy(s, df0, MN); alpha = -1; SCAL(&MN, &alpha, s, &incx); tempnorm = NRM2(&MN, s, &incx); d0 = - SQR(tempnorm); x3 = 1 / (1 - d0); ls_failed = 1; } } datacpy(Xopt, X, MN); FREE(SStLambda); FREE(tempMatrix); FREE(df0); FREE(dftemp); FREE(df3); FREE(s); FREE(X); FREE(X0); FREE(X3); FREE(dF0); }
void OPK_PLCG(const opk_index_t n, real_t p[], real_t q[], real_t r[], real_t x[], real_t z[], real_t rho[4], opk_cg_state_t *state) { const real_t zero = OPK_REALCONST(0.0); real_t pq, alpha, beta; opk_index_t i; switch (*state) { case OPK_CG_START: /* Start with no initial guess: fill x with zeros, there is no needs to compute q = A.x0 since it is 0. */ ZERO(n, x); rho[0] = rho[1] = rho[2] = rho[3] = zero; *state = OPK_CG_NEWX; return; case OPK_CG_RESTART: /* Start or restart with initial x given: copy initial x into p and request caller to compte: q = A.p */ COPY(n, x, p); rho[0] = rho[1] = rho[2] = rho[3] = zero; *state = OPK_CG_AP; return; case OPK_CG_NEWX: if (z == NULL) { /* No preconditioning. Take z = r and jump to next case to compute conjugate gradient direction. */ z = r; } else { /* Preconditioned version of the algorithm. Request caller to compute preconditioned residuals. */ *state = OPK_CG_PRECOND; return; } case OPK_CG_PRECOND: /* Caller has been requested to compute preconditioned residuals. Use conjugate gradients recurrence to compute next search direption p. */ rho[1] = DOT(n, r, z); if (rho[1] <= zero) { /* If r'.z too small, then algorithm has converged or A is not positive definite. */ *state = (rho[1] < zero ? OPK_CG_NON_CONVEX : OPK_CG_FINISH); return; } if (rho[0] > zero) { beta = rho[1]/rho[0]; for (i = 0; i < n; ++i) { p[i] = z[i] + beta*p[i]; } } else { beta = zero; COPY(n, z, p); } rho[3] = beta; /* Request caller to compute: q = A.p */ *state = OPK_CG_AP; return; case OPK_CG_AP: if (rho[1] > zero) { /* Caller has been requested to compute q = A.p. Compute optimal step size and update the variables and the residuals. */ pq = DOT(n, p, q); if (pq <= zero) { /* Operator A is not positive definite. */ *state = OPK_CG_NON_CONVEX; return; } alpha = rho[1]/pq; /* optimal step size */ rho[2] = alpha; /* memorize optimal step size */ if (alpha == zero) { /* If alpha too small, then algorithm has converged. */ *state = OPK_CG_FINISH; return; } AXPY(n, alpha, p, x); AXPY(n, -alpha, q, r); rho[0] = rho[1]; } else { /* Caller has been requested to compute q = A.x0 Update the residuals. */ AXPY(n, -1.0, q, r); } /* Variables X and residuals R available for inspection. */ *state = OPK_CG_NEWX; return; case OPK_CG_FINISH: case OPK_CG_NON_CONVEX: /* If state is OPK_CG_FINISH, the caller can restart the algorithm with final x, state set to OPK_CG_RESTART and r set to b. */ return; default: /* There must be something wrong... */ *state = OPK_CG_ERROR; return; } }
/* * ... RHO[0] = RHO_PREV * ... RHO[1] = RHO * ... RHO[2] = ALPHA * ... RHO[3] = BETA * ... RHO[4] = |X| */ void OPK_TRCG(const opk_index_t n, real_t p[], const real_t q[], real_t r[], real_t x[], const real_t z[], const real_t delta, real_t rho[5], opk_cg_state_t *state) { const real_t zero = OPK_REALCONST(0.0); const real_t one = OPK_REALCONST(1.0); real_t a, b, c, d, e, xn, sum, tmp; real_t pq, alpha, beta; long i; if (delta <= zero) { *state = OPK_CG_ERROR; return; } switch (*state) { case OPK_CG_START: /* Start with no initial guess: fill x with zeros, there is no needs to compute q = A.x0 since it is 0. */ ZERO(n, x); rho[0] = rho[1] = rho[2] = rho[3] = rho[4] = zero; *state = OPK_CG_NEWX; return; case OPK_CG_RESTART: /* Start or restart with initial x given: copy initial x into p and request caller to compte: q = A.p */ rho[0] = rho[1] = rho[2] = rho[3] = zero; xn = NRM2(n, x); if (xn >= delta) { if (xn > delta) { SCAL(n, delta/xn, x); } rho[4] = delta; /* |X| */ *state = OPK_CG_TRUNCATED; } else { rho[4] = xn; /* |X| */ COPY(n, x, p); *state = OPK_CG_AP; } return; case OPK_CG_NEWX: if (z == NULL) { /* No preconditioning. Take z = r and jump to next case to compute conjugate gradient direction. */ z = r; } else { /* Preconditioned version of the algorithm. Request caller to compute preconditioned residuals. */ *state = OPK_CG_PRECOND; return; } case OPK_CG_PRECOND: /* Caller has been requested to compute preconditioned residuals. Use conjugate gradients recurrence to compute next search direption p. */ rho[1] = DOT(n, r, z); if (rho[1] <= zero) { /* If r'.z too small, then algorithm has converged or preconditioner is not positive definite. */ *state = (rho[1] < zero ? OPK_CG_NON_CONVEX : OPK_CG_FINISH); return; } if (rho[0] > zero) { beta = rho[1]/rho[0]; for (i = 0; i < n; ++i) { p[i] = z[i] + beta*p[i]; } } else { beta = zero; COPY(n, z, p); } rho[3] = beta; /* Request caller to compute: q = A.p */ *state = OPK_CG_AP; return; case OPK_CG_AP: if (rho[1] > zero) { /* Caller has been requested to compute q = A.p. Compute optimal step size and update the variables and the residuals. */ pq = DOT(n, p, q); if (pq > zero) { alpha = rho[1]/pq; /* optimal step size */ rho[2] = alpha; /* memorize optimal step size */ if (alpha == zero) { /* If alpha too small, then algorithm has converged. */ *state = OPK_CG_FINISH; return; } sum = zero; for (i = 0; i < n; ++i) { tmp = x[i] + alpha*p[i]; sum += tmp*tmp; } xn = SQRT(sum); if (xn <= delta) { /* Optimal step not too long, take it. */ AXPY(n, alpha, p, x); AXPY(n, -alpha, q, r); rho[0] = rho[1]; rho[4] = xn; *state = (xn < delta ? OPK_CG_NEWX : OPK_CG_TRUNCATED); return; } } /* Operator A is not positive definite (P'.A.P < 0) or optimal step leads us ouside the trust region. In these cases, we take a truncated step X + ALPHA*P along P so that |X + ALPHA*P| = DELTA with ALPHA >= 0. This amounts to find the positive root of: A*ALPHA^2 + 2*B*ALPHA + C with: A = |P|^2, B = X'.P, and C = |X|^2 - DELTA^2. Note that the reduced discriminant is D = B*B - A*C which expands to D = (DELTA^2 - |X|^2*sin(THETA)^2)*|P|^2 with THETA the angle between X and P, as DELTA > |X|, then D > 0 must hold. */ a = DOT(n, p, p); if (a <= zero) { /* This can only occurs if P = 0 (and thus A = 0). It is probably due to rounding errors and the algorithm should be restarted. */ *state = OPK_CG_FINISH; return; } b = DOT(n, x, p); c = (rho[4] + delta)*(rho[4] - delta); /* RHO[4] = |X| */ if (c >= zero) { /* This can only occurs if the caller has modified DELTA or RHO[4], which stores the norm of X, and thus is considered as an error. */ *state = OPK_CG_ERROR; return; } /* Normalize the coefficients to avoid overflows. */ d = FABS(a); if ((e = FABS(b)) > d) d = e; if ((e = FABS(c)) > d) d = e; d = one/d; a *= d; b *= d; c *= d; /* Compute the reduced discriminant. */ d = b*b - a*c; if (d > zero) { /* The polynomial has two real roots of opposite signs (because C/A < 0 by construction), ALPHA is the positive one. Compute this root avoiding numerical errors (by adding numbers of same sign). */ e = SQRT(d); if (b >= zero) { alpha = -c/(e + b); } else { alpha = (e - b)/a; } } else { /* D > 0 must hold. There must be something wrong... */ *state = OPK_CG_ERROR; return; } if (alpha > zero) { AXPY(n, alpha, p, x); AXPY(n, -alpha, q, r); } rho[0] = rho[1]; rho[2] = alpha; /* memorize optimal step size */ rho[4] = delta; *state = OPK_CG_TRUNCATED; return; } else { /* Caller has been requested to compute q = A.x0 Update the residuals. */ for (i = 0; i < n; ++i) { r[i] -= q[i]; } rho[2] = zero; /* ALPHA = 0 (no step has been taken yet) */ rho[3] = zero; /* BETA = 0 (ditto) */ } /* Variables X and residuals R available for inspection. */ *state = OPK_CG_NEWX; return; case OPK_CG_FINISH: case OPK_CG_NON_CONVEX: case OPK_CG_TRUNCATED: /* The caller can restart the algorithm with final 'x', 'state' set to OPK_CG_RESTART and 'r' set to 'b' and, maybe, a larger 'delta'. */ return; default: /* There must be something wrong... */ *state = OPK_CG_ERROR; return; } }
int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l<loops; l++) { for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); AXPY (&m, alpha, x, &inc_x, y, &inc_y ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += time1; } timeg /= loops; fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } return 0; }