Example #1
0
File: blas.c Project: ngjw/mlgp
void MLGP_AXPY(unsigned N, FLOAT a, FLOAT* X, 
               unsigned incX, FLOAT* Y, unsigned incY)
{
  #ifdef DOUBLE
  #define AXPY(...) daxpy_(__VA_ARGS__)
  #else
  #define AXPY(...) saxpy_(__VA_ARGS__)
  #endif
  return AXPY(&N, &a, X, &incX, Y, &incY);
  
}
Example #2
0
void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	TYPE alpha = *((TYPE *)arg);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);

	AXPY((int)n, alpha, block_x, 1, block_y, 1);
}
void minimize_dual(DOUBLE *Xopt, DOUBLE *Xorig, INT length, DOUBLE *SSt, DOUBLE *SXt, DOUBLE *SXtXSt, DOUBLE trXXt, \
					DOUBLE c, INT N, INT K) {

	DOUBLE INTERV = 0.1;
	DOUBLE EXT = 3.0;   
	INT MAX = 20;       
	DOUBLE RATIO = (DOUBLE) 10;  
	DOUBLE SIG = 0.1; 
	DOUBLE RHO = SIG / (DOUBLE) 2;
	INT MN = K * 1;
	
	CHAR lamch_opt = 'U';
	DOUBLE realmin = LAMCH(&lamch_opt);

	DOUBLE red = 1;

	INT i = 0;
	INT ls_failed = 0;
	DOUBLE f0;
	DOUBLE *df0 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	DOUBLE *dftemp = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	DOUBLE *df3 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	DOUBLE *s = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	DOUBLE d0;
	INT derivFlag = 1;

	DOUBLE *X = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	datacpy(X, Xorig, MN);
	
	INT maxNK = IMAX(N, K);
	DOUBLE *SStLambda = (DOUBLE *) MALLOC(maxNK * K * sizeof(DOUBLE));
	DOUBLE *tempMatrix = (DOUBLE *) MALLOC(maxNK * K * sizeof(DOUBLE));
	
	dual_obj_grad(&f0, df0, X, SSt, SXt, SXtXSt, trXXt, c, N, K, derivFlag, SStLambda, tempMatrix);
	
	INT incx = 1;
	INT incy = 1;
		
	datacpy(s, df0, MN);
	DOUBLE alpha = -1;
	SCAL(&MN, &alpha, s, &incx);
	
	d0 = - DOT(&MN, s, &incx, s, &incy);
	
	DOUBLE x1;
	DOUBLE x2;
	DOUBLE x3;
	DOUBLE x4;
	DOUBLE *X0 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	DOUBLE *X3 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	DOUBLE F0;
	DOUBLE *dF0 = (DOUBLE *) MALLOC(MN * sizeof(DOUBLE));
	INT Mmin;
	DOUBLE f1;
	DOUBLE f2;
	DOUBLE f3;
	DOUBLE f4;
	DOUBLE d1;
	DOUBLE d2;
	DOUBLE d3;
	DOUBLE d4;
	INT success;
	DOUBLE A;
	DOUBLE B;
	DOUBLE sqrtquantity;
	DOUBLE tempnorm;
	DOUBLE tempinprod1;
	DOUBLE tempinprod2;
	DOUBLE tempscalefactor;
	
	x3 = red / (1 - d0);            

	while (i++ < length) {
		datacpy(X0, X, MN);
		datacpy(dF0, df0, MN);
		F0 = f0;
		Mmin = MAX;
		
		while (1) {
			x2 = 0;
			f2 = f0;
			d2 = d0;
			f3 = f0;
			
			datacpy(df3, df0, MN);
			
			success = 0;
			while ((!success) && (Mmin > 0)) {
				Mmin = Mmin - 1;
				
				datacpy(X3, X, MN);
				alpha = x3;
				AXPY(&MN, &alpha, s, &incx, X3, &incy);
				
				dual_obj_grad(&f3, df3, X3, SSt, SXt, SXtXSt, trXXt, c, N, K, derivFlag, SStLambda, tempMatrix);	

				if (ISNAN(f3) || ISINF(f3)) {  /* any(isnan(df3)+isinf(df3)) */
					x3 = (x2 + x3) * 0.5;
				} else {
					success = 1;
				}
			}
			
			if (f3 < F0) {
				datacpy(X0, X, MN);
				alpha = x3;
				AXPY(&MN, &alpha, s, &incx, X0, &incy);
				datacpy(dF0, df3, MN);
				F0 = f3;
			}	
			
			d3 = DOT(&MN, df3, &incx, s, &incy);

			if ((d3 > SIG * d0) || (f3 > f0 + x3 * RHO * d0) || (Mmin == 0)) {
				break;
			}
			
			x1 = x2; 
			f1 = f2; 
			d1 = d2;
			x2 = x3; 
			f2 = f3; 
			d2 = d3;
			A = 6 * (f1 - f2) + 3 * (d2 + d1) * (x2 - x1);
			B = 3 * (f2 - f1) - (2 * d1 + d2) * (x2 - x1);
			sqrtquantity = B * B - A * d1 * (x2 - x1);

			if (sqrtquantity < 0) {
				x3 = x2 * EXT;
			} else {
				x3 = x1 - d1 * SQR(x2 - x1) / (B + SQRT(sqrtquantity));
				if (ISNAN(x3) || ISINF(x3) || (x3 < 0)) {
					x3 = x2 * EXT;
				} else if (x3 > x2 * EXT) {
					x3 = x2 * EXT;
				} else if (x3 < x2 + INTERV * (x2 - x1)) {
					x3 = x2 + INTERV * (x2 - x1);
				}
			}		
		}                
	
		while (((ABS(d3) > - SIG * d0) || (f3 > f0 + x3 * RHO * d0)) && (Mmin > 0)) {
			if ((d3 > 0) || (f3 > f0 + x3 * RHO * d0)) {
				x4 = x3;
				f4 = f3;
				d4 = d3;
			} else {
				x2 = x3;
				f2 = f3;
				d2 = d3;
			}

			if (f4 > f0) {
				x3 = x2 - (0.5 * d2 * SQR(x4 - x2)) / (f4 - f2 - d2 * (x4 - x2));
			} else {
				A = 6 * (f2 - f4) / (x4 - x2) + 3 * (d4 + d2);
				B = 3 * (f4 - f2) - (2 * d2 + d4) * (x4 - x2);
				x3 = x2 + (SQRT(B * B - A * d2 * SQR(x4 - x2)) - B) / A;
			}

			if (ISNAN(x3) || ISINF(x3)) {
				x3 = (x2 + x4) * 0.5;
			}
			x3 = IMAX(IMIN(x3, x4 - INTERV * (x4 - x2)), x2 + INTERV * (x4 - x2));

			datacpy(X3, X, MN);
			alpha = x3;
			AXPY(&MN, &alpha, s, &incx, X3, &incy);			

			dual_obj_grad(&f3, df3, X3, SSt, SXt, SXtXSt, trXXt, c, N, K, derivFlag, SStLambda, tempMatrix);

			if (f3 < F0) {
				datacpy(X0, X, MN);
				alpha = x3;
				AXPY(&MN, &alpha, s, &incx, X0, &incy);
				datacpy(dF0, df3, MN);
				F0 = f3;
			}

			Mmin = Mmin - 1;
			d3 = DOT(&MN, df3, &incx, s, &incy);
			
		}
		
		if ((ABS(d3) < - SIG * d0) && (f3 < f0 + x3 * RHO * d0)) {
			alpha = x3;
			AXPY(&MN, &alpha, s, &incx, X, &incy);
			f0 = f3;
			datacpy(dftemp, df3, MN);
			alpha = -1;
			AXPY(&MN, &alpha, df0, &incx, dftemp, &incy);
			tempinprod1 = DOT(&MN, dftemp, &incx, df3, &incy);
			tempnorm = NRM2(&MN, df0, &incx);
			tempinprod2 = SQR(tempnorm);
			tempscalefactor = tempinprod1 / tempinprod2;

			alpha = tempscalefactor;
			SCAL(&MN, &alpha, s, &incx);
			alpha = -1;
			AXPY(&MN, &alpha, df3, &incx, s, &incy);
			datacpy(df0, df3, MN);
			d3 = d0;
			d0 = DOT(&MN, df0, &incx, s, &incy);

			if (d0 > 0) {
				datacpy(s, df0, MN);
				alpha = -1;
				SCAL(&MN, &alpha, s, &incx);
				tempnorm = NRM2(&MN, s, &incx);
				d0 = - SQR(tempnorm);
			}
			x3 = x3 * IMIN(RATIO, d3 / (d0 - realmin));
			ls_failed = 0;
		} else {
			datacpy(X, X0, MN);
			datacpy(df0, dF0, MN);
			f0 = F0;
			
			if ((ls_failed == 1) || (i > length)) {
				break;
			}
			
			datacpy(s, df0, MN);
			alpha = -1;
			SCAL(&MN, &alpha, s, &incx);
			tempnorm = NRM2(&MN, s, &incx);
			d0 = - SQR(tempnorm);
			x3 = 1 / (1 - d0);
			
			ls_failed = 1;
		}
	}

	datacpy(Xopt, X, MN);
	
	FREE(SStLambda);
	FREE(tempMatrix);
	FREE(df0);
	FREE(dftemp);
	FREE(df3);
	FREE(s);
	FREE(X);
	FREE(X0);
	FREE(X3);
	FREE(dF0);
}
Example #4
0
void
OPK_PLCG(const opk_index_t n, real_t p[], real_t q[], real_t r[],
         real_t x[], real_t z[], real_t rho[4], opk_cg_state_t *state)
{
  const real_t zero = OPK_REALCONST(0.0);
  real_t pq, alpha, beta;
  opk_index_t i;

  switch (*state) {

  case OPK_CG_START:

    /* Start with no initial guess: fill x with zeros,
       there is no needs to compute q = A.x0 since it is 0. */
    ZERO(n, x);
    rho[0] = rho[1] = rho[2] = rho[3] = zero;
    *state = OPK_CG_NEWX;
    return;

  case OPK_CG_RESTART:

    /* Start or restart with initial x given: copy initial x into p and
       request caller to compte: q = A.p */
    COPY(n, x, p);
    rho[0] = rho[1] = rho[2] = rho[3] = zero;
    *state = OPK_CG_AP;
    return;

  case OPK_CG_NEWX:

    if (z == NULL) {
      /* No preconditioning.  Take z = r and jump to next case to compute
         conjugate gradient direction. */
      z = r;
    } else {
      /* Preconditioned version of the algorithm.  Request caller to compute
         preconditioned residuals. */
      *state = OPK_CG_PRECOND;
      return;
    }

  case OPK_CG_PRECOND:

    /* Caller has been requested to compute preconditioned residuals.  Use
       conjugate gradients recurrence to compute next search direption p. */
    rho[1] = DOT(n, r, z);
    if (rho[1] <= zero) {
      /* If r'.z too small, then algorithm has converged
         or A is not positive definite. */
      *state = (rho[1] < zero ? OPK_CG_NON_CONVEX : OPK_CG_FINISH);
      return;
    }
    if (rho[0] > zero) {
      beta = rho[1]/rho[0];
      for (i = 0; i < n; ++i) {
        p[i] = z[i] + beta*p[i];
      }
    } else {
      beta = zero;
      COPY(n, z, p);
    }
    rho[3] = beta;

    /* Request caller to compute: q = A.p */
    *state = OPK_CG_AP;
    return;

  case OPK_CG_AP:

    if (rho[1] > zero) {
      /* Caller has been requested to compute q = A.p.  Compute optimal step
         size and update the variables and the residuals. */
      pq = DOT(n, p, q);
      if (pq <= zero) {
        /* Operator A is not positive definite. */
        *state = OPK_CG_NON_CONVEX;
        return;
      }
      alpha = rho[1]/pq; /* optimal step size */
      rho[2] = alpha; /* memorize optimal step size */
      if (alpha == zero) {
        /* If alpha too small, then algorithm has converged. */
        *state = OPK_CG_FINISH;
        return;
      }
      AXPY(n,  alpha, p, x);
      AXPY(n, -alpha, q, r);
      rho[0] = rho[1];
    } else {
      /* Caller has been requested to compute q = A.x0
         Update the residuals. */
      AXPY(n, -1.0, q, r);
    }

    /* Variables X and residuals R available for inspection. */
    *state = OPK_CG_NEWX;
    return;

  case OPK_CG_FINISH:
  case OPK_CG_NON_CONVEX:

    /* If state is OPK_CG_FINISH, the caller can restart the algorithm with
       final x, state set to OPK_CG_RESTART and r set to b. */
    return;

  default:

    /* There must be something wrong... */
    *state = OPK_CG_ERROR;
    return;
  }

}
Example #5
0
/*
 *  ... RHO[0] = RHO_PREV
 *  ... RHO[1] = RHO
 *  ... RHO[2] = ALPHA
 *  ... RHO[3] = BETA
 *  ... RHO[4] = |X|
 */
void
OPK_TRCG(const opk_index_t n, real_t p[], const real_t q[], real_t r[],
         real_t x[], const real_t z[], const real_t delta,
         real_t rho[5], opk_cg_state_t *state)
{
  const real_t zero = OPK_REALCONST(0.0);
  const real_t one  = OPK_REALCONST(1.0);
  real_t a, b, c, d, e, xn, sum, tmp;
  real_t pq, alpha, beta;
  long i;

  if (delta <= zero) {
    *state = OPK_CG_ERROR;
    return;
  }

  switch (*state) {

  case OPK_CG_START:

    /* Start with no initial guess: fill x with zeros,
       there is no needs to compute q = A.x0 since it is 0. */
    ZERO(n, x);
    rho[0] = rho[1] = rho[2] = rho[3] = rho[4] = zero;
    *state = OPK_CG_NEWX;
    return;

  case OPK_CG_RESTART:

    /* Start or restart with initial x given: copy initial x into p and
       request caller to compte: q = A.p */
    rho[0] = rho[1] = rho[2] = rho[3] = zero;
    xn = NRM2(n, x);
    if (xn >= delta) {
      if (xn > delta) {
        SCAL(n, delta/xn, x);
      }
      rho[4] = delta; /* |X| */
      *state = OPK_CG_TRUNCATED;
    } else {
      rho[4] = xn; /* |X| */
      COPY(n, x, p);
      *state = OPK_CG_AP;
    }
    return;

  case OPK_CG_NEWX:

    if (z == NULL) {
      /* No preconditioning.  Take z = r and jump to next case to compute
         conjugate gradient direction. */
      z = r;
    } else {
      /* Preconditioned version of the algorithm.  Request caller to compute
         preconditioned residuals. */
      *state = OPK_CG_PRECOND;
      return;
    }

  case OPK_CG_PRECOND:

    /* Caller has been requested to compute preconditioned residuals.  Use
       conjugate gradients recurrence to compute next search direption p. */
    rho[1] = DOT(n, r, z);
    if (rho[1] <= zero) {
      /* If r'.z too small, then algorithm has converged
         or preconditioner is not positive definite. */
      *state = (rho[1] < zero ? OPK_CG_NON_CONVEX : OPK_CG_FINISH);
      return;
    }
    if (rho[0] > zero) {
      beta = rho[1]/rho[0];
      for (i = 0; i < n; ++i) {
        p[i] = z[i] + beta*p[i];
      }
    } else {
      beta = zero;
      COPY(n, z, p);
    }
    rho[3] = beta;

    /* Request caller to compute: q = A.p */
    *state = OPK_CG_AP;
    return;

  case OPK_CG_AP:

    if (rho[1] > zero) {
      /* Caller has been requested to compute q = A.p.  Compute optimal step
         size and update the variables and the residuals. */
      pq = DOT(n, p, q);
      if (pq > zero) {
        alpha = rho[1]/pq; /* optimal step size */
        rho[2] = alpha; /* memorize optimal step size */
        if (alpha == zero) {
          /* If alpha too small, then algorithm has converged. */
          *state = OPK_CG_FINISH;
          return;
        }
        sum = zero;
        for (i = 0; i < n; ++i) {
          tmp = x[i] + alpha*p[i];
          sum += tmp*tmp;
        }
        xn = SQRT(sum);
        if (xn <= delta) {
          /* Optimal step not too long, take it. */
          AXPY(n,  alpha, p, x);
          AXPY(n, -alpha, q, r);
          rho[0] = rho[1];
          rho[4] = xn;
          *state = (xn < delta ? OPK_CG_NEWX : OPK_CG_TRUNCATED);
          return;
        }
      }

      /* Operator A is not positive definite (P'.A.P < 0) or optimal step
         leads us ouside the trust region.  In these cases, we take a
         truncated step X + ALPHA*P along P so that |X + ALPHA*P| = DELTA with
         ALPHA >= 0.  This amounts to find the positive root of: A*ALPHA^2 +
         2*B*ALPHA + C with: A = |P|^2, B = X'.P, and C = |X|^2 - DELTA^2.
         Note that the reduced discriminant is D = B*B - A*C which expands to
         D = (DELTA^2 - |X|^2*sin(THETA)^2)*|P|^2 with THETA the angle between
         X and P, as DELTA > |X|, then D > 0 must hold. */
      a = DOT(n, p, p);
      if (a <= zero) {
        /* This can only occurs if P = 0 (and thus A = 0).  It is probably due
           to rounding errors and the algorithm should be restarted. */
        *state = OPK_CG_FINISH;
        return;
      }
      b = DOT(n, x, p);
      c = (rho[4] + delta)*(rho[4] - delta); /* RHO[4] = |X| */
      if (c >= zero) {
        /* This can only occurs if the caller has modified DELTA or RHO[4],
           which stores the norm of X, and thus is considered as an error. */
        *state = OPK_CG_ERROR;
        return;
      }
      /* Normalize the coefficients to avoid overflows. */
      d = FABS(a);
      if ((e = FABS(b)) > d) d = e;
      if ((e = FABS(c)) > d) d = e;
      d = one/d;
      a *= d;
      b *= d;
      c *= d;
      /* Compute the reduced discriminant. */
      d = b*b - a*c;
      if (d > zero) {
        /* The polynomial has two real roots of opposite signs (because C/A <
           0 by construction), ALPHA is the positive one.  Compute this root
           avoiding numerical errors (by adding numbers of same sign). */
        e = SQRT(d);
        if (b >= zero) {
          alpha = -c/(e + b);
        } else {
          alpha = (e - b)/a;
        }
      } else {
        /* D > 0 must hold.  There must be something wrong... */
        *state = OPK_CG_ERROR;
        return;
      }
      if (alpha > zero) {
        AXPY(n,  alpha, p, x);
        AXPY(n, -alpha, q, r);
      }
      rho[0] = rho[1];
      rho[2] = alpha; /* memorize optimal step size */
      rho[4] = delta;
      *state = OPK_CG_TRUNCATED;
      return;

    } else {

      /* Caller has been requested to compute q = A.x0
         Update the residuals. */
      for (i = 0; i < n; ++i) {
        r[i] -= q[i];
      }
      rho[2] = zero; /* ALPHA = 0 (no step has been taken yet) */
      rho[3] = zero; /* BETA = 0 (ditto) */

    }

    /* Variables X and residuals R available for inspection. */
    *state = OPK_CG_NEWX;
    return;


  case OPK_CG_FINISH:
  case OPK_CG_NON_CONVEX:
  case OPK_CG_TRUNCATED:

    /* The caller can restart the algorithm with final 'x', 'state' set to
       OPK_CG_RESTART and 'r' set to 'b' and, maybe, a larger 'delta'. */
    return;

  default:

    /* There must be something wrong... */
    *state = OPK_CG_ERROR;
    return;
  }

}
Example #6
0
int main(int argc, char *argv[]){

  FLOAT *x, *y;
  FLOAT alpha[2] = { 2.0, 2.0 };
  blasint m, i;
  blasint inc_x=1,inc_y=1;
  int loops = 1;
  int l;
  char *p;

  int from =   1;
  int to   = 200;
  int step =   1;

  struct timeval start, stop;
  double time1,timeg;

  argc--;argv++;

  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}

  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);

  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

#ifdef linux
  srandom(getpid());
#endif

  fprintf(stderr, "   SIZE       Flops\n");

  for(m = from; m <= to; m += step)
  {

   timeg=0;

   fprintf(stderr, " %6d : ", (int)m);


   for (l=0; l<loops; l++)
   {

   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   	}

   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   	}
    	gettimeofday( &start, (struct timezone *)0);

    	AXPY (&m, alpha, x, &inc_x, y, &inc_y );

    	gettimeofday( &stop, (struct timezone *)0);

    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;

	timeg += time1;

    }

    timeg /= loops;

    fprintf(stderr,
	    " %10.2f MFlops %10.6f sec\n",
	    COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);

  }

  return 0;
}