示例#1
0
__cminpack_attr__
void __cminpack_func__(lmpar)(int n, real *r, int ldr, 
	const int *ipvt, const real *diag, const real *qtb, real delta, 
	real *par, real *x, real *sdiag, real *wa1, 
	real *wa2)
{
    /* Initialized data */

#define p1 .1
#define p001 .001

    /* System generated locals */
    real d1, d2;

    /* Local variables */
    int j, l;
    real fp;
    real parc, parl;
    int iter;
    real temp, paru, dwarf;
    int nsing;
    real gnorm;
    real dxnorm;

/*     ********** */

/*     subroutine lmpar */

/*     given an m by n matrix a, an n by n nonsingular diagonal */
/*     matrix d, an m-vector b, and a positive number delta, */
/*     the problem is to determine a value for the parameter */
/*     par such that if x solves the system */

/*           a*x = b ,     sqrt(par)*d*x = 0 , */

/*     in the least squares sense, and dxnorm is the euclidean */
/*     norm of d*x, then either par is zero and */

/*           (dxnorm-delta) .le. 0.1*delta , */

/*     or par is positive and */

/*           abs(dxnorm-delta) .le. 0.1*delta . */

/*     this subroutine completes the solution of the problem */
/*     if it is provided with the necessary information from the */
/*     qr factorization, with column pivoting, of a. that is, if */
/*     a*p = q*r, where p is a permutation matrix, q has orthogonal */
/*     columns, and r is an upper triangular matrix with diagonal */
/*     elements of nonincreasing magnitude, then lmpar expects */
/*     the full upper triangle of r, the permutation matrix p, */
/*     and the first n components of (q transpose)*b. on output */
/*     lmpar also provides an upper triangular matrix s such that */

/*            t   t                   t */
/*           p *(a *a + par*d*d)*p = s *s . */

/*     s is employed within lmpar and may be of separate interest. */

/*     only a few iterations are generally needed for convergence */
/*     of the algorithm. if, however, the limit of 10 iterations */
/*     is reached, then the output par will contain the best */
/*     value obtained so far. */

/*     the subroutine statement is */

/*       subroutine lmpar(n,r,ldr,ipvt,diag,qtb,delta,par,x,sdiag, */
/*                        wa1,wa2) */

/*     where */

/*       n is a positive integer input variable set to the order of r. */

/*       r is an n by n array. on input the full upper triangle */
/*         must contain the full upper triangle of the matrix r. */
/*         on output the full upper triangle is unaltered, and the */
/*         strict lower triangle contains the strict upper triangle */
/*         (transposed) of the upper triangular matrix s. */

/*       ldr is a positive integer input variable not less than n */
/*         which specifies the leading dimension of the array r. */

/*       ipvt is an integer input array of length n which defines the */
/*         permutation matrix p such that a*p = q*r. column j of p */
/*         is column ipvt(j) of the identity matrix. */

/*       diag is an input array of length n which must contain the */
/*         diagonal elements of the matrix d. */

/*       qtb is an input array of length n which must contain the first */
/*         n elements of the vector (q transpose)*b. */

/*       delta is a positive input variable which specifies an upper */
/*         bound on the euclidean norm of d*x. */

/*       par is a nonnegative variable. on input par contains an */
/*         initial estimate of the levenberg-marquardt parameter. */
/*         on output par contains the final estimate. */

/*       x is an output array of length n which contains the least */
/*         squares solution of the system a*x = b, sqrt(par)*d*x = 0, */
/*         for the output par. */

/*       sdiag is an output array of length n which contains the */
/*         diagonal elements of the upper triangular matrix s. */

/*       wa1 and wa2 are work arrays of length n. */

/*     subprograms called */

/*       minpack-supplied ... dpmpar,enorm,qrsolv */

/*       fortran-supplied ... dabs,dmax1,dmin1,dsqrt */

/*     argonne national laboratory. minpack project. march 1980. */
/*     burton s. garbow, kenneth e. hillstrom, jorge j. more */

/*     ********** */

/*     dwarf is the smallest positive magnitude. */

    dwarf = __cminpack_func__(dpmpar)(2);

/*     compute and store in x the gauss-newton direction. if the */
/*     jacobian is rank-deficient, obtain a least squares solution. */

    nsing = n;
    for (j = 0; j < n; ++j) {
	wa1[j] = qtb[j];
	if (r[j + j * ldr] == 0. && nsing == n) {
	    nsing = j;
	}
	if (nsing < n) {
	    wa1[j] = 0.;
	}
    }
# ifdef USE_CBLAS
    cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, nsing, r, ldr, wa1, 1);
# else
    if (nsing >= 1) {
        int k;
        for (k = 1; k <= nsing; ++k) {
            j = nsing - k;
            wa1[j] /= r[j + j * ldr];
            temp = wa1[j];
            if (j >= 1) {
                int i;
                for (i = 0; i < j; ++i) {
                    wa1[i] -= r[i + j * ldr] * temp;
                }
            }
        }
    }
# endif
    for (j = 0; j < n; ++j) {
	l = ipvt[j]-1;
	x[l] = wa1[j];
    }

/*     initialize the iteration counter. */
/*     evaluate the function at the origin, and test */
/*     for acceptance of the gauss-newton direction. */

    iter = 0;
    for (j = 0; j < n; ++j) {
	wa2[j] = diag[j] * x[j];
    }
    dxnorm = __cminpack_enorm__(n, wa2);
    fp = dxnorm - delta;
    if (fp <= p1 * delta) {
	goto TERMINATE;
    }

/*     if the jacobian is not rank deficient, the newton */
/*     step provides a lower bound, parl, for the zero of */
/*     the function. otherwise set this bound to zero. */

    parl = 0.;
    if (nsing >= n) {
        for (j = 0; j < n; ++j) {
            l = ipvt[j]-1;
            wa1[j] = diag[l] * (wa2[l] / dxnorm);
        }
#     ifdef USE_CBLAS
        cblas_dtrsv(CblasColMajor, CblasUpper, CblasTrans, CblasNonUnit, n, r, ldr, wa1, 1);
#     else
        for (j = 0; j < n; ++j) {
            real sum = 0.;
            if (j >= 1) {
                int i;
                for (i = 0; i < j; ++i) {
                    sum += r[i + j * ldr] * wa1[i];
                }
            }
            wa1[j] = (wa1[j] - sum) / r[j + j * ldr];
        }
#     endif
        temp = __cminpack_enorm__(n, wa1);
        parl = fp / delta / temp / temp;
    }

/*     calculate an upper bound, paru, for the zero of the function. */

    for (j = 0; j < n; ++j) {
        real sum;
#     ifdef USE_CBLAS
        sum = cblas_ddot(j+1, &r[j*ldr], 1, qtb, 1);
#     else
	sum = 0.;
        int i;
	for (i = 0; i <= j; ++i) {
	    sum += r[i + j * ldr] * qtb[i];
	}
#     endif
	l = ipvt[j]-1;
	wa1[j] = sum / diag[l];
    }
    gnorm = __cminpack_enorm__(n, wa1);
    paru = gnorm / delta;
    if (paru == 0.) {
	paru = dwarf / min(delta,(real)p1) /* / p001 ??? */;
    }

/*     if the input par lies outside of the interval (parl,paru), */
/*     set par to the closer endpoint. */

    *par = max(*par,parl);
    *par = min(*par,paru);
    if (*par == 0.) {
	*par = gnorm / dxnorm;
    }

/*     beginning of an iteration. */

    for (;;) {
        ++iter;

/*        evaluate the function at the current value of par. */

        if (*par == 0.) {
            /* Computing MAX */
            d1 = dwarf, d2 = p001 * paru;
            *par = max(d1,d2);
        }
        temp = sqrt(*par);
        for (j = 0; j < n; ++j) {
            wa1[j] = temp * diag[j];
        }
        __cminpack_func__(qrsolv)(n, r, ldr, ipvt, wa1, qtb, x, sdiag, wa2);
        for (j = 0; j < n; ++j) {
            wa2[j] = diag[j] * x[j];
        }
        dxnorm = __cminpack_enorm__(n, wa2);
        temp = fp;
        fp = dxnorm - delta;

/*        if the function is small enough, accept the current value */
/*        of par. also test for the exceptional cases where parl */
/*        is zero or the number of iterations has reached 10. */

        if (fabs(fp) <= p1 * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) {
            goto TERMINATE;
        }

/*        compute the newton correction. */

#     ifdef USE_CBLAS
        for (j = 0; j < nsing; ++j) {
            l = ipvt[j]-1;
            wa1[j] = diag[l] * (wa2[l] / dxnorm);
        }
        for (j = nsing; j < n; ++j) {
            wa1[j] = 0.;
        }
        /* exchange the diagonal of r with sdiag */
        cblas_dswap(n, r, ldr+1, sdiag, 1);
        /* solve lower(r).x = wa1, result id put in wa1 */
        cblas_dtrsv(CblasColMajor, CblasLower, CblasNoTrans, CblasNonUnit, nsing, r, ldr, wa1, 1);
        /* exchange the diagonal of r with sdiag */
        cblas_dswap( n, r, ldr+1, sdiag, 1);
#     else /* !USE_CBLAS */
        for (j = 0; j < n; ++j) {
            l = ipvt[j]-1;
            wa1[j] = diag[l] * (wa2[l] / dxnorm);
        }
        for (j = 0; j < n; ++j) {
            wa1[j] /= sdiag[j];
            temp = wa1[j];
            if (n > j+1) {
                int i;
                for (i = j+1; i < n; ++i) {
                    wa1[i] -= r[i + j * ldr] * temp;
                }
            }
        }
#     endif /* !USE_CBLAS */
        temp = __cminpack_enorm__(n, wa1);
        parc = fp / delta / temp / temp;

/*        depending on the sign of the function, update parl or paru. */

        if (fp > 0.) {
            parl = max(parl,*par);
        }
        if (fp < 0.) {
            paru = min(paru,*par);
        }

/*        compute an improved estimate for par. */

        /* Computing MAX */
        d1 = parl, d2 = *par + parc;
        *par = max(d1,d2);

/*        end of an iteration. */

    }
TERMINATE:

/*     termination. */

    if (iter == 0) {
	*par = 0.;
    }

/*     last card of subroutine lmpar. */

} /* lmpar_ */
double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
  return cblas_ddot(n, x, 1, y, 1);
}
示例#3
0
 double wrapper_cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY)
   {
   return       cblas_ddot(N, X, incX, Y, incY);
   }
示例#4
0
int nonSmoothNewton(int n, double* z, NewtonFunctionPtr* phi, NewtonFunctionPtr* jacobianPhi, int* iparam, double* dparam)
{
    if (phi == NULL || jacobianPhi == NULL)
    {
        fprintf(stderr, "NonSmoothNewton error: phi or its jacobian function = NULL pointer.\n");
        exit(EXIT_FAILURE);
    }

    int itermax = iparam[0]; // maximum number of iterations allowed
    int niter = 0; // current iteration number
    double tolerance = dparam[0];
    if (verbose > 0)
    {
        printf(" ============= Starting of Newton process =============\n");
        printf(" - tolerance: %14.7e\n - maximum number of iterations: %i\n", tolerance, itermax);
    }

    int incx = 1;
    int n2 = n * n;
    int infoDGESV;

    /* Memory allocation for phi and its jacobian */
    double * phiVector = (double*)malloc(n * sizeof(*phiVector));
    double *jacobianPhiMatrix = (double*)malloc(n2 * sizeof(*jacobianPhiMatrix));
    /** merit function and its jacobian */
    double psi;
    double *jacobian_psi = (double*)malloc(n * sizeof(*jacobian_psi));
    int* ipiv = (int *)malloc(n * sizeof(*ipiv));
    if (phiVector == NULL || jacobianPhiMatrix == NULL ||  jacobian_psi == NULL || ipiv == NULL)
    {
        fprintf(stderr, "NonSmoothNewton, memory allocation failed.\n");
        exit(EXIT_FAILURE);
    }

    /** The algorithm is alg 4.1 of the paper of Kanzow and Kleinmichel, "A new class of semismooth Newton-type methods
        for nonlinear complementarity problems", in Computational Optimization and Applications, 11, 227-251 (1998).

        We try to keep the same notations
    */

    double rho = 1e-8;
    double descentCondition, criterion, norm_jacobian_psi, normPhi;
    double p = 2.1;
    double terminationCriterion = 1;
    if (jacobian_psi == NULL)
    {
        fprintf(stderr, "NonSmoothNewton, memory allocation failed for jacobian_psi.\n");
        exit(EXIT_FAILURE);
    }

    /** Iterations ... */
    while ((niter < itermax) && (terminationCriterion > tolerance))
    {
        ++niter;
        /** Computes phi and its jacobian */
        (*phi)(n, z, phiVector, 0);
        (*jacobianPhi)(n, z, jacobianPhiMatrix, 1);
        /* Computes the jacobian of the merit function, jacobian_psi = transpose(jacobianPhiMatrix).phiVector */
        cblas_dgemv(CblasColMajor,CblasTrans, n, n, 1.0, jacobianPhiMatrix, n, phiVector, incx, 0.0, jacobian_psi, incx);
        norm_jacobian_psi = cblas_dnrm2(n, jacobian_psi, 1);

        /* Computes norm2(phi) */
        normPhi = cblas_dnrm2(n, phiVector, 1);
        /* Computes merit function */
        psi = 0.5 * normPhi * normPhi;

        /* Stops if the termination criterion is satisfied */
        terminationCriterion = norm_jacobian_psi;
        if (terminationCriterion < tolerance)
            break;

        /* Search direction calculation
        Find a solution dk of jacobianPhiMatrix.d = -phiVector.
        dk is saved in phiVector.
        */
        cblas_dscal(n , -1.0 , phiVector, incx);
        DGESV(n, 1, jacobianPhiMatrix, n, ipiv, phiVector, n, &infoDGESV);

        /* descentCondition = jacobian_psi.dk */
        descentCondition = cblas_ddot(n, jacobian_psi,  1,  phiVector, 1);

        /* Criterion to be satisfied: error < -rho*norm(dk)^p */
        criterion = cblas_dnrm2(n, phiVector, 1);
        criterion = -rho * pow(criterion, p);

        if (infoDGESV != 0 || descentCondition > criterion)
        {
            /* dk = - jacobian_psi (remind that dk is saved in phiVector) */
            cblas_dcopy(n, jacobian_psi, 1, phiVector, 1);
            cblas_dscal(n , -1.0 , phiVector, incx);
        }

        /* Step-3 Line search: computes z_k+1 */
        linesearch_Armijo(n, z, phiVector, psi, descentCondition, phi);

        if (verbose > 0)
        {
            printf("Non Smooth Newton, iteration number %i, error equal to %14.7e .\n", niter, terminationCriterion);
            printf(" -----------------------------------------------------------------------\n");
        }
    }

    /* Total number of iterations */
    iparam[1] = niter;
    /* Final error */
    dparam[1] = terminationCriterion;

    /** Free memory*/
    free(phiVector);
    free(jacobianPhiMatrix);
    free(jacobian_psi);
    free(ipiv);

    if (verbose > 0)
    {
        if (dparam[1] > tolerance)
            printf("Non Smooth Newton warning: no convergence after %i iterations\n" , niter);

        else
            printf("Non Smooth Newton: convergence after %i iterations\n" , niter);
        printf(" The residue is : %e \n", dparam[1]);
    }

    if (dparam[1] > tolerance)
        return 1;
    else return 0;
}
double caffe_cpu_strided_dot<double>(const int n, const double* x,
    const int incx, const double* y, const int incy) {
  return cblas_ddot(n, x, incx, y, incy);
}
void fc3d_HyperplaneProjection(FrictionContactProblem* problem, double *reaction, double *velocity, int* info, SolverOptions* options)
{
  /* int and double parameters */
  int* iparam = options->iparam;
  double* dparam = options->dparam;
  /* Number of contacts */
  int nc = problem->numberOfContacts;
  double* q = problem->q;
  NumericsMatrix* M = problem->M;
  double* mu = problem->mu;
  /* Dimension of the problem */
  int n = 3 * nc;
  /* Maximum number of iterations */
  int itermax = iparam[0];
  /* Maximum number of iterations in Line--search */
  int lsitermax = iparam[1];
  /* Tolerance */
  double tolerance = dparam[0];
  double norm_q = cblas_dnrm2(nc*3 , problem->q , 1);
 




  /*****  Fixed point iterations *****/
  int iter = 0; /* Current iteration number */
  double error = 1.; /* Current error */
  int hasNotConverged = 1;
  int contact; /* Number of the current row of blocks in M */
  int nLocal = 3;
  dparam[0] = dparam[2]; // set the tolerance for the local solver
  double * velocitytmp = (double *)calloc(n, sizeof(double));
  double * reactiontmp = (double *)calloc(n, sizeof(double));
  double * reactiontmp2 = (double *)calloc(n, sizeof(double));
  double * reactiontmp3 = (double *)calloc(n, sizeof(double));

  /* double tau = 1.0; */
  double sigma = 0.99;

  /* if (dparam[3] > 0.0) */
  /* { */
  /*   tau = dparam[3]; */
  /* } */
  /* else */
  /* { */
  /*   printf("Hyperplane Projection method. tau <=0  is not well defined\n"); */
  /*   printf("Hyperplane Projection method. rho is set to 1.0\n"); */

  /* } */
  if (dparam[4] > 0.0 && dparam[4] < 1.0)
  {
    sigma = dparam[4];
  }
  else
  {
    printf("Hyperplane Projection method. 0<sigma <1  is not well defined\n");
    printf("Hyperplane Projection method. sigma is set to 0.99\n");
  }
 
  /*   double minusrho  = -1.0*rho; */
  while ((iter < itermax) && (hasNotConverged > 0))
  {
    ++iter;

    cblas_dcopy(n , q , 1 , velocitytmp, 1);
    cblas_dcopy(n , reaction , 1 , reactiontmp, 1);

    NM_gemv(1.0, M, reactiontmp, 1.0, velocitytmp);


    // projection for each contact

    double rho = 1;

    for (contact = 0 ; contact < nc ; ++contact)
    {
      int pos = contact * nLocal;
      double  normUT = sqrt(velocitytmp[pos + 1] * velocitytmp[pos + 1] + velocitytmp[pos + 2] * velocitytmp[pos + 2]);
      reactiontmp[pos] -= rho * (velocitytmp[pos] + mu[contact] * normUT);
      reactiontmp[pos + 1] -= rho * velocitytmp[pos + 1];
      reactiontmp[pos + 2] -= rho * velocitytmp[pos + 2];
      projectionOnCone(&reactiontmp[pos], mu[contact]);
    }

    // Armijo line search

    int stopingcriteria = 1;
    int i = -1;
    double alpha ;
    double lhs = NAN;
    double rhs;
    // z_k-y_k
    cblas_dcopy(n , reaction , 1 , reactiontmp3, 1);
    cblas_daxpy(n, -1.0, reactiontmp, 1, reactiontmp3, 1);


    while (stopingcriteria && (i < lsitermax))
    {
      i++ ;
      cblas_dcopy(n , reactiontmp , 1 , reactiontmp2, 1);
      alpha = 1.0 / (pow(2.0, i));
#ifdef VERBOSE_DEBUG
      printf("alpha = %f\n", alpha);
#endif
      cblas_dscal(n , alpha, reactiontmp2, 1);
      alpha  = 1.0 - alpha;

      cblas_daxpy(n, alpha, reaction, 1, reactiontmp2, 1);

      cblas_dcopy(n , q , 1 , velocitytmp, 1);

      NM_gemv(1.0, M, reactiontmp2, 1.0, velocitytmp);



      /* #ifdef VERBOSE_DEBUG */
      /*     for (contact = 0 ; contact < nc ; ++contact) */
      /*     { */
      /*       for(int kk=0; kk<3;kk++) printf("reactiontmp2[%i]=%12.8e\t",contact*nLocal+kk,  reactiontmp2[contact*nLocal+kk]); */
      /*       printf("\n"); */
      /*     } */
      /* #endif   */
      lhs = cblas_ddot(n, velocitytmp, 1, reactiontmp3, 1);
      rhs = cblas_dnrm2(n, reactiontmp3, 1);
      rhs = sigma / rho * rhs * rhs;
      if (lhs >= rhs)  stopingcriteria = 0;
#ifdef VERBOSE_DEBUG
      printf("Number of iteration in Armijo line search = %i\n", i);
      printf("lhs = %f\n", lhs);
      printf("rhs = %f\n", rhs);
      printf("alpha = %f\n", alpha);
      printf("sigma = %f\n", sigma);
      printf("rho = %f\n", rho);
#endif
    }

    double nonorm = cblas_dnrm2(n, velocitytmp, 1);
    double rhoequiv = lhs / (nonorm * nonorm);
#ifdef VERBOSE_DEBUG
    printf("rho equiv = %f\n", rhoequiv);
#endif
    cblas_daxpy(n, -rhoequiv, velocitytmp, 1, reaction  , 1);


    // projection for each contact
    for (contact = 0 ; contact < nc ; ++contact)
    {
      int pos = contact * nLocal;
      projectionOnCone(&reaction[pos], mu[contact]);
    }

    /* **** Criterium convergence **** */
    fc3d_compute_error(problem, reaction , velocity, tolerance, options, norm_q, &error);

    if (options->callback)
    {
      options->callback->collectStatsIteration(options->callback->env, nc * 3, 
                                      reaction, velocity, 
                                      error, NULL);
    }

    if (verbose > 0)
      printf("--------------- FC3D - Hyperplane Projection (HP) - Iteration %i rho = %14.7e \t rhoequiv = %14.7e \tError = %14.7e\n", iter, rho, rhoequiv, error);

    if (error < tolerance) hasNotConverged = 0;
    *info = hasNotConverged;
  }
  if (verbose > 0)
    printf("--------------- FC3D - Hyperplane Projection (HP) - #Iteration %i Final Residual = %14.7e\n", iter, error);
  dparam[0] = tolerance;
  dparam[1] = error;
  iparam[7] = iter;
  free(velocitytmp);
  free(reactiontmp);
  free(reactiontmp2);
  free(reactiontmp3);

}
// Update inverse a1 after column in matrix a has changed
// get new column out of array1 newCol
//
// a1= old inverse
// newCol = new column lCol in the new matrix a_new
// returns Det(a_old)/Det(a_new)
doublevar InverseUpdateColumn(Array2 <doublevar> & a1, const Array1 <doublevar> & newCol,
                              const int lCol, const int n)
{
  Array1 <doublevar> & tmpColL(tmp11);
  tmpColL.Resize(n);
  Array1 <doublevar> & prod(tmp12);
  prod.Resize(n);

  doublevar f=0.0;

#ifdef USE_BLAS
  int a1size=a1.GetDim(1);

  doublevar * a1col=a1.v+lCol*a1size;

  f=cblas_ddot(n,a1col, 1, newCol.v, 1);
  f=-1.0/f;

  cblas_dcopy(n,a1col,1,tmpColL.v,1);
  
  cblas_dgemv(CblasRowMajor,CblasNoTrans,n,n,
              1.0,a1.v,a1size,
              newCol.v,1,
              0.0,prod.v,1);

  cblas_dscal(n,f,prod.v,1);

  cblas_dger(CblasRowMajor, n,n,1.0,
             prod.v,1,
             tmpColL.v,1,
             a1.v,a1size);
  f=-f;
  cblas_dcopy(n,tmpColL.v,1,a1col,1);
  cblas_dscal(n,f,a1col,1);

#else 

  for(int i=0;i<n;++i)
  {
    f += a1(lCol,i)*newCol[i];
  }
  f =-1.0/f;

  for(int j=0;j<n;++j)
  {
    tmpColL[j]=a1(lCol,j);
    prod[j]   =0.0;
    for(int i=0;i<n;++i)
    {
      prod[j] += a1(j,i)*newCol[i];
    }
    prod[j] *= f;
  }

  for(int i=0;i<n;++i)
  {
    doublevar & p(prod[i]);
    for(int j=0;j<n;++j)
    {
      a1(i,j) += tmpColL[j]*p;
    }
  }

  f = -f;
  for(int j=0;j<n;++j)
  {
    a1(lCol,j) = f*tmpColL[j];
  }

#endif

  return f;
}
示例#8
0
void ncp_pathsearch(NonlinearComplementarityProblem* problem, double* z, double* F, int *info , SolverOptions* options)
{
/* Main step of the algorithm:
 * - compute jacobians
 * - call modified lemke
*/

  unsigned int n = problem->n;
  unsigned int preAlloc = options->iparam[SICONOS_IPARAM_PREALLOC];
  int itermax = options->iparam[SICONOS_IPARAM_MAX_ITER];

  double merit_norm = 1.0;
  double nn_tol = options->dparam[SICONOS_DPARAM_TOL];
  int nbiter = 0;

  /* declare a LinearComplementarityProblem on the stack*/
  LinearComplementarityProblem lcp_subproblem;
  lcp_subproblem.size = n;


  /* do some allocation if required
   * - nabla_F (used also as M for the LCP subproblem)
   * - q for the LCP subproblem
   *
   * Then fill the LCP subproblem
   */
  if (!preAlloc || (preAlloc && !options->internalSolvers))
  {
    options->internalSolvers = (SolverOptions *) malloc(sizeof(SolverOptions));
    solver_options_set(options->internalSolvers, SICONOS_LCP_PIVOT);
    options->numberOfInternalSolvers = 1;

    SolverOptions * lcp_options = options->internalSolvers;

    /* We always allocation once and for all since we are supposed to solve
     * many LCPs */
    lcp_options->iparam[SICONOS_IPARAM_PREALLOC] = 1;
    /* set the right pivot rule */
    lcp_options->iparam[SICONOS_IPARAM_PIVOT_RULE] = SICONOS_LCP_PIVOT_PATHSEARCH;
    /* set the right stacksize */
    lcp_options->iparam[SICONOS_IPARAM_PATHSEARCH_STACKSIZE] = options->iparam[SICONOS_IPARAM_PATHSEARCH_STACKSIZE];
  }


  assert(problem->nabla_F);
  lcp_subproblem.M = problem->nabla_F;


  if (!preAlloc || (preAlloc && !options->dWork))
  {
    options->dWork = (double *) malloc(4*n*sizeof(double));
  }
  lcp_subproblem.q = options->dWork;
  double* x = &options->dWork[n];
  double* x_plus = &options->dWork[2*n];
  double* r = &options->dWork[3*n];

  NMS_data* data_NMS;
  functions_LSA* functions;

  if (!preAlloc || (preAlloc && !options->solverData))
  {
    options->solverData = malloc(sizeof(pathsearch_data));
    pathsearch_data* solverData = (pathsearch_data*) options->solverData;

    /* do all the allocation */
    solverData->data_NMS = create_NMS_data(n, NM_DENSE, options->iparam, options->dparam);
    solverData->lsa_functions = (functions_LSA*) malloc(sizeof(functions_LSA));
    solverData->data_NMS->set = malloc(sizeof(positive_orthant));

    data_NMS = solverData->data_NMS;
    functions = solverData->lsa_functions;
    /* for use in NMS;  only those 3 functions are called */
    init_lsa_functions(functions, &FB_compute_F_ncp, &ncp_FB);
    functions->compute_H = &FB_compute_H_ncp;

    set_set_id(data_NMS->set, SICONOS_SET_POSITIVE_ORTHANT);

    /* fill ls_data */
    data_NMS->ls_data->compute_F = functions->compute_F;
    data_NMS->ls_data->compute_F_merit = functions->compute_F_merit;
    data_NMS->ls_data->z = NULL; /* XXX to check -- xhub */
    data_NMS->ls_data->zc = NMS_get_generic_workV(data_NMS->workspace, n);
    data_NMS->ls_data->F = NMS_get_F(data_NMS->workspace, n);
    data_NMS->ls_data->F_merit = NMS_get_F_merit(data_NMS->workspace, n);
    data_NMS->ls_data->desc_dir = NMS_get_dir(data_NMS->workspace, n);
    /** \todo this value should be settable by the user with a default value*/
    data_NMS->ls_data->alpha_min = fmin(data_NMS->alpha_min_watchdog, data_NMS->alpha_min_pgrad);
    data_NMS->ls_data->data = (void*)problem;
    data_NMS->ls_data->set = data_NMS->set;
    data_NMS->ls_data->sigma = options->dparam[SICONOS_DPARAM_NMS_SIGMA];
    /* data_NMS->ls_data->searchtype is set in the NMS code */
  }
  else
  {
    pathsearch_data* solverData = (pathsearch_data*) options->solverData;
    data_NMS = solverData->data_NMS;
    functions = solverData->lsa_functions;
  }

  /* initial value for ref_merit */
  problem->compute_F(problem->env, n, z, F);
  functions->compute_F_merit(problem, z, F, data_NMS->ls_data->F_merit);

  data_NMS->ref_merit = .5 * cblas_ddot(n, data_NMS->ls_data->F_merit, 1, data_NMS->ls_data->F_merit, 1);
  data_NMS->merit_bestpoint = data_NMS->ref_merit;
  cblas_dcopy(n, z, 1, NMS_checkpoint_0(data_NMS, n), 1);
  cblas_dcopy(n, z, 1, NMS_checkpoint_T(data_NMS, n), 1);
  cblas_dcopy(n, z, 1, NMS_bestpoint(data_NMS, n), 1);
  /* -------------------- end init ---------------------------*/

  int nms_failed = 0;
  double err = 10*nn_tol;

  /* to check the solution */
  LinearComplementarityProblem lcp_subproblem_check;
  int check_lcp_solution = 1; /* XXX add config for that */

  double normal_norm2_newton_point;

  /* F is already computed here at z */

  while ((err > nn_tol) && (nbiter < itermax) && !nms_failed)
  {
    int force_watchdog_step = 0;
    int force_d_step_merit_check = 0;
    double check_ratio = 0.0;
    nbiter++;
    /* update M, q and r */

    /* First find x */
    ncp_pathsearch_compute_x_from_z(n, z, F, x);
    pos_part(n, x, x_plus); /* update x_plus */

    ncp_pathsearch_update_lcp_data(problem, &lcp_subproblem, n, x_plus, x, r);

    if (check_lcp_solution)
    {
      lcp_subproblem_check.size = n;
      lcp_subproblem_check.M = problem->nabla_F;
      lcp_subproblem_check.q = lcp_subproblem.q;
      //cblas_dcopy(n, x, 1, lcp_subproblem_check.q , 1);
      //prodNumericsMatrix(n, n, -1.0, problem->nabla_F, x_plus, 0.0, lcp_subproblem.q);
    }

    double norm_r2 = cblas_ddot(n, r, 1, r, 1);
    if (norm_r2 < DBL_EPSILON*DBL_EPSILON) /* ||r|| < 1e-15 */
    {
      DEBUG_PRINTF("ncp_pathsearch :: ||r||  = %e < %e; path search procedure was successful!\n", norm_r2, DBL_EPSILON*DBL_EPSILON);
      (*info) = 0;
      ncp_compute_error(n, z, F, nn_tol, &err); /* XXX F should be up-to-date, we should check only CC*/
      break;
    }

    /* end update M, q and r */

    lcp_pivot_covering_vector(&lcp_subproblem, x_plus, x, info, options->internalSolvers, r);

    switch (*info)
    {
      case LCP_PIVOT_SUCCESS:
        DEBUG_PRINT("ncp_pathsearch :: path search procedure was successful!\n");
        if (check_lcp_solution)
        {
          double err_lcp = 0.0;
          cblas_daxpy(n, 1.0, r, 1, lcp_subproblem_check.q, 1);
          lcp_compute_error(&lcp_subproblem_check, x_plus, x, 1e-14, &err_lcp);
          double local_tol = fmax(1e-14, DBL_EPSILON*sqrt(norm_r2));
          printf("ncp_pathsearch :: lcp solved with error = %e; local_tol = %e\n", err_lcp, local_tol);
          //assert(err_lcp < local_tol && "ncp_pathsearch :: lcp solved with very bad precision");
          if (err_lcp > local_tol)
          {
            printf("ncp_pathsearch :: lcp solved with very bad precision\n");
            NM_display(lcp_subproblem.M);
            printf("z r q x_plus\n");
            for (unsigned i = 0; i < n; ++i) printf("%e %e %e %e\n", z[i], r[i], lcp_subproblem.q[i], x_plus[i]);
            options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = 0;
            lcp_pivot(&lcp_subproblem, x_plus, x, info, options->internalSolvers);
            options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = SICONOS_LCP_PIVOT_PATHSEARCH;
            lcp_compute_error(&lcp_subproblem_check, x_plus, x, 1e-14, &err_lcp);
            printf("ncp_pathsearch :: lcp resolved with error = %e; local_tol = %e\n", err_lcp, local_tol);
          }


          /* XXX missing recompute x ?*/
          /* recompute the normal norm */
          problem->compute_F(problem->env, n, x_plus, r);
          cblas_daxpy(n, -1.0, x, 1, r, 1);
          normal_norm2_newton_point = cblas_ddot(n, r, 1, r, 1);
          if (normal_norm2_newton_point > norm_r2)
          {
            printf("ncp_pathsearch :: lcp successfully solved, but the norm of the normal map increased! %e > %e\n", normal_norm2_newton_point, norm_r2);
            //assert(normal_norm2_newton_point <= norm_r2);
          }
          else
          {
            printf("ncp_pathsearch :: lcp successfully solved, norm of the normal map decreased! %e < %e\n", normal_norm2_newton_point, norm_r2);
            //check_ratio = norm_r2/normal_norm2_newton_point;
          }
          if (50*normal_norm2_newton_point < norm_r2)
          {
            force_d_step_merit_check = 1;
          }
          else if (10*normal_norm2_newton_point < norm_r2)
          {
//            check_ratio = sqrt(norm_r2/normal_norm2_newton_point);
          }
        }
        break;
      case LCP_PIVOT_RAY_TERMINATION:
        DEBUG_PRINT("ncp_pathsearch :: ray termination, let's fastened your seat belt!\n");
        break;
      case LCP_PATHSEARCH_LEAVING_T:
        DEBUG_PRINT("ncp_pathsearch :: leaving t, fastened your seat belt!\n");
        DEBUG_PRINTF("ncp_pathsearch :: max t value = %e\n", options->internalSolvers->dparam[2]); /* XXX fix 2 */
        /* try to retry solving the problem */
        /* XXX keep or not ? */
        /* recompute the normal norm */
        problem->compute_F(problem->env, n, x_plus, r);
        cblas_daxpy(n, -1.0, x, 1, r, 1);
        normal_norm2_newton_point = cblas_ddot(n, r, 1, r, 1);
        if (normal_norm2_newton_point > norm_r2)
        {
          printf("ncp_pathsearch :: lcp successfully solved, but the norm of the normal map increased! %e > %e\n", normal_norm2_newton_point, norm_r2);
          //assert(normal_norm2_newton_point <= norm_r2);
        }
        else
        {
          printf("ncp_pathsearch :: lcp successfully solved, norm of the normal map decreased! %e < %e\n", normal_norm2_newton_point, norm_r2);
          check_ratio = 5.0*norm_r2/normal_norm2_newton_point;
        }
        if (options->internalSolvers->dparam[2] > 1e-5) break;
        memset(x_plus, 0, sizeof(double) * n);
        problem->compute_F(problem->env, n, x_plus, r);
        ncp_pathsearch_compute_x_from_z(n, x_plus, r, x);
        ncp_pathsearch_update_lcp_data(problem, &lcp_subproblem, n, x_plus, x, r);
        lcp_pivot_covering_vector(&lcp_subproblem, x_plus, x, info, options->internalSolvers, r);
        if (*info == LCP_PIVOT_SUCCESS)
        {
           DEBUG_PRINT("ncp_pathsearch :: Lemke start worked !\n");
           double err_lcp = 0.0;
           cblas_daxpy(n, 1.0, r, 1, lcp_subproblem_check.q, 1);
           lcp_compute_error(&lcp_subproblem_check, x_plus, x, 1e-14, &err_lcp);
           double local_tol = fmax(1e-14, DBL_EPSILON*sqrt(norm_r2));
           printf("ncp_pathsearch :: lcp solved with error = %e; local_tol = %e\n", err_lcp, local_tol);
           assert(err_lcp < local_tol);
        }
        else
        {
          NM_display(lcp_subproblem.M);
          printf("z r q x_plus\n");
          for (unsigned i = 0; i < n; ++i) printf("%e %e %e %e\n", z[i], r[i], lcp_subproblem.q[i], x_plus[i]);
          DEBUG_PRINT("ncp_pathsearch :: Lemke start did not succeeded !\n");
          lcp_pivot_diagnose_info(*info);
          if (*info == LCP_PATHSEARCH_LEAVING_T)
          {
            DEBUG_PRINTF("ncp_pathsearch :: max t value after Lemke start = %e\n", options->internalSolvers->dparam[2]);
          }
          options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = 0;
          lcp_pivot(&lcp_subproblem, x_plus, x, info, options->internalSolvers);
          options->internalSolvers->iparam[SICONOS_IPARAM_PIVOT_RULE] = SICONOS_LCP_PIVOT_PATHSEARCH;
          double err_lcp = 0.0;
          lcp_compute_error(&lcp_subproblem, x_plus, x, 1e-14, &err_lcp);
          printf("ncp_pathsearch :: lemke start resolved with info = %d; error = %e\n", *info, err_lcp);
          printf("x_plus x_minus\n");
          for (unsigned i = 0; i < n; ++i) printf("%e %e\n", x_plus[i], x[i]);
          /* recompute the normal norm */
          problem->compute_F(problem->env, n, x_plus, r);
          cblas_daxpy(n, -1.0, x, 1, r, 1);
          double normal_norm2_newton_point = cblas_ddot(n, r, 1, r, 1);
          if (normal_norm2_newton_point > norm_r2)
          {
            printf("ncp_pathsearch :: lcp successfully solved, but the norm of the normal map increased! %e > %e\n", normal_norm2_newton_point, norm_r2);
            //assert(normal_norm2_newton_point <= norm_r2);
          }
          else
          {
             printf("ncp_pathsearch :: lcp successfully solved, norm of the normal map decreased! %.*e < %.*e\n", DECIMAL_DIG, normal_norm2_newton_point, DECIMAL_DIG, norm_r2);
          }
          if (100*normal_norm2_newton_point < norm_r2)
          {
            force_d_step_merit_check = 1;
          }
        }
        break;
      case LCP_PIVOT_NUL:
        printf("ncp_pathsearch :: kaboom, kaboom still more work needs to be done\n");
        lcp_pivot_diagnose_info(*info);
//        exit(EXIT_FAILURE);
        force_watchdog_step = 1;
        break;
      case LCP_PATHSEARCH_NON_ENTERING_T:
        DEBUG_PRINT("ncp_pathsearch :: non entering t, something is wrong here. Fix the f****** code!\n");
        assert(0 && "ncp_pathsearch :: non entering t, something is wrong here\n"); 
        force_watchdog_step = 1;
        break;
      default:
        printf("ncp_pathsearch :: unknown code returned by the path search\n");
        exit(EXIT_FAILURE);
    }

    nms_failed = NMS(data_NMS, problem, functions, z, x_plus, force_watchdog_step, force_d_step_merit_check, check_ratio);
    /* at this point z has been updated */

    /* recompute the normal norm */
    problem->compute_F(problem->env, n, z, F);
    functions->compute_F_merit(problem, z, F, data_NMS->ls_data->F_merit);

    /* XXX is this correct ? */
    merit_norm = .5 * cblas_ddot(n, data_NMS->ls_data->F_merit, 1, data_NMS->ls_data->F_merit, 1);

    ncp_compute_error(n, z, F, nn_tol, &err); /* XXX F should be up-to-date, we should check only CC*/
    DEBUG_PRINTF("ncp_pathsearch :: iter = %d, ncp_error = %e; merit_norm^2 = %e\n", nbiter, err, merit_norm);

  }

  options->iparam[1] = nbiter;
  options->dparam[1] = err;
  if (nbiter == itermax)
  {
    *info = 1;
  }
  else if (nms_failed)
  {
    *info = 2;
  }
  else
  {
    *info = 0;
  }

  DEBUG_PRINTF("ncp_pathsearch procedure finished :: info = %d; iter = %d; ncp_error = %e; merit_norm^2 = %e\n", *info, nbiter, err, merit_norm);

  if (!preAlloc)
  {
    freeNumericsMatrix(problem->nabla_F);
    free(problem->nabla_F);
    problem->nabla_F = NULL;
    free(options->dWork);
    options->dWork = NULL;
    solver_options_delete(options->internalSolvers);
    free(options->internalSolvers);
    options->internalSolvers = NULL;
    free_NMS_data(data_NMS);
    free(functions);
    free(options->solverData);
    options->solverData = NULL;
  }
}
示例#9
0
// ----------------------------------------
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    //real_Double_t   t_m, t_c, t_f;
    magma_int_t ione = 1;
    
    double  *A, *B;
    double diff, error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld;
    double x2_m, x2_c;  // real x for magma, cblas/fortran blas respectively
    double x_m, x_c;  // x for magma, cblas/fortran blas respectively
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    opts.tolerance = max( 100., opts.tolerance );
    double tol = opts.tolerance * lapackf77_dlamch("E");
    gTol = tol;
    
    printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n"
            "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA.  !!\n"
            "\n"
            "Diff  compares MAGMA wrapper        to CBLAS and BLAS function; should be exactly 0.\n"
            "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n"
            "\n" );
    
    double total_diff  = 0.;
    double total_error = 0.;
    int inc[] = { 1 };  //{ -2, -1, 1, 2 };  //{ 1 };  //{ -1, 1 };
    int ninc = sizeof(inc)/sizeof(*inc);
    
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        m = opts.msize[itest];
        n = opts.nsize[itest];
        k = opts.ksize[itest];
        
    for( int iincx = 0; iincx < ninc; ++iincx ) {
        magma_int_t incx = inc[iincx];
        
    for( int iincy = 0; iincy < ninc; ++iincy ) {
        magma_int_t incy = inc[iincy];
        
        printf("=========================================================================\n");
        printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n",
                (int) m, (int) n, (int) k, (int) incx, (int) incy );
        printf( "Function              MAGMA     CBLAS     BLAS        Diff      Error\n"
                "                      msec      msec      msec\n" );
        
        // allocate matrices
        // over-allocate so they can be any combination of
        // {m,n,k} * {abs(incx), abs(incy)} by
        // {m,n,k} * {abs(incx), abs(incy)}
        maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) );
        ld = max( 1, maxn );
        size = ld*maxn;
        magma_dmalloc_pinned( &A,  size );  assert( A   != NULL );
        magma_dmalloc_pinned( &B,  size );  assert( B   != NULL );
        
        // initialize matrices
        lapackf77_dlarnv( &ione, ISEED, &size, A );
        lapackf77_dlarnv( &ione, ISEED, &size, B );
        
        printf( "Level 1 BLAS ----------------------------------------------------------\n" );
        
        
        // ----- test DASUM
        // get one-norm of column j of A
        if ( incx > 0 && incx == incy ) {  // positive, no incy
            diff  = 0;
            error = 0;
            for( int j = 0; j < k; ++j ) {
                x_m = magma_cblas_dasum( m, A(0,j), incx );
                
                x_c = cblas_dasum( m, A(0,j), incx );
                diff += fabs( x_m - x_c );
                
                x_c = blasf77_dasum( &m, A(0,j), &incx );
                error += fabs( (x_m - x_c) / (m*x_c) );
            }
            output( "dasum", diff, error );
            total_diff  += diff;
            total_error += error;
        }
        
        // ----- test DNRM2
        // get two-norm of column j of A
        if ( incx > 0 && incx == incy ) {  // positive, no incy
            diff  = 0;
            error = 0;
            for( int j = 0; j < k; ++j ) {
                x_m = magma_cblas_dnrm2( m, A(0,j), incx );
                
                x_c = cblas_dnrm2( m, A(0,j), incx );
                diff += fabs( x_m - x_c );
                
                x_c = blasf77_dnrm2( &m, A(0,j), &incx );
                error += fabs( (x_m - x_c) / (m*x_c) );
            }
            output( "dnrm2", diff, error );
            total_diff  += diff;
            total_error += error;
        }
        
        // ----- test DDOT
        // dot columns, Aj^H Bj
        diff  = 0;
        error = 0;
        for( int j = 0; j < k; ++j ) {
            // MAGMA implementation, not just wrapper
            x2_m = magma_cblas_ddot( m, A(0,j), incx, B(0,j), incy );
            
            // crashes on MKL 11.1.2, ILP64
            #if ! defined( MAGMA_WITH_MKL )
                #ifdef COMPLEX
                cblas_ddot_sub( m, A(0,j), incx, B(0,j), incy, &x2_c );
                #else
                x2_c = cblas_ddot( m, A(0,j), incx, B(0,j), incy );
                #endif
                error += fabs( x2_m - x2_c ) / fabs( m*x2_c );
            #endif
            
            // crashes on MacOS 10.9
            #if ! defined( __APPLE__ )
                x2_c = blasf77_ddot( &m, A(0,j), &incx, B(0,j), &incy );
                error += fabs( x2_m - x2_c ) / fabs( m*x2_c );
            #endif
        }
        output( "ddot", diff, error );
        total_diff  += diff;
        total_error += error;
        total_error += error;
        
        // ----- test DDOT
        // dot columns, Aj^T * Bj
        diff  = 0;
        error = 0;
        for( int j = 0; j < k; ++j ) {
            // MAGMA implementation, not just wrapper
            x2_m = magma_cblas_ddot( m, A(0,j), incx, B(0,j), incy );
            
            // crashes on MKL 11.1.2, ILP64
            #if ! defined( MAGMA_WITH_MKL )
                #ifdef COMPLEX
                cblas_ddot_sub( m, A(0,j), incx, B(0,j), incy, &x2_c );
                #else
                x2_c = cblas_ddot( m, A(0,j), incx, B(0,j), incy );
                #endif
                error += fabs( x2_m - x2_c ) / fabs( m*x2_c );
            #endif
            
            // crashes on MacOS 10.9
            #if ! defined( __APPLE__ )
                x2_c = blasf77_ddot( &m, A(0,j), &incx, B(0,j), &incy );
                error += fabs( x2_m - x2_c ) / fabs( m*x2_c );
            #endif
        }
        output( "ddot", diff, error );
        total_diff  += diff;
        total_error += error;
        
        // tell user about disabled functions
        #if defined( MAGMA_WITH_MKL )
            printf( "cblas_ddot and cblas_ddot disabled with MKL (segfaults)\n" );
        #endif
        
        #if defined( __APPLE__ )
            printf( "blasf77_ddot and blasf77_ddot disabled on MacOS (segfaults)\n" );
        #endif
            
        // cleanup
        magma_free_pinned( A );
        magma_free_pinned( B );
        fflush( stdout );
    }}}  // itest, incx, incy
    
    // TODO use average error?
    printf( "sum diffs  = %8.2g, MAGMA wrapper        compared to CBLAS and Fortran BLAS; should be exactly 0.\n"
            "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n",
            total_diff, total_error );
    if ( total_diff != 0. ) {
        printf( "some tests failed diff == 0.; see above.\n" );
    }
    else {
        printf( "all tests passed diff == 0.\n" );
    }
    
    TESTING_FINALIZE();
    
    int status = (total_diff != 0.);
    return status;
}
示例#10
0
int solve_bicgstab_block(csr_t* mat, csr_t** ilu, int nb, double* b, double* x)
{
	int n = mat->n;
	int nnz = mat->nnz;
	int *offset_ilu = (int*) calloc(nb, sizeof(int));
	for ( int i = 1; i < nb; i++ )
		offset_ilu[i] = offset_ilu[i-1] + ilu[i-1]->n;
	
	double tol = 1e-6, floatone = 1.0;
	const int max_iter = 200;
    
	double *r, *p, *y, *zm1, *zm2, *rm2, *rm1, *rm3, nrm0, nrm;
	r = (double*) malloc (sizeof(double) * n);
	p = (double*) malloc (sizeof(double) * n);
	y = (double*) malloc (sizeof(double) * n);
	rm1 = (double*) malloc (sizeof(double) * n);
	rm2 = (double*) malloc (sizeof(double) * n);
	rm3 = (double*) malloc (sizeof(double) * n);
	zm1 = (double*) malloc (sizeof(double) * n);
	zm2 = (double*) malloc (sizeof(double) * n);

	double rho = 1.0, rho1, beta = 0.0, alpha = 0.0, omega, temp, temp1;

	char lower1 = 'L', lower = 'N', lower2 = 'U';
	char upper1 = 'U', upper = 'N', upper2 = 'N';
	
	#ifdef TIMER
	double timerLUSol = 0, timerLUSol1, timerSpMV = 0, timerSpMV1;
	double timerTotal = omp_get_wtime();
	#endif

	cblas_dcopy (n, b, 1, r, 1);
	cblas_dcopy (n, r, 1, p, 1);
	cblas_dcopy (n, r, 1, zm1, 1);
	nrm0 = cblas_dnrm2 (n, r, 1);

	for (int k = 0; k < max_iter; k++)
	{
		rho1 = rho;
		rho = cblas_ddot(n, zm1, 1, r, 1);
		if ( k > 0 )
		{
			beta = (rho / rho1) * (alpha / omega);
			cblas_daxpy (n, -omega, zm2, 1, p, 1);
			cblas_dscal (n, beta, p, 1);
			cblas_daxpy (n, floatone, r, 1, p, 1);
		}
		
		#ifdef TIMER
		timerLUSol1 = omp_get_wtime();
		#endif
		
		#pragma omp parallel
		{
		#pragma omp for 
		for (int i = 0; i < nb; i++)
			mkl_dcsrtrsv (&lower1, &lower, &lower2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &p[offset_ilu[i]], &y[offset_ilu[i]]);

		#pragma omp for 
		for (int i = 0; i < nb; i++)
			mkl_dcsrtrsv (&upper1, &upper, &upper2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &y[offset_ilu[i]], &rm2[offset_ilu[i]]);
		}
		#ifdef TIMER
		timerLUSol += omp_get_wtime() - timerLUSol1;
		timerSpMV1 = omp_get_wtime();
		#endif
		
		mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm2, zm2);
		
		#ifdef TIMER
		timerSpMV += omp_get_wtime() - timerSpMV1;
		#endif
		
		temp = cblas_ddot(n, zm1, 1, zm2, 1);
		
		alpha = rho / temp;
		cblas_daxpy (n, -alpha, zm2, 1, r, 1);
		cblas_daxpy (n, alpha, rm2, 1, x, 1);
		nrm = cblas_dnrm2 (n, x, 1);

		if  ((nrm < tol) && ( nrm / nrm0 < tol )) {printf("  iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; }
		
		#ifdef TIMER
		timerLUSol1 = omp_get_wtime();
		#endif
		
		#pragma omp parallel
		{
		#pragma omp for
		for (int i = 0; i < nb; i++)
			mkl_dcsrtrsv (&lower1, &lower, &lower2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &r[offset_ilu[i]], &y[offset_ilu[i]]);
		#pragma omp for 
		for (int i = 0; i < nb; i++)
			mkl_dcsrtrsv (&upper1, &upper, &upper2, &ilu[i]->n, ilu[i]->val, ilu[i]->ia, ilu[i]->ja, &y[offset_ilu[i]], &rm3[offset_ilu[i]]);
		}
		#ifdef TIMER
		timerLUSol += omp_get_wtime() - timerLUSol1;
		timerSpMV1 = omp_get_wtime();
		#endif
		mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm3, y);
		
		#ifdef TIMER
		timerSpMV += omp_get_wtime() - timerSpMV1;
		#endif
		
		temp = cblas_ddot(n, y, 1, r, 1);
		temp1 = cblas_ddot(n, y, 1, y, 1);
		omega = temp / temp1;

		cblas_daxpy (n, omega, rm3, 1, x, 1);
		cblas_daxpy (n, -omega, y, 1, r, 1);
		nrm = cblas_dnrm2 (n, r, 1);
		if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf("  iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; }
		printf("  iteration = %3d, residual = %le \n", k+1, nrm / nrm0);
	}
	#ifdef TIMER
	printf("time LUSol\t%lf\ntime SpMV\t%lf\n",timerLUSol,timerSpMV);
	printf("time total\t%lf\n",omp_get_wtime()-timerTotal);
	#endif
	
	free (r);
	free (rm1);
	free (rm2);
	free (rm3);
	free (zm1);
	free (zm2);
	free (p);
	free (y);
	free (offset_ilu);

	return 0;
}
示例#11
0
void mlcp_pgs(MixedLinearComplementarityProblem* problem, double *z, double *w, int *info, SolverOptions* options)
{

  if (!problem->isStorageType2)
  {
    printf("Siconos/Numerics: mlcp_pgs: Wrong Storage (!isStorageType2) for PGS solver\n");
    exit(EXIT_FAILURE);
  }


  double* A = problem->A;
  double* B = problem->B;
  double* C = problem->C;
  double* D = problem->D;
  double* a = problem->a;
  double* b = problem->b;
  int n = problem->n;
  int m = problem->m;
  double *u = &z[0];
  double *v = &z[n];
  double *Buf;

  int incx, incy, incAx, incAy, incBx, incBy;
  int i, iter;
  int itermax, verbose;
  int pgsExplicit;
  double err, vi;
  double tol;
  double prev;
  double *diagA, *diagB;
  verbose = 0;

  incx = 1;
  incy = 1;
  /* Recup input */

  itermax = options->iparam[0];
  pgsExplicit = options->iparam[2];
  tol   = options->dparam[0];

  /* Initialize output */

  options->iparam[1] = 0;
  options->dparam[1] = 0.0;

  /* Allocation */

  diagA = (double*)malloc(n * sizeof(double));
  diagB = (double*)malloc(m * sizeof(double));



  incx = 1;
  incy = 1;

  /* Preparation of the diagonal of the inverse matrix */

  for (i = 0 ; i < n ; ++i)
  {
    if ((fabs(A[i * n + i]) < DBL_EPSILON))
    {

      if (verbose > 0)
      {
        printf(" Vanishing diagonal term \n");
        printf(" The local problem cannot be solved \n");
      }

      *info = 2;
      free(diagA);
      free(diagB);
      *info = 1;
      return;
    }
    else
    {
      diagA[i] = 1.0 / A[i * n + i];

    }
  }
  for (i = 0 ; i < m ; ++i)
  {
    if ((fabs(B[i * m + i]) < DBL_EPSILON))
    {

      if (verbose > 0)
      {
        printf(" Vanishing diagonal term \n");
        printf(" The local problem cannot be solved \n");
      }

      *info = 2;
      free(diagA);
      free(diagB);

      return;
    }
    else
    {
      diagB[i] = 1.0 / B[i * m + i];

    }
  }
  /*start iterations*/

  iter = 0;
  err  = 1.;

  incx = 1;
  incy = 1;
  incAx = n;
  incAy = 1;
  incBx = m;
  incBy = 1;


  mlcp_compute_error(problem, z, w, tol, &err);

  while ((iter < itermax) && (err > tol))
  {

    ++iter;

    incx = 1;
    incy = 1;

    if (pgsExplicit)
    {
      /*Use w like a buffer*/
      cblas_dcopy(n , w , incx , u , incy);  //w <- q
      Buf = w;

      for (i = 0 ; i < n ; ++i)
      {
        prev = Buf[i];
        Buf[i] = 0;
        //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i];
        u[i] =  - (a[i] + cblas_ddot(n , &A[i] , incAx , Buf , incAy)   + cblas_ddot(m , &C[i] , incAx , v , incBy)) * diagA[i];
        Buf[i] = prev;
      }
      for (i = 0 ; i < m ; ++i)
      {
        v[i] = 0.0;
        //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i];
        vi = -(b[i] + cblas_ddot(n , &D[i] , incBx , u , incAy)   + cblas_ddot(m , &B[i] , incBx , v , incBy)) * diagB[i];

        if (vi < 0) v[i] = 0.0;
        else v[i] = vi;
      }
    }
    else
    {

      for (i = 0 ; i < n ; ++i)
      {
        u[i] = 0.0;

        //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i];
        u[i] =  - (a[i] + cblas_ddot(n , &A[i] , incAx , u , incAy)   + cblas_ddot(m , &C[i] , incAx , v , incBy)) * diagA[i];
      }

      for (i = 0 ; i < m ; ++i)
      {
        v[i] = 0.0;
        //zi = -( q[i] + cblas_ddot( n , &vec[i] , incx , z , incy ))*diag[i];
        vi = -(b[i] + cblas_ddot(n , &D[i] , incBx , u , incAy)   + cblas_ddot(m , &B[i] , incBx , v , incBy)) * diagB[i];

        if (vi < 0) v[i] = 0.0;
        else v[i] = vi;
      }
    }

    /* **** Criterium convergence compliant with filter_result_MLCP **** */

    mlcp_compute_error(problem, z, w, tol, &err);

    if (verbose == 2)
    {
      printf(" # i%d -- %g : ", iter, err);
      for (i = 0 ; i < n ; ++i) printf(" %g", u[i]);
      for (i = 0 ; i < m ; ++i) printf(" %g", v[i]);
      for (i = 0 ; i < m ; ++i) printf(" %g", w[i]);
      printf("\n");
    }

    /* **** ********************* **** */

  }

  options->iparam[1] = iter;
  options->dparam[1] = err;

  if (err > tol)
  {
    printf("Siconos/Numerics: mlcp_pgs: No convergence of PGS after %d iterations\n" , iter);
    printf("Siconos/Numerics: mlcp_pgs: The residue is : %g \n", err);
    *info = 1;
  }
  else
  {
    if (verbose > 0)
    {
      printf("Siconos/Numerics: mlcp_pgs: Convergence of PGS after %d iterations\n" , iter);
      printf("Siconos/Numerics: mlcp_pgs: The residue is : %g \n", err);
    }
    *info = 0;
  }

  free(diagA);
  free(diagB);
  return;
}
示例#12
0
int solve_bicgstab(csr_t* mat, csr_t* ilu, double* b, double* x)
{
	double tol = 1e-6, floatone = 1.0;
	const int max_iter = 200;
	int n = mat->n;
	int nnz = mat->nnz;
    
	double *r, *p, *y, *zm1, *zm2, *rm2, *rm1, *rm3, nrm0, nrm;
	r = (double*) calloc (n, sizeof(double));
	p = (double*) calloc (n, sizeof(double));
	y = (double*) calloc (n, sizeof(double));
	rm1 = (double*) calloc (n, sizeof(double));
	rm2 = (double*) calloc (n, sizeof(double));
	rm3 = (double*) calloc (n, sizeof(double));
	zm1 = (double*) calloc (n, sizeof(double));
	zm2 = (double*) calloc (n, sizeof(double));
	

	double rho = 1.0, rho1, beta = 0.0, alpha = 0.0, omega, temp, temp1;

	char lower1 = 'L', lower = 'N', lower2 = 'U';
	char upper1 = 'U', upper = 'N', upper2 = 'N';
	
	#ifdef TIMER
	double timerLUSol = 0, timerLUSol1, timerSpMV = 0, timerSpMV1, timerVector = 0, timerVector1;
	double timerTotal = omp_get_wtime();
	#endif
	
	cblas_dcopy (n, b, 1, r, 1);
	cblas_dcopy (n, r, 1, p, 1);
	cblas_dcopy (n, r, 1, zm1, 1);

	nrm0 = cblas_dnrm2 (n, r, 1);
	for (int k = 0; k < max_iter; k++)
	{
		rho1 = rho;
		#ifdef TIMER
		timerVector1 = omp_get_wtime();
		#endif
		rho = cblas_ddot(n, zm1, 1, r, 1);
		if ( k > 0 )
		{
			
			beta = (rho / rho1) * (alpha / omega);
			cblas_daxpy (n, -omega, zm2, 1, p, 1);
			cblas_dscal (n, beta, p, 1);
			cblas_daxpy (n, floatone, r, 1, p, 1);
		}
		
		#ifdef TIMER
		timerVector += omp_get_wtime() - timerVector1;
		timerLUSol1 = omp_get_wtime();
		#endif
		
		mkl_dcsrtrsv (&lower1, &lower, &lower2, &n, ilu->val, ilu->ia, ilu->ja, p, y);
		mkl_dcsrtrsv (&upper1, &upper, &upper2, &n, ilu->val, ilu->ia, ilu->ja, y, rm2);
		
		#ifdef TIMER
		timerLUSol += omp_get_wtime() - timerLUSol1;
		timerSpMV1 = omp_get_wtime();
		#endif
		
		mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm2, zm2);
		
		#ifdef TIMER
		timerSpMV += omp_get_wtime() - timerSpMV1;
		timerVector1 = omp_get_wtime();
		#endif
		
		temp = cblas_ddot(n, zm1, 1, zm2, 1);
		alpha = rho / temp;
		cblas_daxpy (n, -alpha, zm2, 1, r, 1);
		cblas_daxpy (n, alpha, rm2, 1, x, 1);

		nrm = cblas_dnrm2 (n, r, 1);
		#ifdef TIMER
		timerVector += omp_get_wtime() - timerVector1;
		#endif
		if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf("  iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; }
		
		#ifdef TIMER
		timerLUSol1 = omp_get_wtime();
		#endif
		
		mkl_dcsrtrsv (&lower1, &lower, &lower2, &n, ilu->val, ilu->ia, ilu->ja, r, y);
		mkl_dcsrtrsv (&upper1, &upper, &upper2, &n, ilu->val, ilu->ia, ilu->ja, y, rm3);
		
		#ifdef TIMER
		timerLUSol += omp_get_wtime() - timerLUSol1;
		timerSpMV1 = omp_get_wtime();
		#endif
		
		mkl_dcsrgemv (&lower, &n, mat->val, mat->ia, mat->ja, rm3, y);
		
		#ifdef TIMER
		timerSpMV += omp_get_wtime() - timerSpMV1;
		timerVector1 = omp_get_wtime();
		#endif
		
		temp = cblas_ddot(n, y, 1, r, 1);
		temp1 = cblas_ddot(n, y, 1, y, 1);
		omega = temp / temp1;
		
		cblas_daxpy (n, omega, rm3, 1, x, 1);
		cblas_daxpy (n, -omega, y, 1, r, 1);
		nrm = cblas_dnrm2 (n, r, 1);
		#ifdef TIMER
		timerVector += omp_get_wtime() - timerVector1;
		#endif
		if ((nrm < tol) && ( nrm / nrm0 < tol )) {printf("  iteration = %3d, residual = %le \n", k+1, nrm / nrm0); break; }
		printf("  iteration = %3d, residual = %le \n", k+1, nrm / nrm0);
	}

	#ifdef TIMER
	printf("time LUSol\t%lf\ntime SpMV\t%lf\ntime v-v oper\t%lf\n",timerLUSol,timerSpMV,timerVector);
	printf("time total\t%lf\n",omp_get_wtime()-timerTotal);
	#endif
	
	free (r);
	free (rm1);
	free (rm2);
	free (rm3);
	free (zm1);
	free (zm2);
	free (p);
	free (y);
	
	return 0;
}
void variationalInequality_HyperplaneProjection(VariationalInequality* problem, double *x, double *w, int* info, SolverOptions* options)
{
  /* /\* int and double parameters *\/ */
  int* iparam = options->iparam;
  double* dparam = options->dparam;
  /* Number of contacts */
  int n = problem->size;
  /* Maximum number of iterations */
  int itermax = iparam[0];
  /* Maximum number of iterations in Line--search */
  int lsitermax = iparam[1];
  assert(lsitermax >0);
  /* Tolerance */
  double tolerance = dparam[0];


  /*****  Fixed point iterations *****/
  int iter = 0; /* Current iteration number */
  double error = 1.; /* Current error */
  int hasNotConverged = 1;
  dparam[0] = dparam[2]; // set the tolerance for the local solver


  double * xtmp = (double *)malloc(n * sizeof(double));
  double * wtmp = (double *)malloc(n * sizeof(double));
  double * xtmp2 = (double *)malloc(n * sizeof(double));
  double * xtmp3 = (double *)malloc(n * sizeof(double));

  int isVariable = 0;

  double tau = 1.0;
  double sigma = 0.99;

  if (dparam[3] > 0.0)
  {
    tau = dparam[3];
  }
  else
  {
    printf("Hyperplane Projection method. tau <=0  is not well defined\n");
    printf("Hyperplane Projection method. tau is set to 1.0\n");
  }

  if (dparam[4] > 0.0 && dparam[4] < 1.0)
  {
    sigma = dparam[4];
  }
  else
  {
    printf("Hyperplane Projection method. 0<sigma <1  is not well defined\n");
    printf("Hyperplane Projection method. sigma is set to %6.4e\n", sigma);
  }


  isVariable=0;


  if (!isVariable)
  {
    /*   double minusrho  = -1.0*rho; */
    while ((iter < itermax) && (hasNotConverged > 0))
    {
      ++iter;
      /** xtmp <-- x (x_k) */
      cblas_dcopy(n , x , 1 , xtmp, 1);



      
      


      

      /* xtmp (y_k)= P_X(x_k-tau F(x_k)) */
      problem->F(problem, n, xtmp, wtmp);
      cblas_daxpy(n, -tau, wtmp , 1, xtmp , 1) ;
      cblas_dcopy(n , xtmp, 1 , xtmp2, 1);
      problem->ProjectionOnX(problem, xtmp2,xtmp);

      // Armijo line search

      int stopingcriteria = 1;
      int ls_iter = -1;
      double alpha = 1.0;
      double lhs = NAN;
      double rhs;
      // xtmp3 = z_k-y_k
      cblas_dcopy(n , x , 1 , xtmp3, 1);
      cblas_daxpy(n, -1.0, xtmp, 1, xtmp3, 1);
      rhs = cblas_dnrm2(n,xtmp3, 1);
      rhs = sigma / tau * rhs * rhs;
      DEBUG_EXPR_WE(
        for (int i =0; i< n ; i++)
        {
          printf("(y_k) xtmp[%i]=%6.4e\t",i,xtmp[i]);    printf("(x_k-y_k) xtmp3[%i]=%6.4e\n",i,xtmp3[i]);
        }
        );
      while (stopingcriteria && (ls_iter < lsitermax))
      {
        ls_iter++ ;
        /* xtmp2 = alpha * y_k + (1-alpha) x_k */
        alpha = 1.0 / (pow(2.0, ls_iter));
        DEBUG_PRINTF("alpha = %6.4e\n", alpha);
        cblas_dcopy(n ,xtmp , 1 , xtmp2, 1);
        cblas_dscal(n , alpha, xtmp2, 1);
        cblas_daxpy(n, 1.0-alpha, x, 1, xtmp2, 1);



        /* wtmp =  */

        problem->F(problem, n, xtmp2,wtmp);
        DEBUG_EXPR_WE(
          for (int i =0; i< n ; i++)
          {
            printf("(z_k) xtmp2[%i]=%6.4e\n",i,xtmp2[i]);    printf("F(z_k) wtmp[%i]=%6.4e\n",i,wtmp[i]);
          }
          );
        lhs = cblas_ddot(n, wtmp, 1, xtmp3, 1);

        if (lhs >= rhs)  stopingcriteria = 0;

        DEBUG_PRINTF("ls_iter= %i, lsitermax =%i, stopingcriteria  %i\n",ls_iter,lsitermax,stopingcriteria);
        DEBUG_PRINTF("Number of iteration in Armijo line search = %i\t, lhs = %6.4e\t, rhs = %6.4e\t, alpha = %6.4e\t, sigma = %6.4e\t, tau = %6.4e\n", ls_iter, lhs, rhs, alpha, sigma, tau);

      }



      cblas_dcopy(n , x , 1 , xtmp3, 1);
      cblas_daxpy(n, -1.0, xtmp2, 1, xtmp3, 1);
      DEBUG_PRINTF("norm(x-x_k) = %6.4e\n",cblas_dnrm2(n, xtmp3, 1) );
      lhs=cblas_ddot(n, wtmp, 1, xtmp3, 1);

      double nonorm = cblas_dnrm2(n, wtmp, 1);
      double rhoequiv = lhs / (nonorm * nonorm);

      /* rhoequiv =1.0;  */
      DEBUG_PRINTF("nonorm = %6.4e\n", nonorm);
      DEBUG_PRINTF("lhs = %6.4e\n", lhs);
      DEBUG_PRINTF("rho equiv = %6.4e\n", rhoequiv);

      cblas_daxpy(n, -rhoequiv, wtmp, 1, x  , 1);

      cblas_dcopy(n , x, 1 , xtmp, 1);
      problem->ProjectionOnX(problem, xtmp,x);



      /* **** Criterium convergence **** */
      variationalInequality_computeError(problem, x , w, tolerance, options, &error);
      DEBUG_EXPR_WE(
         for (int i =0; i< n ; i++)
         {
           printf("x[%i]=%6.4e\t",i,x[i]);    printf("w[%i]=F[%i]=%6.4e\n",i,i,w[i]);
         }
        );
示例#14
0
/**
    Purpose
    -------
    DLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_dsyev_aux
    ********************************************************************/
extern "C" double
magma_dlatrd_mgpu(magma_int_t num_gpus, magma_uplo_t uplo,
                  magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0,
                  double *A,  magma_int_t lda,
                  double *e, double *tau,
                  double *W,   magma_int_t ldw,
                  double **dA, magma_int_t ldda, magma_int_t offset,
                  double **dW, magma_int_t lddw,
                  double *dwork[MagmaMaxGPUs], magma_int_t ldwork,
                  magma_int_t k,
                  double *dx[MagmaMaxGPUs],
                  double *dy[MagmaMaxGPUs],
                  double *work,
                  magma_queue_t stream[][10],
                  double *times)
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(id, i, j)  (dA[(id)] + ((j)+loffset)*ldda + (i) + offset)
#define dW(id, i, j)  (dW[(id)] + (j)          *lddw + (i))
#define dW1(id, i, j) (dW[(id)] + ((j)+nb)     *lddw + (i))

    double mv_time = 0.0;
    magma_int_t i;
#ifndef MAGMABLAS_DSYMV_MGPU
    magma_int_t loffset = nb0*((offset/nb0)/num_gpus);
#endif

    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;
    double value     = MAGMA_D_ZERO;
    magma_int_t id, idw, i_one = 1;

    //magma_int_t kk;
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;

    double alpha;

    double *dx2[MagmaMaxGPUs];
    double *f;
    magma_dmalloc_cpu( &f, n );

    if (n <= 0) {
        return 0;
    }

//#define PROFILE_SYMV
#ifdef PROFILE_SYMV
    magma_event_t start, stop;
    float etime;
    magma_timestr_t cpu_start, cpu_end;
    magma_setdevice(0);
    magma_event_create( &start );
    magma_event_create( &stop  );
#endif

    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                double wii = *W(i, iw+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione);

                wii = *A(i, i+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione);
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);

                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_MAKE( 1, 0 );
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    dx2[id] = dW1(id, 0, iw);
                    magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]);
#ifndef  MAGMABLAS_DSYMV_MGPU
                    magma_dsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] );
#endif
                }
                magmablas_dsymv_mgpu(num_gpus, k, MagmaUpper, i, nb0, c_one, dA, ldda, 0,
                                     dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                     work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }

                /* overlap update */
                if ( i < n-1 && i-1 >= n - nb ) {
                    magma_int_t im1_1 = i_1 - 1;
                    magma_int_t im1   = i-1;
                    /* Update A(1:i,i) */
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        magma_int_t im1_n = i_n + 1;
                        lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw);
                    #endif
                    blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw);
                        lapackf77_dlacgv(&im1_n, A(im1, i +1), &lda);
                    #endif
                    blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  A(im1, i+1), &lda, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&im1_n, A(im1, i+1), &lda);
                    #endif
                }

                // 3. Here is where we need it // TODO find the right place
                magmablas_dsymv_sync(num_gpus, k, i, work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }

                blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -.5f * value;
                blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione);

                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    if ( k > 1 ) {
                        magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] );
                    } else {
                        magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] );
                    }
                }
            }
        }
    } else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            idw = ((offset+i)/nb)%num_gpus;
            if ( i > 0 ) {
                trace_cpu_start( 0, "gemv", "gemv" );
                double wii = *W(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione);

                wii = *A(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &lda);
                #endif
                wii = -wii;
                blasf77_daxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione);
            }

            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                trace_cpu_start( 0, "larfg", "larfg" );
                alpha = *A(i+1, i);
#ifdef PROFILE_SYMV
                cpu_start = get_current_time();
#endif
                lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
#ifdef PROFILE_SYMV
                cpu_end = get_current_time();
                times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0;
#endif
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_MAKE( 1, 0 );
                trace_cpu_end( 0 );

                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                //trace_gpu_start(  idw, 0, "comm", "comm1" );
#ifndef  MAGMABLAS_DSYMV_MGPU
                magma_setdevice(idw);
                magma_dsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 );
#endif
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    trace_gpu_start( id, 0, "comm", "comm" );
#ifdef MAGMABLAS_DSYMV_MGPU
                    dx2[id] = dW1(id, 0, i)-offset;
#else
                    dx2[id] = dx[id];
                    magma_dsetvector( i_n, A(i+1,i), 1, dx[id], 1 );
#endif
                    magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] );
                    trace_gpu_end( id, 0 );
                }
                /* mat-vec on multiple GPUs */
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(start, stream[0][0]);
#endif
                magmablas_dsymv_mgpu(num_gpus, k, MagmaLower, i_n, nb0, c_one, dA, ldda, offset+i+1,
                                       dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                       work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(stop, stream[0][0]);
#endif
                trace_cpu_start( 0, "gemv", "gemv" );
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                trace_cpu_end( 0 );

                /* overlap update */
                if ( i > 0 && i+1 < n ) {
                    magma_int_t ip1 = i+1;
                    trace_cpu_start( 0, "gemv", "gemv" );
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, W(ip1, 0), &ldw);
                    #endif
                    blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda,
                                  W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, W(ip1, 0), &ldw);
                        lapackf77_dlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw,
                                  A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    trace_cpu_end( 0 );
                }

                /* synchronize */
                magmablas_dsymv_sync(num_gpus, k, i_n, work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                cudaEventElapsedTime(&etime, start, stop);
                mv_time += (etime/1000.0);
                times[1+(i_n/(n0/10))] += (etime/1000.0);
#endif
                trace_cpu_start( 0, "axpy", "axpy" );
                if (i != 0)
                    blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);

                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                    cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                    value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i]* -.5f * value;
                blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
                trace_cpu_end( 0 );
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    if ( k > 1 ) {
                        magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] );
                    } else {
                        magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] );
                    }
                }
            }
        }
    }

#ifdef PROFILE_SYMV
    magma_setdevice(0);
    magma_event_destory( start );
    magma_event_destory( stop  );
#endif
    for( id=0; id < num_gpus; id++ ) {
        magma_setdevice(id);
        if ( k > 1 )
            magma_queue_sync(stream[id][1]);
    }
    magma_free_cpu(f);

    return mv_time;
} /* magma_dlatrd_mgpu */
int frictionContactFBLSA(
  fc3d_nonsmooth_Newton_solvers* equation,
  double *reaction,
  double *velocity,
  double *mu,
  double *rho,
  double *F,
  double *A,
  double *B,
  NumericsMatrix *W,
  double *qfree,
  NumericsMatrix *blockAWpB,
  double *direction,
  double *tmp,
  double alpha[1],
  unsigned int maxiter_ls)
{
  unsigned problemSize = W->size0;

  // notes :
  // - F contains FB or grad FB merit
  // - tmp contains direction, scal*direction, reaction+scal*direction

  // cf Newton_Methods.c, L59
  double p = 2.1;
  double fblsa_rho = 1e-8;
  double gamma = 1e-4;
  double scal = 1.;

  // F <- compute fb
  fc3d_FischerBurmeisterFunction(problemSize,
                                              (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterFunctionGenerated,
                                              reaction,
                                              velocity,
                                              mu,
                                              rho,
                                              F,
                                              NULL,
                                              NULL);

  double thetafb0 = 0.5 * cblas_ddot(problemSize, F, 1, F, 1);

  // F <- compute gradient of fb merit function (ugly)
  fc3d_FischerBurmeisterFunction(problemSize,
                                              (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterGradMeritFunctionGenerated,
                                              reaction,
                                              velocity,
                                              mu,
                                              rho,
                                              F,
                                              NULL,
                                              NULL);
  double norm_dir_exp_p = pow(cblas_dnrm2(problemSize, direction, 1), p);
  double gradmeritfb_dir = cblas_ddot(problemSize, F, 1, direction, 1);

  if (!isnan(gradmeritfb_dir) && !isinf(gradmeritfb_dir) && gradmeritfb_dir > (-fblsa_rho * norm_dir_exp_p))
  {
    if (verbose > 0)
    {
      printf("fc3d FBLSA: condition 9.1.6 unsatisfied, gradmeritfb_dir=%g, norm_r=%g\n", gradmeritfb_dir, norm_dir_exp_p);
    }

    // FIX: failure...
    if (verbose > 0)
    {
      printf("fc3d FBLSA: set d^k to - grad merit(fb)\n");
    }

    cblas_dcopy(problemSize, F, 1, direction, 1);
    cblas_dscal(problemSize, -1, direction, 1);
  }

  for (unsigned int iter = 0; iter < maxiter_ls; ++iter)
  {

    scal /= 2.;

    // tmp <- 2^(-ik)*direction+reaction
    cblas_dcopy(problemSize, reaction, 1, tmp, 1);
    cblas_daxpy(problemSize, scal, direction, 1, tmp, 1);

    // velocity <- W*tmp + qfree
    cblas_dcopy(problemSize, qfree, 1, velocity, 1);
    NM_gemv(1., W, tmp, 1., velocity);

    // compute fb
    fc3d_FischerBurmeisterFunction(problemSize,
                                                (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterFunctionGenerated,
                                                tmp,
                                                velocity,
                                                mu,
                                                rho,
                                                F,
                                                NULL,
                                                NULL);

    double thetafb  = 0.5 * cblas_ddot(problemSize, F, 1, F, 1);

    // compute grad merit fb (ugly)
    fc3d_FischerBurmeisterFunction(problemSize,
                                                (FischerBurmeisterFun3x3Ptr) fc3d_FischerBurmeisterGradMeritFunctionGenerated,
                                                tmp,
                                                velocity,
                                                mu,
                                                rho,
                                                F,
                                                NULL,
                                                NULL);

    // tmp <- scal*direction
    cblas_dscal(problemSize, 0., tmp, 1);
    cblas_daxpy(problemSize, scal, direction, 1, tmp, 1);
    double grad_meritf_reaction = cblas_ddot(problemSize, F, 1, tmp, 1);

    if (!isinf(grad_meritf_reaction) && !isnan(grad_meritf_reaction) &&
        thetafb < thetafb0 + gamma * scal * grad_meritf_reaction)
    {
      if (verbose > 0)
      {
        printf("fc3d FBLSA success. iteration  = %i, thetafb=%g, thetafb0=%g, gradmeritf,reaction=%g\n", iter, thetafb, thetafb0, gamma*scal*grad_meritf_reaction);
      }
      // tmp <- reaction + tmp
      cblas_daxpy(problemSize, 1, reaction, 1, tmp, 1);

      return 0;
    }
  }

  if (verbose > 0)
  {
    printf("fc3d FBLSA reached the max number of iteration reached  = %i\n", maxiter_ls);
  }

  return -1;
}
示例#16
0
double F77_ddot(const int *N, const double *X, const int *incX,
                const double *Y, const int *incY)
{
   return cblas_ddot(*N, X, *incX, Y, *incY);
}
示例#17
0
文件: driver.c 项目: lysbleu/pvm-bss
int main(int argc, char* argv[])
{
	char dummy[L2_CACHE_SIZE];
	
// Tests de performances de ddot	
	int size = 50;

	blas_t *matriceD, *matriceE;
	alloc_vecteur(&matriceD, size);
	alloc_vecteur(&matriceE, size);

	printf("Tests de performance de la fonction ddot\n");
	perf_t *t1, *t2,*t3, *t4,*t5, *t6,*t7, *t8, *t9, *t10;
	t1 = malloc(sizeof(perf_t));
	t2 = malloc(sizeof(perf_t));
        t3 = malloc(sizeof(perf_t));
	t4 = malloc(sizeof(perf_t));
        t5 = malloc(sizeof(perf_t));
	t6 = malloc(sizeof(perf_t));
        t7 = malloc(sizeof(perf_t));
	t8 = malloc(sizeof(perf_t));
        t9 = malloc(sizeof(perf_t));
	t10 = malloc(sizeof(perf_t));
        

	double mflops, mflops1,mflops2,mflops3,mflops4, mflops5;
	char command[200];

        system("rm results/ddot_perf.txt");
	for(size = 50; size < 100000000; size += size/4)
	{
		printf("M: %d ", size);
		if(size != 50)
		{
			free(matriceD);
			free(matriceE);
			alloc_vecteur(&matriceD, size);
			alloc_vecteur(&matriceE, size);
		}
		memset(dummy, 0, sizeof(dummy));
		perf(t1);
		blas_t res = cblas_ddot(size, matriceD, 1, matriceE, 1);
		perf(t2);
		perf_diff(t1, t2);
		mflops = perf_mflops(t2, 2 * size);
                printf("Mflops/s: %le\n", mflops);

                sprintf(command, "echo %d %lf >> results/ddot_perf.txt", size, mflops);	
                system(command);
				
	}


// Test de performance dgemm
//////////////////////////////////////////

	long m = 100;

	
	blas_t *matriceA, *matriceB, *matriceC;
	
	alloc_matrice(&matriceA, m, m);
	alloc_matrice(&matriceB, m, m);
        matriceC = calloc(m*m,sizeof(blas_t));
        system("rm results/dgemm_perf.txt");

	for(; m< 1000; m+=20)
	{
            printf("M: %d ", m);
        
		if(m != 100)
		{
			free(matriceA);
			free(matriceB);
			free(matriceC);
			alloc_matrice(&matriceA, m, m);
			alloc_matrice(&matriceB, m, m);
			alloc_matrice(&matriceC, m, m);
		}
        
        memset(dummy, 0, sizeof(dummy));
		perf(t1);
        cblas_dgemm_scalaire( CblasNoTrans, CblasNoTrans ,m, m, m, 1, matriceA, m, matriceB, m, 1, matriceC, m);
		perf(t2);
		perf_diff(t1, t2);
                mflops1 = perf_mflops(t2, m * m * m * 3 + m * m );

                
        
		perf(t3);
		cblas_dgemm_scalaire1(matriceC, m, matriceA, m, matriceB, m,  m);
		perf(t4);
		perf_diff(t3, t4);
        
        
                
                mflops2 = perf_mflops(t4, m * m * m * 3);
              	perf(t5);
		cblas_dgemm_scalaire2(matriceC, m, matriceA, m, matriceB, m,  m);
		perf(t6);
		perf_diff(t5, t6);
                mflops3 = perf_mflops(t6, m * m * m * 3);
                perf(t7);
		cblas_dgemm_scalaire3(matriceC, m, matriceA, m, matriceB, m,  m);
		perf(t8);
		perf_diff(t7, t8);
                mflops4 = perf_mflops(t8, m * m * m * 3);
                
                perf(t9);
		cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, m,m, 1, matriceA, m, matriceB, m, 1, matriceC, m);
		perf(t10);
		perf_diff(t9, t10);
                mflops5 = perf_mflops(t10, m * m * m * 3);
                
                sprintf(command, "echo %d %lf %lf %lf %lf %lf >> results/dgemm_perf.txt", m * m, mflops1, mflops2, mflops3, mflops4, mflops5);	
                system(command);
                printf("Mflops/s : %d %lf %lf %lf %lf %lf\n", m * m, mflops1, mflops2, mflops3, mflops4, mflops5 );
	}


	free(matriceA);
	free(matriceB);
	free(matriceC);
	free(matriceD);
	free(matriceE);
	return EXIT_SUCCESS;
}
int ConjugateGradientSolverMPI(unsigned long *II, unsigned long *J, double *A, unsigned long M,unsigned long local_M, unsigned long N, unsigned long long nz, double *b, unsigned long M_Vector, unsigned long N_Vector, unsigned long long nz_vector, int numIterations) {

    //A*x=b

    double *Ax=(double *) malloc(nz_vector * sizeof(double));
    double *Ap=(double *) malloc(nz_vector * sizeof(double));
    double *r=(double *) malloc(nz_vector * sizeof(double));
    double *p=(double *) malloc(nz_vector * sizeof(double));
    double *x=(double *) calloc(nz_vector,sizeof(double));

    double *Ap_partial=(double *) malloc(local_M * sizeof(double));

    //r = b-A*x
    //If we take x=0 the init multiplication is avoided and r=b

    memcpy(r, b, N*sizeof(double));

    //p=r

    memcpy(p, r, N*sizeof(double));

    //rsold = r*r
    double rsold = cblas_ddot(N,r,1,r,1);

    int stop = 0;

    double alphaCG = 0.0;

    double rsnew = 0.0;
    unsigned long k = 0;

    unsigned long maxIterations = M*2;
    if ( numIterations != 0) {
        maxIterations = numIterations;
    }


    while(!stop){

        //Ap=A*p
        //cblas_dgemv(CblasColMajor, CblasNoTrans, M,N , 1.0, A, N, p, 1, 0.0, Ap, 1);
        cblas_dgemv(CblasRowMajor, CblasNoTrans, local_M,N , 1.0, A, N, p, 1, 0.0, Ap_partial, 1);

        MPI_Allgather (Ap_partial,local_M,MPI_DOUBLE,Ap,local_M,MPI_DOUBLE,MPI_COMM_WORLD);
        MPI_Barrier(MPI_COMM_WORLD);

        //alphaCG=rsold/(p'*Ap)
        alphaCG = rsold/cblas_ddot(N,p,1,Ap,1);

        //x=x+alphaCG*p
        cblas_daxpy(N,alphaCG,p,1,x,1);

        //r=r-alphaCG*Ap
        cblas_daxpy(N,-alphaCG,Ap,1,r,1);

        //rsnew = r'*r
        rsnew = cblas_ddot(N,r,1,r,1);

        if((sqrt(rsnew)<=EPSILON)||(k == maxIterations)){
            stop = 1;
        }

        //p=r+rsnew/rsold*p
        cblas_dscal(N, rsnew/rsold, p, 1);
        cblas_daxpy(N,1.0,r,1,p,1);


        rsold = rsnew;

        k++;
    }

    memcpy(b, x, N*sizeof(double));

    free(Ax);
    free(Ap);
    free(r);
    free(p);
    free(x);

    fprintf(stderr, "[%s] Number of iterations %lu\n",__func__,k);

    return 1;
}
示例#19
0
void FrictionContact2D_cpg(FrictionContactProblem* problem , double *reaction , double *velocity , int *info, SolverOptions* options)
{
  int nc = problem->numberOfContacts;
  assert(nc>0);
  double * vec = problem->M->matrix0;
  double * mu = problem->mu;

  int       n = 2 * nc, i, iter;
  assert(n>0);
  int       incx = 1, incy = 1;
  int       *stat, *statusi, it_end;


  double    eps = 1.e-12;
  double    pAp, alpha, beta, wAp, rp, normr;
  double    alphaf, betaf, den, num, res;

  double    *p, *fric, *r;
  double    *fric1, *v, *w, *Ap, *xi, *z;


  int maxit = options->iparam[0];
  double tol = options->dparam[0];
  options->iparam[1]  = 0;
  options->dparam[1]  = 0.0;


  r       = (double*) malloc(n * sizeof(double));
  p       = (double*) malloc(n * sizeof(double));
  v       = (double*) malloc(n * sizeof(double));
  w       = (double*) malloc(n * sizeof(double));
  Ap      = (double*) malloc(n * sizeof(double));
  xi      = (double*) malloc(n * sizeof(double));
  z       = (double*) malloc(n * sizeof(double));
  fric1   = (double*) malloc(n * sizeof(double));

  fric    = (double*) malloc(nc * sizeof(double));
  stat    = (int*)    malloc(nc * sizeof(int));
  statusi = (int*)    malloc(nc * sizeof(int));





  for (i = 0; i < n ; i++)
  {
    reaction[i]     = 0.0;
    xi[i]    = 0.0;
    r[i]     = 0.0;
    v[i]     = 0.0;
    p[i]     = 0.0;
    w[i]     = 0.0;
    Ap[i]    = 0.0;
    z[i]     = 0.0;
    fric1[i] = 1.0;

    if (i < nc)
    {
      fric[i]  = mu[i] * fric1[i];
      stat[i]    = 0;
      statusi[i] = 0;

    }

  }



  cblas_dcopy(n, problem->q, incx, r, incy);

  alphaf = -1.;
  betaf  = -1.;

  cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alphaf, vec, n, reaction, incx, betaf, r, incy);





  /*             Check for initial status             */


  for (i = 0; i < nc; i++)
  {
    mu[i] = fric[i];
    if (reaction[2 * i] <= eps)
    {
      /*       No contact            */
      stat[i] = 0;
    }
    else if (reaction[2 * i + 1] <=  -mu[i]*reaction[2 * i])
    {
      /*     Slide backward         */
      stat[i] = 1;
    }
    else if (reaction[2 * i + 1] >=  mu[i]*reaction[2 * i])
    {
      /*   Slide forward          */
      stat[i] = 3;
    }
    else
    {
      /*     Stick contact        */
      stat[i] = 2;
    }
  }


  iter  = 0;
  normr = 1.0;



  while ((iter < maxit) && (normr > tol))
  {



    for (i = 0 ; i < nc ; i++)
      statusi[i] = stat[i];


    cblas_dcopy(n, r, incx, v, incy);

    if (iter == 0)
    {
      cblas_dcopy(n, r, incx, w, incy);

      cblas_dcopy(n, w, incx, p, incy);
    }

    alphaf = 1.0;
    betaf  = 0.0;
    cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alphaf, vec, n, p, incx, betaf, Ap, incy);

    pAp    = cblas_ddot(n, p, incx, Ap, incy);

    /*}
    else
    {
    alphaf = 1.0;
    betaf  = 0.0;
    dgemv_( &notrans, (integer *)&n, (integer *)&n, &alphaf, vec, (integer *)&n, p, &incx, &betaf, Ap, &incy );

    pAp    = ddot_( (integer *)&n, p, &incx, Ap, &incy );*/

    if (pAp == 0)
    {
      if (verbose > 0)
        printf("\n Operation non conform alpha at the iteration %d \n", iter);

      free(r);
      free(fric);
      free(p);
      free(v);
      free(w);
      free(Ap);
      free(xi);
      free(z);
      free(fric1);
      free(stat);
      free(statusi);

      *info = 2;

      return;
    }

    /*} */

    rp     = cblas_ddot(n, r, incx, p, incy);

    alpha  = rp / pAp;

    cblas_dcopy(n, reaction, incx, xi, incy);

    alphaf = alpha;
    cblas_daxpy(n, alphaf, p, incx, xi, incy);

    FrictionContact2D_projc(xi, &n, statusi, p, fric, reaction, stat);


    /*         r(:)=b(:)-matmul(A,x)          */

    cblas_dcopy(n, problem->q, incx, r, incy);

    alphaf = -1.;
    betaf  = -1.;
    cblas_dgemv(CblasColMajor,CblasNoTrans, n, n, alphaf, vec, n, reaction, incx, betaf, r, incy);

    FrictionContact2D_projf(statusi, &n, r, fric, w);

    FrictionContact2D_projf(statusi, &n, p, fric, z);


    wAp    = cblas_ddot(n, w, incx, Ap, incy);

    beta   = - wAp / pAp;

    cblas_dcopy(n, w, incx, p, incy);

    alphaf  = beta;
    cblas_daxpy(n, alphaf, z, incx, p, incy);


    /*  alphaf  = 1.;
    betaf   = 0.;
    dgemv_( &notrans, (integer *)&n, (integer *)&n, &alphaf, vec , (integer *)&n, p, &incx, &betaf, Ap, &incy );

    pAp     = ddot_( (integer *)&n, p, &incx, Ap, &incy );*/

    cblas_dcopy(n, r, incx, xi, incy);

    alphaf  = -1.;
    cblas_daxpy(n, alphaf, v, incx, xi, incy);

    num     = cblas_ddot(n, xi, incx, xi, incy);

    den     = cblas_ddot(n, v, incx, v, incy);

    normr   = sqrt(num / den);

    it_end  = iter;
    res     = normr;


    options->iparam[1] = it_end;
    options->dparam[1] = res;


    iter = iter + 1;

  }




  if (normr < tol)
  {

    if (verbose > 0)
      printf("convergence after %d iterations with a residual %g\n", iter - 1, normr);

    *info = 0;


  }
  else
  {
    if (verbose > 0)
      printf("no convergence after %d iterations with a residual %g\n", iter - 1, normr);

    *info = 1;
  }


  alpha = -1.;
  cblas_dscal(n , alpha , r , incx);

  cblas_dcopy(n, r, incx, velocity, incy);



  free(fric);
  free(p);
  free(v);
  free(w);
  free(Ap);
  free(xi);
  free(z);
  free(fric1);
  free(stat);
  free(statusi);
  free(r);




}
示例#20
0
double dotprod2(int N,double *X,int incx,double *Y,int incy) {
    return cblas_ddot(N, X, incx, Y, incy);
}
示例#21
0
extern "C" magma_int_t
magma_dlatrd2(char uplo, magma_int_t n, magma_int_t nb,
              double *a,  magma_int_t lda,
              double *e, double *tau,
              double *w,  magma_int_t ldw,
              double *da, magma_int_t ldda,
              double *dw, magma_int_t lddw,
              double *dwork, magma_int_t ldwork)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    DLATRD2 reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD2_GPU. It uses an
    accelerated HEMV that needs extra memory.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
            = 'U': Upper triangular
            = 'L': Lower triangular

    N       (input) INTEGER
            The order of the matrix A.

    NB      (input) INTEGER
            The number of rows and columns to be reduced.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
            if UPLO = 'U', the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
            if UPLO = 'L', the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    E       (output) DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    TAU     (output) DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
            See Further Details.

    W       (output) DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    LDW     (input) INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = 'U':                       if UPLO = 'L':

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).
    =====================================================================    */
    
    char uplo_[2]  = {uplo, 0};

    magma_int_t i;
    
    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;

    double value = MAGMA_D_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
    
    double alpha;
    double *f;

    if (n <= 0) {
        return 0;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_dmalloc_cpu( &f, n );
    assert( f != NULL );  // TODO return error, or allocate outside dlatrd
    
    if (lapackf77_lsame(uplo_, "U")) {

        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
                #endif
                blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                              W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
                lapackf77_dlacgv(&i_n, A(i, i+1), &ldw);
                #endif
                blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                              A(i, i+1), &lda, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, A(i, i+1), &ldw);
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                
                alpha = *A(i-1, i);
                
                lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
                
                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_MAKE( 1, 0 );
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                //#if (GPUSHMEM < 200)
                //magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                //            dA(0, i), ione, c_zero, dW(0, iw), ione);
                //#else
                magmablas_dsymv_work(MagmaUpper, i, c_one, dA(0, 0), ldda,
                                     dA(0, i), ione, c_zero, dW(0, iw), ione,
                                     dwork, ldwork);
                //#endif
                
                // 2. Start putting the result back (asynchronously)
                magma_dgetmatrix_async( i, 1,
                                        dW(0, iw),         lddw,
                                        W(0, iw) /*test*/, ldw, stream );
                
                if (i < n-1) {
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                    
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                    
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }
                
                blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_daxpy(&i, &alpha, A(0, i), &ione,
                              W(0, iw), &ione);
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, W(i, 0), &ldw);
            #endif
            blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                          W(i, 0), &ldw, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, W(i, 0), &ldw);
            lapackf77_dlacgv(&i, A(i ,0), &lda);
            #endif
            blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                          A(i, 0), &lda, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, A(i, 0), &lda);
            #endif
        
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_MAKE( 1, 0 );
        
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
            
                //#if (GPUSHMEM < 200)
                //magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                //            dW(i+1, i), ione);
                //#else
                magmablas_dsymv_work('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                                     dW(i+1, i), ione,
                                     dwork, ldwork);
                //#endif
        
                // 2. Start putting the result back (asynchronously)
                magma_dgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
        
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
            
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                // 3. Here is where we need it
                magma_queue_sync( stream );
        
                if (i!=0)
                  blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
        
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i] * -0.5f * value;
                blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu(f);
    magma_queue_destroy( stream );

    return 0;
} /* dlatrd */
int  LineSearchGP(FrictionContactProblem* localproblem,
                  computeNonsmoothFunction  Function,
                  double * t_opt,
                  double R[3],
                  double dR[3],
                  double *rho,
                  int LSitermax,
                  double * F,
                  double * A,
                  double * B,
                  double * velocity)
{
  double alpha = *t_opt;

  double inf = 1e20;

  double alphamin = 0.0;
  double alphamax = inf;

  double m1 = 0.1, m2 = 0.9;


  /*     // store the value fo the function */
  /*     double F[3]={0.,0.,0.}; */

  /*     // Store the (sub)-gradient of the function */
  /*     double A[9]={0.,0.,0.,0.,0.,0.,0.,0.,0.}; */
  /*     double B[9]={0.,0.,0.,0.,0.,0.,0.,0.,0.}; */

  /*     double velocity[3]={0.,0.,0.}; */

  double mu = localproblem->mu[0];
  double * qLocal = localproblem->q;
  double * MLocal = localproblem->M->matrix0;

  /*     for (int i=0; i<3; i++) velocity[i] = MLocal[i+0*3]*R[0] + qLocal[i] */
  /*          + MLocal[i+1*3]*R[1] + */
  /*          + MLocal[i+2*3]*R[2] ; */

  /*     Function(R,velocity,mu,rho,F,A,B); */


  // Computation of q(t) and q'(t) for t =0

  double q0 = 0.5 * cblas_ddot(3 , F , 1 , F , 1);

  double tmp[3] = {0., 0., 0.};

  // Value of AW+B
  double AWplusB[9] = {0., 0., 0., 0., 0., 0., 0., 0., 0.};

  // compute A MLocal +B
  for (int i = 0; i < 3; i++)
  {
    for (int j = 0; j < 3; j++)
    {
      AWplusB[i + 3 * j] = 0.0;
      for (int k = 0; k < 3; k++)
      {
        AWplusB[i + 3 * j] += A[i + 3 * k] * MLocal[k + j * 3];
      }
      AWplusB[i + 3 * j] += B[i + 3 * j];
    }
  }

#ifdef VERBOSE_DEBUG
  for (int l = 0; l < 3; l++)
  {
    for (int k = 0; k < 3; k++)
    {
      printf("AWplusB[%i+3*%i] = %le\t", l, k, AWplusB[l + 3 * k]);
    }
    printf("\n");
  }
#endif

  for (int i = 0; i < 3; i++)
  {
    tmp[i] = 0.0;
    for (int j = 0; j < 3; j++)
    {
      tmp[i] += AWplusB[i + 3 * j] * dR[j]  ;
    }
  }




  double dqdt0 = 0.0;
  for (int i = 0; i < 3; i++)
  {
    dqdt0 += F[i] * tmp[i];
  }
#ifdef VERBOSE_DEBUG
  printf("q0 = %12.8e \n", q0);
  printf("dqdt0 = %12.8e \n", dqdt0);
  for (int i = 0; i < 3; i++)
  {
    printf("tmp[%i] = %12.8e \t", i, tmp[i]);
  }
  printf("\n");
  for (int i = 0; i < 3; i++)
  {
    printf("dR[%i] = %12.8e \t", i, dR[i]);
  }
  printf("\n");
#endif

  for (int iter = 0; iter < LSitermax; iter++)
  {

    for (int i = 0; i < 3; i++)  tmp[i] = R[i] + alpha * dR[i];

    for (int i = 0; i < 3; i++) velocity[i] = MLocal[i + 0 * 3] * tmp[0] + qLocal[i]
          + MLocal[i + 1 * 3] * tmp[1] +
          + MLocal[i + 2 * 3] * tmp[2] ;

    Function(tmp, velocity, mu, rho, F, NULL, NULL);

    double q  = 0.5 * cblas_ddot(3 , F , 1 , F , 1);

    double slope = (q - q0) / alpha;

#ifdef VERBOSE_DEBUG
    printf("q = %12.8e \n", q);
    printf("slope = %12.8e \n", slope);
#endif


    int C1 = (slope >= m2 * dqdt0);
    int C2 = (slope <= m1 * dqdt0);

    if (C1 && C2)
    {
#ifdef VERBOSE_DEBUG
      printf("Sucess in LS: alpha = %12.8e\n", alpha);
#endif
      *t_opt = alpha;
      if (verbose > 1)
      {
        printf("-----------------------------------------    LineSearchGP success number of iteration = %i  alpha = %.10e \n", iter, alpha);
      }
      return 0;

    }
    else if (!C1)
    {
#ifdef VERBOSE_DEBUG
      printf("LS: alpha too small = %12.8e\t, slope =%12.8e\n", alpha, slope);
      printf(" m1*dqdt0 =%12.8e\t, m2*dqdt0 =%12.8e\n ", m1 * dqdt0 , m2 * dqdt0);
#endif
      //std::cout << "t = " << t << " is too small : slope = " << slope << ", m2*qp0 = " << m2*qp0 << std::endl;
      alphamin = alpha;
    }
    else   // not(C2)
    {
#ifdef VERBOSE_DEBUG
      printf("LS: alpha too big = %12.8e\t, slope =%12.8e\n", alpha, slope);
      printf(" m1*dqdt0 =%12.8e\t, m2*dqdt0 =%12.8e\n ", m1 * dqdt0 , m2 * dqdt0);
#endif
      //std::cout << "t = " << t << " is too big : slope = " << slope << ", m1*qp0 = " << m1*qp0 << std::endl;
      alphamax = alpha;
    }
    if (alpha < inf)
    {
      alpha = 0.5 * (alphamin + alphamax);
    }
    else
    {
      alpha = 10 * alpha;
    }


  }
  if (verbose > 1)
  {
    printf("-----------------------------------------    LineSearchGP failed max number of iteration reached  = %i  with alpha = %.10e \n", LSitermax, alpha);
  }
  *t_opt = alpha;
  return -1;
}
 double jcho::Matrix<double>::dot(const Matrix<double> &v) const {
   AssertSameDimensions(v);
   AssertIsVector();
   return cblas_ddot(m() == 1 ? n() : m(), _storage, 1, v._storage, 1);
 }  
void computeFGlobal_AC(double* reaction, double* FGlobal)
{

  int numberOfContacts =  globalFC3D->numberOfContacts;

  int n = 3 * numberOfContacts;

  NumericsMatrix * MGlobal = globalFC3D->M;
  double * MLocal = localFC3D->M->matrix0;
  double * qLocal = localFC3D->q;
  double *mu = globalFC3D->mu;


  int contact, diagPos = 0, position;
  int in, it, is, inc, incx;
  double * reactionLocal;
  double alpha, det, beta, num, coef2, mrn;
  for (contact = 0; contact < numberOfContacts; ++contact)
  {
    position = 3 * contact;
    if (MGlobal->storageType == 1) /* Sparse storage */
    {
      /* The part of MGlobal which corresponds to the current block is copied into MLocal */
      diagPos = numberOfContacts * contact + contact;
      MLocal = MGlobal->matrix1->block[diagPos];
    }
    else if (MGlobal->storageType == 0)
    {
      in = 3 * contact;
      it = in + 1;
      is = it + 1;
      inc = n * in;
      double *MM = MGlobal->matrix0;
      /* The part of MM which corresponds to the current block is copied into MLocal */
      MLocal[0] = MM[inc + in];
      MLocal[1] = MM[inc + it];
      MLocal[2] = MM[inc + is];
      inc += n;
      MLocal[3] = MM[inc + in];
      MLocal[4] = MM[inc + it];
      MLocal[5] = MM[inc + is];
      inc += n;
      MLocal[6] = MM[inc + in];
      MLocal[7] = MM[inc + it];
      MLocal[8] = MM[inc + is];
    }

    reactionLocal = &reaction[3 * contact];
    incx = 3;
    velocityLocal[0] = cblas_ddot(3 , MLocal , incx , reactionLocal , 1) + qLocal[0];
    velocityLocal[1] = cblas_ddot(3 , MLocal , incx , reactionLocal , 1) + qLocal[1];
    velocityLocal[2] = cblas_ddot(3 , MLocal , incx , reactionLocal , 1) + qLocal[2];
    an = 1. / MLocal[0];
    alpha = MLocal[4] + MLocal[8];
    det = MLocal[4] * MLocal[8] - MLocal[7] + MLocal[5];
    beta = alpha * alpha - 4 * det;
    at = 2 * (alpha - beta) / ((alpha + beta) * (alpha + beta));
    projN = reactionLocal[0] - an * velocityLocal[0];
    projT = reactionLocal[1] - at * velocityLocal[1];
    projS = reactionLocal[2] - at * velocityLocal[2];
    coef2 = mu[contact] * mu[contact];
    if (projN > 0)
    {
      FGlobal[position] = velocityLocal[0];
    }
    else
    {
      FGlobal[position] = reactionLocal[0] / an;
    }

    mrn = projT * projT + projS * projS;
    if (mrn <= coef2 * reactionLocal[0]*reactionLocal[0])
    {
      FGlobal[position + 1] = velocityLocal[1];
      FGlobal[position + 2] = velocityLocal[2];
    }
    else
    {
      num  = mu[contact] / sqrt(mrn);
      FGlobal[position + 1] = (reactionLocal[1] - projT * reactionLocal[0] * num) / at;
      FGlobal[position + 2] = (reactionLocal[2] - projS * reactionLocal[0] * num) / at;
    }
  }
}
示例#25
0
/**
 * Backtracking linesearch for the aproximate tangent direction
 */
int linesearch_atd(state_t* state , parameters_t  pars, problem_t prob)
{
    double a0 = 1.; //Initial step length
    double a  = a0;
    state->nbacktrack = 0;
    // Calculate the largest step before either tau or kappa reach the boundary
    if(state->dkappa < 0) a = fmin(a,-state->kappa/state->dkappa);
    if(state->dtau < 0) a = fmin(a,-state->tau/state->dtau);
    //If with the full step either reaches the boundary make sure that after
    //the step the new trial tau or kappa is at most pars.eta of the way to the boundary.
    if(a<a0) a = a*pars.eta;

    //  allocate work vectors for the trial steps
    double* xa = (double*)calloc(prob.A.n,sizeof(double));
    double* sa = (double*)calloc(prob.A.n,sizeof(double));
    //Allocate work vectors to calculate the centrality
    double* psi  = (double*)calloc(prob.A.n,sizeof(double));
    double* hpsi = (double*)calloc(prob.A.n,sizeof(double));
    double kappaa;
    double taua;
    double dga;
    double mua;

    //TODO: add clean up before return?
    if(xa == NULL)   return OUT_OF_MEMORY;
    if(sa == NULL)   return OUT_OF_MEMORY;
    if(psi == NULL)  return OUT_OF_MEMORY;
    if(hpsi == NULL) return OUT_OF_MEMORY;
    
    //Counter for the number of backtracks
    int nsect = 0;
    int j = 0;

    //Define some variables
    //used in the loop.
    bool dFeas, pFeas; //Flags to indicate feasiblity
    bool dosect;       //Flag to indicate if there should be a backtrack

    double centmeas;   //Present value of centrality measure

    for(j=0;j<pars.max_backtrack;j++)
    {
          dosect = false;
    
          //Take an a sized step in x, tau and s, kappa, and store in xa, sa, taua, kappaa
          cblas_dcopy(prob.A.n,state->x,1,xa,1);
          cblas_dcopy(prob.A.n,state->s,1,sa,1);
          cblas_daxpy(prob.A.n,a,state->dx,1,xa,1);
          cblas_daxpy(prob.A.n,a,state->ds,1,sa,1);
          taua      = state->tau + a*state->dtau;
          kappaa    = state->kappa + a*state->dkappa;
            
          //Calculate the duality gap at the new trial point
          dga    = cblas_ddot(prob.A.n,xa,1,sa,1) + taua*kappaa;
          mua    = dga / (prob.nu + 1); 
         
          //Check if x,s are feasible wrt the cones
          dFeas = dual_feas(prob,sa);
          pFeas = primal_feas(prob,xa);
    
         //If not primal or dual feasible backtrack
        if(!pFeas)
        {
            dosect = true; 
            //printf("C: Not primal feasible\n");
        }
        else if(!dFeas)
        {
            dosect = true; 
            //printf("C: Not dual feasible\n");
        }
        else //If both primal and dual feasible measure the centrality
        {
            centmeas = eval_cent_meas(prob,xa,sa,*state,mua,psi,hpsi);
            //Decide if we need to backtrack
            if(centmeas > mua*pars.theta)
            {
                dosect = true;
            }
            
            //printf("Evaluating dist: %g, %g, %i:\n",centmeas,mua*pars.theta,dosect);
        }
        //If we must backtrack do so
        if(dosect)
        {
            a = a*pars.lscaff;
            state->nbacktrack += 1; 
        }
        else
        {
            break;
        }
         
    }  
 
    //Check if the backtrack reached the maximum number of iterates
    if(j==pars.max_backtrack)
    {
       //If the backtrack fails do not update the vectors and release 
       //the local work vectors, and test vectors 
       free(xa);
       free(sa);
       free(psi);
       free(hpsi);
       state->a = a;   
       return BACKTRACK_FAIL; 
    }
    else
    {
        //If the backtrack succedded accept the iterate
        //store it in state and take the step in y

        //Copy the successeful xa and sa to state.x state.s
        //just switching the pointers does not work because
        //we should not free a matlab allocated vector
        cblas_dcopy(prob.A.n,xa,1,state->x,1);
        cblas_dcopy(prob.A.n,sa,1,state->s,1);
        state->tau    = taua;
        state->kappa  = kappaa;
       
        //Take the step in y
        cblas_daxpy(prob.A.m,a,state->dy,1,state->y,1);
        state->a = a;   

        //Free the work vectors
        free(xa);
        free(sa);
        free(psi);
        free(hpsi);

        return OK;
    }

    //state->a = a;   
    //return OK;
}
示例#26
0
// Initialises the CG solver
void cg_solver_init(
        const int x,
        const int y,
        const int z,
        const int halo_depth,
        const int coefficient,
        double rx,
        double ry,
        double rz,
        double* rro,
        double* density,
        double* energy,
        double* vec_u,
        double* vec_p,
        double* vec_r,
        double* vec_w,
        double* vec_kx,
        double* vec_ky,
        double* vec_kz,
        int* a_row_index,
        int* a_col_index,
        double* a_non_zeros)
{
    if(coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY)
    {
        die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient);
    }

#pragma omp parallel for
    for(int ii = 0; ii < z; ++ii)
    {
        for(int jj = 0; jj < y; ++jj)
        {
            for(int kk = 0; kk < x; ++kk)
            {
                const int index = ii*y*x+jj*x+kk;
                vec_p[index] = 0.0;
                vec_r[index] = 0.0;
                vec_u[index] = energy[index]*density[index];
            }
        }
    }

#pragma omp parallel for
    for(int ii = 1; ii < z-1; ++ii)
    {
        for(int jj = 1; jj < y-1; ++jj)
        {
            for(int kk = 1; kk < x-1; ++kk)
            {
                const int index = ii*y*x+jj*x+kk;
                vec_w[index] = (coefficient == CONDUCTIVITY) 
                    ? density[index] : 1.0/density[index];
            }
        }
    }

#pragma omp parallel for
    for(int ii = halo_depth; ii < z-1; ++ii)
    {
        for(int jj = halo_depth; jj < y-1; ++jj)
        {
            for(int kk = halo_depth; kk < x-1; ++kk)
            {
                const int index = ii*x*y + jj*x + kk;
                vec_kx[index] = rx*(vec_w[index-1]+vec_w[index]) /
                    (2.0*vec_w[index-1]*vec_w[index]);
                vec_ky[index] = ry*(vec_w[index-x]+vec_w[index]) /
                    (2.0*vec_w[index-x]*vec_w[index]);
                vec_kz[index] = rz*(vec_w[index-x*y]+vec_w[index]) /
                    (2.0*vec_w[index-x*y]*vec_w[index]);
            }
        }
    }

    // Initialise the CSR sparse coefficient matrix
    for(int ii = halo_depth; ii < z-1; ++ii)
    {
        for(int jj = halo_depth; jj < y-1; ++jj)
        {
            for(int kk = halo_depth; kk < x-1; ++kk)
            {
                const int index = ii*x*y + jj*x + kk;
                int coef_index = a_row_index[index];

                if(ii >= halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kz[index];
                    a_col_index[coef_index++] = index-x*y;
                }

                if(jj >= halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_ky[index];
                    a_col_index[coef_index++] = index-x;
                }

                if(kk >= halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kx[index];
                    a_col_index[coef_index++] = index-1;
                }

                a_non_zeros[coef_index] = (1.0 + 
                        vec_kx[index+1] + vec_kx[index] + 
                        vec_ky[index+x] + vec_ky[index] + 
                        vec_kz[index+x*y] + vec_kz[index]);
                a_col_index[coef_index++] = index;

                if(ii < z-halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kz[index+x*y];
                    a_col_index[coef_index++] = index+x*y;
                }

                if(jj < y-halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_ky[index+x];
                    a_col_index[coef_index++] = index+x;
                }

                if(kk < x-halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kx[index+1];
                    a_col_index[coef_index] = index+1;
                }
            }
        }
    }

    double rro_temp = 0.0;

    int m = x*y*z;
    mkl_cspblas_dcsrgemv(
            "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w);

    int x_inner = x-2*halo_depth;

#pragma omp parallel for reduction(+:rro_temp)
    for(int ii = halo_depth; ii < z-halo_depth; ++ii)
    {
        for(int jj = halo_depth; jj < y-halo_depth; ++jj)
        {
            const int offset = ii*y*x + jj*x + halo_depth;
            cblas_dcopy(x_inner, vec_u + offset, 1, vec_r + offset, 1);
            cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1);
            cblas_dcopy(x_inner, vec_r + offset, 1, vec_p + offset, 1);
            rro_temp += cblas_ddot(x_inner, vec_r + offset, 1, vec_p + offset, 1);
        }
    }

    // Sum locally
    *rro += rro_temp;
}
示例#27
0
void fc3d_projectionWithDiagonalization_update(int contact, FrictionContactProblem* problem, FrictionContactProblem* localproblem,  double* reaction, SolverOptions* options)
{
  /* Build a local problem for a specific contact
     reaction corresponds to the global vector (size n) of the global problem.
  */

  /* Call the update function which depends on the storage for MGlobal/MBGlobal */
  /* Build a local problem for a specific contact
   reaction corresponds to the global vector (size n) of the global problem.
  */

  /* The part of MGlobal which corresponds to the current block is copied into MLocal */
  fc3d_nsgs_fillMLocal(problem, localproblem, contact);

  /****  Computation of qLocal = qBlock + sum over a row of blocks in MGlobal of the products MLocal.reactionBlock,
     excluding the block corresponding to the current contact. ****/

  NumericsMatrix * MGlobal = problem->M;
  double * MLocal =  localproblem->M->matrix0;


  double *qLocal = localproblem->q;
  double * qGlobal = problem->q;
  int n = 3 * problem->numberOfContacts;


  int in = 3 * contact, it = in + 1, is = it + 1;
  /* reaction current block set to zero, to exclude current contact block */
  /*   double rin= reaction[in] ; double rit= reaction[it] ; double ris= reaction[is] ;  */
  /* qLocal computation*/
  qLocal[0] = qGlobal[in];
  qLocal[1] = qGlobal[it];
  qLocal[2] = qGlobal[is];

  if (MGlobal->storageType == 0)
  {
    double * MM = MGlobal->matrix0;
    int incx = n, incy = 1;
    qLocal[0] += cblas_ddot(n , &MM[in] , incx , reaction , incy);
    qLocal[1] += cblas_ddot(n , &MM[it] , incx , reaction , incy);
    qLocal[2] += cblas_ddot(n , &MM[is] , incx , reaction , incy);
    // Substract diagonal term
    qLocal[0] -= MM[in + n * in] * reaction[in];
    qLocal[1] -= MM[it + n * it] * reaction[it];
    qLocal[2] -= MM[is + n * is] * reaction[is];
  }
  else if (MGlobal->storageType == 1)
  {
    /* qLocal += rowMB * reaction
       with rowMB the row of blocks of MGlobal which corresponds to the current contact
    */
    subRowProdSBM(n, 3, contact, MGlobal->matrix1, reaction, qLocal, 0);
    // Substract diagonal term
    qLocal[0] -= MLocal[0] * reaction[in];
    qLocal[1] -= MLocal[4] * reaction[it];
    qLocal[2] -= MLocal[8] * reaction[is];

  }
  /*   reaction[in] = rin; reaction[it] = rit; reaction[is] = ris; */

  /* Friction coefficient for current block*/
  localproblem->mu[0] = problem->mu[contact];
}
int globalLineSearchGP(
  fc3d_nonsmooth_Newton_solvers* equation,
  double *reaction,
  double *velocity,
  double *mu,
  double *rho,
  double *F,
  double *A,
  double *B,
  NumericsMatrix *W,
  double *qfree,
  NumericsMatrix *AWpB,
  double *direction,
  double *tmp,
  double alpha[1],
  unsigned int maxiter_ls)
{
  unsigned problemSize = W->size0;

  double inf = 1e10;
  double alphamin = 0.0;
  double alphamax = inf;

  double m1 = 0.01, m2 = 0.99;

  // Computation of q(t) and q'(t) for t =0

  double q0 = 0.5 * cblas_ddot(problemSize, F, 1, F, 1);

  if (isnan(q0) || isinf(q0))
  {
    if (verbose > 0)
    {
      fprintf(stderr, "global line search warning. q0 is not a finite number.\n");
    }
    return -1;
  }

  //  tmp <- AWpB * direction
  NM_gemv(1., AWpB, direction, 0., tmp);

  double dqdt0 = cblas_ddot(problemSize, F, 1, tmp, 1);

  for (unsigned int iter = 0; iter < maxiter_ls; ++iter)
  {

    // tmp <- alpha*direction+reaction
    cblas_dcopy(problemSize, reaction, 1, tmp, 1);
    cblas_daxpy(problemSize, alpha[0], direction, 1, tmp, 1);

    // velocity <- W*tmp + qfree
    cblas_dcopy(problemSize, qfree, 1, velocity, 1);
    NM_gemv(1., W, tmp, 1., velocity);

    equation->function(equation->data, problemSize, tmp,
                       velocity, mu, rho, F,
                       NULL, NULL);

    double q  = 0.5 * cblas_ddot(problemSize, F, 1, F, 1);

    if (isnan(q) || isinf(q))
    {
      printf("global line search warning. q is not a finite number.\n");
      return -1;
    }

    assert(q >= 0);

    double slope = (q - q0) / alpha[0];

    int C1 = (slope >= m2 * dqdt0);
    int C2 = (slope <= m1 * dqdt0);

    if (C1 && C2)
    {
      if (verbose > 0)
      {
        printf("             globalLineSearchGP. success. ls_iter = %i  alpha = %.10e, q = %.10e\n", iter, alpha[0], q);
      }

      return 0;

    }
    else if (!C1)
    {
      alphamin = alpha[0];
    }
    else
    {
      // not(C2)
      alphamax = alpha[0];
    }

    if (alpha[0] < inf)
    {
      alpha[0] = 0.5 * (alphamin + alphamax);
    }
    else
    {
      alpha[0] = alphamin;
    }

  }
  if (verbose > 0)
  {
    printf("global line search reached the  max number of iteration  = %i  with alpha = %.10e \n", maxiter_ls, alpha[0]);
  }

  return -1;
}
int test_subRowprodNonSquare(NumericsMatrix* M3, NumericsMatrix* M4)
{

  printf("== Numerics tests: subRowProdNonSquare(NumericsMatrix,vector) == \n");
  int i , n = M3->size0, m = M3->size1;
  double * x = (double *)malloc(m * sizeof(double));

  for (i = 0; i < m; i++)
  {
    x[i] = i + 1;
  }

  int min = 1;
  int max = 3;
  int sizeY = max - min;
  int sizeX = m;
  /* Computes yRef = subA*x, subA = A limited to row min to max*/
  double * y = (double *)malloc(sizeY * sizeof(double));
  double yref[2];
  int incx = n, incy = 1;
  for (i = 0; i < sizeY; i++)
    yref[i] = cblas_ddot(m, &(M3->matrix0[min + i]), incx, x, incy);

  subRowProd(sizeX, sizeY, min, M3, x, y, 1);
  double tol = 1e-12;
  int info = 0;
  for (i = 0; i < sizeY; i++)
  {
    if (fabs(y[i] - yref[i]) > tol) info = 1;
    /*       printf("%lf\n", fabs(y[i]-yref[i]));  */
    /*           printf("%lf\n", y[i]); */
    /*           printf("%lf\n", yref[i]); */
  }
  if (info == 0)
    printf("Step 0 ( y = subA*x, double* storage NonSquare) ok ...\n");
  else
    printf("Step 0 ( y = subA*x, double* storage NonSquare) failed ...\n");

  /* += */
  subRowProd(sizeX, sizeY, min, M3, x, y, 0);
  for (i = 0; i < sizeY; i++)
  {
    if (fabs(y[i] - 2 * yref[i]) > tol) info = 1;
    /*         printf("%lf\n", fabs(y[i]-2*yref[i]));  */
    /*           printf("%lf\n", y[i]); */
    /*           printf("%lf\n", 2*yref[i]); */
  }
  if (info == 0)
    printf("Step 1 ( y += subA*x, double* storage NonSquare) ok ...\n");
  else
    printf("Step 1 ( y += subA*x, double* storage NonSquare) failed ...\n");


  free(y);


  sizeY = 2;
  int pos = 1; // pos of the required row of blocks
  y = (double *)malloc(sizeY * sizeof(double));

  for (i = 0; i < sizeY; i++)
  {
    y[i] = 0.0;
    yref[i] = cblas_ddot(m, &(M3->matrix0[4 + i]), incx, x, incy);
  }
  /* Sparse ... */
  subRowProd(sizeX, sizeY, pos, M4, x, y, 1);
  for (i = 0; i < sizeY; i++)
  {
    if (fabs(y[i] - yref[i]) > tol) info = 1;
    //  printf("%lf\n", fabs(y[i]-yref[i]));
  }
  for (i = 0; i < sizeY; i++)
    yref[i] = cblas_ddot(m, &(M3->matrix0[6 + i]), incx, x, incy);
  subRowProd(sizeX, sizeY, pos + 1, M4, x, y, 1);
  for (i = 0; i < sizeY; i++)
  {
    if (fabs(y[i] - yref[i]) > tol) info = 1;
    //  printf("%lf\n", fabs(y[i]-yref[i]));
  }


  if (info == 0)
    printf("Step 2 ( y = subA*x, sparse storage NonSquare) ok ...\n");
  else
    printf("Step 2 ( y = subA*x,  sparse storage NonSquare) failed ...\n");

  /* Sparse, += ... */
  subRowProd(sizeX, sizeY, pos + 1, M4, x, y, 0);
  for (i = 0; i < sizeY; i++)
  {
    if (fabs(y[i] - 2 * yref[i]) > tol) info = 1;
    /*       printf("%lf\n", fabs(y[i]-yref[i])); */
  }
  if (info == 0)
    printf("Step 3 ( y += subA*x, sparse storage) ok ...\n");
  else
    printf("Step 3 ( y += subA*x,  sparse storage) failed ...\n");


  free(x);
  free(y);
  printf("== End of test subRowProd(NumericsMatrix,vector), result = %d\n", info);

  return info;
}
示例#30
0
double vector_t::dot(vector_t const &b) const
{
	stack_assert(len == b.len);
	stack::fe_asserter dummy{};
	return cblas_ddot(len, data.get(), inc, b.data.get(), b.inc);
}