예제 #1
0
파일: linalg.c 프로젝트: zauberkraut/acmi
/* Computes the Frobenius norm of a matrix. */
double nrm2(Mat mA) {
  const int n2 = MatN2(mA);
  const void* a = MatElems(mA);
  const bool dev = MatDev(mA);
  double norm;

  switch (MatElemSize(mA)) {
  case 4:
    if (dev) {
      float norm32;
      cublasSnrm2(g_cublasHandle, n2, a, 1, (float*)&norm32);
      norm = norm32;
    } else {
      norm = cblas_snrm2(n2, a, 1);
    }
    break;

  case 8:
    if (dev) {
      cublasDnrm2(g_cublasHandle, n2, a, 1, (double*)&norm);
    } else {
      norm = cblas_dnrm2(n2, a, 1);
    }
    break;
  }

  return norm;
}
예제 #2
0
bool nrm2(REAL &norm, const RealVector &x) {
  bool flag = true;
  UINT N, incX;
  N = x.size;
  incX = 1;
  if (NULL == &x) {
    flag = false;
	  goto end;
  }

  norm = cblas_snrm2(N, x.M, incX);

end:
  return flag;
}
예제 #3
0
파일: c.c 프로젝트: kod3r/cpp-cheat
int main(void)
{
    int info, ipiv2[2];
    float err = 1e-6;
    float x2[2], b2[2], c2[2];
    float a2x2[2][2];

    //#cblas

        x2[0] =  1.0;
        x2[1] = -2.0;
        assert_eqd( cblas_snrm2(2, x2, 1), sqrt(5.0), err );

    //#lapacke

        //1 2 x = 5
        //3 4 y   11

        //x = 1
        //y = 1

        a2x2[0][0] = 1.0;
        a2x2[1][0] = 2.0;
        a2x2[0][1] = 3.0;
        a2x2[1][1] = 4.0;
        b2[0] = 5.;
        b2[1] = 11.;

        info = LAPACKE_sgesv(
            LAPACK_COL_MAJOR, //COL or ROW
            2,                //n
            1,                //nrhs
            &a2x2[0][0],
            2,                //lda
            &ipiv2[0],
            &b2[0],
            2                 //ldb
        );
        c2[0] = 1.0;
        c2[1] = 2.0;
        assert_eqi( info, 0 );
        assert_eqd( b2[0], c2[0], err );
        assert_eqd( b2[1], c2[1], err );

    return EXIT_SUCCESS;
}
예제 #4
0
VALUE rb_blas_xnrm2(int argc, VALUE *argv, VALUE self)
{
  Matrix *dx;
  int incx;
  int incy;
  int n;
  //char error_msg[64];
  VALUE n_value,  incx_value;
  
  rb_scan_args(argc, argv, "02",   &incx_value, &n_value);
  
  Data_Get_Struct(self, Matrix, dx);

  if(incx_value == Qnil)
    incx = 1;
  else
    incx = NUM2INT(incx_value);
  
  if(n_value == Qnil)
    n = dx->nrows;
  else
    n = NUM2INT(n_value);

  if(dx == NULL || dx->ncols != 1)
  { //sprintf(error_msg, "Self is not a Vector");
    rb_raise(rb_eRuntimeError, "Self is not a Vector");
  }
  
  switch(dx->data_type)
  {
  case Single_t: //s
    return rb_float_new(cblas_snrm2(n , (float *)dx->data, incx)); 
  case Double_t: //d
    return rb_float_new(cblas_dnrm2(n , (double *)dx->data, incx)); 
  case Complex_t: //c
    return rb_float_new(cblas_scnrm2(n , dx->data, incx)); 
  case Double_Complex_t: //z
    return rb_float_new(cblas_dznrm2(n , dx->data, incx)); 
  default:
    //sprintf(error_msg, "Invalid data_type (%d) in Matrix", dx->data_type);
    rb_raise(rb_eRuntimeError, "Invalid data_type (%d) in Matrix", dx->data_type);
    return Qnil; //Never reaches here.
  }
}
예제 #5
0
int opt_sgd(const int N,
						sgradfunc gradient,
						sfunc f,
						float *x,
						const sgdparams params)
{
	static int    max_iters ;
	static scalar eps ;
	static scalar delta ;
	static scalar fnow, fprev ;
	static int    i, cond ;
	static int    init = 0 ;

	if (!init)
	{
		atexit(&at_exit) ;
	}
	
	max_iters = params.max_iters ;
	eps       = params.tol_grad ;
	delta     = params.tol_func ;
	
	g = realloc(g, N*sizeof(scalar)) ;
	if(!params.initialized)
		for(i=0;i<N;++i)
			x[i] = 0;
	fnow = f(N, x) ;
	
	i = 0 ;
	do
	{
		fprev = fnow ;
		gradient(N, x, g) ;
		cblas_daxpy(N,-1,g,1,x,1) ;
		fnow = f(N, x) ;
		cond = fabs(fnow-fprev) > delta * fabs(fnow) ;
		cond = cond && (cblas_snrm2(N, g, 1) > eps) ;
		cond = cond && (++i < max_iters) ;
	} while (cond) ;
	if (i >= max_iters) return 0 ;
	return 1 ;
}
예제 #6
0
float HostVector<float>::Norm(void) const {

  return cblas_snrm2(this->size_, this->vec_, 1);

}
예제 #7
0
extern "C" magma_int_t
magma_sgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n,
            float *a, magma_int_t lda,
            float *WR, float *WI,
            float *vl, magma_int_t ldvl,
            float *vr, magma_int_t ldvr,
            float *work, magma_int_t lwork,
            magma_int_t *info, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.0.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       September 2012

    Purpose   
    =======   
    SGEEV computes for an N-by-N real nonsymmetric matrix A, the   
    eigenvalues and, optionally, the left and/or right eigenvectors.   

    The right eigenvector v(j) of A satisfies   
                     A * v(j) = lambda(j) * v(j)   
    where lambda(j) is its eigenvalue.   
    The left eigenvector u(j) of A satisfies   
                  u(j)**T * A = lambda(j) * u(j)**T   
    where u(j)**T denotes the transpose of u(j).   

    The computed eigenvectors are normalized to have Euclidean norm   
    equal to 1 and largest component real.   

    Arguments   
    =========   
    JOBVL   (input) CHARACTER*1   
            = 'N': left eigenvectors of A are not computed;   
            = 'V': left eigenvectors of are computed.   

    JOBVR   (input) CHARACTER*1   
            = 'N': right eigenvectors of A are not computed;   
            = 'V': right eigenvectors of A are computed.   

    N       (input) INTEGER   
            The order of the matrix A. N >= 0.   

    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)   
            On entry, the N-by-N matrix A.   
            On exit, A has been overwritten.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    WR      (output) DOUBLE PRECISION array, dimension (N)   
    WI      (output) DOUBLE PRECISION array, dimension (N)   
            WR and WI contain the real and imaginary parts,
            respectively, of the computed eigenvalues.  Complex
            conjugate pairs of eigenvalues appear consecutively
            with the eigenvalue having the positive imaginary part
            first.

    VL      (output) DOUBLE PRECISION array, dimension (LDVL,N)   
            If JOBVL = 'V', the left eigenvectors u(j) are stored one   
            after another in the columns of VL, in the same order   
            as their eigenvalues.   
            If JOBVL = 'N', VL is not referenced.   
            u(j) = VL(:,j), the j-th column of VL.   

    LDVL    (input) INTEGER   
            The leading dimension of the array VL.  LDVL >= 1; if   
            JOBVL = 'V', LDVL >= N.   

    VR      (output) DOUBLE PRECISION array, dimension (LDVR,N)   
            If JOBVR = 'V', the right eigenvectors v(j) are stored one   
            after another in the columns of VR, in the same order   
            as their eigenvalues.   
            If JOBVR = 'N', VR is not referenced.   
            v(j) = VR(:,j), the j-th column of VR.   

    LDVR    (input) INTEGER   
            The leading dimension of the array VR.  LDVR >= 1; if   
            JOBVR = 'V', LDVR >= N.   

    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= (1+nb)*N.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   
            > 0:  if INFO = i, the QR algorithm failed to compute all the   
                  eigenvalues, and no eigenvectors have been computed;   
                  elements and i+1:N of W contain eigenvalues which have   
                  converged.   
    =====================================================================    */

    magma_int_t c__1 = 1;
    magma_int_t c__0 = 0;
    magma_int_t c_n1 = -1;
    
    magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, 
            i__2, i__3;
    float d__1, d__2;

    magma_int_t i__, k, ihi, ilo;
    float      r__, cs, sn, scl;
    float dum[1], eps;
    magma_int_t ibal;
    float anrm;
    magma_int_t ierr, itau, iwrk, nout;
    magma_int_t scalea;
    float cscale;
    float bignum;
    magma_int_t minwrk;
    magma_int_t wantvl;
    float smlnum;
    magma_int_t lquery, wantvr, select[1];

    magma_int_t nb = 0;
    magmaFloat_ptr dT;
    //magma_timestr_t start, end;

    char side[2] = {0, 0};
    magma_vec_t jobvl_ = jobvl;
    magma_vec_t jobvr_ = jobvr;

    *info = 0;
    lquery = lwork == -1;
    wantvl = lapackf77_lsame(lapack_const(jobvl_), "V");
    wantvr = lapackf77_lsame(lapack_const(jobvr_), "V");
    if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) {
        *info = -1;
    } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -9;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -11;
    }

    /*  Compute workspace   */
    if (*info == 0) {

        nb = magma_get_sgehrd_nb(n);
        minwrk = (2+nb)*n;
        work[0] = (float) minwrk;
        
        if (lwork < minwrk && ! lquery) {
            *info = -13;
        }

    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
   
    // if eigenvectors are needed
#if defined(VERSION3)
    if (MAGMA_SUCCESS != magma_malloc( &dT, nb*n*sizeof(float) )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
#endif

    // subtract row and col for 1-based indexing
    a_dim1   = lda;
    a_offset = 1 + a_dim1;
    a       -= a_offset;
    vl_dim1   = ldvl;
    vl_offset = 1 + vl_dim1;
    vl       -= vl_offset;
    vr_dim1   = ldvr;
    vr_offset = 1 + vr_dim1;
    vr       -= vr_offset;
    --work;

    /* Get machine constants */
    eps    = lapackf77_slamch("P");
    smlnum = lapackf77_slamch("S");
    bignum = 1. / smlnum;
    lapackf77_slabad(&smlnum, &bignum);
    smlnum = magma_ssqrt(smlnum) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_slange("M", &n, &n, &a[a_offset], &lda, dum);
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_slascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, 
                &a[a_offset], &lda, &ierr);
    }

    /* Balance the matrix   
       (Workspace: need N) */
    ibal = 1;
    lapackf77_sgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr);

    /* Reduce to upper Hessenberg form   
       (Workspace: need 3*N, prefer 2*N+N*NB) */
    itau = ibal + n;
    iwrk = itau + n;
    i__1 = lwork - iwrk + 1;

    //start = get_current_time();
#if defined(VERSION1)
    /*
     * Version 1 - LAPACK
     */
    lapackf77_sgehrd(&n, &ilo, &ihi, &a[a_offset], &lda,
                     &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION2)
    /*
     *  Version 2 - LAPACK consistent HRD
     */
    magma_sgehrd2(n, ilo, ihi, &a[a_offset], lda,
                  &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
    /*  
     * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, 
     */
    magma_sgehrd(n, ilo, ihi, &a[a_offset], lda,
                 &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue);
#endif
    //end = get_current_time();
    //printf("    Time for sgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.);

    if (wantvl) {
      /*        Want left eigenvectors   
                Copy Householder vectors to VL */
        side[0] = 'L';
        lapackf77_slacpy(MagmaLowerStr, &n, &n, 
                         &a[a_offset], &lda, &vl[vl_offset], &ldvl);

        /* 
         * Generate orthogonal matrix in VL 
         *   (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) 
         */
        i__1 = lwork - iwrk + 1;

        //start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
        /*
         * Version 1 & 2 - LAPACK
         */
        lapackf77_sorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, 
                         &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
        /*
         * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
         */
        magma_sorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], 
                     dT, 0, nb, &ierr, queue);
#endif
        //end = get_current_time();
        //printf("    Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);

        /*
         * Perform QR iteration, accumulating Schur vectors in VL
         *   (Workspace: need N+1, prefer N+HSWORK (see comments) )
         */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, 
                         &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info);

        if (wantvr) {
          /* Want left and right eigenvectors   
             Copy Schur vectors to VR */
            side[0] = 'B';
            lapackf77_slacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr);
        }

    } else if (wantvr) {
        /*  Want right eigenvectors   
            Copy Householder vectors to VR */
        side[0] = 'R';
        lapackf77_slacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr);

        /*
         * Generate orthogonal matrix in VR
         *   (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) 
         */
        i__1 = lwork - iwrk + 1;
        //start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
        /*
         * Version 1 & 2 - LAPACK
         */
        lapackf77_sorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, 
                         &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
        /*
         * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
         */
        magma_sorghr(n, ilo, ihi, &vr[vr_offset], ldvr, 
                     &work[itau], dT, 0, nb, &ierr, queue);
#endif
        //end = get_current_time();
        //printf("    Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);

        /* 
         * Perform QR iteration, accumulating Schur vectors in VR   
         *   (Workspace: need N+1, prefer N+HSWORK (see comments) ) 
         */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
                &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
    } else {
        /*  
         * Compute eigenvalues only   
         *   (Workspace: need N+1, prefer N+HSWORK (see comments) ) 
         */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_shseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
                &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
    }

    /* If INFO > 0 from SHSEQR, then quit */
    if (*info > 0) {
        fprintf(stderr, "SHSEQR returned with info = %d\n", (int) *info);
        goto L50;
    }

    if (wantvl || wantvr) {
        /*  
         * Compute left and/or right eigenvectors   
         *   (Workspace: need 4*N) 
         */
        lapackf77_strevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl,
                &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr);
    }

    if (wantvl) {
        /*  
         * Undo balancing of left eigenvectors   
         *   (Workspace: need N) 
         */
        lapackf77_sgebak("B", "L", &n, &ilo, &ihi, 
                         &work[ibal], &n, &vl[vl_offset], &ldvl, &ierr);

        /* Normalize left eigenvectors and make largest component real */
        for (i__ = 1; i__ <= n; ++i__) {
            if ( WI[i__-1] == 0.) {
                scl = cblas_snrm2(n, &vl[i__ * vl_dim1 + 1], 1);
                scl = 1. / scl;
                cblas_sscal(n, (scl), &vl[i__ * vl_dim1 + 1], 1);
            } else if (WI[i__-1] > 0.) {
                d__1 = cblas_snrm2(n, &vl[ i__      * vl_dim1 + 1], 1);
                d__2 = cblas_snrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1);
                scl = lapackf77_slapy2(&d__1, &d__2);
                scl = 1. / scl;
                cblas_sscal(n, (scl), &vl[ i__      * vl_dim1 + 1], 1);
                cblas_sscal(n, (scl), &vl[(i__ + 1) * vl_dim1 + 1], 1);
                i__2 = n;
                for (k = 1; k <= i__2; ++k) {
                    /* Computing 2nd power */
                    d__1 = vl[k + i__ * vl_dim1];
                    /* Computing 2nd power */
                    d__2 = vl[k + (i__ + 1) * vl_dim1];
                    work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
                }
                /* Comment:
                   Fortran BLAS does not have to add 1
                   C       BLAS must add one to cblas_isamax */ 
                k = cblas_isamax(n, &work[iwrk], 1)+1;
                lapackf77_slartg(&vl[k +  i__      * vl_dim1], 
                                 &vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__);
                cblas_srot(n, &vl[ i__      * vl_dim1 + 1], 1, 
                           &vl[(i__ + 1) * vl_dim1 + 1], 1, cs, (sn));
                vl[k + (i__ + 1) * vl_dim1] = 0.;
            }
        }
    }

    if (wantvr) {
        /*  
         * Undo balancing of right eigenvectors   
         *   (Workspace: need N) 
         */
        lapackf77_sgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n, 
                         &vr[vr_offset], &ldvr, &ierr);

        /* Normalize right eigenvectors and make largest component real */
        for (i__ = 1; i__ <= n; ++i__) {
            if (WI[i__-1] == 0.) {
                scl = 1. / cblas_snrm2(n, &vr[i__ * vr_dim1 + 1], 1);
                cblas_sscal(n, (scl), &vr[i__ * vr_dim1 + 1], 1);
            } else if (WI[i__-1] > 0.) {
                d__1 = cblas_snrm2(n, &vr[ i__      * vr_dim1 + 1], 1);
                d__2 = cblas_snrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1);
                scl = lapackf77_slapy2(&d__1, &d__2);
                scl = 1. / scl;
                cblas_sscal(n, (scl), &vr[ i__      * vr_dim1 + 1], 1);
                cblas_sscal(n, (scl), &vr[(i__ + 1) * vr_dim1 + 1], 1);
                i__2 = n;
                for (k = 1; k <= i__2; ++k) {
                    /* Computing 2nd power */
                    d__1 = vr[k + i__ * vr_dim1];
                    /* Computing 2nd power */
                    d__2 = vr[k + (i__ + 1) * vr_dim1];
                    work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
                }
                /* Comment:
                   Fortran BLAS does not have to add 1
                   C       BLAS must add one to cblas_isamax */
                k = cblas_isamax(n, &work[iwrk], 1)+1;
                lapackf77_slartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1], 
                        &cs, &sn, &r__);
                cblas_srot(n, &vr[ i__      * vr_dim1 + 1], 1, 
                              &vr[(i__ + 1) * vr_dim1 + 1], 1, cs, (sn));
                vr[k + (i__ + 1) * vr_dim1] = 0.;
            }
        }
    }

    /*  Undo scaling if necessary */
L50:
    if (scalea) {
        i__1 = n - *info;
        /* Computing MAX */
        i__3 = n - *info;
        i__2 = max(i__3,1);
        lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, 
                         WR + (*info), &i__2, &ierr);
        i__1 = n - *info;
        /* Computing MAX */
        i__3 = n - *info;
        i__2 = max(i__3,1);
        lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, 
                WI + (*info), &i__2, &ierr);
        if (*info > 0) {
            i__1 = ilo - 1;
            lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, 
                    WR, &n, &ierr);
            i__1 = ilo - 1;
            lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
                    WI, &n, &ierr);
        }
    }

#if defined(VERSION3)
    magma_free( dT );
#endif
    return *info;
} /* magma_sgeev */
예제 #8
0
JNIEXPORT jfloat JNICALL Java_uncomplicate_neanderthal_CBLAS_snrm2
(JNIEnv *env, jclass clazz, jint N, jobject X, jint offsetX, jint incX) {

    float *cX = (float *) (*env)->GetDirectBufferAddress(env, X);
    return cblas_snrm2(N, cX + offsetX, incX);
};
예제 #9
0
	int GaussNewton(
		void (*func)(T *x, T *r, int m, int n, void *adata),
		void (*jacf)(T *x, T *J, int m, int n, void *adata),
		T *x, T *r, T* J, int m, int n, int itmax, 
		T *opts,	/* delta,  r_threshold, diff_threshold */
		void *adata)
	{
		PhGUtils::debug("m", m, "n", n);

		float delta, R_THRES, DIFF_THRES;
		if( opts == NULL ) {
			// use default values
			delta = 1.0;	// step size, default to use standard Newton-Ralphson
			R_THRES = 1e-6;	DIFF_THRES = 1e-6;
		}
		else {
			delta = opts[0]; R_THRES = opts[1]; DIFF_THRES = opts[2];
		}

		bool allocateR = false, allocateJ = false;
		// residue
		if( r == NULL ) {
			// allocate space for residue
			allocateR = true;
			r = new T[n];
			memset(r, 0, sizeof(T)*n);
		}

		T* x0 = new T[m];
		memset(x0, 0, sizeof(T)*m);

		T* deltaX = new T[m];	// also for Jtr
		memset(deltaX, 0, sizeof(T)*m);
		cblas_scopy(m, x, 1, deltaX, 1);

		T* JtJ = new T[m * m];
		memset(JtJ, 0, sizeof(T)*m*m);

		// Jacobian
		if( J == NULL ) {
			allocateJ = true;
			J = new T[m * n];
			memset(J, 0, sizeof(T)*m*n);
		}

		// compute initial residue
		func(x, r, m, n, adata);

		//ofstream fout0("r.txt");
		//print2DArray(r, n, 1, fout0);
		//fout0.close();

		int iters = 0;

		//::system("pause");

		//printArray(x, m);
		//printArray(r, n);

		// do iteration
		while( (cblas_snrm2(m, deltaX, 1) > DIFF_THRES && cblas_snrm2(n, r, 1) > R_THRES && iters < itmax) || iters < 1 ) {
			// compute Jacobian
			jacf(x, J, m, n, adata);

			// store old value
			cblas_scopy(m, x, 1, x0, 1);

			//ofstream fout1("J.txt");
			//print2DArray(J, n, m, fout1);
			//fout1.close();

			//::system("pause");

			// compute JtJ
			cblas_ssyrk (CblasColMajor, CblasUpper, CblasNoTrans, m, n, 1.0, J, m, 0, JtJ, m);

			//ofstream fout("JtJ.txt");
			//print2DArray(JtJ, m, m, fout);
			//fout.close();

			// compute Jtr
			cblas_sgemv (CblasColMajor, CblasNoTrans, m, n, 1.0, J, m, r, 1, 0, deltaX, 1);
			
			// compute deltaX
			LAPACKE_spotrf( LAPACK_COL_MAJOR, 'U', m, JtJ, m );
			LAPACKE_spotrs( LAPACK_COL_MAJOR, 'U', m, 1, JtJ, m, deltaX, m );

			//ofstream fout2("deltaX.txt");
			//printArray(deltaX, m, fout2);
			//fout2.close();

			// update x
			cblas_saxpy(m, -delta, deltaX, 1, x, 1);

			// update residue
			func(x, r, m, n, adata);

			//printArray(x, m);
			//system("pause");
			iters++;
		}

		//::system("pause");

		// delete workspace
		delete[] x0;
		delete[] deltaX;
		delete[] JtJ;

		if( allocateR ){ delete[] r;}
		if( allocateJ ){ delete[] J;}

		return iters;
	}
예제 #10
0
/**
    Purpose
    -------
    SLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to SLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    nrgpu   INTEGER
            Number of GPUs to use.

    @param[in]
    k       INTEGER
            The number of terms in the rational function to be solved by
            SLAED4.  K >= 0.

    @param[in]
    n       INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N > K).

    @param[in]
    n1      INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    @param[out]
    d       REAL array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    @param[out]
    Q       REAL array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    @param[in]
    ldq     INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    @param[in]
    rho     REAL
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    @param[in,out]
    dlamda  REAL array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    @param[in]
    Q2      REAL array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.

    @param[in]
    indx    INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see SLAED2).
            The rows of the eigenvectors found by SLAED4 must be likewise
            permuted before the matrix multiply can take place.

    @param[in]
    ctot    INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    @param[in,out]
    w       REAL array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    @param
    s       (workspace) REAL array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    @param[out]
    indxq   INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.
    
    @param
    dwork   (devices workspaces) REAL array of arrays,
            dimension NRGPU.
            if NRGPU = 1 the dimension of the first workspace
            should be (3*N*N/2+3*N)
            otherwise the NRGPU workspaces should have the size
            ceil((N-N1) * (N-N1) / floor(nrgpu/2)) +
            NB * ((N-N1) + (N-N1) / floor(nrgpu/2))
    
    @param
    stream  (device stream) magma_queue_t array,
            dimension (MagmaMaxGPUs,2)

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                             will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.
            TODO verify range, vl, vu, il, iu -- copied from slaex1.

    @param[in]
    vl      REAL
    @param[in]
    vu      REAL
            if RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            if RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit.
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ---------------
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slaex3_m(magma_int_t nrgpu,
               magma_int_t k, magma_int_t n, magma_int_t n1, float* d,
               float* Q, magma_int_t ldq, float rho,
               float* dlamda, float* Q2, magma_int_t* indx,
               magma_int_t* ctot, float* w, float* s, magma_int_t* indxq,
               float** dwork, magma_queue_t stream[MagmaMaxGPUs][2],
               magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu,
               magma_int_t* info )
{
#define Q(i_,j_) (Q + (i_) + (j_)*ldq)

#define dQ2(id)    (dwork[id])
#define dS(id, ii) (dwork[id] + n2*n2_loc + (ii)*(n2*nb))
#define dQ(id, ii) (dwork[id] + n2*n2_loc +    2*(n2*nb) + (ii)*(n2_loc*nb))

    if (nrgpu == 1) {
        magma_setdevice(0);
        magma_slaex3(k, n, n1, d, Q, ldq, rho,
                     dlamda, Q2, indx, ctot, w, s, indxq,
                     *dwork, range, vl, vu, il, iu, info );
        return MAGMA_SUCCESS;
    }
    float d_one  = 1.;
    float d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;

    magma_int_t iil, iiu, rk;
    magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu;
    magma_int_t ni_loc[MagmaMaxGPUs];

    magma_int_t i,ind,iq2,j,n12,n2,n23,tmp,lq2;
    float temp;
    magma_int_t alleig, valeig, indeig;

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    *info = 0;

    if (k < 0)
        *info=-1;
    else if (n < k)
        *info=-2;
    else if (ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }


    if (*info != 0) {
        magma_xerbla(__func__, -(*info));
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    // Quick return if possible
    if (k == 0)
        return MAGMA_SUCCESS;
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

//#define CHECK_CPU
#ifdef CHECK_CPU
    float *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs];
    #define hQ2(id) (hwQ2[id])
    #define hS(id, ii) (hwS[ii][id])
    #define hQ(id, ii) (hwQ[ii][id])
#endif
    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    lq2 = iq2 + n2 * n23;

    n1_loc = (n1-1) / (nrgpu/2) + 1;
    n2_loc = (n2-1) / (nrgpu/2) + 1;

    nb = magma_get_slaex3_m_nb();

    if (n1 >= magma_get_slaex3_m_k()) {
#ifdef CHECK_CPU
        for (igpu = 0; igpu < nrgpu; ++igpu) {
            magma_smalloc_pinned( &(hwS[0][igpu]), n2*nb );
            magma_smalloc_pinned( &(hwS[1][igpu]), n2*nb );
            magma_smalloc_pinned( &(hwQ2[igpu]), n2*n2_loc );
            magma_smalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb );
            magma_smalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb );
        }
#endif
        for (igpu = 0; igpu < nrgpu-1; igpu += 2) {
            ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc);
#ifdef CHECK_CPU
            lapackf77_slacpy("A", &ni_loc[igpu], &n12, Q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc);
#endif
            magma_setdevice(igpu);
            magma_ssetmatrix_async( ni_loc[igpu], n12,
                                    Q2+n1_loc*(igpu/2), n1,
                                    dQ2(igpu),          n1_loc, stream[igpu][0] );
            ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc);
#ifdef CHECK_CPU
            lapackf77_slacpy("A", &ni_loc[igpu+1], &n23, Q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc);
#endif
            magma_setdevice(igpu+1);
            magma_ssetmatrix_async( ni_loc[igpu+1], n23,
                                    Q2+iq2+n2_loc*(igpu/2), n2,
                                    dQ2(igpu+1),            n2_loc, stream[igpu+1][0] );
        }
    }

    //

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

#pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for (i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for (j = ib; j < ie; ++j) {
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if (iinfo != 0) {
#pragma omp critical (info)
                *info=iinfo;
                break;
            }
        }

#pragma omp barrier

        if (*info == 0) {
#pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig)
                    magma_svrange(k, d, &iil, &iiu, vl, vu);
                else if (indeig)
                    magma_sirange(k, indxq, &iil, &iiu, il, iu);
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2) {
#pragma omp single
                {
                    for (j = 0; j < k; ++j) {
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }
            }
            else if (k != 1) {
                // Compute updated W.
                blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for (j = 0; j < k; ++j) {
                    magma_int_t i_tmp = min(j, ie);
                    for (i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for (i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for (i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

#pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if (id < tot) {
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else {
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for (j = ib; j < ie; ++j) {
                    for (i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = cblas_snrm2( k, s+id*k, 1);
                    for (i = 0; i < k; ++i) {
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }
    if (*info != 0)
        return MAGMA_SUCCESS; //??????

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

    for (i = 0; i < k; ++i)
        dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for (j = 0; j < k; ++j) {
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if (iinfo != 0)
            *info=iinfo;
    }
    if (*info != 0)
        return MAGMA_SUCCESS;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig)
        magma_svrange(k, d, &iil, &iiu, vl, vu);
    else if (indeig)
        magma_sirange(k, indxq, &iil, &iiu, il, iu);
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2) {
        for (j = 0; j < k; ++j) {
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }
    }
    else if (k != 1) {
        // Compute updated W.
        blasf77_scopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_scopy( &k, Q, &tmp, w, &ione);

        for (j = 0; j < k; ++j) {
            for (i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for (i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for (i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for (j = iil-1; j < iiu; ++j) {
            for (i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = cblas_snrm2( k, s, 1);
            for (i = 0; i < k; ++i) {
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#endif //_OPENMP

    // Compute the updated eigenvectors.

    timer_start( time );

    if (rk > 0) {
        if (n1 < magma_get_slaex3_m_k()) {
            // stay on the CPU
            if ( n23 != 0 ) {
                lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            }
            else
                lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

            if ( n12 != 0 ) {
                lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            }
            else
                lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
        }
        else {
            //use the gpus
            ib = min(nb, rk);
            for (igpu = 0; igpu < nrgpu-1; igpu += 2) {
                if (n23 != 0) {
                    magma_setdevice(igpu+1);
                    magma_ssetmatrix_async( n23, ib,
                                            Q(ctot[0],iil-1), ldq,
                                            dS(igpu+1,0),     n23, stream[igpu+1][0] );
                }
                if (n12 != 0) {
                    magma_setdevice(igpu);
                    magma_ssetmatrix_async( n12, ib,
                                            Q(0,iil-1), ldq,
                                            dS(igpu,0), n12, stream[igpu][0] );
                }
            }

            for (i = 0; i < rk; i += nb) {
                ib = min(nb, rk - i);
                ind = (i/nb)%2;
                if (i+nb < rk) {
                    ib2 = min(nb, rk - i - nb);
                    for (igpu = 0; igpu < nrgpu-1; igpu += 2) {
                        if (n23 != 0) {
                            magma_setdevice(igpu+1);
                            magma_ssetmatrix_async( n23, ib2,
                                                    Q(ctot[0],iil-1+i+nb), ldq,
                                                    dS(igpu+1,(ind+1)%2),  n23, stream[igpu+1][(ind+1)%2] );
                        }
                        if (n12 != 0) {
                            magma_setdevice(igpu);
                            magma_ssetmatrix_async( n12, ib2,
                                                    Q(0,iil-1+i+nb),    ldq,
                                                    dS(igpu,(ind+1)%2), n12, stream[igpu][(ind+1)%2] );
                        }
                    }
                }

                // Ensure that the data is copied on gpu since we will overwrite it.
                for (igpu = 0; igpu < nrgpu-1; igpu += 2) {
                    if (n23 != 0) {
#ifdef CHECK_CPU
                        lapackf77_slacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23);
#endif
                        magma_setdevice(igpu+1);
                        magma_queue_sync( stream[igpu+1][ind] );
                    }
                    if (n12 != 0) {
#ifdef CHECK_CPU
                        lapackf77_slacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12);
#endif
                        magma_setdevice(igpu);
                        magma_queue_sync( stream[igpu][ind] );
                    }
                }
                for (igpu = 0; igpu < nrgpu-1; igpu += 2) {
                    if (n23 != 0) {
#ifdef CHECK_CPU
                        blasf77_sgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc,
                                      hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc);
#endif
                        magma_setdevice(igpu+1);
                        magmablasSetKernelStream(stream[igpu+1][ind]);
                        magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc,
                                    dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc);
#ifdef CHECK_CPU
                        printf("norm Q %d: %f\n", igpu+1, cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc));
#endif
                    }
                    if (n12 != 0) {
#ifdef CHECK_CPU
                        blasf77_sgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc,
                                      hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc);
#endif
                        magma_setdevice(igpu);
                        magmablasSetKernelStream(stream[igpu][ind]);
                        magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc,
                                    dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc);
#ifdef CHECK_CPU
                        printf("norm Q %d: %f\n", igpu, cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc));
#endif
                    }
                }
                for (igpu = 0; igpu < nrgpu-1; igpu += 2) {
                    if (n23 != 0) {
                        magma_setdevice(igpu+1);
                        magma_sgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
                                          Q(n1+n2_loc*(igpu/2),iil-1+i), ldq );
//                        magma_sgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
//                                                Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, stream[igpu+1][ind] );
                    }
                    if (n12 != 0) {
                        magma_setdevice(igpu);
                        magma_sgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
                                          Q(n1_loc*(igpu/2),iil-1+i), ldq );
//                        magma_sgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
//                                                Q(n1_loc*(igpu/2),iil-1+i), ldq, stream[igpu][ind] );
                    }
                }
            }
            for (igpu = 0; igpu < nrgpu; ++igpu) {
#ifdef CHECK_CPU
                magma_free_pinned( hwS[1][igpu] );
                magma_free_pinned( hwS[0][igpu] );
                magma_free_pinned( hwQ2[igpu] );
                magma_free_pinned( hwQ[1][igpu] );
                magma_free_pinned( hwQ[0][igpu] );
#endif
                magma_setdevice(igpu);
                magmablasSetKernelStream(NULL);
                magma_queue_sync( stream[igpu][0] );
                magma_queue_sync( stream[igpu][1] );
            }
            if ( n23 == 0 )
                lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

            if ( n12 == 0 )
                lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
        }
    }
    timer_stop( time );
    timer_printf( "gemms = %6.2f\n", time );

    return MAGMA_SUCCESS;
} /* magma_slaed3_m */
예제 #11
0
파일: Mtdconvert.c 프로젝트: 1014511134/src
int main(int argc, char* argv[])
{
    bool velocity, verb, shape;
    int dim, i, j, n[3], rect[3], it, nt, order, nt0, nx0;
    int iter, niter, iline, nline, cgiter, *f0, *m0=NULL;
    float d[3], o[3], dt0, dx0, ot0, ox0, eps, tol, *p=NULL, *p0=NULL, thres;
    float *vd, *vdt, *vdx, *s, *t0, *x0, *ds, *rhs, *rhs0, *rhs1=NULL, error0, error1, error, scale;
    char key[6];
    sf_file in, out, dix, t_0=NULL, x_0=NULL, f_0=NULL, grad=NULL, cost=NULL, mini=NULL, prec=NULL;

    sf_init(argc,argv);
    in = sf_input("in");
    out = sf_output("out");

    /* read input dimension */
    dim = sf_filedims(in,n);

    nt = 1;
    for (i=0; i < dim; i++) {
	sprintf(key,"d%d",i+1);
	if (!sf_histfloat(in,key,d+i)) sf_error("No %s= in input",key);
	sprintf(key,"o%d",i+1);
	if (!sf_histfloat(in,key,o+i)) o[i]=0.;
	nt *= n[i];
    }
    if (dim < 3) {
	n[2] = 1; d[2] = d[1]; o[2] = o[1];
    }

    /* read initial guess */
    s = sf_floatalloc(nt);
    sf_floatread(s,nt,in);

    if (!sf_getbool("velocity",&velocity)) velocity=true;
    /* y, input is velocity / n, slowness-squared */

    if (velocity) {
	for (it=0; it < nt; it++) {
	    s[it] = 1./s[it]*1./s[it];
	}
    }

    /* read Dix velocity */
    if (NULL == sf_getstring("dix")) sf_error("No Dix input dix=");
    dix = sf_input("dix");

    if(!sf_histint(dix,"n1",&nt0)) sf_error("No n1= in dix");
    if(!sf_histint(dix,"n2",&nx0)) sf_error("No n2= in dix");
    if(!sf_histfloat(dix,"d1",&dt0)) sf_error("No d1= in dix");
    if(!sf_histfloat(dix,"d2",&dx0)) sf_error("No d2= in dix");
    if(!sf_histfloat(dix,"o1",&ot0)) sf_error("No o1= in dix");
    if(!sf_histfloat(dix,"o2",&ox0)) sf_error("No o2= in dix");

    vd = sf_floatalloc(nt0*nx0);
    sf_floatread(vd,nt0*nx0,dix);
    sf_fileclose(dix);

    /* Dix velocity derivative in t0 (2nd order FD) */
    vdt = sf_floatalloc(nt0*nx0);
    for (i=0; i < nt0; i++) {
	for (j=0; j < nx0; j++) {
	    if (i == 0)
		vdt[j*nt0+i] = (-vd[j*nt0+i+2]+4.*vd[j*nt0+i+1]-3.*vd[j*nt0+i])/(2.*dt0);
	    else if (i == nt0-1)
		vdt[j*nt0+i] = (3.*vd[j*nt0+i]-4.*vd[j*nt0+i-1]+vd[j*nt0+i-2])/(2.*dt0);
	    else
		vdt[j*nt0+i] = (vd[j*nt0+i+1]-vd[j*nt0+i-1])/(2.*dt0);
	}
    }

    /* Dix velocity derivative in x0 (2nd order FD) */
    vdx = sf_floatalloc(nt0*nx0);
    for (j=0; j < nx0; j++) {
	for (i=0; i < nt0; i++) {
	    if (j == 0)
		vdx[j*nt0+i] = (-vd[(j+2)*nt0+i]+4.*vd[(j+1)*nt0+i]-3.*vd[j*nt0+i])/(2.*dx0);
	    else if (j == nx0-1)
		vdx[j*nt0+i] = (3.*vd[j*nt0+i]-4.*vd[(j-1)*nt0+i]+vd[(j-2)*nt0+i])/(2.*dx0);
	    else
		vdx[j*nt0+i] = (vd[(j+1)*nt0+i]-vd[(j-1)*nt0+i])/(2.*dx0);
	}
    }

    if (!sf_getint("order",&order)) order=1;
    /* fastmarch accuracy order */

    if (!sf_getfloat("thres",&thres)) thres=10.;
    /* thresholding for caustics */

    if (!sf_getint("niter",&niter)) niter=1;
    /* number of nonlinear updates */

    if (!sf_getint("cgiter",&cgiter)) cgiter=200;
    /* number of CG iterations */

    if (!sf_getbool("shape",&shape)) shape=false;
    /* regularization (default Tikhnov) */

    if (!sf_getfloat("eps",&eps)) eps=0.1;
    /* regularization parameter */

    if (!sf_getint("nline",&nline)) nline=0;
    /* maximum number of line search (default turned-off) */

    if (!sf_getbool("verb",&verb)) verb=false;
    /* verbosity flag */

    if (shape) {
	if (!sf_getfloat("tol",&tol)) tol=1.e-6;
	/* tolerance for shaping regularization */

	for (i=0; i < dim; i++) {
	    sprintf(key,"rect%d",i+1);
	    if (!sf_getint(key,rect+i)) rect[i]=1;
	    /*( rect#=(1,1,...) smoothing radius on #-th axis )*/
	}
	
	/* triangle smoothing operator */
	sf_trianglen_init(dim,rect,n);
	sf_repeat_init(nt,1,sf_trianglen_lop);
	
	sf_conjgrad_init(nt,nt,nt,nt,eps,tol,verb,false);
	p = sf_floatalloc(nt);
    } else {
	/* initialize 2D gradient operator */
	sf_igrad2_init(n[0],n[1]);
    }
    
    /* allocate memory for fastmarch */
    t0 = sf_floatalloc(nt);
    x0 = sf_floatalloc(nt);
    f0 = sf_intalloc(nt);

    /* allocate memory for update */
    ds  = sf_floatalloc(nt);
    rhs = sf_floatalloc(nt);

    /* output transformation matrix */
    if (NULL != sf_getstring("t0")) {
	t_0 = sf_output("t0");
	sf_putint(t_0,"n3",niter+1);
    }
    if (NULL != sf_getstring("x0")) {
	x_0 = sf_output("x0");
	sf_putint(x_0,"n3",niter+1);
    }

    /* output auxiliary label */
    if (NULL != sf_getstring("f0")) {
	f_0 = sf_output("f0");
	sf_settype(f_0,SF_INT);
	sf_putint(f_0,"n3",niter+1);
    }

    /* output gradient */
    if (NULL != sf_getstring("grad")) {
	grad = sf_output("grad");
	sf_putint(grad,"n3",niter);
    }

    /* output cost */
    if (NULL != sf_getstring("cost")) {
	cost = sf_output("cost");
	sf_putint(cost,"n3",niter+1);
    }

    /* read mask (desired minimum) */
    m0 = sf_intalloc(nt);

    if (NULL != sf_getstring("mask")) {
	mini = sf_input("mask");
	sf_intread(m0,nt,mini);
	sf_fileclose(mini);
    } else {
	for (it=0; it < nt; it++) m0[it] = -1;
    }

    /* read cost (desired minimum) */
    rhs0 = sf_floatalloc(nt);

    if (NULL != sf_getstring("mval")) {
	mini = sf_input("mval");
	sf_floatread(rhs0,nt,mini);
	sf_fileclose(mini);
    } else {
	for (it=0; it < nt; it++) rhs0[it] = 0.;
    }

    /* read preconditioner */
    if (NULL != sf_getstring("prec")) {
	prec = sf_input("prec");
	p0 = sf_floatalloc(nt);
	sf_floatread(p0,nt,prec);
	sf_fileclose(prec);

	rhs1 = sf_floatalloc(nt);
    }

    /* fastmarch initialization */
    fastmarch_init(n,o,d,order);

    /* update initialization */
    t2d_init(dim,n,d,nt0,dt0,ot0,nx0,dx0,ox0);

    /* fastmarch */
    fastmarch(t0,x0,f0,s);

    /* caustic region (2D) */
    t2d_caustic(x0,f0,n,d,thres);

    /* set up operator */
    t2d_set(t0,x0,f0,s,vd,vdt,vdx,m0,p0);

    /* evaluate cost */
    t2d_cost(rhs);

    for (it=0; it < nt; it++) {
	if (f0[it] >= 0 || m0[it] >= 0)
	    rhs[it] = 0.;
	else
	    rhs[it] -= rhs0[it];
    }

    if (p0 == NULL) {
	error0 = error1 = cblas_snrm2(nt,rhs,1);
    } else {
	for (it=0; it < nt; it++) rhs1[it] = p0[it]*rhs[it];
	error0 = error1 = cblas_snrm2(nt,rhs1,1);
    }

    /* write optional outputs */    
    if (NULL!=t_0)  sf_floatwrite(t0,nt,t_0);
    if (NULL!=x_0)  sf_floatwrite(x0,nt,x_0);
    if (NULL!=f_0)  sf_intwrite(f0,nt,f_0);
    if (NULL!=cost) sf_floatwrite(rhs,nt,cost);

    sf_warning("Start conversion, cost %g",1.);

    /* nonlinear loop */
    for (iter=0; iter < niter; iter++) {
	
	/* solve ds */
	if (shape) {
	    if (p0 == NULL)
		sf_conjgrad(NULL,t2d_oper,sf_repeat_lop,p,ds,rhs,cgiter);
	    else
		sf_conjgrad(t2d_prec,t2d_oper,sf_repeat_lop,p,ds,rhs,cgiter);
	} else {
	    sf_solver_reg(t2d_oper,sf_cgstep,sf_igrad2_lop,2*nt,nt,nt,ds,rhs,cgiter,eps,"verb",verb,"end");
	    sf_cgstep_close();
	}

	/* add ds */
	for (it=0; it < nt; it++) {
	    s[it] = s[it]+ds[it]+0.25*ds[it]*ds[it]/s[it];
	}

	/* fastmarch */
	fastmarch(t0,x0,f0,s);

	/* caustic region (2D) */
	t2d_caustic(x0,f0,n,d,thres);

	/* set up operator */
	t2d_set(t0,x0,f0,s,vd,vdt,vdx,m0,p0);

	/* evaluate cost */
	t2d_cost(rhs);

	for (it=0; it < nt; it++) {
	    if (f0[it] >= 0 || m0[it] >= 0)
		rhs[it] = 0.;
	    else
		rhs[it] -= rhs0[it];
	}
	
	if (p0 == NULL) {
	    error = cblas_snrm2(nt,rhs,1);
	} else {
	    for (it=0; it < nt; it++) rhs1[it] = p0[it]*rhs[it];
	    error = cblas_snrm2(nt,rhs1,1);
	}

	error = cblas_snrm2(nt,rhs,1);

	/* line search */
	if (nline > 0 && error >= error1) {

	    scale = 0.5;
	    for (iline=0; iline < nline; iline++) {

		for (it=0; it < nt; it++) {
		    s[it] = s[it]+(scale*ds[it])+0.25*(scale*ds[it])*(scale*ds[it])/s[it];
		}
		
		fastmarch(t0,x0,f0,s);
		t2d_caustic(x0,f0,n,d,thres);

		t2d_set(t0,x0,f0,s,vd,vdt,vdx,m0,p0);
		t2d_cost(rhs);

		for (it=0; it < nt; it++) {
		    if (f0[it] >= 0 || m0[it] >= 0)
			rhs[it] = 0.;
		    else
			rhs[it] -= rhs0[it];
		}
		
		if (p0 == NULL) {
		    error = cblas_snrm2(nt,rhs,1);
		} else {
		    for (it=0; it < nt; it++) rhs1[it] = p0[it]*rhs[it];
		    error = cblas_snrm2(nt,rhs1,1);
		}
		
		error = cblas_snrm2(nt,rhs,1);
		if (error < error1) {
		    sf_warning("Exist line search %d of %d",iline+1,nline);
		} else {
		    scale *= 0.5;
		}
	    }
	}

	error1 = error;

	/* write optional outputs */
	if (NULL!=t_0)  sf_floatwrite(t0,nt,t_0);
	if (NULL!=x_0)  sf_floatwrite(x0,nt,x_0);
	if (NULL!=f_0)  sf_intwrite(f0,nt,f_0);
	if (NULL!=cost) sf_floatwrite(rhs,nt,cost);
	if (NULL!=grad) sf_floatwrite(ds,nt,grad);

	sf_warning("Cost after iteration %d: %g",iter+1,error/error0);
    }

    /* write output */
    if (velocity) {
	for (it=0; it < nt; it++) {
	    s[it] = 1./sqrtf(s[it]);
	}
    }

    sf_floatwrite(s,nt,out);

    exit(0);
}
예제 #12
0
파일: gfortblas.c 프로젝트: BigCrunsh/julia
float snrm2_(int *N, float *X, int *INCX) {
    return cblas_snrm2(*N, X, *INCX);
}
예제 #13
0
파일: Mfatomo.c 프로젝트: 1014511134/src
int main(int argc, char* argv[])
{
    bool adj, velocity, l1norm, plane[3], verb;
    int dim, i, count, n[SF_MAX_DIM], it, nt, **m, nrhs, is, nshot=1, *flag, order, iter, niter, stiter, *k, nfreq, nmem;
    float o[SF_MAX_DIM], d[SF_MAX_DIM], **t, *t0, *s, *temps, **source, *rhs, *ds;
    float rhsnorm, rhsnorm0, rhsnorm1, rate, eps, gama;
    char key[4], *what;
    sf_file sinp, sout, shot, time, reco, rece, topo, grad, norm;
    sf_weight weight=NULL;

    sf_init(argc,argv);
    sinp = sf_input("in");
    sout = sf_output("out");

    if (NULL == (what = sf_getstring("what"))) what="tomo";
    /* what to compute (default tomography) */

    switch (what[0]) {
	case 'l': /* linear operator */

	    if (NULL == sf_getstring("time"))
		sf_error("Need time=");
	    time = sf_input("time");

	    /* read operator dimension from time table */
	    dim = sf_filedims(time,n);
	    
	    nt = 1;
	    for (i=0; i < 3; i++) {
		sprintf(key,"d%d",i+1);
		if (!sf_histfloat(time,key,d+i)) sf_error("No %s= in input",key);
		sprintf(key,"o%d",i+1);
		if (!sf_histfloat(time,key,o+i)) o[i]=0.;
		nt *= n[i]; plane[i] = false;
	    }
	    if (dim < 3) {
		n[2] = 1; o[2] = o[1]; d[2] = d[1]; plane[2] = false;
	    }

	    dim = 2;

	    /* read in shot file */
	    if (NULL == sf_getstring("shot"))
		sf_error("Need source shot=");
	    shot = sf_input("shot");
	    
	    if (!sf_histint(shot,"n2",&nshot)) nshot=1;
	    sf_fileclose(shot);

	    /* read in receiver file */
	    m = sf_intalloc2(nt,nshot);
	    if (NULL == sf_getstring("receiver")) {
		for (is=0; is < nshot; is++) {
		    for (it=0; it < nt; it++) {
			m[is][it] = 1;
		    }
		}
	    } else {
		rece = sf_input("receiver");
		sf_intread(m[0],nt*nshot,rece);
		sf_fileclose(rece);
	    }

	    /* number of right-hand side */
	    nrhs = 0;
	    for (is=0; is < nshot; is++) {
		for (it=0; it < nt; it++) {
		    if (m[is][it] == 1) nrhs++;
		}
	    }
	    rhs = sf_floatalloc(nrhs);

	    t = sf_floatalloc2(nt,nshot);
	    sf_floatread(t[0],nt*nshot,time);

	    if (!sf_getbool("adj",&adj)) adj=false;
	    /* adjoint flag (for what=linear) */

	    /* initialize fatomo */
	    fatomo_init(dim,n,d,nshot);

	    /* set operators */
	    fatomo_set(t,m);

	    t0 = sf_floatalloc(nt);

	    if (adj) {
		sf_floatread(rhs,nrhs,sinp);

		fatomo_lop(true,false,nt,nrhs,t0,rhs);

		sf_putint(sout,"n1",nt);
		sf_putint(sout,"n2",1);
		sf_putint(sout,"n3",1);
		sf_floatwrite(t0,nt,sout);
	    } else {
		sf_floatread(t0,nt,sinp);

		fatomo_lop(false,false,nt,nrhs,t0,rhs);
		
		sf_putint(sout,"n1",nrhs);
		sf_putint(sout,"n2",1);
		sf_putint(sout,"n3",1);
		sf_floatwrite(rhs,nrhs,sout);
	    }

	    break;
	    
	case 't': /* tomography */

	    /* read input dimension */
	    dim = sf_filedims(sinp,n);
	    
	    nt = 1;
	    for (i=0; i < dim; i++) {
		sprintf(key,"d%d",i+1);
		if (!sf_histfloat(sinp,key,d+i)) sf_error("No %s= in input",key);
		sprintf(key,"o%d",i+1);
		if (!sf_histfloat(sinp,key,o+i)) o[i]=0.;
		nt *= n[i]; plane[i] = false;
	    }
	    if (dim < 3) {
		n[2] = 1; o[2] = o[1]; d[2] = d[1]; plane[2] = false;
	    }
	    
	    /* read initial guess */
	    s = sf_floatalloc(nt);
	    sf_floatread(s,nt,sinp);
	    
	    if (!sf_getbool("velocity",&velocity)) velocity=true;
	    /* if y, the input is velocity; n, slowness squared */

	    if (velocity) {
		for (it=0; it < nt; it++) {
		    s[it] = 1./s[it]*1./s[it];
		}
	    }

	    /* allocate memory for temporary data */
	    ds    = sf_floatalloc(nt);
	    flag  = sf_intalloc(nt);
	    
	    temps = sf_floatalloc(nt);
	    for (it=0; it < nt; it++) {
		temps[it] = s[it];
	    }
	    
	    if (!sf_getbool("l1norm",&l1norm)) l1norm=false;
	    /* norm for minimization (default L2 norm) */

	    if (!sf_getbool("verb",&verb)) verb=false;
	    /* verbosity flag */

	    /* read in shot file */
	    if (NULL == sf_getstring("shot"))
		sf_error("Need source shot=");
	    shot = sf_input("shot");

	    if (!sf_histint(shot,"n2",&nshot)) nshot=1;

	    source = sf_floatalloc2(3,nshot);
	    sf_floatread(source[0],3*nshot,shot);
	    sf_fileclose(shot);

	    /* allocate memory for time table */
	    t = sf_floatalloc2(nt,nshot);

	    /* read in receiver file */
	    m = sf_intalloc2(nt,nshot);
	    if (NULL == sf_getstring("receiver")) {
		for (is=0; is < nshot; is++) {
		    for (it=0; it < nt; it++) {
			m[is][it] = 1;
		    }
		}
	    } else {
		rece = sf_input("receiver");
		sf_intread(m[0],nt*nshot,rece);
		sf_fileclose(rece);
	    }
	    
	    /* number of right-hand side */
	    nrhs = 0;
	    for (is=0; is < nshot; is++) {
		for (it=0; it < nt; it++) {
		    if (m[is][it] == 1) nrhs++;
		}
	    }
	    rhs = sf_floatalloc(nrhs);
	    
	    /* read in record file */
	    if (NULL == sf_getstring("record"))
		sf_error("Need data record=");
	    reco = sf_input("record");

	    t0 = sf_floatalloc(nrhs);
	    sf_floatread(t0,nrhs,reco);
	    sf_fileclose(reco);

	    /* read in topography file */
	    if (NULL != sf_getstring("topo")) {
		topo = sf_input("topo");
		k = sf_intalloc(nt);
		sf_intread(k,nt,topo);
		sf_fileclose(topo);
	    } else {
		k = NULL;
	    }
	    
	    if (!sf_getint("order",&order)) order=2;
	    /* fast marching accuracy order */

	    if (!sf_getint("niter",&niter)) niter=10;
	    /* number of slowness inversion iterations */

	    if (!sf_getint("stiter",&stiter)) stiter=200;
	    /* number of step iterations */

	    if (!sf_getfloat("eps",&eps)) eps=0.;
	    /* regularization parameter */

	    /* output gradient at each iteration */
	    if (NULL != sf_getstring("gradient")) {
		grad = sf_output("gradient");
		sf_putint(grad,"n3",n[2]);
		sf_putfloat(grad,"d3",d[2]);
		sf_putfloat(grad,"o3",o[2]);
		sf_putint(grad,"n4",niter);
	    } else {
		grad = NULL;
	    }

	    /* output misfit L2 norm at each iteration */
	    if (NULL != sf_getstring("misnorm")) {
		norm = sf_output("misnorm");
		sf_putint(norm,"n1",niter+1);
		sf_putfloat(norm,"d1",1.);
		sf_putfloat(norm,"o1",0.);
		sf_putint(norm,"n2",1);
		sf_putint(norm,"n3",1);
	    } else {
		norm = NULL;
	    }

	    /* initialize fatomo */
	    fatomo_init(dim,n,d,nshot);

	    /* initialize 2D gradient operator */
	    sf_igrad2_init(n[0],n[1]);

	    if (l1norm) {
		/*
		if (!sf_getfloat("perc",&perc)) perc=90.;

		l1_init(nt,stiter,perc,false);
		*/
		if (!sf_getint("nfreq",&nfreq)) nfreq=1;
		/* l1-norm weighting nfreq */

		if (!sf_getint("nmem",&nmem)) nmem=1;
		/* l1-norm weighting nmem */

		weight = sf_l1;
		sf_irls_init(nt);
	    }

	    /* initial misfit */
	    fastmarch_init(n[2],n[1],n[0]);
	    
	    i = 0;
	    for (is=0; is < nshot; is++) {
		fastmarch(t[is],s,flag,plane,
			  n[2],n[1],n[0],o[2],o[1],o[0],d[2],d[1],d[0],
			  source[is][2],source[is][1],source[is][0],1,1,1,order);

		for (it=0; it < nt; it++) {
		    if (m[is][it] == 1) {
			rhs[i] = t0[i]-t[is][it];
			i++;
		    }
		}
	    }
	    
	    fastmarch_close();
	    
	    /* calculate L2 data-misfit */
	    rhsnorm0 = cblas_snrm2(nrhs,rhs,1);
	    rhsnorm = rhsnorm0;
	    rhsnorm1 = rhsnorm;
	    rate = rhsnorm1/rhsnorm0;

	    if (l1norm)
		sf_warning("L1 misfit after iteration 0 of %d: %g",niter,rate);
	    else
		sf_warning("L2 misfit after iteration 0 of %d: %g",niter,rate);

	    if (norm != NULL) sf_floatwrite(&rate,1,norm);

	    /* iterations over inversion */
	    for (iter=0; iter < niter; iter++) {
		
		/* clean-up */
		for (it=0; it < nt; it++) {
		    ds[it] = 0.;
		}

		/* prepare for CG */
		fatomo_set(t,m);

		/* solve ds */
		if (l1norm) {
		    /*
		      sf_solver_reg(fatomo_lop,l1step,sf_igrad2_lop,2*nt, nt,nrhs,ds,rhs,stiter,eps,"verb",verb,"end");
		    */
		    sf_solver_reg(fatomo_lop,sf_cgstep,sf_igrad2_lop,2*nt,nt,nrhs,ds,rhs,stiter,eps,"wght",weight,"nfreq",nfreq,"nmem",nmem,"verb",verb,"end");
		    
		    /*
		      l1step_close();
		    */
		    
		    sf_cgstep_close();
		} else {
		    sf_solver_reg(fatomo_lop,sf_cgstep,sf_igrad2_lop,2*nt,nt,nrhs,ds,rhs,stiter,eps,"verb",verb,"end");
			
		    sf_cgstep_close();
		}
		
		/* line search */
		gama = 1.;
		for (count=0; count < 10; count++) {
		    
		    /* update slowness */
		    for (it=0; it < nt; it++) {
			if (k == NULL || k[it] != 1)
			    temps[it] = (s[it]+gama*ds[it])*(s[it]+gama*ds[it])/s[it];
		    }

		    /* forward fast-marching for stencil time */
		    fastmarch_init(n[2],n[1],n[0]);
		    
		    i = 0;
		    for (is=0; is < nshot; is++) {
			fastmarch(t[is],temps,flag,plane,
				  n[2],n[1],n[0],o[2],o[1],o[0],d[2],d[1],d[0],
				  source[is][2],source[is][1],source[is][0],1,1,1,order);
			
			for (it=0; it < nt; it++) {
			    if (m[is][it] == 1) {
				rhs[i] = t0[i]-t[is][it];
				i++;
			    }
			}
		    }
		    
		    fastmarch_close();
		    
		    rhsnorm = cblas_snrm2(nrhs,rhs,1);
		    rate = rhsnorm/rhsnorm1;
		    
		    if (rate < 1.) {
			for (it=0; it < nt; it++) {
			    s[it] = temps[it];
			}
			rhsnorm1 = rhsnorm;
			rate = rhsnorm1/rhsnorm0;
			break;
		    }
		    
		    gama *= 0.5;
		}
		
		if (count == 10) {
		    sf_warning("Line-search Failure. Iteration terminated at %d of %d.",iter+1,niter);
		    sf_warning("Dimensions for GRAD and NORM need to be fixed before read.");
		    break;
		}

		if (l1norm)
		    sf_warning("L1 misfit after iteration %d of %d: %g (line-search %d)",iter+1,niter,rate,count);
		else
		    sf_warning("L2 misfit after iteration %d of %d: %g (line-search %d)",iter+1,niter,rate,count);
		
		if (grad != NULL) sf_floatwrite(ds,nt,grad);
		if (norm != NULL) sf_floatwrite(&rate,1,norm);
	    }
	    
	    /* convert to velocity */
	    if (velocity) {
		for (it=0; it < nt; it++) {
		    s[it] = 1./sqrtf(s[it]);
		}
	    }
	    
	    sf_floatwrite(s,nt,sout);

       	    break;
    }
    
    exit(0);
}
예제 #14
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sgeev
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gpu_time, cpu_time;
    float *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2;
    float *w1i, *w2i;
    magmaFloatComplex *w1copy, *w2copy;
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    float tnrm, result[9];
    magma_int_t N, n2, lda, nb, lwork, info;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    float ulp, ulpinv, error;
    magma_int_t status = 0;
    
    ulp = lapackf77_slamch( "P" );
    ulpinv = 1./ulp;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    // need slightly looser bound (60*eps instead of 30*eps) for some tests
    opts.tolerance = max( 60., opts.tolerance );
    float tol    = opts.tolerance * lapackf77_slamch("E");
    float tolulp = opts.tolerance * lapackf77_slamch("P");
    
    // enable at least some minimal checks, if requested
    if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) {
        fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n"
                "      set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n"
                "      Some checks require running lapack (-l); setting lapack.\n\n");
        opts.lapack = true;
    }
    
    printf("    N   CPU Time (sec)   GPU Time (sec)   |W_magma - W_lapack| / |W_lapack|\n");
    printf("===========================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[i];
            lda   = N;
            n2    = lda*N;
            nb    = magma_get_sgehrd_nb(N);
            lwork = N*(2 + nb);
            // generous workspace - required by sget22
            lwork = max( lwork, N*(5 + 2*N) );
            
            TESTING_MALLOC_CPU( w1copy, magmaFloatComplex, N );
            TESTING_MALLOC_CPU( w2copy, magmaFloatComplex, N );
            TESTING_MALLOC_CPU( w1,  float, N  );
            TESTING_MALLOC_CPU( w2,  float, N  );
            TESTING_MALLOC_CPU( w1i, float, N  );
            TESTING_MALLOC_CPU( w2i, float, N  );
            TESTING_MALLOC_CPU( h_A, float, n2 );
            
            TESTING_MALLOC_PIN( h_R, float, n2 );
            TESTING_MALLOC_PIN( VL,  float, n2 );
            TESTING_MALLOC_PIN( VR,  float, n2 );
            TESTING_MALLOC_PIN( h_work, float, lwork );
            
            /* Initialize the matrix */
            lapackf77_slarnv( &ione, ISEED, &n2, h_A );
            lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            magma_sgeev( opts.jobvl, opts.jobvr,
                         N, h_R, lda, w1, w1i,
                         VL, lda, VR, lda,
                         h_work, lwork, &info );
            gpu_time = magma_wtime() - gpu_time;
            if (info != 0)
                printf("magma_sgeev returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.check ) {
                /* ===================================================================
                 * Check the result following LAPACK's [zcds]drvev routine.
                 * The following tests are performed:
                 * (1)   | A * VR - VR * W | / ( n |A| )
                 *
                 *       Here VR is the matrix of unit right eigenvectors.
                 *       W is a diagonal matrix with diagonal entries W(j).
                 *
                 * (2)   | |VR(i)| - 1 |   and whether largest component real
                 *
                 *       VR(i) denotes the i-th column of VR.
                 *
                 * (3)   | A**T * VL - VL * W**T | / ( n |A| )
                 *
                 *       Here VL is the matrix of unit left eigenvectors, A**T is the
                 *       transpose of A, and W is as above.
                 *
                 * (4)   | |VL(i)| - 1 |   and whether largest component real
                 *
                 *       VL(i) denotes the i-th column of VL.
                 *
                 * (5)   W(full) = W(partial, W only) -- currently skipped
                 * (6)   W(full) = W(partial, W and VR)
                 * (7)   W(full) = W(partial, W and VL)
                 *
                 *       W(full) denotes the eigenvalues computed when both VR and VL
                 *       are also computed, and W(partial) denotes the eigenvalues
                 *       computed when only W, only W and VR, or only W and VL are
                 *       computed.
                 *
                 * (8)   VR(full) = VR(partial, W and VR)
                 *
                 *       VR(full) denotes the right eigenvectors computed when both VR
                 *       and VL are computed, and VR(partial) denotes the result
                 *       when only VR is computed.
                 *
                 * (9)   VL(full) = VL(partial, W and VL)
                 *
                 *       VL(full) denotes the left eigenvectors computed when both VR
                 *       and VL are also computed, and VL(partial) denotes the result
                 *       when only VL is computed.
                 *
                 * (1, 2) only if jobvr = V
                 * (3, 4) only if jobvl = V
                 * (5-9)  only if check = 2 (option -c2)
                 ================================================================= */
                float vmx, vrmx, vtst;
                
                // Initialize result. -1 indicates test was not run.
                for( int j = 0; j < 9; ++j )
                    result[j] = -1.;
                
                if ( opts.jobvr == MagmaVec ) {
                    // Do test 1: | A * VR - VR * W | / ( n |A| )
                    // Note this writes result[1] also
                    lapackf77_sget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr,
                                      &N, h_A, &lda, VR, &lda, w1, w1i,
                                      h_work, &result[0] );
                    result[0] *= ulp;
                    
                    // Do test 2: | |VR(i)| - 1 |   and whether largest component real
                    result[1] = -1.;
                    for( int j = 0; j < N; ++j ) {
                        tnrm = 1.;
                        if (w1i[j] == 0.)
                            tnrm = cblas_snrm2(N, &VR[j*lda], ione);
                        else if (w1i[j] > 0.)
                            tnrm = magma_slapy2( cblas_snrm2(N, &VR[j    *lda], ione),
                                                 cblas_snrm2(N, &VR[(j+1)*lda], ione) );
                        
                        result[1] = max( result[1], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp ));
                        
                        if (w1i[j] > 0.) {
                            vmx  = vrmx = 0.;
                            for( int jj = 0; jj < N; ++jj ) {
                                vtst = magma_slapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]);
                                if (vtst > vmx)
                                    vmx = vtst;
                                
                                if ( (VR[jj + (j+1)*lda])==0. &&
                                     MAGMA_S_ABS( VR[jj+j*lda] ) > vrmx)
                                {
                                    vrmx = MAGMA_S_ABS( VR[jj+j*lda] );
                                }
                            }
                            if (vrmx / vmx < 1. - ulp*2.)
                                result[1] = ulpinv;
                        }
                    }
                    result[1] *= ulp;
                }
                
                if ( opts.jobvl == MagmaVec ) {
                    // Do test 3: | A**T * VL - VL * W**T | / ( n |A| )
                    // Note this writes result[3] also
                    lapackf77_sget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr,
                                      &N, h_A, &lda, VL, &lda, w1, w1i,
                                      h_work, &result[2] );
                    result[2] *= ulp;
                
                    // Do test 4: | |VL(i)| - 1 |   and whether largest component real
                    result[3] = -1.;
                    for( int j = 0; j < N; ++j ) {
                        tnrm = 1.;
                        if (w1i[j] == 0.)
                            tnrm = cblas_snrm2(N, &VL[j*lda], ione);
                        else if (w1i[j] > 0.)
                            tnrm = magma_slapy2( cblas_snrm2(N, &VL[j    *lda], ione),
                                                 cblas_snrm2(N, &VL[(j+1)*lda], ione) );
                        
                        result[3] = max( result[3], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp ));
                        
                        if (w1i[j] > 0.) {
                            vmx  = vrmx = 0.;
                            for( int jj = 0; jj < N; ++jj ) {
                                vtst = magma_slapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]);
                                if (vtst > vmx)
                                    vmx = vtst;
                                
                                if ( (VL[jj + (j+1)*lda])==0. &&
                                     MAGMA_S_ABS( VL[jj+j*lda]) > vrmx)
                                {
                                    vrmx = MAGMA_S_ABS( VL[jj+j*lda] );
                                }
                            }
                            if (vrmx / vmx < 1. - ulp*2.)
                                result[3] = ulpinv;
                        }
                    }
                    result[3] *= ulp;
                }
            }
            if ( opts.check == 2 ) {
                // more extensive tests
                // this is really slow because it calls magma_zgeev multiple times
                float *LRE, DUM;
                TESTING_MALLOC_PIN( LRE, float, n2 );
                
                lapackf77_slarnv( &ione, ISEED, &n2, h_A );
                lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                
                // ----------
                // Compute eigenvalues, left and right eigenvectors
                magma_sgeev( MagmaVec, MagmaVec,
                             N, h_R, lda, w1, w1i,
                             VL, lda, VR, lda,
                             h_work, lwork, &info );
                if (info != 0)
                    printf("magma_zgeev (case V, V) returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                // ----------
                // Compute eigenvalues only
                // These are not exactly equal, and not in the same order, so skip for now.
                //lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                //magma_sgeev( MagmaNoVec, MagmaNoVec,
                //             N, h_R, lda, w2, w2i,
                //             &DUM, 1, &DUM, 1,
                //             h_work, lwork, &info );
                //if (info != 0)
                //    printf("magma_sgeev (case N, N) returned error %d: %s.\n",
                //           (int) info, magma_strerror( info ));
                //
                //// Do test 5: W(full) = W(partial, W only)
                //result[4] = 1;
                //for( int j = 0; j < N; ++j )
                //    if ( w1[j] != w2[j] || w1i[j] != w2i[j] )
                //        result[4] = 0;
                
                // ----------
                // Compute eigenvalues and right eigenvectors
                lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                magma_sgeev( MagmaNoVec, MagmaVec,
                             N, h_R, lda, w2, w2i,
                             &DUM, 1, LRE, lda,
                             h_work, lwork, &info );
                if (info != 0)
                    printf("magma_sgeev (case N, V) returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                // Do test 6: W(full) = W(partial, W and VR)
                result[5] = 1;
                for( int j = 0; j < N; ++j )
                    if ( w1[j] != w2[j] || w1i[j] != w2i[j] )
                        result[5] = 0;
                
                // Do test 8: VR(full) = VR(partial, W and VR)
                result[7] = 1;
                for( int j = 0; j < N; ++j )
                    for( int jj = 0; jj < N; ++jj )
                        if ( ! MAGMA_S_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] ))
                            result[7] = 0;
                
                // ----------
                // Compute eigenvalues and left eigenvectors
                lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                magma_sgeev( MagmaVec, MagmaNoVec,
                             N, h_R, lda, w2, w2i,
                             LRE, lda, &DUM, 1,
                             h_work, lwork, &info );
                if (info != 0)
                    printf("magma_sgeev (case V, N) returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                // Do test 7: W(full) = W(partial, W and VL)
                result[6] = 1;
                for( int j = 0; j < N; ++j )
                    if ( w1[j] != w2[j] || w1i[j] != w2i[j] )
                        result[6] = 0;
                
                // Do test 9: VL(full) = VL(partial, W and VL)
                result[8] = 1;
                for( int j = 0; j < N; ++j )
                    for( int jj = 0; jj < N; ++jj )
                        if ( ! MAGMA_S_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] ))
                            result[8] = 0;
                
                TESTING_FREE_PIN( LRE );
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               Do this after checks, because it overwrites VL and VR.
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_sgeev( &opts.jobvl, &opts.jobvr,
                                 &N, h_A, &lda, w2, w2i,
                                 VL, &lda, VR, &lda,
                                 h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                if (info != 0)
                    printf("lapackf77_sgeev returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                // check | W_magma - W_lapack | / | W |
                // need to sort eigenvalues first
                // copy them into complex vectors for ease
                for( int j=0; j < N; ++j ) {
                    w1copy[j] = MAGMA_C_MAKE( w1[j], w1i[j] );
                    w2copy[j] = MAGMA_C_MAKE( w2[j], w2i[j] );
                }
                std::sort( w1copy, &w1copy[N], compare );
                std::sort( w2copy, &w2copy[N], compare );
                
                // adjust sorting to deal with numerical inaccuracy
                // search down w2 for eigenvalue that matches w1's eigenvalue
                for( int j=0; j < N; ++j ) {
                    for( int j2=j; j2 < N; ++j2 ) {
                        magmaFloatComplex diff = MAGMA_C_SUB( w1copy[j], w2copy[j2] );
                        float diff2 = magma_szlapy2( diff ) / max( magma_szlapy2( w1copy[j] ), tol );
                        if ( diff2 < 100*tol ) {
                            if ( j != j2 ) {
                                std::swap( w2copy[j], w2copy[j2] );
                            }
                            break;
                        }
                    }
                }
                
                blasf77_caxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione );
                error  = cblas_scnrm2( N, w1copy, 1 );
                error /= cblas_scnrm2( N, w2copy, 1 );
                
                printf("%5d   %7.2f          %7.2f          %.2e %s\n",
                       (int) N, cpu_time, gpu_time,
                       error, (error < tolulp ? "  ok" : "  failed"));
                status |= ! (error < tolulp);
            }
            else {
                printf("%5d     ---            %7.2f\n",
                       (int) N, gpu_time);
            }
            if ( opts.check ) {
                // -1 indicates test was not run
                if ( result[0] != -1 ) { printf("        | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? "  ok" : "  failed")); }
                if ( result[1] != -1 ) { printf("        |  |VR(i)| - 1    |             = %8.2e %s\n", result[1], (result[1] < tol ? "  ok" : "  failed")); }
                if ( result[2] != -1 ) { printf("        | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? "  ok" : "  failed")); }
                if ( result[3] != -1 ) { printf("        |  |VL(i)| - 1    |             = %8.2e %s\n", result[3], (result[3] < tol ? "  ok" : "  failed")); }
                if ( result[4] != -1 ) { printf("        W  (full) == W  (partial, W only)          %s\n",         (result[4] == 1. ? "  ok" : "  failed")); }
                if ( result[5] != -1 ) { printf("        W  (full) == W  (partial, W and VR)        %s\n",         (result[5] == 1. ? "  ok" : "  failed")); }
                if ( result[6] != -1 ) { printf("        W  (full) == W  (partial, W and VL)        %s\n",         (result[6] == 1. ? "  ok" : "  failed")); }
                if ( result[7] != -1 ) { printf("        VR (full) == VR (partial, W and VR)        %s\n",         (result[7] == 1. ? "  ok" : "  failed")); }
                if ( result[8] != -1 ) { printf("        VL (full) == VL (partial, W and VL)        %s\n",         (result[8] == 1. ? "  ok" : "  failed")); }
                
                int newline = 0;
                if ( result[0] != -1 ) { status |= ! (result[0] < tol);  newline = 1; }
                if ( result[1] != -1 ) { status |= ! (result[1] < tol);  newline = 1; }
                if ( result[2] != -1 ) { status |= ! (result[2] < tol);  newline = 1; }
                if ( result[3] != -1 ) { status |= ! (result[3] < tol);  newline = 1; }
                if ( result[4] != -1 ) { status |= ! (result[4] == 1.);  newline = 1; }
                if ( result[5] != -1 ) { status |= ! (result[5] == 1.);  newline = 1; }
                if ( result[6] != -1 ) { status |= ! (result[6] == 1.);  newline = 1; }
                if ( result[7] != -1 ) { status |= ! (result[7] == 1.);  newline = 1; }
                if ( result[8] != -1 ) { status |= ! (result[8] == 1.);  newline = 1; }
                if ( newline ) {
                    printf( "\n" );
                }
            }
            
            TESTING_FREE_CPU( w1copy );
            TESTING_FREE_CPU( w2copy );
            TESTING_FREE_CPU( w1  );
            TESTING_FREE_CPU( w2  );
            TESTING_FREE_CPU( w1i );
            TESTING_FREE_CPU( w2i );
            TESTING_FREE_CPU( h_A );
            
            TESTING_FREE_PIN( h_R );
            TESTING_FREE_PIN( VL  );
            TESTING_FREE_PIN( VR  );
            TESTING_FREE_PIN( h_work );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
예제 #15
0
//
// Overloaded function for dispatching to
// * CBLAS backend, and
// * float value-type.
//
inline float nrm2( const int n, const float* x, const int incx ) {
    return cblas_snrm2( n, x, incx );
}
예제 #16
0
/**
    Purpose
    -------
    SLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to SLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    k       INTEGER
            The number of terms in the rational function to be solved by
            SLAED4.  K >= 0.

    @param[in]
    n       INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N > K).

    @param[in]
    n1      INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    @param[out]
    d       REAL array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    @param[out]
    Q       REAL array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    @param[in]
    ldq     INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    @param[in]
    rho     REAL
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    @param[in,out]
    dlamda  REAL array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    @param[in]
    Q2      REAL array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.
            TODO what is LDQ2?

    @param[in]
    indx    INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see SLAED2).
            The rows of the eigenvectors found by SLAED4 must be likewise
            permuted before the matrix multiply can take place.

    @param[in]
    ctot    INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    @param[in,out]
    w       REAL array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    @param
    s       (workspace) REAL array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    @param[out]
    indxq   INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.

    @param
    dwork   (workspace) REAL array, dimension (3*N*N/2+3*N)

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                             will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.
            TODO verify range, vl, vu, il, iu -- copied from slaex1.

    @param[in]
    vl      REAL
    @param[in]
    vu      REAL
            if RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            if RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit.
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ---------------
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d,
             float* Q, magma_int_t ldq, float rho,
             float* dlamda, float* Q2, magma_int_t* indx,
             magma_int_t* ctot, float* w, float* s, magma_int_t* indxq,
             float* dwork,
             magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu,
             magma_int_t* info )
{
#define Q(i_,j_) (Q + (i_) + (j_)*ldq)

    float d_one  = 1.;
    float d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;

    magma_int_t iil, iiu, rk;

    float* dq2= dwork;
    float* ds = dq2  + n*(n/2+1);
    float* dq = ds   + n*(n/2+1);
    magma_int_t lddq = n/2 + 1;

    magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2;
    float temp;
    magma_int_t alleig, valeig, indeig;

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    *info = 0;

    if (k < 0)
        *info=-1;
    else if (n < k)
        *info=-2;
    else if (ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }


    if (*info != 0) {
        magma_xerbla(__func__, -(*info));
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    // Quick return if possible
    if (k == 0)
        return MAGMA_SUCCESS;
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    lq2 = iq2 + n2 * n23;

    magma_ssetvector_async( lq2, Q2, 1, dq2, 1, NULL );

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

#pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for (i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for (j = ib; j < ie; ++j) {
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if (iinfo != 0) {
#pragma omp critical (info)
                *info=iinfo;
                break;
            }
        }

#pragma omp barrier

        if (*info == 0) {
#pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig)
                    magma_svrange(k, d, &iil, &iiu, vl, vu);
                else if (indeig)
                    magma_sirange(k, indxq, &iil, &iiu, il, iu);
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2) {
#pragma omp single
                {
                    for (j = 0; j < k; ++j) {
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }
            }
            else if (k != 1) {
                // Compute updated W.
                blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for (j = 0; j < k; ++j) {
                    magma_int_t i_tmp = min(j, ie);
                    for (i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for (i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for (i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

#pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if (id < tot) {
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else {
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for (j = ib; j < ie; ++j) {
                    for (i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = cblas_snrm2( k, s+id*k, 1);
                    for (i = 0; i < k; ++i) {
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }
    if (*info != 0)
        return MAGMA_SUCCESS; //??????

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

    for (i = 0; i < k; ++i)
        dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for (j = 0; j < k; ++j) {
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if (iinfo != 0)
            *info=iinfo;
    }
    if (*info != 0)
        return MAGMA_SUCCESS;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig)
        magma_svrange(k, d, &iil, &iiu, vl, vu);
    else if (indeig)
        magma_sirange(k, indxq, &iil, &iiu, il, iu);
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2) {
        for (j = 0; j < k; ++j) {
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }
    }
    else if (k != 1) {
        // Compute updated W.
        blasf77_scopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_scopy( &k, Q, &tmp, w, &ione);

        for (j = 0; j < k; ++j) {
            for (i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for (i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for (i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for (j = iil-1; j < iiu; ++j) {
            for (i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = cblas_snrm2( k, s, 1);
            for (i = 0; i < k; ++i) {
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#endif //_OPENMP
    // Compute the updated eigenvectors.

    timer_start( time );
    magma_queue_sync( NULL );

    if (rk != 0) {
        if ( n23 != 0 ) {
            if (rk < magma_get_slaed3_k()) {
                lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            } else {
                magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq);
                magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq );
            }
        } else
            lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

        if ( n12 != 0 ) {
            if (rk < magma_get_slaed3_k()) {
                lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            } else {
                magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq);
                magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq );
            }
        } else
            lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
    }
    timer_stop( time );
    timer_printf( "gemms = %6.2f\n", time );

    return MAGMA_SUCCESS;
} /* magma_slaex3 */
예제 #17
0
extern "C" magma_int_t
magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d,
             float* q, magma_int_t ldq, float rho,
             float* dlamda, float* q2, magma_int_t* indx,
             magma_int_t* ctot, float* w, float* s, magma_int_t* indxq,
             float* dwork,
             char range, float vl, float vu, magma_int_t il, magma_int_t iu,
             magma_int_t* info )
{
/*
    Purpose
    =======
    SLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to SLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    K       (input) INTEGER
            The number of terms in the rational function to be solved by
            SLAED4.  K >= 0.

    N       (input) INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N>K).

    N1      (input) INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    D       (output) REAL array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    Q       (output) REAL array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    LDQ     (input) INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    RHO     (input) REAL
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    DLAMDA  (input/output) REAL array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    Q2      (input) REAL array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.

    INDX    (input) INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see SLAED2).
            The rows of the eigenvectors found by SLAED4 must be likewise
            permuted before the matrix multiply can take place.

    CTOT    (input) INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    W       (input/output) REAL array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    S       (workspace) REAL array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    INDXQ   (output) INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.

    DWORK   (device workspace) REAL array, dimension (3*N*N/2+3*N)

    INFO    (output) INTEGER
            = 0:  successful exit.
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ===============
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    ===================================================================== */

    float d_one  = 1.;
    float d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;
    char range_[] = {range, 0};

    magma_int_t iil, iiu, rk;

    float* dq2= dwork;
    float* ds = dq2  + n*(n/2+1);
    float* dq = ds   + n*(n/2+1);
    magma_int_t lddq = n/2 + 1;

    magma_int_t i,iq2,j,n12,n2,n23,tmp,lq2;
    float temp;
    magma_int_t alleig, valeig, indeig;

    alleig = lapackf77_lsame(range_, "A");
    valeig = lapackf77_lsame(range_, "V");
    indeig = lapackf77_lsame(range_, "I");

    *info = 0;

    if(k < 0)
        *info=-1;
    else if(n < k)
        *info=-2;
    else if(ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }


    if(*info != 0){
        magma_xerbla(__func__, -(*info));
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    // Quick return if possible
    if(k == 0)
        return MAGMA_SUCCESS;
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    lq2 = iq2 + n2 * n23;

    magma_ssetvector_async( lq2, q2, 1, dq2, 1, NULL );

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    magma_timestr_t start, end;
    start = get_current_time();
#endif

#pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for(i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for(j = ib; j < ie; ++j){
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if(iinfo != 0){
#pragma omp critical (info)
                *info=iinfo;
                break;
            }
        }

#pragma omp barrier

        if(*info == 0){

#pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig)
                    magma_svrange(k, d, &iil, &iiu, vl, vu);
                else if (indeig)
                    magma_sirange(k, indxq, &iil, &iiu, il, iu);
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2){
#pragma omp single
                {
                    for(j = 0; j < k; ++j){
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }

            }
            else if(k != 1){

                // Compute updated W.
                blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for(j = 0; j < k; ++j){
                    magma_int_t i_tmp = min(j, ie);
                    for(i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for(i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for(i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

#pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if(id < tot){
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else{
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for(j = ib; j < ie; ++j){
                    for(i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = cblas_snrm2( k, s+id*k, 1);
                    for(i = 0; i < k; ++i){
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }
    if (*info != 0)
        return MAGMA_SUCCESS; //??????

#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    end = get_current_time();
    printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    magma_timestr_t start, end;
    start = get_current_time();
#endif

    for(i = 0; i < k; ++i)
        dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for(j = 0; j < k; ++j){
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if(iinfo != 0)
            *info=iinfo;
    }
    if(*info != 0)
        return MAGMA_SUCCESS;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig)
        magma_svrange(k, d, &iil, &iiu, vl, vu);
    else if (indeig)
        magma_sirange(k, indxq, &iil, &iiu, il, iu);
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2){

        for(j = 0; j < k; ++j){
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }

    }
    else if(k != 1){

        // Compute updated W.
        blasf77_scopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_scopy( &k, q, &tmp, w, &ione);

        for(j = 0; j < k; ++j){
            for(i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for(i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for(i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for(j = iil-1; j < iiu; ++j){
            for(i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = cblas_snrm2( k, s, 1);
            for(i = 0; i < k; ++i){
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    end = get_current_time();
    printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

#endif //_OPENMP
    // Compute the updated eigenvectors.

#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    start = get_current_time();
#endif
    magma_queue_sync( NULL );

    if (rk != 0){
        if( n23 != 0 ){
            if (rk < magma_get_slaed3_k()){
                lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            } else {
                magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 );
                magma_sgemm('N', 'N', n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq);
                magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq );
            }
        } else
            lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

        if( n12 != 0 ) {
            if (rk < magma_get_slaed3_k()){
                lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            } else {
                magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 );
                magma_sgemm('N', 'N', n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq);
                magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq );
            }
        } else
            lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
    }
#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    end = get_current_time();
    printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    return MAGMA_SUCCESS;
} /*magma_slaed3*/
예제 #18
0
파일: slaqps.cpp 프로젝트: soulsheng/magma
extern "C" magma_int_t
magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset,
             magma_int_t nb, magma_int_t *kb,
             float *A,  magma_int_t lda,
             float *dA, magma_int_t ldda,
             magma_int_t *jpvt, float *tau, float *vn1, float *vn2,
             float *auxv,
             float *F,  magma_int_t ldf,
             float *dF, magma_int_t lddf)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SLAQPS computes a step of QR factorization with column pivoting
    of a real M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A. N >= 0

    OFFSET  (input) INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    NB      (input) INTEGER
            The number of columns to factorize.

    KB      (output) INTEGER
            The number of columns actually factorized.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    LDA     (input) INTEGER
            The leading dimension of the array A. LDA >= max(1,M).

    JPVT    (input/output) INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    TAU     (output) REAL array, dimension (KB)
            The scalar factors of the elementary reflectors.

    VN1     (input/output) DOUBLE PRECISION array, dimension (N)
            The vector with the partial column norms.

    VN2     (input/output) DOUBLE PRECISION array, dimension (N)
            The vector with the exact column norms.

    AUXV    (input/output) REAL array, dimension (NB)
            Auxiliar vector.

    F       (input/output) REAL array, dimension (LDF,NB)
            Matrix F' = L*Y'*A.

    LDF     (input) INTEGER
            The leading dimension of the array F. LDF >= max(1,N).

    =====================================================================    */
    
#define  A(i, j) (A  + (i) + (j)*(lda ))
#define dA(i, j) (dA + (i) + (j)*(ldda))
#define  F(i, j) (F  + (i) + (j)*(ldf ))
#define dF(i, j) (dF + (i) + (j)*(lddf))

    float c_zero    = MAGMA_S_MAKE( 0.,0.);
    float c_one     = MAGMA_S_MAKE( 1.,0.);
    float c_neg_one = MAGMA_S_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    float d__1;
    float z__1;
    
    magma_int_t j, k, rk;
    float Akk;
    magma_int_t pvt;
    float temp, temp2, tol3z;
    magma_int_t itemp;

    magma_int_t lsticc;
    magma_int_t lastrk;

    lastrk = min( m, n + offset );
    tol3z = magma_ssqrt( lapackf77_slamch("Epsilon"));

    magma_queue_t stream;
    magma_queue_create( &stream );

    lsticc = 0;
    k = 0;
    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k.
        // C:       pvt, k, isamax are all 0-based; don't subtract 1.
        pvt = k + cblas_isamax( n-k, &vn1[k], ione );
        
        if (pvt != k) {

            if (pvt >= nb) {
                /* 1. Start copy from GPU                           */
                magma_sgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            vn1[pvt] = vn1[k];
            vn2[pvt] = vn2[k];

            if (pvt < nb){
                /* no need of transfer if pivot is within the panel */
                blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                /* 1. Finish copy from GPU                          */
                magma_queue_sync( stream );

                /* 2. Swap as usual on CPU                          */
                blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                /* 3. Restore the GPU                               */
                magma_ssetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j){
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif

            i__1 = m - rk;
            i__2 = k;
            blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );

            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        if (rk < m-1) {
            i__1 = m - rk;
            lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] );
        } else {
            lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] );
        }
        
        Akk = *A(rk, k);
        *A(rk, k) = c_one;

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;
        
            /* Send the vector to the GPU */
            magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );
        
            /* Multiply on GPU */
            // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            magma_int_t i__3 = nb-k-1;
            magma_int_t i__4 = i__2 - i__3;
            magma_int_t i__5 = nb-k;
            magma_sgemv( MagmaTrans, i__1 - i__5, i__2 - i__3,
                         tau[k], dA(rk +i__5, k+1+i__3), ldda,
                                 dA(rk +i__5, k       ), ione,
                         c_zero, dF(k+1+i__3, k       ), ione );
            
            magma_sgetmatrix_async( i__2-i__3, 1,
                                    dF(k + 1 +i__3, k), i__2,
                                    F (k + 1 +i__3, k), i__2, stream );
            
            blasf77_sgemv( MagmaTransStr, &i__1, &i__3,
                           &tau[k], A(rk,  k+1), &lda,
                                    A(rk,  k  ), &ione,
                           &c_zero, F(k+1, k  ), &ione );
            
            magma_queue_sync( stream );
            blasf77_sgemv( MagmaTransStr, &i__5, &i__4,
                           &tau[k], A(rk, k+1+i__3), &lda,
                                    A(rk, k       ), &ione,
                           &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros. */
        for (j = 0; j < k; ++j) {
            *F(j, k) = c_zero;
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */
        if (k > 0) {
            i__1 = m - rk;
            i__2 = k;
            z__1 = MAGMA_S_NEGATE( tau[k] );
            blasf77_sgemv( MagmaTransStr, &i__1, &i__2,
                           &z__1,   A(rk, 0), &lda,
                                    A(rk, k), &ione,
                           &c_zero, auxv, &ione );
            
            i__1 = k;
            blasf77_sgemv( MagmaNoTransStr, &n, &i__1,
                           &c_one, F(0,0), &ldf,
                                   auxv,   &ione,
                           &c_one, F(0,k), &ione );
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            blasf77_sgemm( MagmaNoTransStr, MagmaTransStr, &ione, &i__1, &i__2,
                           &c_neg_one, A(rk, 0  ), &lda,
                                       F(k+1,0  ), &ldf,
                           &c_one,     A(rk, k+1), &lda );
        }
        
        /* Update partial column norms. */
        if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    /* NOTE: The following 4 lines follow from the analysis in
                       Lapack Working Note 176. */
                    temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );
        
                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);
        
                    if (temp2 <= tol3z) {
                        vn2[j] = (float) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_ssqrt(temp);
                    }
                }
            }
        }
        
        *A(rk, k) = Akk;
        
        ++k;
    }
    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU */
        magma_ssetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 );

        magma_sgemm( MagmaNoTrans, MagmaTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), i__2,
                     c_one,     dA(rk+1, *kb), ldda );
    }
    
    /* Recomputation of difficult columns. */
    while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb)
            vn1[lsticc] = cblas_snrm2(i__1, A(rk + 1, lsticc), ione);
        else {
            /* Where is the data, CPU or GPU ? */
            float r1, r2;
            
            r1 = cblas_snrm2(nb-k, A(rk + 1, lsticc), ione);
            r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);
            
            //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione);
            vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2);
        }
        
        /* NOTE: The computation of VN1( LSTICC ) relies on the fact that
           SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp;
    }
    
    magma_queue_destroy( stream );

    return MAGMA_SUCCESS;
} /* magma_slaqps */
예제 #19
0
파일: Mdsrtomo0.c 프로젝트: 1014511134/src
int main(int argc, char* argv[])
{
    bool velocity, causal, limit, verb, shape;
    int dimw, dimt, i, n[SF_MAX_DIM], rect[SF_MAX_DIM], iw, nw, ir, nr;
    long nt, *order;
    int iter, niter, cgiter, count;
    int *ff, *dp, *mp, nloop;
    float o[SF_MAX_DIM], d[SF_MAX_DIM], *dt, *dw, *dv, *t, *w, *t0, *w1, *p=NULL;
    float eps, tol, thres, rhsnorm, rhsnorm0, rhsnorm1, rate, gama;
    char key[6];
    sf_file in, out, reco, grad, mask, prec;

    sf_init(argc,argv);
    in  = sf_input("in");
    out = sf_output("out");
   
    /* read dimension */
    dimw = sf_filedims(in,n);
	    
    nw = 1;
    for (i=0; i < dimw; i++) {
	sprintf(key,"d%d",i+1);
	if (!sf_histfloat(in,key,d+i)) sf_error("No %s= in input.",key);
	sprintf(key,"o%d",i+1);
	if (!sf_histfloat(in,key,o+i)) o[i]=0.;
	nw *= n[i];
    }
	    
    if (dimw > 2) sf_error("Only works for 2D now.");
	    
    n[2] = n[1]; d[2] = d[1]; o[2] = o[1];
    dimt = 3; nr = n[1]*n[2]; nt = nw*n[2];

    /* read initial velocity */
    w = sf_floatalloc(nw);
    sf_floatread(w,nw,in);
	    
    if (!sf_getbool("velocity",&velocity)) velocity=true;
    /* if y, the input is velocity; n, slowness-squared */
	    	    
    /* convert to slowness-squared */
    if (velocity) {
	for (iw=0; iw < nw; iw++)
	    w[iw] = 1./w[iw]*1./w[iw];
	
	dv = sf_floatalloc(nw);
    } else {
	dv = NULL;
    }
	    
    if (!sf_getbool("limit",&limit)) limit=false;
    /* if y, limit computation within receiver coverage */

    if (!sf_getbool("shape",&shape)) shape=false;
    /* shaping regularization (default no) */
	    
    /* read record */
    if (NULL == sf_getstring("reco"))
	sf_error("Need record reco=");
    reco = sf_input("reco");
    
    t0 = sf_floatalloc(nr);
    sf_floatread(t0,nr,reco);
    sf_fileclose(reco);
	    
    /* read receiver mask */	    
    if (NULL == sf_getstring("mask")) {
	mask = NULL;
	dp = NULL;
    } else {
	mask = sf_input("mask");
	dp = sf_intalloc(nr);
	sf_intread(dp,nr,mask);
	sf_fileclose(mask);
    }
	    
    /* read model mask */
    if (NULL == sf_getstring("prec")) {
	prec = NULL;
	mp = NULL;
    } else {
	prec = sf_input("prec");
	mp = sf_intalloc(nw);
	sf_intread(mp,nw,prec);
	sf_fileclose(prec);
    }

    if (!sf_getbool("verb",&verb)) verb=false;
    /* verbosity flag */
	    
    if (!sf_getint("niter",&niter)) niter=5;
    /* number of inversion iterations */
	    
    if (!sf_getint("cgiter",&cgiter)) cgiter=10;
    /* number of conjugate-gradient iterations */
	    
    if (!sf_getfloat("thres",&thres)) thres=5.e-5;
    /* threshold (percentage) */
	    
    if (!sf_getfloat("tol",&tol)) tol=1.e-3;
    /* tolerance for bisection root-search */

    if (!sf_getint("nloop",&nloop)) nloop=10;
    /* number of bisection root-search */

    /* output gradient at each iteration */
    if (NULL != sf_getstring("grad")) {
	grad = sf_output("grad");
	sf_putint(grad,"n3",niter);
    } else {
	grad = NULL;
    }
	    
    if (!sf_getfloat("eps",&eps)) eps=0.;
    /* regularization parameter */

    if (shape) {
	for (i=0; i < dimw; i++) {
	    sprintf(key,"rect%d",i+1);
	    if (!sf_getint(key,rect+i)) rect[i]=1;
	    /*( rect#=(1,1,...) smoothing radius on #-th axis )*/
	}
	
	/* triangle smoothing operator */
	sf_trianglen_init(dimw,rect,n);
	sf_repeat_init(nw,1,sf_trianglen_lop);
	
	sf_conjgrad_init(nw,nw,nr,nr,eps,1.e-6,verb,false);
	p = sf_floatalloc(nw);
    } else {
	/* initialize 2D gradient operator */
	sf_igrad2_init(n[0],n[1]);
    }

    /* allocate temporary array */
    t  = sf_floatalloc(nt);
    dw = sf_floatalloc(nw);
    dt = sf_floatalloc(nr);
    w1 = sf_floatalloc(nw);
    ff = sf_intalloc(nt);

    if (!sf_getbool("causal",&causal)) causal=true;
    /* if y, neglect non-causal branches of DSR */

    /* initialize eikonal */
    dsreiko_init(n,o,d,
		 thres,tol,nloop,
		 causal,limit,dp);
	    
    /* initialize operator */
    dsrtomo_init(dimt,n,d);

    /* upwind order */
    order = dsrtomo_order();

    /* initial misfit */
    dsreiko_fastmarch(t,w,ff,order);
    dsreiko_mirror(t);

    /* calculate L2 data-misfit */
    for (ir=0; ir < nr; ir++) {
	if (dp == NULL || dp[ir] == 1) {
	    dt[ir] = t0[ir]-t[(long) ir*n[0]];
	} else {
	    dt[ir] = 0.;
	}
    }

    rhsnorm0 = cblas_snrm2(nr,dt,1);
    rhsnorm = rhsnorm0;
    rhsnorm1 = rhsnorm;
    rate = rhsnorm1/rhsnorm0;
	    
    sf_warning("L2 misfit after iteration 0 of %d: %g",niter,rate);
    
    /* iterations over inversion */
    for (iter=0; iter < niter; iter++) {
	
	/* clean-up */
	for (iw=0; iw < nw; iw++) dw[iw] = 0.;
	
	/* set operator */
	dsrtomo_set(t,w,ff,dp,mp);
	
	/* solve dw */
	if (shape) {
	    sf_conjgrad(NULL,dsrtomo_oper,sf_repeat_lop,p,dw,dt,cgiter);
	} else {
	    sf_solver_reg(dsrtomo_oper,sf_cgstep,sf_igrad2_lop,2*nw,nw,nr,dw,dt,cgiter,eps,"verb",verb,"end");
	    sf_cgstep_close();
	}
	
	/* output gradient */
	if (grad != NULL) {
	    if (velocity) {
		for (iw=0; iw < nw; iw++) {
		    dv[iw] = -dw[iw]/(2.*sqrtf(w[iw])*(w[iw]+dw[iw]/2.));
		}
		sf_floatwrite(dv,nw,grad);
	    } else {
		sf_floatwrite(dw,nw,grad);
	    }
	}
	
	/* line search */
	gama = 0.5;
	for (count=0; count < 5; count++) {
	    
	    /* update slowness */
	    for (iw=0; iw < nw; iw++) 
		w1[iw] = (w[iw]+gama*dw[iw])*(w[iw]+gama*dw[iw])/w[iw];
	    
	    /* compute new misfit */
	    dsreiko_fastmarch(t,w1,ff,order);
	    dsreiko_mirror(t);
	    
	    for (ir=0; ir < nr; ir++) {
		if (dp == NULL || dp[ir] == 1) {
		    dt[ir] = t0[ir]-t[(long) ir*n[0]];
		} else {
		    dt[ir] = 0.;
		}
	    }

	    rhsnorm = cblas_snrm2(nr,dt,1);
	    rate = rhsnorm/rhsnorm1;
	    
	    if (rate < 1.) {
		for (iw=0; iw < nw; iw++) w[iw] = w1[iw];
		
		rhsnorm1 = rhsnorm;
		rate = rhsnorm1/rhsnorm0;
		break;
	    }
	    
	    gama *= 0.5;
	}
	
	if (count == 5) {
	    sf_warning("Line-search failure at iteration %d of %d.",iter+1,niter);
	    break;
	}
	
	sf_warning("L2 misfit after iteration %d of %d: %g (line-search %d)",iter+1,niter,rate,count);
    }
    
    /* convert to velocity */
    if (velocity) {
	for (iw=0; iw < nw; iw++) {
	    w[iw] = 1./sqrtf(w[iw]);
	}
    }
    
    sf_floatwrite(w,nw,out);
       
    exit(0);
}