/* Computes the Frobenius norm of a matrix. */ double nrm2(Mat mA) { const int n2 = MatN2(mA); const void* a = MatElems(mA); const bool dev = MatDev(mA); double norm; switch (MatElemSize(mA)) { case 4: if (dev) { float norm32; cublasSnrm2(g_cublasHandle, n2, a, 1, (float*)&norm32); norm = norm32; } else { norm = cblas_snrm2(n2, a, 1); } break; case 8: if (dev) { cublasDnrm2(g_cublasHandle, n2, a, 1, (double*)&norm); } else { norm = cblas_dnrm2(n2, a, 1); } break; } return norm; }
bool nrm2(REAL &norm, const RealVector &x) { bool flag = true; UINT N, incX; N = x.size; incX = 1; if (NULL == &x) { flag = false; goto end; } norm = cblas_snrm2(N, x.M, incX); end: return flag; }
int main(void) { int info, ipiv2[2]; float err = 1e-6; float x2[2], b2[2], c2[2]; float a2x2[2][2]; //#cblas x2[0] = 1.0; x2[1] = -2.0; assert_eqd( cblas_snrm2(2, x2, 1), sqrt(5.0), err ); //#lapacke //1 2 x = 5 //3 4 y 11 //x = 1 //y = 1 a2x2[0][0] = 1.0; a2x2[1][0] = 2.0; a2x2[0][1] = 3.0; a2x2[1][1] = 4.0; b2[0] = 5.; b2[1] = 11.; info = LAPACKE_sgesv( LAPACK_COL_MAJOR, //COL or ROW 2, //n 1, //nrhs &a2x2[0][0], 2, //lda &ipiv2[0], &b2[0], 2 //ldb ); c2[0] = 1.0; c2[1] = 2.0; assert_eqi( info, 0 ); assert_eqd( b2[0], c2[0], err ); assert_eqd( b2[1], c2[1], err ); return EXIT_SUCCESS; }
VALUE rb_blas_xnrm2(int argc, VALUE *argv, VALUE self) { Matrix *dx; int incx; int incy; int n; //char error_msg[64]; VALUE n_value, incx_value; rb_scan_args(argc, argv, "02", &incx_value, &n_value); Data_Get_Struct(self, Matrix, dx); if(incx_value == Qnil) incx = 1; else incx = NUM2INT(incx_value); if(n_value == Qnil) n = dx->nrows; else n = NUM2INT(n_value); if(dx == NULL || dx->ncols != 1) { //sprintf(error_msg, "Self is not a Vector"); rb_raise(rb_eRuntimeError, "Self is not a Vector"); } switch(dx->data_type) { case Single_t: //s return rb_float_new(cblas_snrm2(n , (float *)dx->data, incx)); case Double_t: //d return rb_float_new(cblas_dnrm2(n , (double *)dx->data, incx)); case Complex_t: //c return rb_float_new(cblas_scnrm2(n , dx->data, incx)); case Double_Complex_t: //z return rb_float_new(cblas_dznrm2(n , dx->data, incx)); default: //sprintf(error_msg, "Invalid data_type (%d) in Matrix", dx->data_type); rb_raise(rb_eRuntimeError, "Invalid data_type (%d) in Matrix", dx->data_type); return Qnil; //Never reaches here. } }
int opt_sgd(const int N, sgradfunc gradient, sfunc f, float *x, const sgdparams params) { static int max_iters ; static scalar eps ; static scalar delta ; static scalar fnow, fprev ; static int i, cond ; static int init = 0 ; if (!init) { atexit(&at_exit) ; } max_iters = params.max_iters ; eps = params.tol_grad ; delta = params.tol_func ; g = realloc(g, N*sizeof(scalar)) ; if(!params.initialized) for(i=0;i<N;++i) x[i] = 0; fnow = f(N, x) ; i = 0 ; do { fprev = fnow ; gradient(N, x, g) ; cblas_daxpy(N,-1,g,1,x,1) ; fnow = f(N, x) ; cond = fabs(fnow-fprev) > delta * fabs(fnow) ; cond = cond && (cblas_snrm2(N, g, 1) > eps) ; cond = cond && (++i < max_iters) ; } while (cond) ; if (i >= max_iters) return 0 ; return 1 ; }
float HostVector<float>::Norm(void) const { return cblas_snrm2(this->size_, this->vec_, 1); }
extern "C" magma_int_t magma_sgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, float *a, magma_int_t lda, float *WR, float *WI, float *vl, magma_int_t ldvl, float *vr, magma_int_t ldvr, float *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= SGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). WR (output) DOUBLE PRECISION array, dimension (N) WI (output) DOUBLE PRECISION array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. VL (output) DOUBLE PRECISION array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) DOUBLE PRECISION array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ magma_int_t c__1 = 1; magma_int_t c__0 = 0; magma_int_t c_n1 = -1; magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; float d__1, d__2; magma_int_t i__, k, ihi, ilo; float r__, cs, sn, scl; float dum[1], eps; magma_int_t ibal; float anrm; magma_int_t ierr, itau, iwrk, nout; magma_int_t scalea; float cscale; float bignum; magma_int_t minwrk; magma_int_t wantvl; float smlnum; magma_int_t lquery, wantvr, select[1]; magma_int_t nb = 0; magmaFloat_ptr dT; //magma_timestr_t start, end; char side[2] = {0, 0}; magma_vec_t jobvl_ = jobvl; magma_vec_t jobvr_ = jobvr; *info = 0; lquery = lwork == -1; wantvl = lapackf77_lsame(lapack_const(jobvl_), "V"); wantvr = lapackf77_lsame(lapack_const(jobvr_), "V"); if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) { *info = -1; } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ if (*info == 0) { nb = magma_get_sgehrd_nb(n); minwrk = (2+nb)*n; work[0] = (float) minwrk; if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } // if eigenvectors are needed #if defined(VERSION3) if (MAGMA_SUCCESS != magma_malloc( &dT, nb*n*sizeof(float) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif // subtract row and col for 1-based indexing a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; vl_dim1 = ldvl; vl_offset = 1 + vl_dim1; vl -= vl_offset; vr_dim1 = ldvr; vr_offset = 1 + vr_dim1; vr -= vr_offset; --work; /* Get machine constants */ eps = lapackf77_slamch("P"); smlnum = lapackf77_slamch("S"); bignum = 1. / smlnum; lapackf77_slabad(&smlnum, &bignum); smlnum = magma_ssqrt(smlnum) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_slange("M", &n, &n, &a[a_offset], &lda, dum); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_slascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, &ierr); } /* Balance the matrix (Workspace: need N) */ ibal = 1; lapackf77_sgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr); /* Reduce to upper Hessenberg form (Workspace: need 3*N, prefer 2*N+N*NB) */ itau = ibal + n; iwrk = itau + n; i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) /* * Version 1 - LAPACK */ lapackf77_sgehrd(&n, &ilo, &ihi, &a[a_offset], &lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION2) /* * Version 2 - LAPACK consistent HRD */ magma_sgehrd2(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, */ magma_sgehrd(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.); if (wantvl) { /* Want left eigenvectors Copy Householder vectors to VL */ side[0] = 'L'; lapackf77_slacpy(MagmaLowerStr, &n, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl); /* * Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_sorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_sorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info); if (wantvr) { /* Want left and right eigenvectors Copy Schur vectors to VR */ side[0] = 'B'; lapackf77_slacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr); } } else if (wantvr) { /* Want right eigenvectors Copy Householder vectors to VR */ side[0] = 'R'; lapackf77_slacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr); /* * Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_sorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_sorghr(n, ilo, ihi, &vr[vr_offset], ldvr, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } else { /* * Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } /* If INFO > 0 from SHSEQR, then quit */ if (*info > 0) { fprintf(stderr, "SHSEQR returned with info = %d\n", (int) *info); goto L50; } if (wantvl || wantvr) { /* * Compute left and/or right eigenvectors * (Workspace: need 4*N) */ lapackf77_strevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr); } if (wantvl) { /* * Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_sgebak("B", "L", &n, &ilo, &ihi, &work[ibal], &n, &vl[vl_offset], &ldvl, &ierr); /* Normalize left eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if ( WI[i__-1] == 0.) { scl = cblas_snrm2(n, &vl[i__ * vl_dim1 + 1], 1); scl = 1. / scl; cblas_sscal(n, (scl), &vl[i__ * vl_dim1 + 1], 1); } else if (WI[i__-1] > 0.) { d__1 = cblas_snrm2(n, &vl[ i__ * vl_dim1 + 1], 1); d__2 = cblas_snrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1); scl = lapackf77_slapy2(&d__1, &d__2); scl = 1. / scl; cblas_sscal(n, (scl), &vl[ i__ * vl_dim1 + 1], 1); cblas_sscal(n, (scl), &vl[(i__ + 1) * vl_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vl[k + i__ * vl_dim1]; /* Computing 2nd power */ d__2 = vl[k + (i__ + 1) * vl_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_isamax */ k = cblas_isamax(n, &work[iwrk], 1)+1; lapackf77_slartg(&vl[k + i__ * vl_dim1], &vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__); cblas_srot(n, &vl[ i__ * vl_dim1 + 1], 1, &vl[(i__ + 1) * vl_dim1 + 1], 1, cs, (sn)); vl[k + (i__ + 1) * vl_dim1] = 0.; } } } if (wantvr) { /* * Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_sgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n, &vr[vr_offset], &ldvr, &ierr); /* Normalize right eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if (WI[i__-1] == 0.) { scl = 1. / cblas_snrm2(n, &vr[i__ * vr_dim1 + 1], 1); cblas_sscal(n, (scl), &vr[i__ * vr_dim1 + 1], 1); } else if (WI[i__-1] > 0.) { d__1 = cblas_snrm2(n, &vr[ i__ * vr_dim1 + 1], 1); d__2 = cblas_snrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1); scl = lapackf77_slapy2(&d__1, &d__2); scl = 1. / scl; cblas_sscal(n, (scl), &vr[ i__ * vr_dim1 + 1], 1); cblas_sscal(n, (scl), &vr[(i__ + 1) * vr_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vr[k + i__ * vr_dim1]; /* Computing 2nd power */ d__2 = vr[k + (i__ + 1) * vr_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_isamax */ k = cblas_isamax(n, &work[iwrk], 1)+1; lapackf77_slartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1], &cs, &sn, &r__); cblas_srot(n, &vr[ i__ * vr_dim1 + 1], 1, &vr[(i__ + 1) * vr_dim1 + 1], 1, cs, (sn)); vr[k + (i__ + 1) * vr_dim1] = 0.; } } } /* Undo scaling if necessary */ L50: if (scalea) { i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR + (*info), &i__2, &ierr); i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI + (*info), &i__2, &ierr); if (*info > 0) { i__1 = ilo - 1; lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR, &n, &ierr); i__1 = ilo - 1; lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI, &n, &ierr); } } #if defined(VERSION3) magma_free( dT ); #endif return *info; } /* magma_sgeev */
JNIEXPORT jfloat JNICALL Java_uncomplicate_neanderthal_CBLAS_snrm2 (JNIEnv *env, jclass clazz, jint N, jobject X, jint offsetX, jint incX) { float *cX = (float *) (*env)->GetDirectBufferAddress(env, X); return cblas_snrm2(N, cX + offsetX, incX); };
int GaussNewton( void (*func)(T *x, T *r, int m, int n, void *adata), void (*jacf)(T *x, T *J, int m, int n, void *adata), T *x, T *r, T* J, int m, int n, int itmax, T *opts, /* delta, r_threshold, diff_threshold */ void *adata) { PhGUtils::debug("m", m, "n", n); float delta, R_THRES, DIFF_THRES; if( opts == NULL ) { // use default values delta = 1.0; // step size, default to use standard Newton-Ralphson R_THRES = 1e-6; DIFF_THRES = 1e-6; } else { delta = opts[0]; R_THRES = opts[1]; DIFF_THRES = opts[2]; } bool allocateR = false, allocateJ = false; // residue if( r == NULL ) { // allocate space for residue allocateR = true; r = new T[n]; memset(r, 0, sizeof(T)*n); } T* x0 = new T[m]; memset(x0, 0, sizeof(T)*m); T* deltaX = new T[m]; // also for Jtr memset(deltaX, 0, sizeof(T)*m); cblas_scopy(m, x, 1, deltaX, 1); T* JtJ = new T[m * m]; memset(JtJ, 0, sizeof(T)*m*m); // Jacobian if( J == NULL ) { allocateJ = true; J = new T[m * n]; memset(J, 0, sizeof(T)*m*n); } // compute initial residue func(x, r, m, n, adata); //ofstream fout0("r.txt"); //print2DArray(r, n, 1, fout0); //fout0.close(); int iters = 0; //::system("pause"); //printArray(x, m); //printArray(r, n); // do iteration while( (cblas_snrm2(m, deltaX, 1) > DIFF_THRES && cblas_snrm2(n, r, 1) > R_THRES && iters < itmax) || iters < 1 ) { // compute Jacobian jacf(x, J, m, n, adata); // store old value cblas_scopy(m, x, 1, x0, 1); //ofstream fout1("J.txt"); //print2DArray(J, n, m, fout1); //fout1.close(); //::system("pause"); // compute JtJ cblas_ssyrk (CblasColMajor, CblasUpper, CblasNoTrans, m, n, 1.0, J, m, 0, JtJ, m); //ofstream fout("JtJ.txt"); //print2DArray(JtJ, m, m, fout); //fout.close(); // compute Jtr cblas_sgemv (CblasColMajor, CblasNoTrans, m, n, 1.0, J, m, r, 1, 0, deltaX, 1); // compute deltaX LAPACKE_spotrf( LAPACK_COL_MAJOR, 'U', m, JtJ, m ); LAPACKE_spotrs( LAPACK_COL_MAJOR, 'U', m, 1, JtJ, m, deltaX, m ); //ofstream fout2("deltaX.txt"); //printArray(deltaX, m, fout2); //fout2.close(); // update x cblas_saxpy(m, -delta, deltaX, 1, x, 1); // update residue func(x, r, m, n, adata); //printArray(x, m); //system("pause"); iters++; } //::system("pause"); // delete workspace delete[] x0; delete[] deltaX; delete[] JtJ; if( allocateR ){ delete[] r;} if( allocateJ ){ delete[] J;} return iters; }
/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (devices workspaces) REAL array of arrays, dimension NRGPU. if NRGPU = 1 the dimension of the first workspace should be (3*N*N/2+3*N) otherwise the NRGPU workspaces should have the size ceil((N-N1) * (N-N1) / floor(nrgpu/2)) + NB * ((N-N1) + (N-N1) / floor(nrgpu/2)) @param stream (device stream) magma_queue_t array, dimension (MagmaMaxGPUs,2) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3_m(magma_int_t nrgpu, magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* Q, magma_int_t ldq, float rho, float* dlamda, float* Q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float** dwork, magma_queue_t stream[MagmaMaxGPUs][2], magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) #define dQ2(id) (dwork[id]) #define dS(id, ii) (dwork[id] + n2*n2_loc + (ii)*(n2*nb)) #define dQ(id, ii) (dwork[id] + n2*n2_loc + 2*(n2*nb) + (ii)*(n2_loc*nb)) if (nrgpu == 1) { magma_setdevice(0); magma_slaex3(k, n, n1, d, Q, ldq, rho, dlamda, Q2, indx, ctot, w, s, indxq, *dwork, range, vl, vu, il, iu, info ); return MAGMA_SUCCESS; } float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu; magma_int_t ni_loc[MagmaMaxGPUs]; magma_int_t i,ind,iq2,j,n12,n2,n23,tmp,lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if (k == 0) return MAGMA_SUCCESS; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ //#define CHECK_CPU #ifdef CHECK_CPU float *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs]; #define hQ2(id) (hwQ2[id]) #define hS(id, ii) (hwS[ii][id]) #define hQ(id, ii) (hwQ[ii][id]) #endif n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; n1_loc = (n1-1) / (nrgpu/2) + 1; n2_loc = (n2-1) / (nrgpu/2) + 1; nb = magma_get_slaex3_m_nb(); if (n1 >= magma_get_slaex3_m_k()) { #ifdef CHECK_CPU for (igpu = 0; igpu < nrgpu; ++igpu) { magma_smalloc_pinned( &(hwS[0][igpu]), n2*nb ); magma_smalloc_pinned( &(hwS[1][igpu]), n2*nb ); magma_smalloc_pinned( &(hwQ2[igpu]), n2*n2_loc ); magma_smalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb ); magma_smalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb ); } #endif for (igpu = 0; igpu < nrgpu-1; igpu += 2) { ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc); #ifdef CHECK_CPU lapackf77_slacpy("A", &ni_loc[igpu], &n12, Q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc); #endif magma_setdevice(igpu); magma_ssetmatrix_async( ni_loc[igpu], n12, Q2+n1_loc*(igpu/2), n1, dQ2(igpu), n1_loc, stream[igpu][0] ); ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc); #ifdef CHECK_CPU lapackf77_slacpy("A", &ni_loc[igpu+1], &n23, Q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc); #endif magma_setdevice(igpu+1); magma_ssetmatrix_async( ni_loc[igpu+1], n23, Q2+iq2+n2_loc*(igpu/2), n2, dQ2(igpu+1), n2_loc, stream[igpu+1][0] ); } } // #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s+id*k, 1); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return MAGMA_SUCCESS; //?????? timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return MAGMA_SUCCESS; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s, 1); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); if (rk > 0) { if (n1 < magma_get_slaex3_m_k()) { // stay on the CPU if ( n23 != 0 ) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } else { //use the gpus ib = min(nb, rk); for (igpu = 0; igpu < nrgpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_ssetmatrix_async( n23, ib, Q(ctot[0],iil-1), ldq, dS(igpu+1,0), n23, stream[igpu+1][0] ); } if (n12 != 0) { magma_setdevice(igpu); magma_ssetmatrix_async( n12, ib, Q(0,iil-1), ldq, dS(igpu,0), n12, stream[igpu][0] ); } } for (i = 0; i < rk; i += nb) { ib = min(nb, rk - i); ind = (i/nb)%2; if (i+nb < rk) { ib2 = min(nb, rk - i - nb); for (igpu = 0; igpu < nrgpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_ssetmatrix_async( n23, ib2, Q(ctot[0],iil-1+i+nb), ldq, dS(igpu+1,(ind+1)%2), n23, stream[igpu+1][(ind+1)%2] ); } if (n12 != 0) { magma_setdevice(igpu); magma_ssetmatrix_async( n12, ib2, Q(0,iil-1+i+nb), ldq, dS(igpu,(ind+1)%2), n12, stream[igpu][(ind+1)%2] ); } } } // Ensure that the data is copied on gpu since we will overwrite it. for (igpu = 0; igpu < nrgpu-1; igpu += 2) { if (n23 != 0) { #ifdef CHECK_CPU lapackf77_slacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23); #endif magma_setdevice(igpu+1); magma_queue_sync( stream[igpu+1][ind] ); } if (n12 != 0) { #ifdef CHECK_CPU lapackf77_slacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12); #endif magma_setdevice(igpu); magma_queue_sync( stream[igpu][ind] ); } } for (igpu = 0; igpu < nrgpu-1; igpu += 2) { if (n23 != 0) { #ifdef CHECK_CPU blasf77_sgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc, hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc); #endif magma_setdevice(igpu+1); magmablasSetKernelStream(stream[igpu+1][ind]); magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc, dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc); #ifdef CHECK_CPU printf("norm Q %d: %f\n", igpu+1, cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc)); #endif } if (n12 != 0) { #ifdef CHECK_CPU blasf77_sgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc, hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc); #endif magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][ind]); magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc, dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc); #ifdef CHECK_CPU printf("norm Q %d: %f\n", igpu, cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc)); #endif } } for (igpu = 0; igpu < nrgpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_sgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc, Q(n1+n2_loc*(igpu/2),iil-1+i), ldq ); // magma_sgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc, // Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, stream[igpu+1][ind] ); } if (n12 != 0) { magma_setdevice(igpu); magma_sgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc, Q(n1_loc*(igpu/2),iil-1+i), ldq ); // magma_sgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc, // Q(n1_loc*(igpu/2),iil-1+i), ldq, stream[igpu][ind] ); } } } for (igpu = 0; igpu < nrgpu; ++igpu) { #ifdef CHECK_CPU magma_free_pinned( hwS[1][igpu] ); magma_free_pinned( hwS[0][igpu] ); magma_free_pinned( hwQ2[igpu] ); magma_free_pinned( hwQ[1][igpu] ); magma_free_pinned( hwQ[0][igpu] ); #endif magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_queue_sync( stream[igpu][0] ); magma_queue_sync( stream[igpu][1] ); } if ( n23 == 0 ) lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 == 0 ) lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); return MAGMA_SUCCESS; } /* magma_slaed3_m */
int main(int argc, char* argv[]) { bool velocity, verb, shape; int dim, i, j, n[3], rect[3], it, nt, order, nt0, nx0; int iter, niter, iline, nline, cgiter, *f0, *m0=NULL; float d[3], o[3], dt0, dx0, ot0, ox0, eps, tol, *p=NULL, *p0=NULL, thres; float *vd, *vdt, *vdx, *s, *t0, *x0, *ds, *rhs, *rhs0, *rhs1=NULL, error0, error1, error, scale; char key[6]; sf_file in, out, dix, t_0=NULL, x_0=NULL, f_0=NULL, grad=NULL, cost=NULL, mini=NULL, prec=NULL; sf_init(argc,argv); in = sf_input("in"); out = sf_output("out"); /* read input dimension */ dim = sf_filedims(in,n); nt = 1; for (i=0; i < dim; i++) { sprintf(key,"d%d",i+1); if (!sf_histfloat(in,key,d+i)) sf_error("No %s= in input",key); sprintf(key,"o%d",i+1); if (!sf_histfloat(in,key,o+i)) o[i]=0.; nt *= n[i]; } if (dim < 3) { n[2] = 1; d[2] = d[1]; o[2] = o[1]; } /* read initial guess */ s = sf_floatalloc(nt); sf_floatread(s,nt,in); if (!sf_getbool("velocity",&velocity)) velocity=true; /* y, input is velocity / n, slowness-squared */ if (velocity) { for (it=0; it < nt; it++) { s[it] = 1./s[it]*1./s[it]; } } /* read Dix velocity */ if (NULL == sf_getstring("dix")) sf_error("No Dix input dix="); dix = sf_input("dix"); if(!sf_histint(dix,"n1",&nt0)) sf_error("No n1= in dix"); if(!sf_histint(dix,"n2",&nx0)) sf_error("No n2= in dix"); if(!sf_histfloat(dix,"d1",&dt0)) sf_error("No d1= in dix"); if(!sf_histfloat(dix,"d2",&dx0)) sf_error("No d2= in dix"); if(!sf_histfloat(dix,"o1",&ot0)) sf_error("No o1= in dix"); if(!sf_histfloat(dix,"o2",&ox0)) sf_error("No o2= in dix"); vd = sf_floatalloc(nt0*nx0); sf_floatread(vd,nt0*nx0,dix); sf_fileclose(dix); /* Dix velocity derivative in t0 (2nd order FD) */ vdt = sf_floatalloc(nt0*nx0); for (i=0; i < nt0; i++) { for (j=0; j < nx0; j++) { if (i == 0) vdt[j*nt0+i] = (-vd[j*nt0+i+2]+4.*vd[j*nt0+i+1]-3.*vd[j*nt0+i])/(2.*dt0); else if (i == nt0-1) vdt[j*nt0+i] = (3.*vd[j*nt0+i]-4.*vd[j*nt0+i-1]+vd[j*nt0+i-2])/(2.*dt0); else vdt[j*nt0+i] = (vd[j*nt0+i+1]-vd[j*nt0+i-1])/(2.*dt0); } } /* Dix velocity derivative in x0 (2nd order FD) */ vdx = sf_floatalloc(nt0*nx0); for (j=0; j < nx0; j++) { for (i=0; i < nt0; i++) { if (j == 0) vdx[j*nt0+i] = (-vd[(j+2)*nt0+i]+4.*vd[(j+1)*nt0+i]-3.*vd[j*nt0+i])/(2.*dx0); else if (j == nx0-1) vdx[j*nt0+i] = (3.*vd[j*nt0+i]-4.*vd[(j-1)*nt0+i]+vd[(j-2)*nt0+i])/(2.*dx0); else vdx[j*nt0+i] = (vd[(j+1)*nt0+i]-vd[(j-1)*nt0+i])/(2.*dx0); } } if (!sf_getint("order",&order)) order=1; /* fastmarch accuracy order */ if (!sf_getfloat("thres",&thres)) thres=10.; /* thresholding for caustics */ if (!sf_getint("niter",&niter)) niter=1; /* number of nonlinear updates */ if (!sf_getint("cgiter",&cgiter)) cgiter=200; /* number of CG iterations */ if (!sf_getbool("shape",&shape)) shape=false; /* regularization (default Tikhnov) */ if (!sf_getfloat("eps",&eps)) eps=0.1; /* regularization parameter */ if (!sf_getint("nline",&nline)) nline=0; /* maximum number of line search (default turned-off) */ if (!sf_getbool("verb",&verb)) verb=false; /* verbosity flag */ if (shape) { if (!sf_getfloat("tol",&tol)) tol=1.e-6; /* tolerance for shaping regularization */ for (i=0; i < dim; i++) { sprintf(key,"rect%d",i+1); if (!sf_getint(key,rect+i)) rect[i]=1; /*( rect#=(1,1,...) smoothing radius on #-th axis )*/ } /* triangle smoothing operator */ sf_trianglen_init(dim,rect,n); sf_repeat_init(nt,1,sf_trianglen_lop); sf_conjgrad_init(nt,nt,nt,nt,eps,tol,verb,false); p = sf_floatalloc(nt); } else { /* initialize 2D gradient operator */ sf_igrad2_init(n[0],n[1]); } /* allocate memory for fastmarch */ t0 = sf_floatalloc(nt); x0 = sf_floatalloc(nt); f0 = sf_intalloc(nt); /* allocate memory for update */ ds = sf_floatalloc(nt); rhs = sf_floatalloc(nt); /* output transformation matrix */ if (NULL != sf_getstring("t0")) { t_0 = sf_output("t0"); sf_putint(t_0,"n3",niter+1); } if (NULL != sf_getstring("x0")) { x_0 = sf_output("x0"); sf_putint(x_0,"n3",niter+1); } /* output auxiliary label */ if (NULL != sf_getstring("f0")) { f_0 = sf_output("f0"); sf_settype(f_0,SF_INT); sf_putint(f_0,"n3",niter+1); } /* output gradient */ if (NULL != sf_getstring("grad")) { grad = sf_output("grad"); sf_putint(grad,"n3",niter); } /* output cost */ if (NULL != sf_getstring("cost")) { cost = sf_output("cost"); sf_putint(cost,"n3",niter+1); } /* read mask (desired minimum) */ m0 = sf_intalloc(nt); if (NULL != sf_getstring("mask")) { mini = sf_input("mask"); sf_intread(m0,nt,mini); sf_fileclose(mini); } else { for (it=0; it < nt; it++) m0[it] = -1; } /* read cost (desired minimum) */ rhs0 = sf_floatalloc(nt); if (NULL != sf_getstring("mval")) { mini = sf_input("mval"); sf_floatread(rhs0,nt,mini); sf_fileclose(mini); } else { for (it=0; it < nt; it++) rhs0[it] = 0.; } /* read preconditioner */ if (NULL != sf_getstring("prec")) { prec = sf_input("prec"); p0 = sf_floatalloc(nt); sf_floatread(p0,nt,prec); sf_fileclose(prec); rhs1 = sf_floatalloc(nt); } /* fastmarch initialization */ fastmarch_init(n,o,d,order); /* update initialization */ t2d_init(dim,n,d,nt0,dt0,ot0,nx0,dx0,ox0); /* fastmarch */ fastmarch(t0,x0,f0,s); /* caustic region (2D) */ t2d_caustic(x0,f0,n,d,thres); /* set up operator */ t2d_set(t0,x0,f0,s,vd,vdt,vdx,m0,p0); /* evaluate cost */ t2d_cost(rhs); for (it=0; it < nt; it++) { if (f0[it] >= 0 || m0[it] >= 0) rhs[it] = 0.; else rhs[it] -= rhs0[it]; } if (p0 == NULL) { error0 = error1 = cblas_snrm2(nt,rhs,1); } else { for (it=0; it < nt; it++) rhs1[it] = p0[it]*rhs[it]; error0 = error1 = cblas_snrm2(nt,rhs1,1); } /* write optional outputs */ if (NULL!=t_0) sf_floatwrite(t0,nt,t_0); if (NULL!=x_0) sf_floatwrite(x0,nt,x_0); if (NULL!=f_0) sf_intwrite(f0,nt,f_0); if (NULL!=cost) sf_floatwrite(rhs,nt,cost); sf_warning("Start conversion, cost %g",1.); /* nonlinear loop */ for (iter=0; iter < niter; iter++) { /* solve ds */ if (shape) { if (p0 == NULL) sf_conjgrad(NULL,t2d_oper,sf_repeat_lop,p,ds,rhs,cgiter); else sf_conjgrad(t2d_prec,t2d_oper,sf_repeat_lop,p,ds,rhs,cgiter); } else { sf_solver_reg(t2d_oper,sf_cgstep,sf_igrad2_lop,2*nt,nt,nt,ds,rhs,cgiter,eps,"verb",verb,"end"); sf_cgstep_close(); } /* add ds */ for (it=0; it < nt; it++) { s[it] = s[it]+ds[it]+0.25*ds[it]*ds[it]/s[it]; } /* fastmarch */ fastmarch(t0,x0,f0,s); /* caustic region (2D) */ t2d_caustic(x0,f0,n,d,thres); /* set up operator */ t2d_set(t0,x0,f0,s,vd,vdt,vdx,m0,p0); /* evaluate cost */ t2d_cost(rhs); for (it=0; it < nt; it++) { if (f0[it] >= 0 || m0[it] >= 0) rhs[it] = 0.; else rhs[it] -= rhs0[it]; } if (p0 == NULL) { error = cblas_snrm2(nt,rhs,1); } else { for (it=0; it < nt; it++) rhs1[it] = p0[it]*rhs[it]; error = cblas_snrm2(nt,rhs1,1); } error = cblas_snrm2(nt,rhs,1); /* line search */ if (nline > 0 && error >= error1) { scale = 0.5; for (iline=0; iline < nline; iline++) { for (it=0; it < nt; it++) { s[it] = s[it]+(scale*ds[it])+0.25*(scale*ds[it])*(scale*ds[it])/s[it]; } fastmarch(t0,x0,f0,s); t2d_caustic(x0,f0,n,d,thres); t2d_set(t0,x0,f0,s,vd,vdt,vdx,m0,p0); t2d_cost(rhs); for (it=0; it < nt; it++) { if (f0[it] >= 0 || m0[it] >= 0) rhs[it] = 0.; else rhs[it] -= rhs0[it]; } if (p0 == NULL) { error = cblas_snrm2(nt,rhs,1); } else { for (it=0; it < nt; it++) rhs1[it] = p0[it]*rhs[it]; error = cblas_snrm2(nt,rhs1,1); } error = cblas_snrm2(nt,rhs,1); if (error < error1) { sf_warning("Exist line search %d of %d",iline+1,nline); } else { scale *= 0.5; } } } error1 = error; /* write optional outputs */ if (NULL!=t_0) sf_floatwrite(t0,nt,t_0); if (NULL!=x_0) sf_floatwrite(x0,nt,x_0); if (NULL!=f_0) sf_intwrite(f0,nt,f_0); if (NULL!=cost) sf_floatwrite(rhs,nt,cost); if (NULL!=grad) sf_floatwrite(ds,nt,grad); sf_warning("Cost after iteration %d: %g",iter+1,error/error0); } /* write output */ if (velocity) { for (it=0; it < nt; it++) { s[it] = 1./sqrtf(s[it]); } } sf_floatwrite(s,nt,out); exit(0); }
float snrm2_(int *N, float *X, int *INCX) { return cblas_snrm2(*N, X, *INCX); }
int main(int argc, char* argv[]) { bool adj, velocity, l1norm, plane[3], verb; int dim, i, count, n[SF_MAX_DIM], it, nt, **m, nrhs, is, nshot=1, *flag, order, iter, niter, stiter, *k, nfreq, nmem; float o[SF_MAX_DIM], d[SF_MAX_DIM], **t, *t0, *s, *temps, **source, *rhs, *ds; float rhsnorm, rhsnorm0, rhsnorm1, rate, eps, gama; char key[4], *what; sf_file sinp, sout, shot, time, reco, rece, topo, grad, norm; sf_weight weight=NULL; sf_init(argc,argv); sinp = sf_input("in"); sout = sf_output("out"); if (NULL == (what = sf_getstring("what"))) what="tomo"; /* what to compute (default tomography) */ switch (what[0]) { case 'l': /* linear operator */ if (NULL == sf_getstring("time")) sf_error("Need time="); time = sf_input("time"); /* read operator dimension from time table */ dim = sf_filedims(time,n); nt = 1; for (i=0; i < 3; i++) { sprintf(key,"d%d",i+1); if (!sf_histfloat(time,key,d+i)) sf_error("No %s= in input",key); sprintf(key,"o%d",i+1); if (!sf_histfloat(time,key,o+i)) o[i]=0.; nt *= n[i]; plane[i] = false; } if (dim < 3) { n[2] = 1; o[2] = o[1]; d[2] = d[1]; plane[2] = false; } dim = 2; /* read in shot file */ if (NULL == sf_getstring("shot")) sf_error("Need source shot="); shot = sf_input("shot"); if (!sf_histint(shot,"n2",&nshot)) nshot=1; sf_fileclose(shot); /* read in receiver file */ m = sf_intalloc2(nt,nshot); if (NULL == sf_getstring("receiver")) { for (is=0; is < nshot; is++) { for (it=0; it < nt; it++) { m[is][it] = 1; } } } else { rece = sf_input("receiver"); sf_intread(m[0],nt*nshot,rece); sf_fileclose(rece); } /* number of right-hand side */ nrhs = 0; for (is=0; is < nshot; is++) { for (it=0; it < nt; it++) { if (m[is][it] == 1) nrhs++; } } rhs = sf_floatalloc(nrhs); t = sf_floatalloc2(nt,nshot); sf_floatread(t[0],nt*nshot,time); if (!sf_getbool("adj",&adj)) adj=false; /* adjoint flag (for what=linear) */ /* initialize fatomo */ fatomo_init(dim,n,d,nshot); /* set operators */ fatomo_set(t,m); t0 = sf_floatalloc(nt); if (adj) { sf_floatread(rhs,nrhs,sinp); fatomo_lop(true,false,nt,nrhs,t0,rhs); sf_putint(sout,"n1",nt); sf_putint(sout,"n2",1); sf_putint(sout,"n3",1); sf_floatwrite(t0,nt,sout); } else { sf_floatread(t0,nt,sinp); fatomo_lop(false,false,nt,nrhs,t0,rhs); sf_putint(sout,"n1",nrhs); sf_putint(sout,"n2",1); sf_putint(sout,"n3",1); sf_floatwrite(rhs,nrhs,sout); } break; case 't': /* tomography */ /* read input dimension */ dim = sf_filedims(sinp,n); nt = 1; for (i=0; i < dim; i++) { sprintf(key,"d%d",i+1); if (!sf_histfloat(sinp,key,d+i)) sf_error("No %s= in input",key); sprintf(key,"o%d",i+1); if (!sf_histfloat(sinp,key,o+i)) o[i]=0.; nt *= n[i]; plane[i] = false; } if (dim < 3) { n[2] = 1; o[2] = o[1]; d[2] = d[1]; plane[2] = false; } /* read initial guess */ s = sf_floatalloc(nt); sf_floatread(s,nt,sinp); if (!sf_getbool("velocity",&velocity)) velocity=true; /* if y, the input is velocity; n, slowness squared */ if (velocity) { for (it=0; it < nt; it++) { s[it] = 1./s[it]*1./s[it]; } } /* allocate memory for temporary data */ ds = sf_floatalloc(nt); flag = sf_intalloc(nt); temps = sf_floatalloc(nt); for (it=0; it < nt; it++) { temps[it] = s[it]; } if (!sf_getbool("l1norm",&l1norm)) l1norm=false; /* norm for minimization (default L2 norm) */ if (!sf_getbool("verb",&verb)) verb=false; /* verbosity flag */ /* read in shot file */ if (NULL == sf_getstring("shot")) sf_error("Need source shot="); shot = sf_input("shot"); if (!sf_histint(shot,"n2",&nshot)) nshot=1; source = sf_floatalloc2(3,nshot); sf_floatread(source[0],3*nshot,shot); sf_fileclose(shot); /* allocate memory for time table */ t = sf_floatalloc2(nt,nshot); /* read in receiver file */ m = sf_intalloc2(nt,nshot); if (NULL == sf_getstring("receiver")) { for (is=0; is < nshot; is++) { for (it=0; it < nt; it++) { m[is][it] = 1; } } } else { rece = sf_input("receiver"); sf_intread(m[0],nt*nshot,rece); sf_fileclose(rece); } /* number of right-hand side */ nrhs = 0; for (is=0; is < nshot; is++) { for (it=0; it < nt; it++) { if (m[is][it] == 1) nrhs++; } } rhs = sf_floatalloc(nrhs); /* read in record file */ if (NULL == sf_getstring("record")) sf_error("Need data record="); reco = sf_input("record"); t0 = sf_floatalloc(nrhs); sf_floatread(t0,nrhs,reco); sf_fileclose(reco); /* read in topography file */ if (NULL != sf_getstring("topo")) { topo = sf_input("topo"); k = sf_intalloc(nt); sf_intread(k,nt,topo); sf_fileclose(topo); } else { k = NULL; } if (!sf_getint("order",&order)) order=2; /* fast marching accuracy order */ if (!sf_getint("niter",&niter)) niter=10; /* number of slowness inversion iterations */ if (!sf_getint("stiter",&stiter)) stiter=200; /* number of step iterations */ if (!sf_getfloat("eps",&eps)) eps=0.; /* regularization parameter */ /* output gradient at each iteration */ if (NULL != sf_getstring("gradient")) { grad = sf_output("gradient"); sf_putint(grad,"n3",n[2]); sf_putfloat(grad,"d3",d[2]); sf_putfloat(grad,"o3",o[2]); sf_putint(grad,"n4",niter); } else { grad = NULL; } /* output misfit L2 norm at each iteration */ if (NULL != sf_getstring("misnorm")) { norm = sf_output("misnorm"); sf_putint(norm,"n1",niter+1); sf_putfloat(norm,"d1",1.); sf_putfloat(norm,"o1",0.); sf_putint(norm,"n2",1); sf_putint(norm,"n3",1); } else { norm = NULL; } /* initialize fatomo */ fatomo_init(dim,n,d,nshot); /* initialize 2D gradient operator */ sf_igrad2_init(n[0],n[1]); if (l1norm) { /* if (!sf_getfloat("perc",&perc)) perc=90.; l1_init(nt,stiter,perc,false); */ if (!sf_getint("nfreq",&nfreq)) nfreq=1; /* l1-norm weighting nfreq */ if (!sf_getint("nmem",&nmem)) nmem=1; /* l1-norm weighting nmem */ weight = sf_l1; sf_irls_init(nt); } /* initial misfit */ fastmarch_init(n[2],n[1],n[0]); i = 0; for (is=0; is < nshot; is++) { fastmarch(t[is],s,flag,plane, n[2],n[1],n[0],o[2],o[1],o[0],d[2],d[1],d[0], source[is][2],source[is][1],source[is][0],1,1,1,order); for (it=0; it < nt; it++) { if (m[is][it] == 1) { rhs[i] = t0[i]-t[is][it]; i++; } } } fastmarch_close(); /* calculate L2 data-misfit */ rhsnorm0 = cblas_snrm2(nrhs,rhs,1); rhsnorm = rhsnorm0; rhsnorm1 = rhsnorm; rate = rhsnorm1/rhsnorm0; if (l1norm) sf_warning("L1 misfit after iteration 0 of %d: %g",niter,rate); else sf_warning("L2 misfit after iteration 0 of %d: %g",niter,rate); if (norm != NULL) sf_floatwrite(&rate,1,norm); /* iterations over inversion */ for (iter=0; iter < niter; iter++) { /* clean-up */ for (it=0; it < nt; it++) { ds[it] = 0.; } /* prepare for CG */ fatomo_set(t,m); /* solve ds */ if (l1norm) { /* sf_solver_reg(fatomo_lop,l1step,sf_igrad2_lop,2*nt, nt,nrhs,ds,rhs,stiter,eps,"verb",verb,"end"); */ sf_solver_reg(fatomo_lop,sf_cgstep,sf_igrad2_lop,2*nt,nt,nrhs,ds,rhs,stiter,eps,"wght",weight,"nfreq",nfreq,"nmem",nmem,"verb",verb,"end"); /* l1step_close(); */ sf_cgstep_close(); } else { sf_solver_reg(fatomo_lop,sf_cgstep,sf_igrad2_lop,2*nt,nt,nrhs,ds,rhs,stiter,eps,"verb",verb,"end"); sf_cgstep_close(); } /* line search */ gama = 1.; for (count=0; count < 10; count++) { /* update slowness */ for (it=0; it < nt; it++) { if (k == NULL || k[it] != 1) temps[it] = (s[it]+gama*ds[it])*(s[it]+gama*ds[it])/s[it]; } /* forward fast-marching for stencil time */ fastmarch_init(n[2],n[1],n[0]); i = 0; for (is=0; is < nshot; is++) { fastmarch(t[is],temps,flag,plane, n[2],n[1],n[0],o[2],o[1],o[0],d[2],d[1],d[0], source[is][2],source[is][1],source[is][0],1,1,1,order); for (it=0; it < nt; it++) { if (m[is][it] == 1) { rhs[i] = t0[i]-t[is][it]; i++; } } } fastmarch_close(); rhsnorm = cblas_snrm2(nrhs,rhs,1); rate = rhsnorm/rhsnorm1; if (rate < 1.) { for (it=0; it < nt; it++) { s[it] = temps[it]; } rhsnorm1 = rhsnorm; rate = rhsnorm1/rhsnorm0; break; } gama *= 0.5; } if (count == 10) { sf_warning("Line-search Failure. Iteration terminated at %d of %d.",iter+1,niter); sf_warning("Dimensions for GRAD and NORM need to be fixed before read."); break; } if (l1norm) sf_warning("L1 misfit after iteration %d of %d: %g (line-search %d)",iter+1,niter,rate,count); else sf_warning("L2 misfit after iteration %d of %d: %g (line-search %d)",iter+1,niter,rate,count); if (grad != NULL) sf_floatwrite(ds,nt,grad); if (norm != NULL) sf_floatwrite(&rate,1,norm); } /* convert to velocity */ if (velocity) { for (it=0; it < nt; it++) { s[it] = 1./sqrtf(s[it]); } } sf_floatwrite(s,nt,sout); break; } exit(0); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeev */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; float *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; float *w1i, *w2i; magmaFloatComplex *w1copy, *w2copy; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float tnrm, result[9]; magma_int_t N, n2, lda, nb, lwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float ulp, ulpinv, error; magma_int_t status = 0; ulp = lapackf77_slamch( "P" ); ulpinv = 1./ulp; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); // enable at least some minimal checks, if requested if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) { fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n" " set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n" " Some checks require running lapack (-l); setting lapack.\n\n"); opts.lapack = true; } printf(" N CPU Time (sec) GPU Time (sec) |W_magma - W_lapack| / |W_lapack|\n"); printf("===========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; nb = magma_get_sgehrd_nb(N); lwork = N*(2 + nb); // generous workspace - required by sget22 lwork = max( lwork, N*(5 + 2*N) ); TESTING_MALLOC_CPU( w1copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w2copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( w1i, float, N ); TESTING_MALLOC_CPU( w2i, float, N ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_PIN( VL, float, n2 ); TESTING_MALLOC_PIN( VR, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeev( opts.jobvl, opts.jobvr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { /* =================================================================== * Check the result following LAPACK's [zcds]drvev routine. * The following tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (3) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * transpose of A, and W is as above. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial, W only) -- currently skipped * (6) W(full) = W(partial, W and VR) * (7) W(full) = W(partial, W and VL) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (8) VR(full) = VR(partial, W and VR) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (9) VL(full) = VL(partial, W and VL) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. * * (1, 2) only if jobvr = V * (3, 4) only if jobvl = V * (5-9) only if check = 2 (option -c2) ================================================================= */ float vmx, vrmx, vtst; // Initialize result. -1 indicates test was not run. for( int j = 0; j < 9; ++j ) result[j] = -1.; if ( opts.jobvr == MagmaVec ) { // Do test 1: | A * VR - VR * W | / ( n |A| ) // Note this writes result[1] also lapackf77_sget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr, &N, h_A, &lda, VR, &lda, w1, w1i, h_work, &result[0] ); result[0] *= ulp; // Do test 2: | |VR(i)| - 1 | and whether largest component real result[1] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_snrm2(N, &VR[j*lda], ione); else if (w1i[j] > 0.) tnrm = magma_slapy2( cblas_snrm2(N, &VR[j *lda], ione), cblas_snrm2(N, &VR[(j+1)*lda], ione) ); result[1] = max( result[1], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VR[jj+j*lda] ) > vrmx) { vrmx = MAGMA_S_ABS( VR[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[1] = ulpinv; } } result[1] *= ulp; } if ( opts.jobvl == MagmaVec ) { // Do test 3: | A**T * VL - VL * W**T | / ( n |A| ) // Note this writes result[3] also lapackf77_sget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr, &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[2] ); result[2] *= ulp; // Do test 4: | |VL(i)| - 1 | and whether largest component real result[3] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_snrm2(N, &VL[j*lda], ione); else if (w1i[j] > 0.) tnrm = magma_slapy2( cblas_snrm2(N, &VL[j *lda], ione), cblas_snrm2(N, &VL[(j+1)*lda], ione) ); result[3] = max( result[3], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VL[jj+j*lda]) > vrmx) { vrmx = MAGMA_S_ABS( VL[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[3] = ulpinv; } } result[3] *= ulp; } } if ( opts.check == 2 ) { // more extensive tests // this is really slow because it calls magma_zgeev multiple times float *LRE, DUM; TESTING_MALLOC_PIN( LRE, float, n2 ); lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); // ---------- // Compute eigenvalues, left and right eigenvectors magma_sgeev( MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); if (info != 0) printf("magma_zgeev (case V, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // ---------- // Compute eigenvalues only // These are not exactly equal, and not in the same order, so skip for now. //lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_sgeev( MagmaNoVec, MagmaNoVec, // N, h_R, lda, w2, w2i, // &DUM, 1, &DUM, 1, // h_work, lwork, &info ); //if (info != 0) // printf("magma_sgeev (case N, N) returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //// Do test 5: W(full) = W(partial, W only) //result[4] = 1; //for( int j = 0; j < N; ++j ) // if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) // result[4] = 0; // ---------- // Compute eigenvalues and right eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case N, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 6: W(full) = W(partial, W and VR) result[5] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[5] = 0; // Do test 8: VR(full) = VR(partial, W and VR) result[7] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] )) result[7] = 0; // ---------- // Compute eigenvalues and left eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case V, N) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 7: W(full) = W(partial, W and VL) result[6] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[6] = 0; // Do test 9: VL(full) = VL(partial, W and VL) result[8] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] )) result[8] = 0; TESTING_FREE_PIN( LRE ); } /* ===================================================================== Performs operation using LAPACK Do this after checks, because it overwrites VL and VR. =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeev( &opts.jobvl, &opts.jobvr, &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); // check | W_magma - W_lapack | / | W | // need to sort eigenvalues first // copy them into complex vectors for ease for( int j=0; j < N; ++j ) { w1copy[j] = MAGMA_C_MAKE( w1[j], w1i[j] ); w2copy[j] = MAGMA_C_MAKE( w2[j], w2i[j] ); } std::sort( w1copy, &w1copy[N], compare ); std::sort( w2copy, &w2copy[N], compare ); // adjust sorting to deal with numerical inaccuracy // search down w2 for eigenvalue that matches w1's eigenvalue for( int j=0; j < N; ++j ) { for( int j2=j; j2 < N; ++j2 ) { magmaFloatComplex diff = MAGMA_C_SUB( w1copy[j], w2copy[j2] ); float diff2 = magma_szlapy2( diff ) / max( magma_szlapy2( w1copy[j] ), tol ); if ( diff2 < 100*tol ) { if ( j != j2 ) { std::swap( w2copy[j], w2copy[j2] ); } break; } } } blasf77_caxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione ); error = cblas_scnrm2( N, w1copy, 1 ); error /= cblas_scnrm2( N, w2copy, 1 ); printf("%5d %7.2f %7.2f %.2e %s\n", (int) N, cpu_time, gpu_time, error, (error < tolulp ? " ok" : " failed")); status |= ! (error < tolulp); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } if ( opts.check ) { // -1 indicates test was not run if ( result[0] != -1 ) { printf(" | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? " ok" : " failed")); } if ( result[1] != -1 ) { printf(" | |VR(i)| - 1 | = %8.2e %s\n", result[1], (result[1] < tol ? " ok" : " failed")); } if ( result[2] != -1 ) { printf(" | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? " ok" : " failed")); } if ( result[3] != -1 ) { printf(" | |VL(i)| - 1 | = %8.2e %s\n", result[3], (result[3] < tol ? " ok" : " failed")); } if ( result[4] != -1 ) { printf(" W (full) == W (partial, W only) %s\n", (result[4] == 1. ? " ok" : " failed")); } if ( result[5] != -1 ) { printf(" W (full) == W (partial, W and VR) %s\n", (result[5] == 1. ? " ok" : " failed")); } if ( result[6] != -1 ) { printf(" W (full) == W (partial, W and VL) %s\n", (result[6] == 1. ? " ok" : " failed")); } if ( result[7] != -1 ) { printf(" VR (full) == VR (partial, W and VR) %s\n", (result[7] == 1. ? " ok" : " failed")); } if ( result[8] != -1 ) { printf(" VL (full) == VL (partial, W and VL) %s\n", (result[8] == 1. ? " ok" : " failed")); } int newline = 0; if ( result[0] != -1 ) { status |= ! (result[0] < tol); newline = 1; } if ( result[1] != -1 ) { status |= ! (result[1] < tol); newline = 1; } if ( result[2] != -1 ) { status |= ! (result[2] < tol); newline = 1; } if ( result[3] != -1 ) { status |= ! (result[3] < tol); newline = 1; } if ( result[4] != -1 ) { status |= ! (result[4] == 1.); newline = 1; } if ( result[5] != -1 ) { status |= ! (result[5] == 1.); newline = 1; } if ( result[6] != -1 ) { status |= ! (result[6] == 1.); newline = 1; } if ( result[7] != -1 ) { status |= ! (result[7] == 1.); newline = 1; } if ( result[8] != -1 ) { status |= ! (result[8] == 1.); newline = 1; } if ( newline ) { printf( "\n" ); } } TESTING_FREE_CPU( w1copy ); TESTING_FREE_CPU( w2copy ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// // Overloaded function for dispatching to // * CBLAS backend, and // * float value-type. // inline float nrm2( const int n, const float* x, const int incx ) { return cblas_snrm2( n, x, incx ); }
/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. TODO what is LDQ2? @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (workspace) REAL array, dimension (3*N*N/2+3*N) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* Q, magma_int_t ldq, float rho, float* dlamda, float* Q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if (k == 0) return MAGMA_SUCCESS; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, Q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s+id*k, 1); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return MAGMA_SUCCESS; //?????? timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return MAGMA_SUCCESS; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s, 1); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); magma_queue_sync( NULL ); if (rk != 0) { if ( n23 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); return MAGMA_SUCCESS; } /* magma_slaex3 */
extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* q, magma_int_t ldq, float rho, float* dlamda, float* q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { /* Purpose ======= SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= K (input) INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. N (input) INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N>K). N1 (input) INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. D (output) REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. Q (output) REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. LDQ (input) INTEGER The leading dimension of the array Q. LDQ >= max(1,N). RHO (input) REAL The value of the parameter in the rank one update equation. RHO >= 0 required. DLAMDA (input/output) REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. Q2 (input) REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. INDX (input) INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. CTOT (input) INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. W (input/output) REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. S (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. INDXQ (output) INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = 1, an eigenvalue did not converge Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. ===================================================================== */ float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; char range_[] = {range, 0}; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i,iq2,j,n12,n2,n23,tmp,lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); *info = 0; if(k < 0) *info=-1; else if(n < k) *info=-2; else if(ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if(*info != 0){ magma_xerbla(__func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if(k == 0) return MAGMA_SUCCESS; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for(i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = ib; j < ie; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0){ #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if(*info == 0){ #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2){ #pragma omp single { for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if(k != 1){ // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for(j = 0; j < k; ++j){ magma_int_t i_tmp = min(j, ie); for(i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for(i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if(id < tot){ ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else{ ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for(j = ib; j < ie; ++j){ for(i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s+id*k, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return MAGMA_SUCCESS; //?????? #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif for(i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = 0; j < k; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0) *info=iinfo; } if(*info != 0) return MAGMA_SUCCESS; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2){ for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if(k != 1){ // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, q, &tmp, w, &ione); for(j = 0; j < k; ++j){ for(i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for(i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for(j = iil-1; j < iiu; ++j){ for(i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #endif //_OPENMP // Compute the updated eigenvectors. #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER start = get_current_time(); #endif magma_queue_sync( NULL ); if (rk != 0){ if( n23 != 0 ){ if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm('N', 'N', n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if( n12 != 0 ) { if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm('N', 'N', n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.); #endif return MAGMA_SUCCESS; } /*magma_slaed3*/
extern "C" magma_int_t magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, float *A, magma_int_t lda, float *dA, magma_int_t ldda, magma_int_t *jpvt, float *tau, float *vn1, float *vn2, float *auxv, float *F, magma_int_t ldf, float *dF, magma_int_t lddf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SLAQPS computes a step of QR factorization with column pivoting of a real M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) REAL array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) REAL array, dimension (NB) Auxiliar vector. F (input/output) REAL array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) float c_zero = MAGMA_S_MAKE( 0.,0.); float c_one = MAGMA_S_MAKE( 1.,0.); float c_neg_one = MAGMA_S_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; float z__1; magma_int_t j, k, rk; float Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t stream; magma_queue_create( &stream ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k. // C: pvt, k, isamax are all 0-based; don't subtract 1. pvt = k + cblas_isamax( n-k, &vn1[k], ione ); if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_sgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb){ /* no need of transfer if pivot is within the panel */ blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( stream ); /* 2. Swap as usual on CPU */ blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_ssetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_sgemv( MagmaTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione ); magma_sgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, stream ); blasf77_sgemv( MagmaTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( stream ); blasf77_sgemv( MagmaTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_S_NEGATE( tau[k] ); blasf77_sgemv( MagmaTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_sgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_sgemm( MagmaNoTransStr, MagmaTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_ssetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 ); magma_sgemm( MagmaNoTrans, MagmaTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_snrm2(i__1, A(rk + 1, lsticc), ione); else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = cblas_snrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( stream ); return MAGMA_SUCCESS; } /* magma_slaqps */
int main(int argc, char* argv[]) { bool velocity, causal, limit, verb, shape; int dimw, dimt, i, n[SF_MAX_DIM], rect[SF_MAX_DIM], iw, nw, ir, nr; long nt, *order; int iter, niter, cgiter, count; int *ff, *dp, *mp, nloop; float o[SF_MAX_DIM], d[SF_MAX_DIM], *dt, *dw, *dv, *t, *w, *t0, *w1, *p=NULL; float eps, tol, thres, rhsnorm, rhsnorm0, rhsnorm1, rate, gama; char key[6]; sf_file in, out, reco, grad, mask, prec; sf_init(argc,argv); in = sf_input("in"); out = sf_output("out"); /* read dimension */ dimw = sf_filedims(in,n); nw = 1; for (i=0; i < dimw; i++) { sprintf(key,"d%d",i+1); if (!sf_histfloat(in,key,d+i)) sf_error("No %s= in input.",key); sprintf(key,"o%d",i+1); if (!sf_histfloat(in,key,o+i)) o[i]=0.; nw *= n[i]; } if (dimw > 2) sf_error("Only works for 2D now."); n[2] = n[1]; d[2] = d[1]; o[2] = o[1]; dimt = 3; nr = n[1]*n[2]; nt = nw*n[2]; /* read initial velocity */ w = sf_floatalloc(nw); sf_floatread(w,nw,in); if (!sf_getbool("velocity",&velocity)) velocity=true; /* if y, the input is velocity; n, slowness-squared */ /* convert to slowness-squared */ if (velocity) { for (iw=0; iw < nw; iw++) w[iw] = 1./w[iw]*1./w[iw]; dv = sf_floatalloc(nw); } else { dv = NULL; } if (!sf_getbool("limit",&limit)) limit=false; /* if y, limit computation within receiver coverage */ if (!sf_getbool("shape",&shape)) shape=false; /* shaping regularization (default no) */ /* read record */ if (NULL == sf_getstring("reco")) sf_error("Need record reco="); reco = sf_input("reco"); t0 = sf_floatalloc(nr); sf_floatread(t0,nr,reco); sf_fileclose(reco); /* read receiver mask */ if (NULL == sf_getstring("mask")) { mask = NULL; dp = NULL; } else { mask = sf_input("mask"); dp = sf_intalloc(nr); sf_intread(dp,nr,mask); sf_fileclose(mask); } /* read model mask */ if (NULL == sf_getstring("prec")) { prec = NULL; mp = NULL; } else { prec = sf_input("prec"); mp = sf_intalloc(nw); sf_intread(mp,nw,prec); sf_fileclose(prec); } if (!sf_getbool("verb",&verb)) verb=false; /* verbosity flag */ if (!sf_getint("niter",&niter)) niter=5; /* number of inversion iterations */ if (!sf_getint("cgiter",&cgiter)) cgiter=10; /* number of conjugate-gradient iterations */ if (!sf_getfloat("thres",&thres)) thres=5.e-5; /* threshold (percentage) */ if (!sf_getfloat("tol",&tol)) tol=1.e-3; /* tolerance for bisection root-search */ if (!sf_getint("nloop",&nloop)) nloop=10; /* number of bisection root-search */ /* output gradient at each iteration */ if (NULL != sf_getstring("grad")) { grad = sf_output("grad"); sf_putint(grad,"n3",niter); } else { grad = NULL; } if (!sf_getfloat("eps",&eps)) eps=0.; /* regularization parameter */ if (shape) { for (i=0; i < dimw; i++) { sprintf(key,"rect%d",i+1); if (!sf_getint(key,rect+i)) rect[i]=1; /*( rect#=(1,1,...) smoothing radius on #-th axis )*/ } /* triangle smoothing operator */ sf_trianglen_init(dimw,rect,n); sf_repeat_init(nw,1,sf_trianglen_lop); sf_conjgrad_init(nw,nw,nr,nr,eps,1.e-6,verb,false); p = sf_floatalloc(nw); } else { /* initialize 2D gradient operator */ sf_igrad2_init(n[0],n[1]); } /* allocate temporary array */ t = sf_floatalloc(nt); dw = sf_floatalloc(nw); dt = sf_floatalloc(nr); w1 = sf_floatalloc(nw); ff = sf_intalloc(nt); if (!sf_getbool("causal",&causal)) causal=true; /* if y, neglect non-causal branches of DSR */ /* initialize eikonal */ dsreiko_init(n,o,d, thres,tol,nloop, causal,limit,dp); /* initialize operator */ dsrtomo_init(dimt,n,d); /* upwind order */ order = dsrtomo_order(); /* initial misfit */ dsreiko_fastmarch(t,w,ff,order); dsreiko_mirror(t); /* calculate L2 data-misfit */ for (ir=0; ir < nr; ir++) { if (dp == NULL || dp[ir] == 1) { dt[ir] = t0[ir]-t[(long) ir*n[0]]; } else { dt[ir] = 0.; } } rhsnorm0 = cblas_snrm2(nr,dt,1); rhsnorm = rhsnorm0; rhsnorm1 = rhsnorm; rate = rhsnorm1/rhsnorm0; sf_warning("L2 misfit after iteration 0 of %d: %g",niter,rate); /* iterations over inversion */ for (iter=0; iter < niter; iter++) { /* clean-up */ for (iw=0; iw < nw; iw++) dw[iw] = 0.; /* set operator */ dsrtomo_set(t,w,ff,dp,mp); /* solve dw */ if (shape) { sf_conjgrad(NULL,dsrtomo_oper,sf_repeat_lop,p,dw,dt,cgiter); } else { sf_solver_reg(dsrtomo_oper,sf_cgstep,sf_igrad2_lop,2*nw,nw,nr,dw,dt,cgiter,eps,"verb",verb,"end"); sf_cgstep_close(); } /* output gradient */ if (grad != NULL) { if (velocity) { for (iw=0; iw < nw; iw++) { dv[iw] = -dw[iw]/(2.*sqrtf(w[iw])*(w[iw]+dw[iw]/2.)); } sf_floatwrite(dv,nw,grad); } else { sf_floatwrite(dw,nw,grad); } } /* line search */ gama = 0.5; for (count=0; count < 5; count++) { /* update slowness */ for (iw=0; iw < nw; iw++) w1[iw] = (w[iw]+gama*dw[iw])*(w[iw]+gama*dw[iw])/w[iw]; /* compute new misfit */ dsreiko_fastmarch(t,w1,ff,order); dsreiko_mirror(t); for (ir=0; ir < nr; ir++) { if (dp == NULL || dp[ir] == 1) { dt[ir] = t0[ir]-t[(long) ir*n[0]]; } else { dt[ir] = 0.; } } rhsnorm = cblas_snrm2(nr,dt,1); rate = rhsnorm/rhsnorm1; if (rate < 1.) { for (iw=0; iw < nw; iw++) w[iw] = w1[iw]; rhsnorm1 = rhsnorm; rate = rhsnorm1/rhsnorm0; break; } gama *= 0.5; } if (count == 5) { sf_warning("Line-search failure at iteration %d of %d.",iter+1,niter); break; } sf_warning("L2 misfit after iteration %d of %d: %g (line-search %d)",iter+1,niter,rate,count); } /* convert to velocity */ if (velocity) { for (iw=0; iw < nw; iw++) { w[iw] = 1./sqrtf(w[iw]); } } sf_floatwrite(w,nw,out); exit(0); }