/*------------------------------------------------------------------- * Check the orthogonality of Q */ static magma_int_t check_orthogonality(magma_int_t M, magma_int_t N, float *Q, magma_int_t LDQ, float eps) { float done = 1.0; float mdone = -1.0; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float normQ, result; magma_int_t info_ortho; magma_int_t minMN = min(M, N); float *work = (float *)malloc(minMN*sizeof(float)); /* Build the idendity matrix */ float *Id = (float *) malloc(minMN*minMN*sizeof(float)); lapackf77_slaset("A", &minMN, &minMN, &c_zero, &c_one, Id, &minMN); /* Perform Id - Q'Q */ if (M >= N) blasf77_ssyrk("U", "C", &N, &M, &done, Q, &LDQ, &mdone, Id, &N); else blasf77_ssyrk("U", "N", &M, &N, &done, Q, &LDQ, &mdone, Id, &M); normQ = lapackf77_slansy("I", "U", &minMN, Id, &minMN, work); result = normQ / (minMN * eps); printf(" ======================================================\n"); printf(" ||Id-Q'*Q||_oo / (minMN*eps) : %15.3E \n", result ); printf(" ======================================================\n"); if ( isnan(result) || isinf(result) || (result > 60.0) ) { printf("-- Orthogonality is suspicious ! \n"); info_ortho=1; } else { printf("-- Orthogonality is CORRECT ! \n"); info_ortho=0; } free(work); free(Id); return info_ortho; }
extern "C" magma_int_t magma_sorgqr(magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. DT (input) REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. NB (input) INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_slaset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_sorgqr */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] T REAL array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_m( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; float *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; float *dA[ MagmaMaxGPUs ] = { NULL }; float *dT[ MagmaMaxGPUs ] = { NULL }; float *dV[ MagmaMaxGPUs ] = { NULL }; float *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } float *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_ssetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_sorgqr */
/** Purpose ------- SSTEDX computes some eigenvalues and, optionally, eigenvectors of a symmetric tridiagonal matrix using the divide and conquer method. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. See SLAEX3 for details. Arguments --------- @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] n INTEGER The dimension of the symmetric tridiagonal matrix. N >= 0. @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[in,out] d REAL array, dimension (N) On entry, the diagonal elements of the tridiagonal matrix. On exit, if INFO = 0, the eigenvalues in ascending order. @param[in,out] e REAL array, dimension (N-1) On entry, the subdiagonal elements of the tridiagonal matrix. On exit, E has been destroyed. @param[in,out] Z REAL array, dimension (LDZ,N) On exit, if INFO = 0, Z contains the orthonormal eigenvectors of the symmetric tridiagonal matrix. @param[in] ldz INTEGER The leading dimension of the array Z. LDZ >= max(1,N). @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If N > 1 then LWORK >= ( 1 + 4*N + N**2 ). Note that if N is less than or equal to the minimum divide size, usually 25, then LWORK need only be max(1,2*(N-1)). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. LIWORK >= ( 3 + 5*N ). Note that if N is less than or equal to the minimum divide size, usually 25, then LIWORK need only be 1. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the IWORK array, returns this value as the first entry of the IWORK array, and no error message related to LIWORK is issued by XERBLA. @param dwork (workspace) REAL array, dimension (3*N*N/2+3*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: The algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_comp ********************************************************************/ extern "C" magma_int_t magma_sstedx( magma_range_t range, magma_int_t n, float vl, float vu, magma_int_t il, magma_int_t iu, float *d, float *e, float *Z, magma_int_t ldz, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magmaFloat_ptr dwork, magma_int_t *info) { #define Z(i_,j_) (Z + (i_) + (j_)*ldz) float d_zero = 0.; float d_one = 1.; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t alleig, indeig, valeig, lquery; magma_int_t i, j, k, m; magma_int_t liwmin, lwmin; magma_int_t start, end, smlsiz; float eps, orgnrm, p, tiny; // Test the input parameters. alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (alleig || valeig || indeig)) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldz < max(1,n)) { *info = -10; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -4; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -5; } else if (iu < min(n,il) || iu > n) { *info = -6; } } } if (*info == 0) { // Compute the workspace requirements smlsiz = magma_get_smlsize_divideconquer(); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else { lwmin = 1 + 4*n + n*n; liwmin = 3 + 5*n; } work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -12; } else if (liwork < liwmin && ! lquery) { *info = -14; } } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } // Quick return if possible if (n == 0) return *info; if (n == 1) { *Z = 1.; return *info; } /* determine the number of threads *///not needed here to be checked Azzam //magma_int_t threads = magma_get_parallel_numthreads(); //magma_int_t mklth = magma_get_lapack_numthreads(); //magma_set_lapack_numthreads(mklth); #ifdef ENABLE_DEBUG //printf(" D&C is using %d threads\n", threads); #endif // If N is smaller than the minimum divide size (SMLSIZ+1), then // solve the problem with another solver. if (n < smlsiz) { lapackf77_ssteqr("I", &n, d, e, Z, &ldz, work, info); } else { lapackf77_slaset("F", &n, &n, &d_zero, &d_one, Z, &ldz); //Scale. orgnrm = lapackf77_slanst("M", &n, d, e); if (orgnrm == 0) { work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; return *info; } eps = lapackf77_slamch( "Epsilon" ); if (alleig) { start = 0; while ( start < n ) { // Let FINISH be the position of the next subdiagonal entry // such that E( END ) <= TINY or FINISH = N if no such // subdiagonal exists. The matrix identified by the elements // between START and END constitutes an independent // sub-problem. for (end = start+1; end < n; ++end) { tiny = eps * sqrt( MAGMA_S_ABS(d[end-1]*d[end])); if (MAGMA_S_ABS(e[end-1]) <= tiny) break; } // (Sub) Problem determined. Compute its size and solve it. m = end - start; if (m == 1) { start = end; continue; } if (m > smlsiz) { // Scale orgnrm = lapackf77_slanst("M", &m, &d[start], &e[start]); lapackf77_slascl("G", &izero, &izero, &orgnrm, &d_one, &m, &ione, &d[start], &m, info); magma_int_t mm = m-1; lapackf77_slascl("G", &izero, &izero, &orgnrm, &d_one, &mm, &ione, &e[start], &mm, info); magma_slaex0( m, &d[start], &e[start], Z(start, start), ldz, work, iwork, dwork, MagmaRangeAll, vl, vu, il, iu, info); if ( *info != 0) { return *info; } // Scale Back lapackf77_slascl("G", &izero, &izero, &d_one, &orgnrm, &m, &ione, &d[start], &m, info); } else { lapackf77_ssteqr( "I", &m, &d[start], &e[start], Z(start, start), &ldz, work, info); if (*info != 0) { *info = (start+1) *(n+1) + end; } } start = end; } // If the problem split any number of times, then the eigenvalues // will not be properly ordered. Here we permute the eigenvalues // (and the associated eigenvectors) into ascending order. if (m < n) { // Use Selection Sort to minimize swaps of eigenvectors for (i = 1; i < n; ++i) { k = i-1; p = d[i-1]; for (j = i; j < n; ++j) { if (d[j] < p) { k = j; p = d[j]; } } if (k != i-1) { d[k] = d[i-1]; d[i-1] = p; blasf77_sswap(&n, Z(0,i-1), &ione, Z(0,k), &ione); } } } } else { // Scale lapackf77_slascl("G", &izero, &izero, &orgnrm, &d_one, &n, &ione, d, &n, info); magma_int_t nm = n-1; lapackf77_slascl("G", &izero, &izero, &orgnrm, &d_one, &nm, &ione, e, &nm, info); magma_slaex0( n, d, e, Z, ldz, work, iwork, dwork, range, vl, vu, il, iu, info); if ( *info != 0) { return *info; } // Scale Back lapackf77_slascl("G", &izero, &izero, &d_one, &orgnrm, &n, &ione, d, &n, info); } } work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; return *info; } /* magma_sstedx */
/* //////////////////////////////////////////////////////////////////////////// -- Testing slaset Code is very similar to testing_slacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; magmaFloat_ptr d_A; float offdiag, diag; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("%% uplo M N offdiag diag CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("%%===================================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { for( int ival = 0; ival < 4; ++ival ) { // test combinations of zero & non-zero: // ival offdiag diag // 0 0 0 // 1 0 3.14 // 2 1.23 0 // 3 1.23 3.14 offdiag = MAGMA_S_MAKE( 1.2345, 6.7890 ) * (ival / 2); diag = MAGMA_S_MAKE( 3.1415, 2.7183 ) * (ival % 2); M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = magma_roundup( M, opts.align ); size = lda*N; if ( uplo[iuplo] == MagmaLower ) { // save lower trapezoid (with diagonal) if ( M > N ) { gbytes = sizeof(float) * (1.*M*N - 0.5*N*(N-1)) / 1e9; } else { gbytes = sizeof(float) * 0.5*M*(M+1) / 1e9; } } else if ( uplo[iuplo] == MagmaUpper ) { // save upper trapezoid (with diagonal) if ( N > M ) { gbytes = sizeof(float) * (1.*M*N - 0.5*M*(M-1)) / 1e9; } else { gbytes = sizeof(float) * 0.5*N*(N+1) / 1e9; } } else { // save entire matrix gbytes = sizeof(float) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); //magmablas_slaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda, opts.queue ); // inset by 1 row & col magmablas_slaset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_slaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_slaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_sprint( M, N, h_A, lda ); printf( "dA=" ); magma_sprint_gpu( M, N, d_A, ldda ); } /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5s %5d %5d %9.4f %6.4f %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, real(offdiag), real(diag), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. TODO what is LDQ2? @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (workspace) REAL array, dimension (3*N*N/2+3*N) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* Q, magma_int_t ldq, float rho, float* dlamda, float* Q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, Q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return *info; timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); magma_queue_sync( NULL ); if (rk != 0) { if ( n23 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); return *info; } /* magma_slaex3 */
magma_int_t magma_strevc3( magma_side_t side, magma_vec_t howmany, magma_int_t *select, // logical in fortran magma_int_t n, float *T, magma_int_t ldt, float *VL, magma_int_t ldvl, float *VR, magma_int_t ldvr, magma_int_t mm, magma_int_t *mout, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define T(i,j) (T + (i) + (j)*ldt) #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) #define X(i,j) (X + (i)-1 + ((j)-1)*2) // still as 1-based indices #define work(i,j) (work + (i) + (j)*n) // constants const magma_int_t ione = 1; const float c_zero = 0; const float c_one = 1; const magma_int_t nbmin = 16, nbmax = 256; // .. Local Scalars .. magma_int_t allv, bothv, leftv, over, pair, rightv, somev; magma_int_t i, ierr, ii, ip, is, j, k, ki, ki2, iv, n2, nb, nb2, version; float emax, remax; // .. Local Arrays .. // since iv is a 1-based index, allocate one extra here magma_int_t iscomplex[ nbmax+1 ]; // Decode and test the input parameters bothv = (side == MagmaBothSides); rightv = (side == MagmaRight) || bothv; leftv = (side == MagmaLeft ) || bothv; allv = (howmany == MagmaAllVec); over = (howmany == MagmaBacktransVec); somev = (howmany == MagmaSomeVec); *info = 0; if ( ! rightv && ! leftv ) *info = -1; else if ( ! allv && ! over && ! somev ) *info = -2; else if ( n < 0 ) *info = -4; else if ( ldt < max( 1, n ) ) *info = -6; else if ( ldvl < 1 || ( leftv && ldvl < n ) ) *info = -8; else if ( ldvr < 1 || ( rightv && ldvr < n ) ) *info = -10; else if ( lwork < max( 1, 3*n ) ) *info = -14; else { // Set mout to the number of columns required to store the selected // eigenvectors, standardize the array select if necessary, and // test mm. if ( somev ) { *mout = 0; pair = false; for( j=0; j < n; ++j ) { if ( pair ) { pair = false; select[j] = false; } else { if ( j < n-1 ) { if ( *T(j+1,j) == c_zero ) { if ( select[j] ) { *mout += 1; } } else { pair = true; if ( select[j] || select[j+1] ) { select[j] = true; *mout += 2; } } } else if ( select[n-1] ) { *mout += 1; } } } } else { *mout = n; } if ( mm < *mout ) { *info = -11; } } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if possible. if ( n == 0 ) { return *info; } // Use blocked version (2) if sufficient workspace. // Requires 1 vector for 1-norms, and 2*nb vectors for x and Q*x. // Zero-out the workspace to avoid potential NaN propagation. nb = 2; if ( lwork >= n + 2*n*nbmin ) { version = 2; nb = (lwork - n) / (2*n); nb = min( nb, nbmax ); nb2 = 1 + 2*nb; lapackf77_slaset( "F", &n, &nb2, &c_zero, &c_zero, work, &n ); } else { version = 1; } // Compute 1-norm of each column of strictly upper triangular // part of T to control overflow in triangular solver. *work(0,0) = c_zero; for( j=1; j < n; ++j ) { *work(j,0) = c_zero; for( i=0; i < j; ++i ) { *work(j,0) += fabsf( *T(i,j) ); } } magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0; timer_start( time_total ); // Index ip is used to specify the real or complex eigenvalue: // ip = 0, real eigenvalue (wr), // = 1, first of conjugate complex pair: (wr,wi) // = -1, second of conjugate complex pair: (wr,wi) // iscomplex array stores ip for each column in current block. if ( rightv ) { // ============================================================ // Compute right eigenvectors. // iv is index of column in current block (1-based). // For complex right vector, uses iv-1 for real part and iv for complex part. // Non-blocked version always uses iv=2; // blocked version starts with iv=nb, goes down to 1 or 2. // (Note the "0-th" column is used for 1-norms computed above.) iv = 2; if ( version == 2 ) { iv = nb; } timer_start( time_trsv ); ip = 0; is = *mout - 1; for( ki=n-1; ki >= 0; --ki ) { if ( ip == -1 ) { // previous iteration (ki+1) was second of conjugate pair, // so this ki is first of conjugate pair; skip to end of loop ip = 1; continue; } else if ( ki == 0 ) { // last column, so this ki must be real eigenvalue ip = 0; } else if ( *T(ki,ki-1) == c_zero ) { // zero on sub-diagonal, so this ki is real eigenvalue ip = 0; } else { // non-zero on sub-diagonal, so this ki is second of conjugate pair ip = -1; } if ( somev ) { if ( ip == 0 ) { if ( ! select[ki] ) { continue; } } else { if ( ! select[ki-1] ) { continue; } } } if ( ip == 0 ) { // ------------------------------------------------------------ // Real right eigenvector // Solve upper quasi-triangular system: // [ T(0:ki-1,0:ki-1) - wr ]*X = -T(0:ki-1,ki) magma_slaqtrsd( MagmaNoTrans, ki+1, T(0,0), ldt, work(0,iv), n, work(0,0), &ierr ); // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize. n2 = ki+1; blasf77_scopy( &n2, work(0,iv), &ione, VR(0,is), &ione ); ii = blasf77_isamax( &n2, VR(0,is), &ione ) - 1; // subtract 1; ii is 0-based remax = c_one / fabsf( *VR(ii,is) ); blasf77_sscal( &n2, &remax, VR(0,is), &ione ); for( k=ki + 1; k < n; ++k ) { *VR(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 0 ) { n2 = ki; blasf77_sgemv( "n", &n, &n2, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } time_gemv_sum += timer_stop( time_gemv ); ii = blasf77_isamax( &n, VR(0,ki), &ione ) - 1; // subtract 1; ii is 0-based remax = c_one / fabsf( *VR(ii,ki) ); blasf77_sscal( &n, &remax, VR(0,ki), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki + 1; k < n; ++k ) { *work(k,iv) = c_zero; } iscomplex[ iv ] = ip; // back-transform and normalization is done below } } // end real eigenvector else { // ------------------------------------------------------------ // Complex right eigenvector // Solve upper quasi-triangular system: // [ T(0:ki-2,0:ki-2) - (wr+i*wi) ]*x = u magma_slaqtrsd( MagmaNoTrans, ki+1, T(0,0), ldt, work(0,iv-1), n, work(0,0), &ierr ); // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize. n2 = ki+1; blasf77_scopy( &n2, work(0,iv-1), &ione, VR(0,is-1), &ione ); blasf77_scopy( &n2, work(0,iv ), &ione, VR(0,is ), &ione ); emax = c_zero; for( k=0; k <= ki; ++k ) { emax = max( emax, fabsf(*VR(k,is-1)) + fabsf(*VR(k,is)) ); } remax = c_one / emax; blasf77_sscal( &n2, &remax, VR(0,is-1), &ione ); blasf77_sscal( &n2, &remax, VR(0,is ), &ione ); for( k=ki + 1; k < n; ++k ) { *VR(k,is-1) = c_zero; *VR(k,is ) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 1 ) { n2 = ki-1; blasf77_sgemv( "n", &n, &n2, &c_one, VR, &ldvr, work(0, iv-1), &ione, work(ki-1,iv-1), VR(0,ki-1), &ione ); blasf77_sgemv( "n", &n, &n2, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } else { blasf77_sscal( &n, work(ki-1,iv-1), VR(0,ki-1), &ione ); blasf77_sscal( &n, work(ki, iv ), VR(0,ki ), &ione ); } time_gemv_sum += timer_stop( time_gemv ); emax = c_zero; for( k=0; k < n; ++k ) { emax = max( emax, fabsf(*VR(k,ki-1)) + fabsf(*VR(k,ki)) ); } remax = c_one / emax; blasf77_sscal( &n, &remax, VR(0,ki-1), &ione ); blasf77_sscal( &n, &remax, VR(0,ki ), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki + 1; k < n; ++k ) { *work(k,iv-1) = c_zero; *work(k,iv ) = c_zero; } iscomplex[ iv-1 ] = -ip; iscomplex[ iv ] = ip; iv -= 1; // back-transform and normalization is done below } } // end real or complex vector if ( version == 2 ) { // ------------------------------------------------------------ // Blocked version of back-transform // For complex case, ki2 includes both vectors (ki-1 and ki) if ( ip == 0 ) { ki2 = ki; } else { ki2 = ki - 1; } // Columns iv:nb of work are valid vectors. // When the number of vectors stored reaches nb-1 or nb, // or if this was last vector, do the GEMM if ( (iv <= 2) || (ki2 == 0) ) { time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemm ); nb2 = nb-iv+1; n2 = ki2+nb-iv+1; blasf77_sgemm( "n", "n", &n, &nb2, &n2, &c_one, VR, &ldvr, work(0,iv), &n, &c_zero, work(0,nb+iv), &n ); time_gemm_sum += timer_stop( time_gemm ); // normalize vectors // TODO if somev, should copy vectors individually to correct location. for( k=iv; k <= nb; ++k ) { if ( iscomplex[k] == 0 ) { // real eigenvector ii = blasf77_isamax( &n, work(0,nb+k), &ione ) - 1; // subtract 1; ii is 0-based remax = c_one / fabsf( *work(ii,nb+k) ); } else if ( iscomplex[k] == 1 ) { // first eigenvector of conjugate pair emax = c_zero; for( ii=0; ii < n; ++ii ) { emax = max( emax, fabsf( *work(ii,nb+k ) ) + fabsf( *work(ii,nb+k+1) ) ); } remax = c_one / emax; // else if iscomplex[k] == -1 // second eigenvector of conjugate pair // reuse same remax as previous k } blasf77_sscal( &n, &remax, work(0,nb+k), &ione ); } nb2 = nb-iv+1; lapackf77_slacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki2), &ldvr ); iv = nb; timer_start( time_trsv ); } else { iv -= 1; } } // end blocked back-transform is -= 1; if ( ip != 0 ) { is -= 1; } } } timer_stop( time_trsv ); timer_stop( time_total ); timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n", time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total ); if ( leftv ) { // ============================================================ // Compute left eigenvectors. // iv is index of column in current block (1-based). // For complex left vector, uses iv for real part and iv+1 for complex part. // Non-blocked version always uses iv=1; // blocked version starts with iv=1, goes up to nb-1 or nb. // (Note the "0-th" column is used for 1-norms computed above.) iv = 1; ip = 0; is = 0; for( ki=0; ki < n; ++ki ) { if ( ip == 1 ) { // previous iteration (ki-1) was first of conjugate pair, // so this ki is second of conjugate pair; skip to end of loop ip = -1; continue; } else if ( ki == n-1 ) { // last column, so this ki must be real eigenvalue ip = 0; } else if ( *T(ki+1,ki) == c_zero ) { // zero on sub-diagonal, so this ki is real eigenvalue ip = 0; } else { // non-zero on sub-diagonal, so this ki is first of conjugate pair ip = 1; } if ( somev ) { if ( ! select[ki] ) { continue; } } if ( ip == 0 ) { // ------------------------------------------------------------ // Real left eigenvector // Solve transposed quasi-triangular system: // [ T(ki+1:n,ki+1:n) - wr ]**T * X = -T(ki+1:n,ki) magma_slaqtrsd( MagmaTrans, n-ki, T(ki,ki), ldt, work(ki,iv), n, work(ki,0), &ierr ); // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize. n2 = n-ki; blasf77_scopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione ); ii = blasf77_isamax( &n2, VL(ki,is), &ione ) + ki - 1; // subtract 1; ii is 0-based remax = c_one / fabsf( *VL(ii,is) ); blasf77_sscal( &n2, &remax, VL(ki,is), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. if ( ki < n-1 ) { n2 = n-ki-1; blasf77_sgemv( "n", &n, &n2, &c_one, VL(0,ki+1), &ldvl, work(ki+1,iv), &ione, work(ki, iv), VL(0,ki), &ione ); } ii = blasf77_isamax( &n, VL(0,ki), &ione ) - 1; // subtract 1; ii is 0-based remax = c_one / fabsf( *VL(ii,ki) ); blasf77_sscal( &n, &remax, VL(0,ki), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv) = c_zero; } iscomplex[ iv ] = ip; // back-transform and normalization is done below } } // end real eigenvector else { // ------------------------------------------------------------ // Complex left eigenvector // Solve transposed quasi-triangular system: // [ T(ki+2:n,ki+2:n)**T - (wr-i*wi) ]*X = V magma_slaqtrsd( MagmaTrans, n-ki, T(ki,ki), ldt, work(ki,iv), n, work(ki,0), &ierr ); // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize. n2 = n-ki; blasf77_scopy( &n2, work(ki,iv ), &ione, VL(ki,is ), &ione ); blasf77_scopy( &n2, work(ki,iv+1), &ione, VL(ki,is+1), &ione ); emax = c_zero; for( k=ki; k < n; ++k ) { emax = max( emax, fabsf(*VL(k,is))+ fabsf(*VL(k,is+1)) ); } remax = c_one / emax; blasf77_sscal( &n2, &remax, VL(ki,is ), &ione ); blasf77_sscal( &n2, &remax, VL(ki,is+1), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is ) = c_zero; *VL(k,is+1) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. if ( ki < n-2 ) { n2 = n-ki-2; blasf77_sgemv( "n", &n, &n2, &c_one, VL(0,ki+2), &ldvl, work(ki+2,iv), &ione, work(ki, iv), VL(0,ki), &ione ); blasf77_sgemv( "n", &n, &n2, &c_one, VL(0,ki+2), &ldvl, work(ki+2,iv+1), &ione, work(ki+1,iv+1), VL(0,ki+1), &ione ); } else { blasf77_sscal( &n, work(ki, iv ), VL(0, ki ), &ione ); blasf77_sscal( &n, work(ki+1,iv+1), VL(0, ki+1), &ione ); } emax = c_zero; for( k=0; k < n; ++k ) { emax = max( emax, fabsf(*VL(k,ki))+ fabsf(*VL(k,ki+1)) ); } remax = c_one / emax; blasf77_sscal( &n, &remax, VL(0,ki ), &ione ); blasf77_sscal( &n, &remax, VL(0,ki+1), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv ) = c_zero; *work(k,iv+1) = c_zero; } iscomplex[ iv ] = ip; iscomplex[ iv+1 ] = -ip; iv += 1; // back-transform and normalization is done below } } // end real or complex eigenvector if ( version == 2 ) { // ------------------------------------------------- // Blocked version of back-transform // For complex case, (ki2+1) includes both vectors (ki+1) and (ki+2) if ( ip == 0 ) { ki2 = ki; } else { ki2 = ki + 1; } // Columns 1:iv of work are valid vectors. // When the number of vectors stored reaches nb-1 or nb, // or if this was last vector, do the GEMM if ( (iv >= nb-1) || (ki2 == n-1) ) { n2 = n-(ki2+1)+iv; blasf77_sgemm( "n", "n", &n, &iv, &n2, &c_one, VL(0,ki2-iv+1), &ldvl, work(ki2-iv+1,1), &n, &c_zero, work(0,nb+1), &n ); // normalize vectors for( k=1; k <= iv; ++k ) { if ( iscomplex[k] == 0 ) { // real eigenvector ii = blasf77_isamax( &n, work(0,nb+k), &ione ) - 1; // subtract 1; ii is 0-based remax = c_one / fabsf( *work(ii,nb+k) ); } else if ( iscomplex[k] == 1) { // first eigenvector of conjugate pair emax = c_zero; for( ii=0; ii < n; ++ii ) { emax = max( emax, fabsf( *work(ii,nb+k ) ) + fabsf( *work(ii,nb+k+1) ) ); } remax = c_one / emax; // else if iscomplex[k] == -1 // second eigenvector of conjugate pair // reuse same remax as previous k } blasf77_sscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_slacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki2-iv+1), &ldvl ); iv = 1; } else { iv += 1; } } // blocked back-transform is += 1; if ( ip != 0 ) { is += 1; } } } return *info; } // end of STREVC3
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const float d_neg_one = MAGMA_D_NEG_ONE; const float d_one = MAGMA_D_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float Anorm, error=0, error2=0; float *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloat_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); // version 3 can do either check if (opts.check == 1 && opts.version == 1) { opts.check = 2; printf( "%% version 1 requires check 2 (solve A*x=b)\n" ); } if (opts.check == 2 && opts.version == 2) { opts.check = 1; printf( "%% version 2 requires check 1 (R - Q^H*A)\n" ); } printf( "%% version %d\n", (int) opts.version ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("%%==============================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |b - A*x|\n"); printf("%%===============================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min( M, N ); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_sgeqrf_nb( M, N ); gflops = FLOPS_SGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf( &M, &N, NULL, &M, NULL, tmp, &lwork, &info ); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); if ( opts.version == 1 || opts.version == 3 ) { size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb; TESTING_MALLOC_DEV( dT, float, size ); magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size ); } /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_sgeqrf_nb( M, N ); gpu_time = magma_wtime(); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } else if ( opts.version == 2 ) { // LAPACK complaint arguments magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); return -1; } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) { if ( opts.version == 3 ) { // copy diagonal blocks of R back to A for( int i=0; i < min_mn-nb; i += nb ) { magma_int_t ib = min( min_mn-i, nb ); magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda ); } } /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. Or for version 3, after restoring diagonal blocks of A above. =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; float *Q, *R; float *work; TESTING_MALLOC_CPU( Q, float, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, float, ldr*N ); // K by N TESTING_MALLOC_CPU( work, float, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_slange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_slange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3) ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork2; float *x, *b, *hwork; magmaFloat_ptr d_B; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, float, N ); TESTING_MALLOC_CPU( b, float, M ); lapackf77_slarnv( &ione, ISEED, &N, x ); blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, float, M ); magma_ssetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork2 ); // solve linear system magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork2, &info ); if (info != 0) { printf("magma_sgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork2 ); // solve linear system magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork2, &info ); if (info != 0) { printf("magma_sgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); return -1; } magma_sgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (max(m,n)*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (max(M,N) * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check == 1 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( opts.check == 2 ) { if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version == 1 || opts.version == 3 ) { TESTING_FREE_DEV( dT ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/***************************************************************************//** Purpose ------- SGEHRD reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= N*NB. where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] T REAL array, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices, for later use in magma_sorghr. @ingroup magma_gehrd *******************************************************************************/ extern "C" magma_int_t magma_sgehrd_m( magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *T, magma_int_t *info) { #define A( i, j ) (A + (i) + (j)*lda) #define dA( dev, i, j ) (data.dA[dev] + (i) + (j)*ldda) float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; magma_int_t nb = magma_get_sgehrd_nb(n); magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, dev; magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork; magma_int_t iinfo; magma_int_t lquery; struct sgehrd_data data; magma_int_t ngpu = magma_num_gpus(); *info = 0; iws = n*(nb + nb*ngpu); work[0] = magma_smake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < iws && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; // set T to zero lapackf77_slaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb ); // set to null, to simplify cleanup code for( dev = 0; dev < ngpu; ++dev ) { data.dA[dev] = NULL; data.queues[dev] = NULL; } // Now requires lwork >= iws; else dT won't be computed in unblocked code. // If not enough workspace, use unblocked code //if ( lwork < iws ) { // nb = 1; //} if (nb == 1 || nb >= nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // allocate memory on GPUs for A and workspaces ldda = magma_roundup( n, 32 ); min_lblocks = (n / nb) / ngpu; max_lblocks = ((n-1) / nb) / ngpu + 1; last_dev = (n / nb) % ngpu; // V and Vd need to be padded for copying in mslahr2 data.ngpu = ngpu; data.ldda = ldda; data.ldv = nb*max_lblocks*ngpu; data.ldvd = nb*max_lblocks; for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); nlocal = min_lblocks*nb; if ( dev < last_dev ) { nlocal += nb; } else if ( dev == last_dev ) { nlocal += (n % nb); } ldwork = nlocal*ldda // A + nb*data.ldv // V + nb*data.ldvd // Vd + nb*ldda // Y + nb*ldda // W + nb*nb; // Ti if ( MAGMA_SUCCESS != magma_smalloc( &data.dA[dev], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } data.dV [dev] = data.dA [dev] + nlocal*ldda; data.dVd[dev] = data.dV [dev] + nb*data.ldv; data.dY [dev] = data.dVd[dev] + nb*data.ldvd; data.dW [dev] = data.dY [dev] + nb*ldda; data.dTi[dev] = data.dW [dev] + nb*ldda; magma_queue_create( dev, &data.queues[dev] ); } // Copy the matrix to GPUs magma_ssetmatrix_1D_col_bcyclic( ngpu, n, n, nb, A, lda, data.dA, ldda, data.queues ); // round ilo down to block boundary ilo = (ilo/nb)*nb; for (i = ilo; i < ihi - 1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; if ( i > ilo ) { magma_setdevice( dpanel ); magma_sgetmatrix( ihi-i, nb, dA(dpanel, i, di), ldda, A(i,i), lda, data.queues[dpanel] ); } // add 1 to i for 1-based index magma_slahr2_m( ihi, i+1, nb, A(0,i), lda, &tau[i], &T[i*nb], nb, work, n, &data ); magma_slahru_m( n, ihi, i, nb, A, lda, &data ); // copy first i rows above panel to host magma_setdevice( dpanel ); magma_sgetmatrix_async( i, nb, dA(dpanel, 0, di), ldda, A(0,i), lda, data.queues[dpanel] ); } // Copy remainder to host, block-by-block for( i2 = i; i2 < n; i2 += nb ) { ib = min( nb, n-i2 ); dev = (i2 / nb) % ngpu; di = (i2 / nb) / ngpu * nb; magma_setdevice( dev ); magma_sgetmatrix( n, ib, dA(dev, 0, di), ldda, A(0,i2), lda, data.queues[dev] ); } } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_smake_lwork( iws ); CLEANUP: for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( data.dA[dev] ); magma_queue_destroy( data.queues[dev] ); } magma_setdevice( orig_dev ); return *info; } /* magma_sgehrd */
extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* q, magma_int_t ldq, float rho, float* dlamda, float* q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { /* Purpose ======= SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= K (input) INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. N (input) INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N>K). N1 (input) INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. D (output) REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. Q (output) REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. LDQ (input) INTEGER The leading dimension of the array Q. LDQ >= max(1,N). RHO (input) REAL The value of the parameter in the rank one update equation. RHO >= 0 required. DLAMDA (input/output) REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. Q2 (input) REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. INDX (input) INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. CTOT (input) INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. W (input/output) REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. S (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. INDXQ (output) INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = 1, an eigenvalue did not converge Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. ===================================================================== */ float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; char range_[] = {range, 0}; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i,iq2,j,n12,n2,n23,tmp,lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); *info = 0; if(k < 0) *info=-1; else if(n < k) *info=-2; else if(ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if(*info != 0){ magma_xerbla(__func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if(k == 0) return MAGMA_SUCCESS; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for(i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = ib; j < ie; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0){ #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if(*info == 0){ #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2){ #pragma omp single { for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if(k != 1){ // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for(j = 0; j < k; ++j){ magma_int_t i_tmp = min(j, ie); for(i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for(i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if(id < tot){ ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else{ ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for(j = ib; j < ie; ++j){ for(i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s+id*k, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return MAGMA_SUCCESS; //?????? #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif for(i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = 0; j < k; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0) *info=iinfo; } if(*info != 0) return MAGMA_SUCCESS; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2){ for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if(k != 1){ // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, q, &tmp, w, &ione); for(j = 0; j < k; ++j){ for(i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for(i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for(j = iil-1; j < iiu; ++j){ for(i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #endif //_OPENMP // Compute the updated eigenvectors. #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER start = get_current_time(); #endif magma_queue_sync( NULL ); if (rk != 0){ if( n23 != 0 ){ if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm('N', 'N', n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if( n12 != 0 ) { if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm('N', 'N', n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.); #endif return MAGMA_SUCCESS; } /*magma_slaed3*/
/* //////////////////////////////////////////////////////////////////////////// -- Testing slarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float error, work[1]; // test all combinations of input parameters const char side[] = { MagmaLeft, MagmaRight }; const char trans[] = { MagmaTrans, MagmaNoTrans }; const char direct[] = { MagmaForward, MagmaBackward }; const char storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { M = opts.msize[i]; N = opts.nsize[i]; K = opts.ksize[i]; if ( M < K || N < K || K <= 0 ) { printf( "skipping M %d, N %d, K %d; requires M >= K, N >= K, K >= 0.\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices float *C, *R, *V, *T, *W; TESTING_MALLOC( C, float, ldc*N ); TESTING_MALLOC( R, float, ldc*N ); TESTING_MALLOC( V, float, ldv*K ); TESTING_MALLOC( T, float, ldt*K ); TESTING_MALLOC( W, float, ldw*K ); float *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, float, ldc*N ); TESTING_DEVALLOC( dV, float, ldv*K ); TESTING_DEVALLOC( dT, float, ldt*K ); TESTING_DEVALLOC( dW, float, ldw*K ); // C is M x N. size = ldc*N; lapackf77_slarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_sprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_slarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_slaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_slaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_sprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_slarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_slaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_sprint( K, K, T, ldt ); magma_ssetmatrix( M, N, C, ldc, dC, ldc ); magma_ssetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_ssetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_slarfb( &side[iside], &trans[itran], &direct[idir], &storev[istor], &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_sprint( M, N, C, ldc ); magma_slarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_sgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_sprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_slange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_slange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e\n", (int) M, (int) N, (int) K, storev[istor], side[iside], direct[idir], trans[itran], error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); }}}} printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by SGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T REAL array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y REAL array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_sgeev_aux ********************************************************************/ extern "C" magma_int_t magma_slahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, float *A, magma_int_t lda, float *tau, float *T, magma_int_t ldt, float *Y, magma_int_t ldy, struct sgehrd_data *data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; float scale; magma_int_t i; float ei = MAGMA_S_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv, data->queues[d] ); } // set all Y=0 lapackf77_slaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_S_NEGATE( tau[i-1] ); blasf77_saxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_scopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_strmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_sgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_strmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_sgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_strmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_saxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_slarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_ssetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->queues[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_slacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb, data->queues[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_sgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1, data->queues[d] ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_sgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->queues[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_S_NEGATE( tau[i] ); blasf77_sgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_strmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If real, conjugate row of V, and undo afterwards #ifdef COMPLEX lapackf77_slacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_sgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #ifdef COMPLEX lapackf77_slacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_sgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->queues[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_saxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_sgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda, data->queues[d] ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_sgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->queues[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_saxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_ssetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->queues[d] ); magma_ssetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->queues[d] ); } magma_setdevice( orig_dev ); return *info; } /* magma_slahr2 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing slaset Code is very similar to testing_slacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; magmaFloat_ptr d_A; float offdiag = MAGMA_S_MAKE( 1.2000, 6.7000 ); float diag = MAGMA_S_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("=================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(float) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(float) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime( 0 ); //magmablas_slaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, 0, ldda, opts.queue ); // inset by 1 row & col magmablas_slaset( uplo[iuplo], M, N, offdiag, diag, d_A, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_slaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_slaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_sprint( M, N, h_A, lda ); printf( "dA=" ); magma_sprint_gpu( M, N, d_A, 0, ldda, opts.queue ); } /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_A, 0, ldda, h_R, lda, opts.queue ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work); printf("%5s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetri */ int main( int argc, char** argv ) { TESTING_INIT(); // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_Ainv, *h_R, *work; magmaFloat_ptr d_A, dwork; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float tmp; float error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default ldwork = N * magma_get_sgetri_nb( N ); gflops = FLOPS_SGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_sgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_S_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, float, lwork ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_Ainv, float, n2 ); TESTING_MALLOC_CPU( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_sgetrf_gpu( N, N, d_A, ldda, ipiv, &info ); magma_sgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); if (info != 0) { printf("magma_sgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // check for exact singularity //h_Ainv[ 10 + 10*lda ] = MAGMA_S_MAKE( 0.0, 0.0 ); //magma_ssetmatrix( N, N, h_Ainv, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgetri_gpu( N, d_A, ldda, ipiv, dwork, ldwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgetri( &N, h_Ainv, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_sgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); // compute 1-norm condition number estimate, following LAPACK's zget03 float normA, normAinv, rcond; normA = lapackf77_slange( "1", &N, &N, h_A, &lda, rwork ); normAinv = lapackf77_slange( "1", &N, &N, h_Ainv, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; error = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_slaset( "full", &N, &N, &c_zero, &c_one, h_R, &lda ); blasf77_sgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A, &lda, h_Ainv, &lda, &c_one, h_R, &lda ); error = lapackf77_slange( "1", &N, &N, h_R, &lda, rwork ); error = error * rcond / N; } bool okay = (error < tol); status += ! okay; printf( " %8.2e %s\n", error, (okay ? "ok" : "failed")); } else { printf( "\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_INIT(); /* Constants */ const float d_zero = 0; const magma_int_t izero = 0; const magma_int_t ione = 1; /* Local variables */ real_Double_t gpu_time, cpu_time; magmaFloatComplex *h_A, *h_R, *h_Z, *h_work, aux_work[1]; #ifdef COMPLEX float *rwork, aux_rwork[1]; magma_int_t lrwork; #endif float *w1, *w2, result[4]={0, 0, 0, 0}, eps, abstol; magma_int_t *iwork, *isuppz, *ifail, aux_iwork[1]; magma_int_t N, n2, info, lwork, liwork, lda; magma_int_t ISEED[4] = {0,0,0,1}; eps = lapackf77_slamch( "E" ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // checking NoVec requires LAPACK opts.lapack |= (opts.check && opts.jobz == MagmaNoVec); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; #ifdef REAL if (opts.version == 3 || opts.version == 4) { printf("%% magma_cheevr and magma_cheevx are not available for real precisions (single, float).\n"); return status; } #endif float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n", lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), opts.fraction, int(abs_ngpu) ); printf("%% N CPU Time (sec) GPU Time (sec) |S-S_magma| |A-USU^H| |I-U^H U|\n"); printf("%%============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; abstol = 0; // auto, in cheevr // TODO: test vl-vu range magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } // query for workspace sizes if ( opts.version == 1 || opts.version == 2 ) { magma_cheevd( opts.jobz, opts.uplo, N, NULL, lda, NULL, // A, w aux_work, -1, #ifdef COMPLEX aux_rwork, -1, #endif aux_iwork, -1, &info ); } else if ( opts.version == 3 ) { #ifdef COMPLEX magma_cheevr( opts.jobz, range, opts.uplo, N, NULL, lda, // A vl, vu, il, iu, abstol, &m1, NULL, // w NULL, lda, NULL, // Z, isuppz aux_work, -1, #ifdef COMPLEX aux_rwork, -1, #endif aux_iwork, -1, &info ); #endif } else if ( opts.version == 4 ) { #ifdef COMPLEX magma_cheevx( opts.jobz, range, opts.uplo, N, NULL, lda, // A vl, vu, il, iu, abstol, &m1, NULL, // w NULL, lda, // Z aux_work, -1, #ifdef COMPLEX aux_rwork, #endif aux_iwork, NULL, // ifail &info ); // cheevx doesn't query rwork, iwork; set them for consistency aux_rwork[0] = float(7*N); aux_iwork[0] = float(5*N); #endif } lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); #ifdef COMPLEX lrwork = (magma_int_t) aux_rwork[0]; #endif liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, N*lda ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); #ifdef COMPLEX TESTING_MALLOC_CPU( rwork, float, lrwork ); #endif TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, N*lda ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); if (opts.version == 3) { TESTING_MALLOC_CPU( h_Z, magmaFloatComplex, N*lda ); TESTING_MALLOC_CPU( isuppz, magma_int_t, 2*max(1,N) ); } if (opts.version == 4) { TESTING_MALLOC_CPU( h_Z, magmaFloatComplex, N*lda ); TESTING_MALLOC_CPU( ifail, magma_int_t, N ); } /* Clear eigenvalues, for |S-S_magma| check when fraction < 1. */ lapackf77_slaset( "Full", &N, &ione, &d_zero, &d_zero, w1, &N ); lapackf77_slaset( "Full", &N, &ione, &d_zero, &d_zero, w2, &N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if (opts.version == 1) { if (opts.ngpu == 1) { magma_cheevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { //printf( "magma_cheevd_m, ngpu %d (%d)\n", opts.ngpu, abs_ngpu ); magma_cheevd_m( abs_ngpu, opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } } else if ( opts.version == 2 ) { // version 2: cheevdx computes selected eigenvalues/vectors if (opts.ngpu == 1) { magma_cheevdx( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { //printf( "magma_cheevdx_m, ngpu %d (%d)\n", opts.ngpu, abs_ngpu ); magma_cheevdx_m( abs_ngpu, opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } //printf( "il %d, iu %d, m1 %d\n", il, iu, m1 ); } else if ( opts.version == 3 ) { // version 3: MRRR, computes selected eigenvalues/vectors // only complex version available #ifdef COMPLEX magma_cheevr( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, abstol, &m1, w1, h_Z, lda, isuppz, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); lapackf77_clacpy( "Full", &N, &N, h_Z, &lda, h_R, &lda ); #endif } else if ( opts.version == 4 ) { // version 3: cheevx (QR iteration), computes selected eigenvalues/vectors // only complex version available #ifdef COMPLEX magma_cheevx( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, abstol, &m1, w1, h_Z, lda, h_work, lwork, #ifdef COMPLEX rwork, /*lrwork,*/ #endif iwork, /*liwork,*/ ifail, &info ); lapackf77_clacpy( "Full", &N, &N, h_Z, &lda, h_R, &lda ); #endif } gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); } bool okay = true; if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U^H and the following 3 tests computed: (1) | A - U S U^H | / ( |A| N ) (2) | I - U^H U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | // currently disabled, but compares to LAPACK =================================================================== */ magmaFloatComplex *work; TESTING_MALLOC_CPU( work, magmaFloatComplex, 2*N*N ); // e=NULL is unused since kband=0; tau=NULL is unused since itype=1 lapackf77_chet21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, NULL, h_R, &lda, h_R, &lda, NULL, work, #ifdef COMPLEX rwork, #endif &result[0] ); result[0] *= eps; result[1] *= eps; TESTING_FREE_CPU( work ); work=NULL; // Disable third eigenvalue check that calls routine again -- // it obscures whether error occurs in first call above or in this call. // But see comparison to LAPACK below. // //lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_cheevd( MagmaNoVec, opts.uplo, // N, h_R, lda, w2, // h_work, lwork, // #ifdef COMPLEX // rwork, lrwork, // #endif // iwork, liwork, // &info ); //if (info != 0) { // printf("magma_cheevd returned error %d: %s.\n", // (int) info, magma_strerror( info )); //} // //float maxw=0, diff=0; //for( int j=0; j < N; j++ ) { // maxw = max(maxw, fabs(w1[j])); // maxw = max(maxw, fabs(w2[j])); // diff = max(diff, fabs(w1[j]-w2[j])); //} //result[2] = diff / (N*maxw); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); if ( opts.version == 1 || opts.version == 2 ) { lapackf77_cheevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); } else if ( opts.version == 3 ) { lapackf77_cheevr( lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), &N, h_A, &lda, &vl, &vu, &il, &iu, &abstol, &m1, w2, h_Z, &lda, isuppz, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); lapackf77_clacpy( "Full", &N, &N, h_Z, &lda, h_A, &lda ); } else if ( opts.version == 4 ) { lapackf77_cheevx( lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), &N, h_A, &lda, &vl, &vu, &il, &iu, &abstol, &m1, w2, h_Z, &lda, h_work, &lwork, #ifdef COMPLEX rwork, #endif iwork, ifail, &info ); lapackf77_clacpy( "Full", &N, &N, h_Z, &lda, h_A, &lda ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) { printf("lapackf77_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); } // compare eigenvalues float maxw=0, diff=0; for( int j=0; j < N; j++ ) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[3] = diff / (N*maxw); okay = okay && (result[3] < tolulp); printf("%5d %9.4f %9.4f %8.2e ", (int) N, cpu_time, gpu_time, result[3] ); } else { printf("%5d --- %9.4f --- ", (int) N, gpu_time); } // print error checks if ( opts.check && opts.jobz != MagmaNoVec ) { okay = okay && (result[0] < tol) && (result[1] < tol); printf(" %8.2e %8.2e", result[0], result[1] ); } else { printf(" --- --- "); } printf(" %s\n", (okay ? "ok" : "failed")); status += ! okay; TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); #ifdef COMPLEX TESTING_FREE_CPU( rwork ); #endif TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); if ( opts.version == 3 ) { TESTING_FREE_CPU( h_Z ); TESTING_FREE_CPU( isuppz ); } if ( opts.version == 4 ) { TESTING_FREE_CPU( h_Z ); TESTING_FREE_CPU( ifail ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, magmaFloat_ptr dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sorgqr */
int main(int argc, char **argv) { TESTING_INIT(); const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf, atomics_time; real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, atomics_error, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaFloat_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("======================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Yatomics, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, ldda*N ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, float, ldwork ); magmablas_slaset( MagmaFull, ldwork, 1, MAGMA_S_NAN, MAGMA_S_NAN, dwork, ldwork ); magmablas_slaset( MagmaFull, ldda, N, MAGMA_S_NAN, MAGMA_S_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_slaset( "Lower", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[1], &lda ); } else { lapackf77_slaset( "Upper", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[lda], &lda ); } lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, ldda ); magma_ssetvector( N, X, incx, dX, incx ); magma_ssetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_ssetvector( N, Y, incy, dY, incy ); atomics_time = magma_sync_wtime( 0 ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( 0 ) - atomics_time; atomics_perf = gflops / atomics_time; magma_sgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_ssetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); if ( opts.version == 1 ) { magmablas_ssymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_ssymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_slange( "M", &N, &ione, Yatomics, &N, work ) / N; bool ok = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! ok; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (ok ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; // local variables magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, ldw2, nv; magma_int_t ISEED[4] = {0,0,0,1}; float Cnorm, error, work[1]; magma_int_t status = 0; // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans [] = { MagmaTrans, MagmaNoTrans }; magma_direct_t direct[] = { MagmaForward, MagmaBackward }; magma_storev_t storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("%%=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; if ( M < K || N < K || K <= 0 ) { printf( "%5d %5d %5d skipping because slarfb requires M >= K, N >= K, K >= 0\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { ldc = magma_roundup( M, opts.align ); // multiple of 32 by default ldt = magma_roundup( K, opts.align ); // multiple of 32 by default ldw = (side[iside] == MagmaLeft ? N : M); ldw2 = min( M, N ); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices float *C, *R, *V, *T, *W; TESTING_MALLOC_CPU( C, float, ldc*N ); TESTING_MALLOC_CPU( R, float, ldc*N ); TESTING_MALLOC_CPU( V, float, ldv*K ); TESTING_MALLOC_CPU( T, float, ldt*K ); TESTING_MALLOC_CPU( W, float, ldw*K ); magmaFloat_ptr dC, dV, dT, dW, dW2; TESTING_MALLOC_DEV( dC, float, ldc*N ); TESTING_MALLOC_DEV( dV, float, ldv*K ); TESTING_MALLOC_DEV( dT, float, ldt*K ); TESTING_MALLOC_DEV( dW, float, ldw*K ); TESTING_MALLOC_DEV( dW2, float, ldw2*K ); // C is M x N. size = ldc*N; lapackf77_slarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_sprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_slarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_slaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_slaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_sprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_slarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_slaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_sprint( K, K, T, ldt ); magma_ssetmatrix( M, N, C, ldc, dC, ldc, opts.queue ); magma_ssetmatrix( ldv, nv, V, ldv, dV, ldv, opts.queue ); magma_ssetmatrix( K, K, T, ldt, dT, ldt, opts.queue ); lapackf77_slarfb( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), lapack_direct_const( direct[idir] ), lapack_storev_const( storev[istor] ), &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_sprint( M, N, C, ldc ); if ( opts.version == 1 ) { magma_slarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw, opts.queue ); } else { magma_slarfb_gpu_gemm( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw, dW2, ldw2, opts.queue ); } magma_sgetmatrix( M, N, dC, ldc, R, ldc, opts.queue ); //printf( "dHC=" ); magma_sprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| size = ldc*N; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_slange( "Fro", &M, &N, C, &ldc, work ); error = lapackf77_slange( "Fro", &M, &N, R, &ldc, work ) / Cnorm; printf( "%5d %5d %5d %c %c %c %c %8.2e %s\n", (int) M, (int) N, (int) K, lapacke_storev_const(storev[istor]), lapacke_side_const(side[iside]), lapacke_direct_const(direct[idir]), lapacke_trans_const(trans[itran]), error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( V ); TESTING_FREE_CPU( T ); TESTING_FREE_CPU( W ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dV ); TESTING_FREE_DEV( dT ); TESTING_FREE_DEV( dW ); TESTING_FREE_DEV( dW2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}}} printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (devices workspaces) REAL array of arrays, dimension NRGPU. if NRGPU = 1 the dimension of the first workspace should be (3*N*N/2+3*N) otherwise the NRGPU workspaces should have the size ceil((N-N1) * (N-N1) / floor(ngpu/2)) + NB * ((N-N1) + (N-N1) / floor(ngpu/2)) @param queues (device queues) magma_queue_t array, dimension (MagmaMaxGPUs,2) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3_m( magma_int_t ngpu, magma_int_t k, magma_int_t n, magma_int_t n1, float *d, float *Q, magma_int_t ldq, float rho, float *dlamda, float *Q2, magma_int_t *indx, magma_int_t *ctot, float *w, float *s, magma_int_t *indxq, magmaFloat_ptr dwork[], magma_queue_t queues[MagmaMaxGPUs][2], magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) #define dQ2(id) (dwork[id]) #define dS(id, ii) (dwork[id] + n2*n2_loc + (ii)*(n2*nb)) #define dQ(id, ii) (dwork[id] + n2*n2_loc + 2*(n2*nb) + (ii)*(n2_loc*nb)) if (ngpu == 1) { magma_setdevice(0); magma_slaex3(k, n, n1, d, Q, ldq, rho, dlamda, Q2, indx, ctot, w, s, indxq, *dwork, range, vl, vu, il, iu, info ); return *info; } float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu; magma_int_t ni_loc[MagmaMaxGPUs]; magma_int_t i, ind, iq2, j, n12, n2, n23, tmp; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ //#define CHECK_CPU #ifdef CHECK_CPU float *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs]; #define hQ2(id) (hwQ2[id]) #define hS(id, ii) (hwS[ii][id]) #define hQ(id, ii) (hwQ[ii][id]) #endif n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; //lq2 = iq2 + n2 * n23; n1_loc = (n1-1) / (ngpu/2) + 1; n2_loc = (n2-1) / (ngpu/2) + 1; nb = magma_get_slaex3_m_nb(); if (n1 >= magma_get_slaex3_m_k()) { #ifdef CHECK_CPU for (igpu = 0; igpu < ngpu; ++igpu) { magma_smalloc_pinned( &(hwS[0][igpu]), n2*nb ); magma_smalloc_pinned( &(hwS[1][igpu]), n2*nb ); magma_smalloc_pinned( &(hwQ2[igpu]), n2*n2_loc ); magma_smalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb ); magma_smalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb ); } #endif for (igpu = 0; igpu < ngpu-1; igpu += 2) { ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc); #ifdef CHECK_CPU lapackf77_slacpy("A", &ni_loc[igpu], &n12, Q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc); #endif magma_setdevice(igpu); magma_ssetmatrix_async( ni_loc[igpu], n12, Q2+n1_loc*(igpu/2), n1, dQ2(igpu), n1_loc, queues[igpu][0] ); ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc); #ifdef CHECK_CPU lapackf77_slacpy("A", &ni_loc[igpu+1], &n23, Q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc); #endif magma_setdevice(igpu+1); magma_ssetmatrix_async( ni_loc[igpu+1], n23, Q2+iq2+n2_loc*(igpu/2), n2, dQ2(igpu+1), n2_loc, queues[igpu+1][0] ); } } // #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info = iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return *info; timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); if (rk > 0) { if (n1 < magma_get_slaex3_m_k()) { // stay on the CPU if ( n23 != 0 ) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } else { //use the gpus ib = min(nb, rk); for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_ssetmatrix_async( n23, ib, Q(ctot[0],iil-1), ldq, dS(igpu+1,0), n23, queues[igpu+1][0] ); } if (n12 != 0) { magma_setdevice(igpu); magma_ssetmatrix_async( n12, ib, Q(0,iil-1), ldq, dS(igpu,0), n12, queues[igpu][0] ); } } for (i = 0; i < rk; i += nb) { ib = min(nb, rk - i); ind = (i/nb)%2; if (i+nb < rk) { ib2 = min(nb, rk - i - nb); for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_ssetmatrix_async( n23, ib2, Q(ctot[0],iil-1+i+nb), ldq, dS(igpu+1,(ind+1)%2), n23, queues[igpu+1][(ind+1)%2] ); } if (n12 != 0) { magma_setdevice(igpu); magma_ssetmatrix_async( n12, ib2, Q(0,iil-1+i+nb), ldq, dS(igpu,(ind+1)%2), n12, queues[igpu][(ind+1)%2] ); } } } // Ensure that the data is copied on gpu since we will overwrite it. for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { #ifdef CHECK_CPU lapackf77_slacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23); #endif magma_setdevice(igpu+1); magma_queue_sync( queues[igpu+1][ind] ); } if (n12 != 0) { #ifdef CHECK_CPU lapackf77_slacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12); #endif magma_setdevice(igpu); magma_queue_sync( queues[igpu][ind] ); } } for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { #ifdef CHECK_CPU blasf77_sgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc, hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc); #endif magma_setdevice(igpu+1); magmablasSetKernelStream(queues[igpu+1][ind]); magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc, dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc); #ifdef CHECK_CPU printf("norm Q %d: %f\n", igpu+1, cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc)); #endif } if (n12 != 0) { #ifdef CHECK_CPU blasf77_sgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc, hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc); #endif magma_setdevice(igpu); magmablasSetKernelStream(queues[igpu][ind]); magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc, dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc); #ifdef CHECK_CPU printf("norm Q %d: %f\n", igpu, cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc)); #endif } } for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_sgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc, Q(n1+n2_loc*(igpu/2),iil-1+i), ldq ); // magma_sgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc, // Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, queues[igpu+1][ind] ); } if (n12 != 0) { magma_setdevice(igpu); magma_sgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc, Q(n1_loc*(igpu/2),iil-1+i), ldq ); // magma_sgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc, // Q(n1_loc*(igpu/2),iil-1+i), ldq, queues[igpu][ind] ); } } } for (igpu = 0; igpu < ngpu; ++igpu) { #ifdef CHECK_CPU magma_free_pinned( hwS[1][igpu] ); magma_free_pinned( hwS[0][igpu] ); magma_free_pinned( hwQ2[igpu] ); magma_free_pinned( hwQ[1][igpu] ); magma_free_pinned( hwQ[0][igpu] ); #endif magma_setdevice(igpu); magma_queue_sync( queues[igpu][0] ); magma_queue_sync( queues[igpu][1] ); } if ( n23 == 0 ) lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 == 0 ) lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_slaed3_m */
extern "C" magma_int_t magma_sstedx(magma_vec_t range, magma_int_t n, float vl, float vu, magma_int_t il, magma_int_t iu, float* d, float* e, float* z, magma_int_t ldz, float* work, magma_int_t lwork, magma_int_t* iwork, magma_int_t liwork, magmaFloat_ptr dwork, magma_int_t* info, magma_queue_t queue) { /* -- MAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 .. Scalar Arguments .. CHARACTER RANGE INTEGER IL, IU, INFO, LDZ, LIWORK, LWORK, N REAL VL, VU .. .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), E( * ), WORK( * ), Z( LDZ, * ), $ DWORK ( * ) .. Purpose ======= SSTEDX computes some eigenvalues and, optionally, eigenvectors of a symmetric tridiagonal matrix using the divide and conquer method. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. See SLAEX3 for details. Arguments ========= RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. N (input) INTEGER The dimension of the symmetric tridiagonal matrix. N >= 0. VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. D (input/output) REAL array, dimension (N) On entry, the diagonal elements of the tridiagonal matrix. On exit, if INFO = 0, the eigenvalues in ascending order. E (input/output) REAL array, dimension (N-1) On entry, the subdiagonal elements of the tridiagonal matrix. On exit, E has been destroyed. Z (input/output) REAL array, dimension (LDZ,N) On exit, if INFO = 0, Z contains the orthonormal eigenvectors of the symmetric tridiagonal matrix. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= max(1,N). WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If N > 1 then LWORK must be at least ( 1 + 4*N + N**2 ). Note that if N is less than or equal to the minimum divide size, usually 25, then LWORK need only be max(1,2*(N-1)). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. LIWORK must be at least ( 3 + 5*N ). Note that if N is less than or equal to the minimum divide size, usually 25, then LIWORK need only be 1. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the IWORK array, returns this value as the first entry of the IWORK array, and no error message related to LIWORK is issued by XERBLA. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: The algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. ===================================================================== */ magma_vec_t range_ = range; float d_zero = 0.; float d_one = 1.; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t alleig, indeig, valeig, lquery; magma_int_t i, j, k, m; magma_int_t liwmin, lwmin; magma_int_t start, end, smlsiz; float eps, orgnrm, p, tiny; // Test the input parameters. alleig = lapackf77_lsame(lapack_const(range_), "A"); valeig = lapackf77_lsame(lapack_const(range_), "V"); indeig = lapackf77_lsame(lapack_const(range_), "I"); lquery = lwork == -1 || liwork == -1; *info = 0; if (! (alleig || valeig || indeig)) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldz < max(1,n)) { *info = -10; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -4; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -5; } else if (iu < min(n,il) || iu > n) { *info = -6; } } } if (*info == 0) { // Compute the workspace requirements smlsiz = get_sstedx_smlsize(); if( n <= 1 ){ lwmin = 1; liwmin = 1; } else { lwmin = 1 + 4*n + n*n; liwmin = 3 + 5*n; } work[0] = lwmin; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -12; } else if (liwork < liwmin && ! lquery) { *info = -14; } } if (*info != 0) { magma_xerbla( __func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) { return MAGMA_SUCCESS; } // Quick return if possible if(n==0) return MAGMA_SUCCESS; if(n==1){ *z = 1.; return MAGMA_SUCCESS; } // If N is smaller than the minimum divide size (SMLSIZ+1), then // solve the problem with another solver. if (n < smlsiz){ char char_I[]= {'I', 0}; lapackf77_ssteqr(char_I, &n, d, e, z, &ldz, work, info); } else { char char_F[]= {'F', 0}; lapackf77_slaset(char_F, &n, &n, &d_zero, &d_one, z, &ldz); //Scale. char char_M[]= {'M', 0}; orgnrm = lapackf77_slanst(char_M, &n, d, e); if (orgnrm == 0){ work[0] = lwmin; iwork[0] = liwmin; return MAGMA_SUCCESS; } eps = lapackf77_slamch( "Epsilon" ); if (alleig){ start = 0; while ( start < n ){ // Let FINISH be the position of the next subdiagonal entry // such that E( END ) <= TINY or FINISH = N if no such // subdiagonal exists. The matrix identified by the elements // between START and END constitutes an independent // sub-problem. for(end = start+1; end < n; ++end){ tiny = eps * sqrt( MAGMA_S_ABS(d[end-1]*d[end])); if (MAGMA_S_ABS(e[end-1]) <= tiny) break; } // (Sub) Problem determined. Compute its size and solve it. m = end - start; if (m==1){ start = end; continue; } if (m > smlsiz){ // Scale char char_G[] = {'G', 0}; orgnrm = lapackf77_slanst(char_M, &m, &d[start], &e[start]); lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &m, &ione, &d[start], &m, info); magma_int_t mm = m-1; lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &mm, &ione, &e[start], &mm, info); magma_slaex0( m, &d[start], &e[start], Z(start, start), ldz, work, iwork, dwork, MagmaAllVec, vl, vu, il, iu, info, queue); if( *info != 0) { return MAGMA_SUCCESS; } // Scale Back lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &m, &ione, &d[start], &m, info); } else { char char_I[]= {'I', 0}; lapackf77_ssteqr( char_I, &m, &d[start], &e[start], Z(start, start), &ldz, work, info); if (*info != 0){ *info = (start+1) *(n+1) + end; } } start = end; } // If the problem split any number of times, then the eigenvalues // will not be properly ordered. Here we permute the eigenvalues // (and the associated eigenvectors) into ascending order. if (m < n){ // Use Selection Sort to minimize swaps of eigenvectors for (i = 1; i < n; ++i){ k = i-1; p = d[i-1]; for (j = i; j < n; ++j){ if (d[j] < p){ k = j; p = d[j]; } } if(k != i-1) { d[k] = d[i-1]; d[i-1] = p; blasf77_sswap(&n, Z(0,i-1), &ione, Z(0,k), &ione); } } } } else { // Scale char char_G[] = {'G', 0}; lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &n, &ione, d, &n, info); magma_int_t nm = n-1; lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &nm, &ione, e, &nm, info); magma_slaex0( n, d, e, z, ldz, work, iwork, dwork, range, vl, vu, il, iu, info, queue); if( *info != 0) { return MAGMA_SUCCESS; } // Scale Back lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &n, &ione, d, &n, info); } } work[0] = lwmin; iwork[0] = liwmin; return MAGMA_SUCCESS; } /* sstedx */