/** Purpose ------- ZGEQRF2_MGPU computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaDoubleComplex **dlA, magma_int_t ldda, magmaDoubleComplex *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaDoubleComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaDoubleComplex *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; magma_queue_t cqueue; magmablasGetKernelStream( &cqueue ); magma_device_t cdevice; magma_getdevice( &cdevice ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; nb = magma_get_zgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and zlarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think zlarfb work could be smaller, max(n_local[:]). * Oddly, T and zlarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_zmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for zgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_zmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_zgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_zgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); zpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_zsetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if zpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ zq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_zsetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_zsetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_zgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for zgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_zgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_zsetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. // check that queues and events are non-zero before destroying them, though. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( stream[dev][0] ) { magma_queue_destroy( stream[dev][0] ); } if ( stream[dev][1] ) { magma_queue_destroy( stream[dev][1] ); } if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); } magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( cdevice ); magmablasSetKernelStream( cqueue ); return *info; } /* magma_zgeqrf2_mgpu */
extern "C" magma_int_t magma_zgeqrs_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by ZGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) COMPLEX_16 array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by ZGEQRF_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) COMPLEX_16 array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_ZGEQRF_GPU. DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) COMPLEX_16 array that is the output (the 6th argument) of magma_zgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define d_ref(a_1) (dT+(lddwork+(a_1))*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_zunmqr_gpu( MagmaLeft, MagmaConjTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; if (nb < k) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_zunmqr_gpu, hwork contains // the last block of A and B (i.e., C in zunmqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_ztrsm and drop the zsetmatrix. if ( nrhs == 1 ) { blasf77_ztrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ztrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_zsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork ); // update c if (nrhs == 1) magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, 1, c_one, dB, 1); else magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); int start = i-nb; if (nb < k) { for (i = start; i >=0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_zgemv( MagmaNoTrans, ib, ib, c_one, d_ref(i), ib, dB+i, 1, c_zero, dwork+i, 1); magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, 1, c_one, dB, 1); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, d_ref(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork); magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); } } } } magma_zcopymatrix( (n), nrhs, dwork, lddwork, dB, lddb ); return *info; }
extern "C" magma_int_t magma_zgeqrf2_2q_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex *tau, magma_queue_t* queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDoubleComplex_ptr dwork; magmaDoubleComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_zgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_zmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaDoubleComplex)*lwork, NULL, NULL); work = (magmaDoubleComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL); nbmin = 2; nx = 2*nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), ldwork, queues[0], NULL); clFlush(queues[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]); magma_zsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, queues[1], NULL); clFlush(queues[1]); } magma_queue_sync(queues[0]); lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, queues[0]); zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_zsetmatrix( ib, ib, hwork, ib, dwork, 0, lddwork, queues[1]); if (i+nb < k-nx){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_queue_sync(queues[1]); }else { magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_zsetmatrix( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, queues[1]); clFlush(queues[1]); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows, queues[0]); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda, queues[0]); } clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); // magma_free_cpu(work); return *info; } /* magma_zgeqrf2_gpu */
/** Purpose ------- Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF3. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zgels3_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { magmaDoubleComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { magma_int_t nquarkthreads=2; magma_int_t nthreads=2; magma_int_t num_gpus = 1; TRACE = 0; //magma_qr_params mp; cuDoubleComplex *h_A, *h_R, *h_work, *tau; double gpu_perf, cpu_perf, flops; magma_timestr_t start, end; magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params)); /* Matrix size */ magma_int_t M=0, N=0, n2; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; cublasStatus status; magma_int_t i, j, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; mp->nb=-1; mp->ob=-1; mp->fb=-1; mp->ib=32; magma_int_t loop = argc; magma_int_t accuracyflag = 1; char precision; magma_int_t nc = -1; magma_int_t ncps = -1; if (argc != 1) { for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-F", argv[i])==0) mp->fb = atoi(argv[++i]); else if (strcmp("-O", argv[i])==0) mp->ob = atoi(argv[++i]); else if (strcmp("-B", argv[i])==0) mp->nb = atoi(argv[++i]); else if (strcmp("-b", argv[i])==0) mp->ib = atoi(argv[++i]); else if (strcmp("-A", argv[i])==0) accuracyflag = atoi(argv[++i]); else if (strcmp("-P", argv[i])==0) nthreads = atoi(argv[++i]); else if (strcmp("-Q", argv[i])==0) nquarkthreads = atoi(argv[++i]); else if (strcmp("-nc", argv[i])==0) nc = atoi(argv[++i]); else if (strcmp("-ncps", argv[i])==0) ncps = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" Set number of cores per socket and number of cores.\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024); printf(" Alternatively, set:\n"); printf(" Q: Number of threads for panel factorization.\n"); printf(" P: Number of threads for trailing matrix update (CPU).\n"); printf(" B: Block size.\n"); printf(" b: Inner block size.\n"); printf(" O: Block size for trailing matrix update (CPU).\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112); M = N = size[9]; } /* Auto tune based on number of cores and number of cores per socket if provided */ if ((nc > 0) && (ncps > 0)) { precision = 's'; #if (defined(PRECISION_d)) precision = 'd'; #endif #if (defined(PRECISION_c)) precision = 'c'; #endif #if (defined(PRECISION_z)) precision = 'z'; #endif auto_tune('q', precision, nc, ncps, M, N, &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads); fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ mp->sync0 = 0; magma_context *context; context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv); context->params = (void *)(mp); mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads); for (i = 0; i < nthreads; i++) mp->sync1[i] = 0; n2 = M * N; magma_int_t min_mn = min(M, N); magma_int_t nb = magma_get_zgeqrf_nb(min_mn); magma_int_t lwork = N*nb; /* Allocate host memory for the matrix */ TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, min_mn); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork ); printf("\n\n"); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (loop==1){ M = N = min_mn = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M ); //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info); for(j=0; j<n2; j++) h_R[j] = h_A[j]; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_qr_init(mp, M, N, h_R, nthreads); start = get_current_time(); magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info); end = get_current_time(); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); if (accuracyflag == 1) lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of zgeqrf had an illegal value.\n", -info); cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; if (accuracyflag == 1){ matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one); } if (accuracyflag == 1){ printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu_perf, gpu_perf, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); } else { printf("%5d %5d %6.2f \n", M, N, gpu_perf); } if (loop != 1) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R ); /* Shut down the MAGMA context */ magma_finalize(context); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zungqr_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Anorm, error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *hA, *hR, *tau, *h_work; magmaDoubleComplex_ptr dA, dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; if ( m < n || n < k ) { printf( "%5d %5d %5d skipping because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_zgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_ZUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( hA, magmaDoubleComplex, lda*n ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( hR, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaDoubleComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlacpy( MagmaFullStr, &m, &n, hA, &lda, hR, &lda ); Anorm = lapackf77_zlange("f", &m, &n, hA, &lda, work ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // first, get QR factors in both hA and dA // okay that magma_zgeqrf_gpu has special structure for R; R isn't used here. magma_zsetmatrix( m, n, hA, lda, dA, ldda ); magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if (info != 0) printf("magma_zgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( m, n, dA, ldda, hA, lda ); gpu_time = magma_wtime(); magma_zungqr_gpu( m, n, k, dA, ldda, tau, dT, nb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get dA back to the CPU to compare with the CPU result. magma_zgetmatrix( m, n, dA, ldda, hR, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_zaxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_zlange("f", &m, &n, hR, &lda, work) / Anorm; bool okay = (error < tol); status += ! okay; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. This version assumes the computation runs through the NULL stream and therefore is not overlapping some computation with communication. Other versions (magma_zgeqrf_gpu and magma_zgeqrf3_gpu) store the intermediate T matrices. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDoubleComplex *dwork; magmaDoubleComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (n)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_zsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda ); zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_zsetmatrix( ib, ib, hwork, ib, dwork, lddwork ); if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); } else { magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); magma_zsetmatrix( ib, ib, work_ref(i), ldwork, dA(i,i), ldda ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); return *info; } /* magma_zgeqrf2_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("==========================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgeqrf_nb( M ); gflops = FLOPS_ZGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_zgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lhwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*n_local ); } /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { magmaDoubleComplex *tau2; TESTING_MALLOC_CPU( tau2, magmaDoubleComplex, min_mn ); cpu_time = magma_wtime(); lapackf77_zgeqrf( &M, &N, h_A, &M, tau2, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_zgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); magma_queue_sync( NULL ); if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; magmaDoubleComplex *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, magmaDoubleComplex, n2 ); // Q TESTING_MALLOC_CPU( h_W2, magmaDoubleComplex, n2 ); // R TESTING_MALLOC_CPU( h_W3, magmaDoubleComplex, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_zlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_zqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work ); blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ){ magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zcgeqrsv_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). dB (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,M). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGEQRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dworkd, *hworkd; magmaFloatComplex *dworks, *hworks; magmaDoubleComplex *dR, *tau, *dT; magmaFloatComplex *dSA, *dSX, *dST, *stau; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_cgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange('I', m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for zgeqrf */ nb = magma_get_zgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_zmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zungqr_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *hA, *hR, *tau, *h_work; magmaDoubleComplex *dA, *dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; if ( m < n || n < k ) { printf( "skipping m %d, n %d, k %d because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_zgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_ZUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( hA, magmaDoubleComplex, lda*n ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( hR, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaDoubleComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlacpy( MagmaUpperLowerStr, &m, &n, hA, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( m, n, hA, lda, dA, ldda ); magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if (info != 0) printf("magma_zgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_time = magma_wtime(); magma_zungqr_gpu( m, n, k, dA, ldda, tau, dT, nb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get dA back to the CPU to compare with the CPU result. magma_zgetmatrix( m, n, dA, ldda, hR, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { error = lapackf77_zlange("f", &m, &n, hA, &lda, work ); lapackf77_zgeqrf( &m, &n, hA, &lda, tau, h_work, &lwork, &info ); if (info != 0) printf("lapackf77_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); cpu_time = magma_wtime(); lapackf77_zungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_zaxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_zlange("f", &m, &n, hR, &lda, work) / error; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_zgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZGEQRF4 computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUs to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ cuDoubleComplex *da[4]; cuDoubleComplex c_one = MAGMA_Z_ONE; int i, k, ldda; *info = 0; int nb = magma_get_zgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus <0 || num_gpus > 4) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } ldda = ((m+31)/32)*32; magma_int_t n_local[4]; for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_zmalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magmablas_zsetmatrix_1D_bcyclic(m, n, a, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_zgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magmablas_zgetmatrix_1D_bcyclic(m, n, da, ldda, a, lda, num_gpus, nb); } else { lapackf77_zgeqrf(&m, &n, a, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_free( da[i] ); } return *info; } /* magma_zgeqrf4 */
/** Purpose ------- ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) COMPLEX_16 array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; magmaDoubleComplex *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(magmaDoubleComplex)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_zsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_zgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); zsplit_diag_block(ib, work(i), ldwork, ut); magma_zsetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_zsetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_zsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_zgeqrf_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; cuDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, lwork; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t nsize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t i, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; // process command line arguments printf( "\nUsage: %s -N <m,n> -c\n", argv[0] ); printf( " -N can be repeated up to %d times. If only m is given, then m=n.\n", MAXTESTS ); printf( " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n" ); int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); int m, n; info = sscanf( argv[++i], "%d,%d", &m, &n ); if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0.\n", argv[i] ); exit(1); } M = max( M, msize[ ntest ] ); N = max( N, nsize[ ntest ] ); ntest++; } else if ( strcmp("-M", argv[i]) == 0 ) { printf( "-M has been replaced in favor of -N m,n to allow -N to be repeated.\n\n" ); exit(1); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; M = msize[ntest-1]; N = nsize[ntest-1]; } n2 = M * N; min_mn = min(M, N); nb = magma_get_zgeqrf_nb(M); /* Allocate memory for the matrix */ TESTING_MALLOC( tau, cuDoubleComplex, min_mn ); TESTING_MALLOC( h_A, cuDoubleComplex, n2 ); TESTING_HOSTALLOC( h_R, cuDoubleComplex, n2 ); lwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); TESTING_MALLOC( h_work, cuDoubleComplex, lwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( i = 0; i < ntest; ++i ) { M = msize[i]; N = nsize[i]; min_mn= min(M, N); lda = M; n2 = lda*N; gflops = FLOPS_ZGEQRF( M, N ) / 1e9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqrf returned error %d.\n", (int) info); if ( checkres ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgeqrf returned error %d.\n", (int) info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); TESTING_CUDA_FINALIZE(); return 0; }
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by ZGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by ZGEQRF_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau COMPLEX_16 array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_ZGEQRF_GPU. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT COMPLEX_16 array that is the output (the 6th argument) of magma_zgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrs_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex_ptr dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_zunmqr_gpu( MagmaLeft, Magma_ConjTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; if (nb < k) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_zunmqr_gpu, hwork contains // the last block of A and B (i.e., C in zunmqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_ztrsm and drop the zsetmatrix. if ( nrhs == 1 ) { blasf77_ztrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ztrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_zsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork ); // update c if (nrhs == 1) magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); else magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); int start = i-nb; if (nb < k) { for (i = start; i >= 0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_zgemv( MagmaNoTrans, ib, ib, c_one, dT(i), ib, dB+i, 1, c_zero, dwork+i, 1); magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, dT(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork); magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); } } } } magma_zcopymatrix( (n), nrhs, dwork, lddwork, dB, lddb ); return *info; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double matnorm, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, flag = 0, n2, check = 0; magma_int_t n_local[MagmaMaxGPUs*MagmaMaxSubs], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, info, min_mn, nb; int num_gpus = 1, num_subs = 1, tot_subs = 1; magma_int_t ione = 1; M = N = size[9]; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); flag = 1; } else if (strcmp("-M", argv[i])==0) { M = atoi(argv[++i]); flag = 1; } else if (strcmp("-NGPU", argv[i])==0) { num_gpus = atoi(argv[++i]); } else if (strcmp("-NSUB", argv[i])==0) { num_subs = atoi(argv[++i]); } else if (strcmp("-check", argv[i])==0) { check = 1; } } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } } if (num_gpus > MagmaMaxGPUs){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = MagmaMaxGPUs; } if (num_subs > MagmaMaxSubs) { printf("More buffers requested than available. Have to change it.\n"); num_subs = MagmaMaxSubs; } tot_subs = num_gpus * num_subs; printf("\nNumber of GPUs to be used = %d\n", (int) num_gpus); printf("Usage: \n"); printf(" testing_zgeqrf_msub -M %d -N %d -NGPU %d -NSUB %d %s\n\n", M, N, num_gpus, num_subs, (check == 1 ? "-check" : " ")); /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for (i=0; i<num_gpus; i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf( "\n" ); printf(" M N CPU GFlop/s (sec.) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("==========================================================================\n"); for(i=0; i<10; i++){ if (flag == 0) { M = N = size[i]; } nb = magma_get_zgeqrf_nb(M); min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_ZGEQRF( (double)M, (double)N ) / 1e9; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); /* Allocate host workspace */ lhwork = -1; lapackf77_zgeqrf(&M, &N, h_R, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lhwork ); /* Allocate device memory for the matrix */ for (int j=0; j<tot_subs; j++) { n_local[j] = ((N/nb)/tot_subs)*nb; if (j < (N/nb)%tot_subs) n_local[j] += nb; else if (j == (N/nb)%tot_subs) n_local[j] += N%nb; TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*n_local[j] ); } /* Initialize the matrix */ init_matrix( M, N, h_R, lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_queue_t *trans_queues = (magma_queue_t*)malloc(tot_subs*sizeof(magma_queue_t)); for (int j=0; j<tot_subs; j++) { trans_queues[j] = queues[2*(j%num_gpus)]; } // warm-up magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues); magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues); gpu_time = magma_wtime(); magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info < 0) printf("Argument %d of magma_zgeqrf_msub had an illegal value.\n", (int) -info); if (check == 1) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_zgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, tot_subs, nb, trans_queues); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); init_matrix( M, N, h_A, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapack_zgeqrf had an illegal value.\n", (int) -info); matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); TESTING_FREE_PIN( h_A ); } else { printf("%5d %5d -- ( -- ) %6.2f (%6.2f) --\n", (int) M, (int) N, gpu_perf, gpu_time ); } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for (int j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } for (i=0; i<num_gpus; i++) { magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
extern "C" int magma_ztsqrt_gpu(int *m, int *n, magmaDoubleComplex *a1, magmaDoubleComplex *a2, int *lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, int *lwork, magmaDoubleComplex *dwork, int *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (M+N+NB)*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. DWORK (workspace/output) COMPLEX_16 array on the GPU, dimension 2*N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). It starts with NB*NB blocks that store the triangular T matrices, followed by the NB*NB blocks of the diagonal inverses for the R matrix. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a1_ref(a_1,a_2) ( a1+(a_2)*(*lda) + (a_1)) #define a2_ref(a_1,a_2) ( a2+(a_2)*(*lda) + (a_1)) #define t_ref(a_1) (dwork+(a_1)) #define d_ref(a_1) (dwork+(lddwork+(a_1))*nb) #define dd_ref(a_1) (dwork+(2*lddwork+(a_1))*nb) #define work_a1 ( work ) #define work_a2 ( work + nb ) #define hwork ( work + (nb)*(*m)) int i, k, ldwork, lddwork, old_i, old_ib, rows, cols; int nbmin, ib, ldda; /* Function Body */ *info = 0; int nb = magma_get_zgeqrf_nb(*m); int lwkopt = (*n+*m) * nb; work[0] = (magmaDoubleComplex) lwkopt; int lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; } else if (*lwork < max(1,*n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(*m,*n); if (k == 0) { work[0] = 1.f; return *info; } int lhwork = *lwork - (*m)*nb; magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldda = *m; nbmin = 2; ldwork = *m; lddwork= k; // This is only blocked code for now for (i = 0; i < k; i += nb) { ib = min(k-i, nb); rows = *m -i; rows = *m; // Send the next panel (diagonal block of A1 & block column of A2) // to the CPU (in work_a1 and work_a2) magma_zgetmatrix_async( rows, ib, a2_ref(0,i), (*lda), work_a2, ldwork, stream[1] ); // a1_ref(i,i), (*lda)*sizeof(magmaDoubleComplex), // the diagonal of a1 is in d_ref generated and // passed from magma_zgeqrf_gpu magma_zgetmatrix_async( ib, ib, d_ref(i), ib, work_a1, ldwork, stream[1] ); if (i>0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ // update T2 cols = *n-old_i-2*old_ib; magma_zssrfb(*m, cols, &old_ib, a2_ref( 0, old_i), lda, t_ref(old_i), &lddwork, a1_ref(old_i, old_i+2*old_ib), lda, a2_ref( 0, old_i+2*old_ib), lda, dd_ref(0), &lddwork); } magma_queue_sync( stream[1] ); // TTT - here goes the CPU PLASMA code // Matrix T has to be put in hwork with lda = ib and 0s // in the parts that are not used - copied on GPU in t_ref(i) // Now diag of A1 is updated, send it back asynchronously to the GPU. // We have to play interchaning these copies to see which is faster magma_zsetmatrix_async( ib, ib, work_a1, ib, d_ref(i), ib, stream[0] ); // Send the panel from A2 back to the GPU magma_zsetmatrix( *m, ib, work_a2, ldwork, a2_ref(0,i), *lda ); if (i + ib < *n) { // Send the triangular factor T from hwork to the GPU in t_ref(i) magma_zsetmatrix( ib, ib, hwork, ib, t_ref(i), lddwork ); if (i+nb < k){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ // if we can do one more step, first update T1 magma_zssrfb(*m, ib, &ib, a2_ref(0, i), lda, t_ref(i), &lddwork, a1_ref(i, i+ib), lda, a2_ref(0, i+ib), lda, dd_ref(0), &lddwork); } else { cols = *n-i-ib; // otherwise, update until the end and fix the panel magma_zssrfb(*m, cols, &ib, a2_ref(0, i), lda, t_ref(i), &lddwork, a1_ref(i, i+ib), lda, a2_ref(0, i+ib), lda, dd_ref(0), &lddwork); } old_i = i; old_ib = ib; } } return *info; } /* magma_ztsqrt_gpu */
extern "C" magma_int_t magma_zgeqrf(magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) cuDoubleComplex *da, *dwork; cuDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_zgeqrf_nb(min(m, n)); magma_int_t lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_zgeqrf4(num_gpus, m, n, a, lda, tau, work, lwork, info); } if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_zgeqrf_ooc(m, n, a, lda, tau, work, lwork, info); } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); dwork = da + ldda*(n); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ magma_zsetmatrix_async( (m), (n-nb), a_ref(0,nb), lda, da_ref(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i>0){ magma_zgetmatrix_async( (m-i), ib, da_ref(i,i), ldda, a_ref(i,i), lda, stream[1] ); magma_zgetmatrix_async( i, ib, da_ref(0,i), ldda, a_ref(0,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } magma_queue_sync( stream[1] ); magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &ib); zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib); magma_zsetmatrix( rows, ib, a_ref(i,i), lda, da_ref(i,i), ldda ); zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib); if (i + ib < n) { magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork ); if (i+ib < k-nb) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) magma_zgetmatrix( m, ib, da_ref(0,i), ldda, a_ref(0,i), lda ); magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqrf */
/** Purpose ------- ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[out] dX COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGEQRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zcgeqrsv_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex_ptr dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *hworkd; magmaFloatComplex *hworks; magmaDoubleComplex *tau; magmaFloatComplex *stau; magmaDoubleComplex_ptr dworkd; magmaFloatComplex_ptr dworks; magmaDoubleComplex_ptr dR, dT; magmaFloatComplex_ptr dSA, dSX, dST; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_cgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for zgeqrf */ nb = magma_get_zgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_zmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
extern "C" magma_int_t magma_zunmqr(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *C, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define A(a_1,a_2) ( A + (a_1) + (a_2)*lda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) magma_int_t nb = magma_get_zgeqrf_nb( min( m, n )); cuDoubleComplex c_one = MAGMA_Z_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t nq_i, lddwork; magma_int_t i; cuDoubleComplex T[ 2*nb*nb ]; magma_int_t i1, i2, step, ib, ic, jc, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = max(1,nw) * nb; work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, MagmaConjTransStr)) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } /* Allocate work space on the GPU */ magma_int_t lddc = m; cuDoubleComplex *dwork, *dC; magma_zmalloc( &dC, lddc*n ); magma_zmalloc( &dwork, (m + n + nb)*nb ); /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, C, ldc, dC, lddc ); if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, &tau[1], C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( (left && (! notran)) || ((! left) && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for( i=i1; (step<0 ? i>=i2 : i<i2); i += step ) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_zlarft("F", "C", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q('U', ib, A(i,i), lda, T+ib*ib); magma_zsetmatrix( nq_i, ib, A(i,i), lda, dwork, nq_i ); zq_to_panel('U', ib, A(i,i), lda, T+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dwork+nq_i*ib, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, nq_i, dwork+nq_i*ib, ib, dC(ic,jc), lddc, dwork+nq_i*ib + ib*ib, lddwork); } magma_zgetmatrix( m, n, dC, lddc, C, ldc ); } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); magma_free( dC ); magma_free( dwork ); return *info; } /* magma_zunmqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; opts.lapack |= ( opts.check == 2 ); // check (-c2) implies lapack (-l) printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("===============================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_zgeqrf_nb(M); gflops = FLOPS_ZGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, max( N*nb, 2*nb*nb )); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, n2 ); TESTING_HOSTALLOC( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC( h_work, magmaDoubleComplex, lwork ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ magmaDoubleComplex *tau; TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); cpu_time = magma_wtime(); lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; magmaDoubleComplex *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, magmaDoubleComplex, n2 ); // Q TESTING_MALLOC( h_W2, magmaDoubleComplex, n2 ); // R TESTING_MALLOC( h_W3, magmaDoubleComplex, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_zlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_zqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaDoubleComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_zgeqrf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_ZUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqrf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaDoubleComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaDoubleComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, k ); // C is full, m x n size = ldc*n; lapackf77_zlarnv( &ione, ISEED, &size, C ); lapackf77_zlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_zlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_zgeqrf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_zunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_Z_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_zlange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zqr( magma_int_t m, magma_int_t n, magma_z_matrix A, magma_int_t lda, magma_z_matrix *Q, magma_z_matrix *R, magma_queue_t queue ) { magma_int_t info = 0; // local constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; // local variables magma_int_t inc = 1; magma_int_t k = min(m,n); magma_int_t ldt; magma_int_t nb; magmaDoubleComplex *tau = NULL; magmaDoubleComplex *dT = NULL; magmaDoubleComplex *dA = NULL; magma_z_matrix dR1 = {Magma_CSR}; // allocate CPU resources CHECK( magma_zmalloc_pinned( &tau, k ) ); // query number of blocks required for QR factorization nb = magma_get_zgeqrf_nb( m, n ); ldt = (2 * k + magma_roundup(n, 32)) * nb; CHECK( magma_zmalloc( &dT, ldt ) ); // get copy of matrix array if ( A.memory_location == Magma_DEV ) { dA = A.dval; } else { CHECK( magma_zmalloc( &dA, lda * n ) ); magma_zsetvector( lda * n, A.val, inc, dA, inc, queue ); } // QR factorization magma_zgeqrf_gpu( m, n, dA, lda, tau, dT, &info ); // construct R matrix if ( R != NULL ) { if ( A.memory_location == Magma_DEV ) { CHECK( magma_zvinit( R, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_zlacpy( MagmaUpper, k, n, dA, lda, R->dval, lda, queue ); } else { CHECK( magma_zvinit( &dR1, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_zlacpy( MagmaUpper, k, n, dA, lda, dR1.dval, lda, queue ); CHECK( magma_zvinit( R, Magma_CPU, lda, n, c_zero, queue ) ); magma_zgetvector( lda * n, dR1.dval, inc, R->val, inc, queue ); } } // construct Q matrix if ( Q != NULL ) { magma_zungqr_gpu( m, n, k, dA, lda, tau, dT, nb, &info ); if ( A.memory_location == Magma_DEV ) { CHECK( magma_zvinit( Q, Magma_DEV, lda, n, c_zero, queue ) ); magma_zcopyvector( lda * n, dA, inc, Q->dval, inc, queue ); } else { CHECK( magma_zvinit( Q, Magma_CPU, lda, n, c_zero, queue ) ); magma_zgetvector( lda * n, dA, inc, Q->val, inc, queue ); } } cleanup: if( info != 0 ){ magma_zmfree( Q, queue ); magma_zmfree( R, queue ); magma_zmfree( &dR1, queue ); } // free resources magma_free_pinned( tau ); magma_free( dT ); if ( A.memory_location == Magma_CPU ) { magma_free( dA ); } return info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 2. * opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqlf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by ZGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by ZGEQRF3_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau COMPLEX_16 array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_ZGEQRF_GPU. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT COMPLEX_16 array that is the output (the 6th argument) of magma_zgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrs3_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork= k; /* B := Q' * B */ magma_zunmqr_gpu( MagmaLeft, MagmaConjTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_zswapdblk(k, nb, dA(0,0), ldda, 1, dT(0), nb, 0); if ( nrhs == 1 ) { magma_ztrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1); } else { magma_ztrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb); } magmablas_zswapdblk(k, nb, dT(0), nb, 0, dA(0,0), ldda, 1); return *info; }
extern "C" magma_int_t magma_zgels3_gpu( char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments ========= TRANS (input) CHARACTER*1 = 'N': the linear system involves A. Only trans='N' is currently handled. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF3. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) magmaDoubleComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( (trans != 'N') && (trans != 'n' ) ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
extern "C" magma_int_t magma_zgeqrf_ooc(magma_int_t m, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF_OOC computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This is an out-of-core (ooc) version that is similar to magma_zgeqrf but the difference is that this version can use a GPU even if the matrix does not fit into the GPU memory at once. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) magmaDoubleComplex *da, *dwork; magmaDoubleComplex c_one = MAGMA_Z_ONE; int k, lddwork, ldda; *info = 0; int nb = magma_get_zgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Check how much memory do we have */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); magma_int_t IB, NB = (magma_int_t)(0.8*freeMem/m); NB = (NB / nb) * nb; if (NB >= n) return magma_zgeqrf(m, n, a, lda, tau, work, lwork, info); k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((NB+31)/32)*32+nb; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &da, (NB + nb)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); // magmablasSetKernelStream(stream[1]); magmaDoubleComplex *ptr = da + ldda * NB; dwork = da + ldda*(NB + nb); /* start the main loop over the blocks that fit in the GPU memory */ for(int i=0; i<n; i+=NB) { IB = min(n-i, NB); //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB); /* 1. Copy the next part of the matrix to the GPU */ magma_zsetmatrix_async( (m), IB, a_ref(0,i), lda, da_ref(0,0), ldda, stream[0] ); magma_queue_sync( stream[0] ); /* 2. Update it with the previous transformations */ for(int j=0; j<min(i,k); j+=nb) { magma_int_t ib = min(k-j, nb); /* Get a panel in ptr. */ // 1. Form the triangular factor of the block reflector // 2. Send it to the GPU. // 3. Put 0s in the upper triangular part of V. // 4. Send V to the GPU in ptr. // 5. Update the matrix. // 6. Restore the upper part of V. magma_int_t rows = m-j; lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(j,j), &lda, tau+j, work, &ib); magma_zsetmatrix_async( ib, ib, work, ib, dwork, lddwork, stream[1] ); zpanel_to_q(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib); magma_zsetmatrix_async( rows, ib, a_ref(j,j), lda, ptr, rows, stream[1] ); magma_queue_sync( stream[1] ); magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, IB, ib, ptr, rows, dwork, lddwork, da_ref(j, 0), ldda, dwork+ib, lddwork); zq_to_panel(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib); } /* 3. Do a QR on the current part */ if (i<k) magma_zgeqrf2_gpu(m-i, IB, da_ref(i,0), ldda, tau+i, info); /* 4. Copy the current part back to the CPU */ magma_zgetmatrix_async( (m), IB, da_ref(0,0), ldda, a_ref(0,i), lda, stream[0] ); } magma_queue_sync( stream[0] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqrf_ooc */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zcgeqrsv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs; double error, gpu_error, cpu_error, Anorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R; magmaDoubleComplex_ptr d_A, d_B, d_X, d_T; magmaFloatComplex *d_SA, *d_SB; magmaDoubleComplex *h_workd, *tau, tmp[1]; magmaFloatComplex *h_works; magma_int_t lda, ldb, lhwork, lworkgpu; magma_int_t ldda, lddb, lddx; magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" CPU Gflop/s GPU Gflop/s |b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS double double single mixed Iter CPU GPU \n"); printf("=============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = ((M+31)/32) * 32; lddb = ((max_mn+31)/32)*32; lddx = ((N+31)/32) * 32; nb = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) ); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork ); h_works = (magmaFloatComplex*)h_workd; TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs ); TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb ); /* Initialize the matrices */ size = lda*N; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcgeqrsv_gpu( M, N, nrhs, d_A, ldda, d_B, lddb, d_X, lddx, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zcgeqrsv returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute the residual magma_zgetmatrix( N, nrhs, d_X, lddx, h_X, ldb ); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_workd, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfd = gflops / gpu_time; //===================================================================== // Single Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); /* The allocation of d_SA and d_SB is done here to avoid * to double the memory used on GPU with zcgeqrsv */ TESTING_MALLOC_DEV( d_SA, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_SB, magmaFloatComplex, lddb*nrhs ); magmablas_zlag2c( M, N, d_A, ldda, d_SA, ldda, &info ); magmablas_zlag2c( N, nrhs, d_B, lddb, d_SB, lddb, &info ); gpu_time = magma_wtime(); magma_cgels_gpu( MagmaNoTrans, M, N, nrhs, d_SA, ldda, d_SB, lddb, h_works, lhwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfs = gflops / gpu_time; TESTING_FREE_DEV( d_SA ); TESTING_FREE_DEV( d_SB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_workd, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_zaxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f %7.2f %7.2f %7.2f %4d %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, (int) nrhs, cpu_perf, gpu_perfd, gpu_perfs, gpu_perf, (int) qrsv_iters, cpu_error, gpu_error, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_T ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgeqrs3_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by ZGEQRF3_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) COMPLEX_16 array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by ZGEQRF3_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) COMPLEX_16 array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_ZGEQRF_GPU. DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) COMPLEX_16 array that is the output (the 6th argument) of magma_zgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define d_ref(a_1) (dT+(lddwork+(a_1))*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork= k; /* B := Q' * B */ magma_zunmqr_gpu( MagmaLeft, MagmaConjTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the block diagonal submatrices from d_ref to R 2. Solve 3. Restore the data format moving data from R back to d_ref */ magmablas_zswapdblk(k, nb, a_ref(0,0), ldda, 1, d_ref(0), nb, 0); if ( nrhs == 1 ) { magma_ztrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, a_ref(0,0), ldda, dB, 1); } else { magma_ztrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, a_ref(0,0), ldda, dB, lddb); } magmablas_zswapdblk(k, nb, d_ref(0), nb, 0, a_ref(0,0), ldda, 1); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrs */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, matnorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *d_B; magma_int_t M, N, n2, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===============================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; if ( M < N ) { printf( "skipping M=%d, N=%d because M < N is not yet supported.\n", (int) M, (int) N ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; n2 = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; // query for workspace size lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; lhwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = -1; lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_work, magmaDoubleComplex, lhwork ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS n2 = M*nrhs; lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //n2 = N*nrhs; //lapackf77_zlarnv( &ione, ISEED, &n2, h_X ); //blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_zgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*matnorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*matnorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error ); printf("%s\n", (gpu_error < tol ? "" : " failed")); status |= ! (gpu_error < tol); TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgeqrf(magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) magmaDoubleComplex *dA, *dwork, *dT; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_zgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_zgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_zgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[3], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); magma_queue_create( &stream[2] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_zsetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[2] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i>0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_zgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[2] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); zpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_zsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_zsetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); zq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); zq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_zgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda ); } magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[2] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( dA ); return *info; } /* magma_zgeqrf */