/** Purpose ------- CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. @param[in] T COMPLEX array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_cgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in CGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; magmaFloatComplex *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaFloatComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_cmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_cmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } magmaFloatComplex *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // cungqr requires less workspace (n*nb), but is slow if k < cungqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_cungqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_clacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_claset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_clarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_csetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_csetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_claset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_claset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_cgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "cungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_cungqr */
extern "C" magma_int_t magma_cungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. T (input) COMPLEX array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_cgeqrf_gpu (except stored on the CPU, not the GPU). NB (input) INTEGER This is the block size used in CGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t i, ib, ki, kk, iinfo; magmaFloatComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; int dpanel; int ngpu = magma_num_gpus(); int doriginal; magma_getdevice( &doriginal ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = ((m + 31) / 32) * 32; magma_int_t lddwork = ((n + 31) / 32) * 32; magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaFloatComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t stream[ MagmaMaxGPUs ] = { NULL }; for( int d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block , cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_cmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( &stream[d] ); } trace_init( 1, ngpu, 1, stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for cungqr workspace lwork = n * nb; magma_cmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; dpanel = (kk / nb) % ngpu; di = ((kk / nb) / ngpu) * nb; magma_setdevice( dpanel ); lapackf77_cungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); magma_csetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(dpanel, kk, di), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaUpperLower, kk, n - kk, dA(dpanel, 0, di), ldda ); trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_csetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] ); trace_gpu_end( d, 0 ); } // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to the GPUs lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, stream[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); magmablasSetKernelStream( stream[dpanel] ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_claset( MagmaUpperLower, i, ib, dA(dpanel, 0, di), ldda ); magmablas_claset_identity( mi, ib, dA(dpanel, i, di), ldda ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( stream[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork ); trace_gpu_end( d, 0 ); } } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_cgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb ); trace_cpu_end( 0 ); #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "cungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif CLEANUP: for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( NULL ); magma_free( dA[d] ); dA[d] = NULL; if ( stream[d] != NULL ) { magma_queue_destroy( stream[d] ); } } magma_free_cpu( work ); magma_setdevice( doriginal ); return *info; } /* magma_cungqr */
extern "C" magma_int_t magma_cgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CGEQRF4 computes a QR factorization of a COMPLEX M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUs to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_cgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ magmaFloatComplex *da[MagmaMaxGPUs]; magmaFloatComplex c_one = MAGMA_C_ONE; int i, k, ldda; *info = 0; int nb = magma_get_cgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus <0 || num_gpus > 4) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } ldda = ((m+31)/32)*32; magma_int_t n_local[MagmaMaxGPUs]; for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_cmalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_csetmatrix_1D_col_bcyclic(m, n, a, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_cgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_cgetmatrix_1D_col_bcyclic(m, n, da, ldda, a, lda, num_gpus, nb); } else { lapackf77_cgeqrf(&m, &n, a, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_free( da[i] ); } return *info; } /* magma_cgeqrf4 */
/***************************************************************************//** Purpose ------- CGEQRF computes a QR factorization of a COMPLEX M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_cgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf *******************************************************************************/ extern "C" magma_int_t magma_cgeqrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info ) { magmaFloatComplex *da[MagmaMaxGPUs]; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t i, min_mn, ldda; *info = 0; magma_int_t nb = magma_get_cgeqrf_nb( m, n ); magma_int_t lwkopt = n * nb; work[0] = magma_cmake_lwork( lwkopt ); bool lquery = (lwork == -1); if (ngpu < 0 || ngpu > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); ldda = magma_roundup( m, 32 ); magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < ngpu; i++) { n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_cmalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { magma_queue_t queues[MagmaMaxGPUs]; for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_create( dev, &queues[dev] ); } /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_csetmatrix_1D_col_bcyclic( ngpu, m, n, nb, A, lda, da, ldda, queues ); for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev] ); } /* Factor using the GPU interface */ magma_cgeqrf2_mgpu( ngpu, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_cgetmatrix_1D_col_bcyclic( ngpu, m, n, nb, da, ldda, A, lda, queues ); for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev] ); magma_queue_destroy( queues[dev] ); } } else { lapackf77_cgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_free( da[i] ); } magma_setdevice( orig_dev ); return *info; } /* magma_cgeqrf4 */
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix_1D_col_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/** Purpose ------- CGEQRF4 computes a QR factorization of a COMPLEX M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_cgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info ) { magmaFloatComplex *da[MagmaMaxGPUs]; magmaFloatComplex c_one = MAGMA_C_ONE; int i, k, ldda; *info = 0; int nb = magma_get_cgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus < 0 || num_gpus > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } ldda = ((m+31)/32)*32; magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < num_gpus; i++) { n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_cmalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_csetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_cgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_cgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, num_gpus, nb); } else { lapackf77_cgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); magma_free( da[i] ); } return *info; } /* magma_cgeqrf4 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; float tol, eps = lapackf77_slamch("E"); tol = opts.tolerance * eps; printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("==========================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_cgeqrf_nb( M ); gflops = FLOPS_CGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_cgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_C_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaFloatComplex, ldda*n_local ); } /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { magmaFloatComplex *tau2; TESTING_MALLOC_CPU( tau2, magmaFloatComplex, min_mn ); cpu_time = magma_wtime(); lapackf77_cgeqrf( &M, &N, h_A, &M, tau2, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); magma_queue_sync( NULL ); if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; magmaFloatComplex *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, magmaFloatComplex, n2 ); // Q TESTING_MALLOC_CPU( h_W2, magmaFloatComplex, n2 ); // R TESTING_MALLOC_CPU( h_W3, magmaFloatComplex, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); lapackf77_cqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_clange("f", &M, &N, h_A, &lda, work ); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_clange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ){ magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magmaFloatComplex *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_cgetrf_nb( M ); gflops = FLOPS_CGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaFloatComplex, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_csetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_cgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }