extern "C" magma_int_t magma_zgeqrf2(magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date July 2014 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using cudaMallocHost. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -8, the GPU memory allocation failed Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) int cnt=-1; cuDoubleComplex c_one = MAGMA_Z_ONE; int i, k, lddwork, old_i, old_ib; int nbmin, nx, ib, ldda; *info = 0; magma_qr_params *qr_params = (magma_qr_params *)cntxt->params; int nb = qr_params->nb; int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); long int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) return MAGMA_SUCCESS; k = min(m,n); if (k == 0) { work[0] = c_one; return MAGMA_SUCCESS; } cublasStatus status; static cudaStream_t stream[2]; cudaStreamCreate(&stream[0]); cudaStreamCreate(&stream[1]); nbmin = 2; nx = nb; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; cuDoubleComplex *da; status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da); if (status != CUBLAS_STATUS_SUCCESS) { *info = -8; return 0; } cuDoubleComplex *dwork = da + ldda*(n); if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex), a_ref(0,nb), lda *sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m), (n-nb), cudaMemcpyHostToDevice,stream[0]); old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); if (i>0){ cudaMemcpy2DAsync( a_ref(i,i), lda *sizeof(cuDoubleComplex), da_ref(i,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m-i), ib, cudaMemcpyDeviceToHost,stream[1]); cudaMemcpy2DAsync( a_ref(0,i), lda *sizeof(cuDoubleComplex), da_ref(0,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*i, ib, cudaMemcpyDeviceToHost,stream[0]); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } cudaStreamSynchronize(stream[1]); int rows = m-i; cnt++; cntxt->nb = qr_params->ib; magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); cntxt->nb = nb; /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib); if (cnt < qr_params->np_gpu) { qr_params->p[cnt]=a; } zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex), a_ref(i,i), lda, da_ref(i,i), ldda); if (qr_params->flag == 1) zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); if (i + ib < n) { cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork); if (i+ib < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) cublasGetMatrix(m, ib, sizeof(cuDoubleComplex), da_ref(0,i), ldda, a_ref(0,i), lda); int rows = m-i; cnt++; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); if (cnt < qr_params->np_gpu) { int ib2=min(ib,nb); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2); qr_params->p[cnt]=a; } } cudaStreamDestroy( stream[0] ); cudaStreamDestroy( stream[1] ); cublasFree(da); return MAGMA_SUCCESS; } /* magma_zgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { cuDoubleComplex *h_A, *h_R, *h_A2, *h_A3, *h_work, *h_work2, *tau, *d_work2; cuDoubleComplex *d_A, *d_work; float gpu_perf, cpu_perf, cpu2_perf; double flops; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2, lda, M=0; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, j, info[1]; magma_int_t loop = argc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t num_cores = 4; magma_int_t num_gpus = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-C", argv[i])==0) num_cores = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf_mc -M %d -N %d \n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; magma_int_t min_mn = min(M,N); /* Allocate host memory for the matrix */ h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0])); if (h_A2 == 0) { fprintf (stderr, "!!!! host memory allocation error (A2)\n"); } magma_int_t lwork = n2; h_work2 = (cuDoubleComplex*)malloc(lwork * sizeof(cuDoubleComplex)); if (h_work2 == 0) { fprintf (stderr, "!!!! host memory allocation error (h_work2)\n"); } h_A3 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A3[0])); if (h_A3 == 0) { fprintf (stderr, "!!!! host memory allocation error (A3)\n"); } tau = (cuDoubleComplex*)malloc(min_mn * sizeof(cuDoubleComplex)); if (tau == 0) { fprintf (stderr, "!!!! host memory allocation error (tau)\n"); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ magma_context *context; context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv); printf("\n\n"); printf(" M N LAPACK Gflop/s Multi-core Gflop/s ||R||_F / ||A||_F\n"); printf("===========================================================================\n"); for(i=0; i<10; i++){ if (loop == 1) { M = N = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A2 ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A2, &M, h_A3, &M ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_zgeqrf(&M, &N, h_A3, &M, tau, h_work2, &lwork, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]); cpu2_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using multicore =================================================================== */ start = get_current_time(); magma_zgeqrf_mc(context, &M, &N, h_A2, &M, tau, h_work2, &lwork, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; matnorm = lapackf77_zlange("f", &M, &N, h_A2, &M, work); blasf77_zaxpy(&n2, &mone, h_A2, &one, h_A3, &one); printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu2_perf, cpu_perf, lapackf77_zlange("f", &M, &N, h_A3, &M, work) / matnorm); if (loop != 1) break; } /* Memory clean up */ free(h_A2); free(tau); free(h_A3); free(h_work2); /* Shut down the MAGMA context */ magma_finalize(context); }