int main(int argc, char **argv) { const char *fin = argv[1]; const char *fout1 = argv[2]; const char *fout2 = argv[3]; struct timeval timer1, timer2; size_t D_dims[2], E_dims[2], Q_dims[2]; double *D = read_mat(fin, "D", D_dims); double *E = read_mat(fin, "E", E_dims); double *Q; int N = (D_dims[0] > D_dims[1]) ? (int)D_dims[0] : (int)D_dims[1]; double *WORK; int *IWORK; Q_dims[0] = Q_dims[1] = (size_t)N; Q = (double *)malloc(N * N * sizeof(double)); WORK = (double *)malloc((2 * N + 2 * N * N) * sizeof(double)); IWORK = (int *)malloc((3 + 5 * N) * sizeof(int)); gettimeofday(&timer1, NULL); dlaed0(N, D, E, Q, N, WORK, IWORK); gettimeofday(&timer2, NULL); printf("Time: %.3lf s\n", GetTimerValue(timer1, timer2) / 1000.0 ); write_mat(fout1, "D", D, D_dims); write_mat(fout2, "Q", Q, Q_dims); return 0; }
void kernel_main(void) { //Initialize process manager, setup timers, get threading up and running // Enumerate and initialize drivers // Load UI elf from disk // Switch to usermode // Execute UI //Seed the rng with the timer value seed(GetTimerValue()); InterruptMan_Initialize(); SyscallMan_Initialize(); Syscall_Initialize(); DeviceManager_Initialize(); //smp_unlock_cores(); SetupPreemption(); target_device_setup(); UID cpid = 0; if(CreateProcess(ROOT_PID, 0, &cpid) != ProcessErrors_None) HaltProcessor(); load_exec(cpid, "userboot.bin"); while(1) WakeReadyThreads(); }
EFI_STATUS EFIAPI EndGauge ( IN EFI_PERFORMANCE_PROTOCOL *This, IN EFI_HANDLE Handle, IN UINT16 *Token, IN UINT16 *Host, IN UINT64 Ticker ) /*++ Routine Description: End all unfinished gauge data node that match specified handle, token and host. Arguments: This - Calling context Handle - Handle to stop Token - Token to stop Host - Host to stop Ticker - End tick, if 0 then get current timer Returns: EFI_NOT_FOUND - Node not found EFI_SUCCESS - Gauge data node successfully ended. --*/ { EFI_PERFORMANCE_INSTANCE *PerfInstance; EFI_PERF_DATA_LIST *Node; UINT64 TimerValue; TimerValue = 0; PerfInstance = EFI_PERFORMANCE_FROM_THIS (This); Node = GetDataNode (Handle, Token, Host, NULL, NULL); if (!Node) { return EFI_NOT_FOUND; } while (Node->GaugeData.EndTick != 0) { Node = GetDataNode (Handle, Token, Host, NULL, &(Node->GaugeData)); if (!Node) { return EFI_NOT_FOUND; } } if (Ticker != 0) { TimerValue = Ticker; } else { GetTimerValue (&TimerValue); } Node->GaugeData.EndTick = TimerValue; return EFI_SUCCESS; }
void magma_gettimervalue_f(unsigned int *start, unsigned int *end, double *result) { magma_timestr_t time1, time2; time1.sec = start[0]; time1.usec = start[1]; time2.sec = end[0]; time2.usec = end[1]; *result = GetTimerValue(time1, time2); }
////////////////////////////////////////////////////////////// // ZSTEDC Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_zstedx_withZ(magma_int_t N, magma_int_t NE, double *D, double * E, magmaDoubleComplex *Z, magma_int_t LDZ) { double *RWORK; double *dwork; magma_int_t *IWORK; magma_int_t LWORK, LIWORK, LRWORK; magma_int_t INFO; LWORK = N; LRWORK = 2*N*N+4*N+1+256*N; LIWORK = 256*N; RWORK = (double*) malloc( LRWORK*sizeof( double) ); IWORK = (magma_int_t*) malloc( LIWORK*sizeof( magma_int_t) ); if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("ZSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_zstedx\n"); #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif char job = 'I'; if(NE==N) job = 'A'; magma_zstedx(job, N, 0.,0., 1, NE, D, E, Z, LDZ, RWORK, LRWORK, IWORK, LIWORK, dwork, &INFO); if(INFO!=0){ printf("=================================================\n"); printf("ZSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO==0); } #ifdef ENABLE_TIMER end = get_current_time(); printf("time zstevx = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( RWORK ); }
int magma_solve ( int *dA_dim, int *lWork, double2 *A, int *ipiv, int *N ){ // Check inputs // fprintf (stderr, "Using MAGMA solve\n" ); fprintf (stderr, " dA_dim: %i\n", *dA_dim ); fprintf (stderr, " N: %i\n", *N ); fprintf (stderr, " lWork: %i\n", *lWork ); cuInit(0); cublasInit(); printout_devices(); cublasStatus status; double2 *d_A, *work; status = cublasAlloc ( *dA_dim, sizeof(double2), (void**)&d_A ); if ( status != CUBLAS_STATUS_SUCCESS ){ fprintf (stderr, "ERROR: device memory allocation error (d_A)\n" ); fprintf (stderr, "ERROR: dA_dim: %i\n", dA_dim ); } cudaError_t err; err = cudaMallocHost ( (void**)&work, *lWork * sizeof(double2) ); if(err != cudaSuccess){ fprintf (stderr, "ERROR: cudaMallocHost error (work)\n" ); } int info[1]; TimeStruct start, end; start = get_current_time (); magma_zgetrf ( N, N, A, N, ipiv, work, d_A, info ); end = get_current_time (); double gpu_perf; gpu_perf = 4.*2.*(*N)*(*N)*(*N)/(3.*1000000*GetTimerValue(start,end)); if ( info[0] != 0 ){ fprintf (stderr, "ERROR: magma_zgetrf failed\n" ); } printf(" GPU performance: %6.2f GFlop/s\n", gpu_perf); int stat = 0; return stat; }
/******************************************************************************* Purpose: Check timeout. Re-design in 20 May 2004, Yu Wei. It will use relative timer. Input unTimeValue -- Timeout value. (in) Return 1 - Timeout. 0 - Not timeout. *******************************************************************************/ bool CheckTimeOut(unsigned int unTimeValue) { struct tTimerValue tTime; unsigned int unTimerDiff = 0; GetTimerValue(&tTime); unTimerDiff = ( tTime.High - g_tRMMTimeOut.High ) * TIMER_LOW_MSEC; //Get high part different. unTimerDiff += tTime.Low; //Added low part different. unTimerDiff -= g_tRMMTimeOut.Low; #if ((defined CFG_DEBUG_MSG) && _CFG_DEBUG_RMM) printf("[RMM] CheckTimeOut, unTimerDiff = %d, unTimeValue = %d\n", unTimerDiff, unTimeValue); #endif // ((defined CFG_DEBUG_MSG) && (CFG_DEBUG_RMM_STATE)) if (unTimerDiff >= unTimeValue) //Check timeout. return true; else return false; }
int main(int argc, char** argv) { magma_init(); magma_timestr_t start , end; double gpu_time ; double *c; int dim[] = {20000,30000,40000}; int i,n; n = sizeof(dim) / sizeof(dim[0]); for(i=0; i < n; i++) { magma_int_t m = dim[i]; magma_int_t mm=m*m; magma_err_t err; err = magma_dmalloc_cpu ( &c , mm ); //generate random symmetric, positive matrix double *ml = generate_sym_matrix(m); start = get_current_time(); //find the inverse matrix for MxM symmetric, positive definite matrix using the cholesky decomposition. //Compute GPU cholesky decomposition with CPU interface c = cholesky(ml, m); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1e3; printf("gpu time for %dx%d: %7.5f sec\n", m, m, gpu_time); //copy upper diag copy_upper_diag(c,m); free(c); } magma_finalize (); return 0; }
extern "C" magma_int_t magma_ssyevdx_2stage(char jobz, char range, char uplo, magma_int_t n, float *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) REAL array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 2). If JOBZ = 'V' and N > 1, LWORK >= LQ2 + 1 + 6*N + 2*N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by MAGMA_BULGE_GET_LQ2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; float d_one = 1.; magma_int_t ione = 1; magma_int_t izero = 0; float d__1; float eps; float anrm; magma_int_t imax; float rmin, rmax; float sigma; magma_int_t lwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; float safmin; float bignum; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; float* dwork; /* determine the number of threads */ magma_int_t threads = magma_get_numthreads(); magma_setlapack_numthreads(threads); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame( range_, "A" ); valeig = lapackf77_lsame( range_, "V" ); indeig = lapackf77_lsame( range_, "I" ); lquery = lwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_sbulge_nb(n, threads); magma_int_t Vblksiz = magma_sbulge_get_Vblksiz(n, nb, threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_sbulge_get_lq2(n, threads); if (wantz) { lwmin = lq2 + 1 + 6 * n + 2 * n * n; liwmin = 5 * n + 3; } else { lwmin = lq2 + n * (nb + 1); liwmin = 1; } work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((liwork < liwmin) && ! lquery) { *info = -16; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = a[0]; if (wantz) { a[0] = MAGMA_S_ONE; } return *info; } #ifdef ENABLE_TIMER printf("using %d threads\n", threads); #endif /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if( ( ntiles < 2 ) || ( n <= 128 ) ){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int ) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_ssyevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_slansy("M", uplo_, &n, a, &lda, work); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_slascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } magma_int_t inde = 0; magma_int_t indT2 = inde + n; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; magma_int_t indwk2 = indwrk + n * n; magma_int_t llwork = lwork - indwrk; magma_int_t llwrk2 = lwork - indwk2; #ifdef ENABLE_TIMER magma_timestr_t start, st1, st2, end; start = get_current_time(); #endif float *dT1; if (MAGMA_SUCCESS != magma_smalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ssytrd_sy2sb(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, dT1, threads, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time ssytrd_sy2sb = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); float* A2 = &work[indwrk]; memset(A2 , 0, n*lda2*sizeof(float)); for (magma_int_t j = 0; j < n-nb; j++) { cblas_scopy(nb+1, &a[j*(lda+1)], 1, &A2[j*lda2], 1); memset(&a[j*(lda+1)], 0, (nb+1)*sizeof(float)); a[nb + j*(lda+1)] = d_one; } for (magma_int_t j = 0; j < nb; j++) { cblas_scopy(nb-j, &a[(j+n-nb)*(lda+1)], 1, &A2[(j+n-nb)*lda2], 1); memset(&a[(j+n-nb)*(lda+1)], 0, (nb-j)*sizeof(float)); } #ifdef ENABLE_TIMER st2 = get_current_time(); printf(" time ssytrd_convert = %6.2f\n" , GetTimerValue(st1,st2)/1000.); #endif magma_ssytrd_sb2st(threads, uplo, n, nb, Vblksiz, A2, lda2, w, &work[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time ssytrd_sy2st = %6.2f\n" , GetTimerValue(st2,end)/1000.); printf(" time ssytrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { #ifdef ENABLE_TIMER start = get_current_time(); #endif lapackf77_ssterf(&n, w, &work[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time sstedc = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_sstedx(range, n, vl, vu, il, iu, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); magma_free( dwork ); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time sstedx = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif float *dZ; magma_int_t lddz = n; float *da; magma_int_t ldda = n; magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); if (MAGMA_SUCCESS != magma_smalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_sbulge_back(threads, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time sbulge_back = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif magma_ssetmatrix( n, n, a, lda, da, ldda ); magma_sormqr_gpu_2stages(MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, da+nb, ldda, dZ+nb, n, dT1, nb, info); magma_sgetmatrix( n, *m, dZ, lddz, a, lda ); magma_free(dT1); magma_free(dZ); magma_free(da); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time sormqr + copy = %6.2f\n", GetTimerValue(st1,end)/1000.); printf(" time eigenvectors backtransf. = %6.2f\n" , GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_zheevdx_2stage */
/******************************************************************************* Purpose: Reset timeout counter. Re-design in 20 May 2004, Yu Wei. It will use relative timer. *******************************************************************************/ void ActivateTimeOut(void) { GetTimerValue(&g_tRMMTimeOut); }
extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* q, magma_int_t ldq, float rho, float* dlamda, float* q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { /* Purpose ======= SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= K (input) INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. N (input) INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N>K). N1 (input) INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. D (output) REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. Q (output) REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. LDQ (input) INTEGER The leading dimension of the array Q. LDQ >= max(1,N). RHO (input) REAL The value of the parameter in the rank one update equation. RHO >= 0 required. DLAMDA (input/output) REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. Q2 (input) REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. INDX (input) INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. CTOT (input) INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. W (input/output) REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. S (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. INDXQ (output) INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = 1, an eigenvalue did not converge Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. ===================================================================== */ float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; char range_[] = {range, 0}; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i,iq2,j,n12,n2,n23,tmp,lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); *info = 0; if(k < 0) *info=-1; else if(n < k) *info=-2; else if(ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if(*info != 0){ magma_xerbla(__func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if(k == 0) return MAGMA_SUCCESS; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for(i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = ib; j < ie; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0){ #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if(*info == 0){ #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2){ #pragma omp single { for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if(k != 1){ // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for(j = 0; j < k; ++j){ magma_int_t i_tmp = min(j, ie); for(i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for(i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if(id < tot){ ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else{ ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for(j = ib; j < ie; ++j){ for(i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s+id*k, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return MAGMA_SUCCESS; //?????? #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif for(i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = 0; j < k; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0) *info=iinfo; } if(*info != 0) return MAGMA_SUCCESS; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2){ for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if(k != 1){ // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, q, &tmp, w, &ione); for(j = 0; j < k; ++j){ for(i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for(i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for(j = iil-1; j < iiu; ++j){ for(i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #endif //_OPENMP // Compute the updated eigenvectors. #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER start = get_current_time(); #endif magma_queue_sync( NULL ); if (rk != 0){ if( n23 != 0 ){ if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm('N', 'N', n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if( n12 != 0 ) { if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm('N', 'N', n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.); #endif return MAGMA_SUCCESS; } /*magma_slaed3*/
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); cuFloatComplex *h_A, *h_R, *h_work; float *rwork, *w1, *w2; magma_int_t *iwork; float gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, info; magma_int_t ione = 1, izero = 0; magma_int_t ISEED[4] = {0,0,0,1}; const char *uplo = MagmaLowerStr; const char *jobz = MagmaVectorsStr; magma_int_t checkres; float result[3], eps = lapackf77_slamch( "E" ); if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); } else if ( strcmp("-JV", argv[i]) == 0 ) { jobz = MagmaVectorsStr; } else if ( strcmp("-JN", argv[i]) == 0 ) { jobz = MagmaNoVectorsStr; } } if (N>0) printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; if ( checkres && jobz[0] == MagmaNoVectors ) { printf( "Cannot check results when vectors are not computed (jobz='N')\n" ); checkres = false; } /* Query for workspace sizes */ cuFloatComplex aux_work[1]; float aux_rwork[1]; magma_int_t aux_iwork[1]; magma_cheevd( jobz[0], uplo[0], N, h_R, N, w1, aux_work, -1, aux_rwork, -1, aux_iwork, -1, &info ); magma_int_t lwork, lrwork, liwork; lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); lrwork = (magma_int_t) aux_rwork[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuFloatComplex, N*N ); TESTING_MALLOC( w1, float , N ); TESTING_MALLOC( w2, float , N ); TESTING_HOSTALLOC( h_R, cuFloatComplex, N*N ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); TESTING_MALLOC( rwork, float, lrwork ); TESTING_MALLOC( iwork, magma_int_t, liwork ); printf(" N CPU Time(s) GPU Time(s) \n"); printf("===================================\n"); for(i=0; i<8; i++){ if (argc==1){ N = size[i]; } n2 = N*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for( int i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*N+i], MAGMA_C_REAL(h_A[i*N+i]) ); } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* warm up run */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* query for optimal workspace sizes */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, -1, rwork, -1, iwork, -1, &info); int lwork_save = lwork; int lrwork_save = lrwork; int liwork_save = liwork; lwork = min( lwork, (magma_int_t) MAGMA_C_REAL( h_work[0] )); lrwork = min( lrwork, (magma_int_t) rwork[0] ); liwork = min( liwork, iwork[0] ); //printf( "lwork %d, query %d, used %d; liwork %d, query %d, used %d\n", // lwork_save, (magma_int_t) h_work[0], lwork, // liwork_save, iwork[0], liwork ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; lwork = lwork_save; lrwork = lrwork_save; liwork = liwork_save; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; cuFloatComplex *tau; lapackf77_chet21(&ione, uplo, &N, &izero, h_A, &N, w1, w1, h_R, &N, h_R, &N, tau, h_work, rwork, &result[0]); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); magma_cheevd('N', uplo[0], N, h_R, N, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cheevd(jobz, uplo, &N, h_A, &N, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of cheevd had an illegal value.\n", (int) -info); cpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); if ( checkres ){ printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %e\n", result[0]*eps); printf("(2) | I - U'U | / N = %e\n", result[1]*eps); printf("(3) | S(w/ U)-S(w/o U)|/ |S| = %e\n\n", result[2]); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_FREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, gpu_perf, cpu_perf; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, (int) M, (int) N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, 1024, 1024); M = N = size[9]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, lda*N ); TESTING_MALLOC( h_A2, double, lda*N ); TESTING_MALLOC( h_B, double, ldb*nrhs ); TESTING_MALLOC( h_X, double, ldb*nrhs ); TESTING_MALLOC( h_R, double, ldb*nrhs ); TESTING_DEVALLOC( d_A, double, ldda*N ); TESTING_DEVALLOC( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC( h_work, double, lhwork ); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; flops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1000000; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); start = get_current_time(); magma_dgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); start = get_current_time(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); end = get_current_time(); cpu_perf = flops / GetTimerValue(start, end); if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", (int) -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
int main(int argc, char **argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; FILE *fp ; magma_int_t i, lda, Xm, Ym; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t szeA, szeX, szeY; magma_int_t istart = 64; magma_int_t iend = 10240; magma_int_t incx = 1; magma_int_t incy = 1; char trans = MagmaNoTrans; cuDoubleComplex alpha = MAGMA_Z_MAKE(1., 0.); // MAGMA_Z_MAKE( 1.5, -2.3 ); cuDoubleComplex beta = MAGMA_Z_MAKE(0., 0.); // MAGMA_Z_MAKE( -0.6, 0.8 ); cuDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; cuDoubleComplex *dA, *dX, *dY; if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-n", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-m", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if (strcmp("-N", argv[i])==0){ trans = MagmaNoTrans; } else if (strcmp("-T", argv[i])==0){ trans = MagmaTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-C", argv[i])==0){ trans = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) ) iend = istart + 1; M = N = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; szeA = lda*N; szeX = incx*Xm; szeY = incy*Ym; TESTING_MALLOC( A, cuDoubleComplex, szeA ); TESTING_MALLOC( X, cuDoubleComplex, szeX ); TESTING_MALLOC( Y, cuDoubleComplex, szeY ); TESTING_MALLOC( Ycublas, cuDoubleComplex, szeY ); TESTING_MALLOC( Ymagma, cuDoubleComplex, szeY ); TESTING_DEVALLOC( dA, cuDoubleComplex, szeA ); TESTING_DEVALLOC( dX, cuDoubleComplex, szeX ); TESTING_DEVALLOC( dY, cuDoubleComplex, szeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &szeA, A ); lapackf77_zlarnv( &ione, ISEED, &szeX, X ); lapackf77_zlarnv( &ione, ISEED, &szeY, Y ); fp = fopen ("results_zgemv.txt", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("\nUsage: \n"); printf(" testing_zgemv [-N|T|C] [-m %d] [-n %d]\n\n", 1024, 1024); printf( " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); for( i=istart; i < iend; i = (int)((i+1)*1.1) ) { M = N = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; flops = FLOPS( (double)M, (double)N ) / 1000000; printf( "%5d %5d ", (int) M, (int) N ); fprintf( fp, "%5d %5d ", (int) M, (int) N ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); /* * Cublas Version */ start = get_current_time(); cublasZgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incy, Ycublas, incy ); cuda_perf = flops / GetTimerValue(start, end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f", cuda_perf ); /* * Magma Version */ magma_zsetvector( Ym, Y, incy, dY, incy ); start = get_current_time(); magmablas_zgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incx, Ymagma, incx ); magma_perf = flops / GetTimerValue(start, end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f", magma_perf ); /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); #if 0 printf( "\t\t %8.6e", error / (double)Ym ); fprintf( fp, "\t\t %8.6e", error / (double)Ym ); /* * Blas comparaison */ { char *blastrans = MagmaNoTransStr; if ( trans == MagmaConjTrans ) blastrans = MagmaConjTransStr; else if ( trans == MagmaTrans ) blastrans = MagmaTransStr; blasf77_zcopy( &Ym, Y, &incy, Ycublas, &incy ); blasf77_zgemv( blastrans, &M, &N, &alpha, A, &lda, X, &incx, &beta, Ycublas, &incy ); blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); } #endif printf( "\t\t %8.6e\n", error / (double)Ym ); fprintf( fp, "\t\t %8.6e\n", error / (double)Ym ); } /* Free Memory */ TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); /* Free device */ TESTING_CUDA_FINALIZE(); return EXIT_SUCCESS; }
EFI_STATUS GetPeiPerformance ( IN EFI_HANDLE ImageHandle, IN EFI_SYSTEM_TABLE *SystemTable, IN UINT64 Ticker ) /*++ Routine Description: Transfer PEI performance data to gauge data node. Arguments: ImageHandle - Standard entry point parameter SystemTable - Standard entry point parameter Ticker - Start tick Returns: EFI_OUT_OF_RESOURCES - No enough resource to create data node. EFI_SUCCESS - Transfer done successfully. --*/ { EFI_STATUS Status; VOID *HobList; EFI_HOB_GUID_DATA_PERFORMANCE_LOG *LogHob; PEI_PERFORMANCE_MEASURE_LOG_ENTRY *LogEntry; UINT32 Index; EFI_PERF_DATA_LIST *Node; UINT64 TimerValue; Node = CreateDataNode (0, PEI_TOK, NULL); if (!Node) { return EFI_OUT_OF_RESOURCES; } if (Ticker != 0) { TimerValue = Ticker; } else { GetTimerValue (&TimerValue); } (Node->GaugeData).EndTick = TimerValue; InsertTailList (&mPerfDataHead, &(Node->Link)); EfiLibGetSystemConfigurationTable (&gEfiHobListGuid, &HobList); do { Status = GetNextGuidHob (&HobList, &gEfiPeiPerformanceHobGuid, (VOID **) &LogHob, NULL); if (EFI_ERROR (Status)) { break; } for (Index = 0; Index < LogHob->NumberOfEntries; Index++) { LogEntry = &(LogHob->Log[Index]); Node = CreateDataNode (0, LogEntry->DescriptionString, NULL); if (!Node) { return EFI_OUT_OF_RESOURCES; } (Node->GaugeData).StartTick = LogEntry->StartTimeCount; EfiCopyMem (&(Node->GaugeData.GuidName), &LogEntry->Name, sizeof (EFI_GUID)); InsertTailList (&mPerfDataHead, &(Node->Link)); (Node->GaugeData).EndTick = LogEntry->StopTimeCount; } } while (!EFI_ERROR (Status)); return EFI_SUCCESS; }
extern "C" magma_int_t magma_ssyevd_gpu(char jobz, char uplo, magma_int_t n, float *da, magma_int_t ldda, float *w, float *wa, magma_int_t ldwa, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SSYEVD_GPU computes all eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. DA (device input/output) REAL array on the GPU, dimension (LDDA, N). On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDDA (input) INTEGER The leading dimension of the array DA. LDDA >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WA (workspace) DOUBLE PRECISION array, dimension (LDWA, N) LDWA (input) INTEGER The leading dimension of the array WA. LDWA >= max(1,N). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= 2*N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; magma_int_t ione = 1; float d__1; float eps; magma_int_t inde; float anrm; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; float smlnum; magma_int_t lquery; float *dwork; magma_int_t lddc = ldda; wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); lquery = lwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -2; } else if (n < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -10; } else if ((liwork < liwmin) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif char jobz_[2] = {jobz, 0}, uplo_[2] = {uplo, 0}; float *a = (float *) malloc( n * n * sizeof(float) ); magma_sgetmatrix(n, n, da, ldda, a, n); lapackf77_ssyevd(jobz_, uplo_, &n, a, &n, w, work, &lwork, iwork, &liwork, info); magma_ssetmatrix( n, n, a, n, da, ldda); free(a); return *info; } magma_queue_t stream; magma_queue_create( &stream ); // n*lddc for ssytrd2_gpu // n for slansy magma_int_t ldwork = n*lddc; if ( wantz ) { // need 3n^2/2 for sstedx ldwork = max( ldwork, 3*n*(n/2 + 1)); } if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_slansy('M', uplo, n, da, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_slascl(uplo, 0, 0, 1., sigma, n, n, da, ldda, info); } /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */ // ssytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; // #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif #ifdef FAST_SYMV magma_ssytrd2_gpu(uplo, n, da, ldda, w, &work[inde], &work[indtau], wa, ldwa, &work[indwrk], llwork, dwork, n*lddc, &iinfo); #else magma_ssytrd_gpu(uplo, n, da, ldda, w, &work[inde], &work[indtau], wa, ldwa, &work[indwrk], llwork, &iinfo); #endif #ifdef ENABLE_TIMER end = get_current_time(); #ifdef FAST_SYMV printf("time ssytrd2 = %6.2f\n", GetTimerValue(start,end)/1000.); #else printf("time ssytrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call SORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &work[inde], info); } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_sstedx('A', n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time sstedx = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_ssetmatrix( n, n, &work[indwrk], n, dwork, lddc ); #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_sormtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, n, da, ldda, &work[indtau], dwork, lddc, wa, ldwa, &iinfo); magma_scopymatrix( n, n, dwork, lddc, da, ldda ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time sormtr + copy = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_sscal(&n, &d__1, w, &ione); } work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); // round up iwork[0] = liwmin; magma_queue_destroy( stream ); magma_free( dwork ); return *info; } /* magma_ssyevd_gpu */
EFI_STATUS EFIAPI StartGauge ( IN EFI_PERFORMANCE_PROTOCOL *This, IN EFI_HANDLE Handle, IN UINT16 *Token, IN UINT16 *Host, IN UINT64 Ticker ) /*++ Routine Description: Create a guage data node and initialized it. Arguments: This - Calling context Handle - Handle of gauge data Token - Token of gauge data Host - Host of gauge data Ticker - Set gauge data's StartTick. If 0, StartTick is current timer. Returns: EFI_SUCCESS - Successfully create and initialized a guage data node. EFI_OUT_OF_RESOURCES - No enough resource to create a guage data node. --*/ { EFI_PERFORMANCE_INSTANCE *PerfInstance; EFI_PERF_DATA_LIST *Node; UINT64 TimerValue; TimerValue = 0; PerfInstance = EFI_PERFORMANCE_FROM_THIS (This); Node = CreateDataNode (Handle, Token, Host); if (!Node) { return EFI_OUT_OF_RESOURCES; } if (Ticker != 0) { TimerValue = Ticker; } else { GetTimerValue (&TimerValue); } Node->GaugeData.StartTick = TimerValue; if (!EfiStrCmp (Token, DXE_TOK)) { PerfInstance->Phase = DXE_PHASE; } if (!EfiStrCmp (Token, SHELL_TOK)) { PerfInstance->Phase = SHELL_PHASE; } Node->GaugeData.Phase = PerfInstance->Phase; InsertTailList (&mPerfDataHead, &(Node->Link)); return EFI_SUCCESS; }
extern "C" magma_int_t magma_zheevd(char jobz, char uplo, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZHEEVD computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_zhetrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. RWORK (workspace/output) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = 'N' and N > 1, LRWORK >= N. If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; magma_int_t inde; double anrm; magma_int_t imax; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; double smlnum; magma_int_t lquery; double* dwork; wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); lquery = lwork == -1 || lrwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_zhetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. work[0] = MAGMA_Z_MAKE( lwmin * (1. + lapackf77_dlamch("Epsilon")), 0.); rwork[0] = lrwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -8; } else if ((lrwork < lrwmin) && ! lquery) { *info = -10; } else if ((liwork < liwmin) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL(a[0]); if (wantz) { a[0] = MAGMA_Z_ONE; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zheevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe("M", uplo_, &n, a, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ // zhetrd rwork: e (n) // zstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // zhetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // zstedx work: tau (n) + z (n^2) // zunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; // #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_zhetrd(uplo_[0], n, a, lda, w, &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); #ifdef ENABLE_TIMER end = get_current_time(); printf("time zhetrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf(&n, w, &rwork[inde], info); } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zstedx('A', n, 0., 0., 0, 0, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); magma_free( dwork ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time zstedx = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif magma_zunmtr(MagmaLeft, uplo, MagmaNoTrans, n, n, a, lda, &work[indtau], &work[indwrk], n, &work[indwk2], llwrk2, &iinfo); lapackf77_zlacpy("A", &n, &n, &work[indwrk], &n, a, &lda); #ifdef ENABLE_TIMER end = get_current_time(); printf("time zunmtr + copy = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_Z_MAKE( lwmin * (1. + lapackf77_dlamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_zheevd */
extern "C" magma_int_t magma_zgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, cuDoubleComplex **d_lAT, magma_int_t lddat, magma_int_t *ipiv, cuDoubleComplex **d_lAP, cuDoubleComplex *w, magma_int_t ldw, cudaStream_t streaml[][2], magma_int_t *info) #endif { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2010 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -7, internal GPU memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[4]; magma_int_t maxm, mindim; magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4]; magma_int_t id, i_local, i_local2, nb0, nb1; cuDoubleComplex *d_panel[4], *panel_local[4]; //cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); //nb = magma_get_zgetrf_nb(m); if( num_gpus > ceil((double)n/nb) ) { *info = -1; return *info; } { /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for(i=0; i<num_gpus; i++){ magmaSetDevice(i); n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ d_panel[i] = &(d_lAP[i][nb*maxm]); /* temporary panel storage */ /* create local streams */ //magma_queue_create(&streaml[i][0]); //magma_queue_create(&streaml[i][1]); } trace_init( 1, num_gpus, 2, (CUstream_st**)streaml ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magmaSetDevice(0); magmablasSetKernelStream(streaml[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); if( nb0 == nb ) magmablas_ztranspose( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm ); else magmablas_ztranspose2( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm ); magma_zgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, streaml[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ #ifdef PROFILE magma_timestr_t start_timer, end_timer; start_timer = get_current_time(); #endif s = mindim / nb; for( i=0; i<s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; magmaSetDevice(id); /* Set the local index where the current panel is */ i_local = i/num_gpus; cols = maxm - i*nb; rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( streaml[id][1] ); /* i-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); #ifdef PANEL_FACT_MC cntxt->nb = 12; magma_zgetrf_mc(cntxt, &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); #else lapackf77_zgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); #endif if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; //break; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_zsetmatrix_async( rows, nb, W(i), ldw, d_lAP[d], cols, streaml[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 1, "pivot", "pivot" ); if( dd == 0 ) magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb ); else magmablas_zpermute_long3( inAT(d,0,0), lddat, ipiv, nb, i*nb ); trace_gpu_end( d, 1 ); d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); /* storage for panel */ if( d == id ) { /* the panel belond to this gpu */ panel_local[d] = inAT(d,i,i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = &d_panel[d][(i%2)*nb*maxm]; //panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if( d < s%num_gpus ) nb0 -= nb; } if( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(streaml[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(streaml[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; magmablasSetKernelStream(streaml[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(streaml[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); } magmablas_ztranspose(panel_local[d], ldpan[d], d_lAP[d], cols, cols, nb); /* gpu updating the trailing matrix */ //magmablas_ztrsm( magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], inAT(d, i, i_local2), lddat); //cublasZgemm magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, inAT(d, i+1, i_local2), lddat ); if( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); if( i+1 < s ) magmablas_ztranspose( d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda ); else magmablas_ztranspose2( d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda ); /* send the panel to cpu */ magma_zgetmatrix_async( cols, nb0, d_lAP[d], ldda, W(i+1), ldw, streaml[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); //magmablas_ztrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], inAT(d,i,i_local+1), lddat ); //cublasZgemm magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, inAT(d,i,i_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, inAT(d,i+1, i_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if( nb0 > 0 ) { magmaSetDevice(id); /* wait for the last panel on cpu */ magma_queue_sync( streaml[id][1] ); /* factor on cpu */ lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); i_local2 = i_local; if( d < id ) i_local2 ++; if( d == id || n_local[d] > i_local2*nb ) { magma_zsetmatrix_async( rows, nb0, W(s), ldw, d_lAP[d], cols, streaml[d][1] ); } } for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); if( d == 0 ) magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb ); else magmablas_zpermute_long3( inAT(d,0,0), lddat, ipiv, nb0, s*nb ); } for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( streaml[d][0] ); i_local2 = i_local; if( d < id ) i_local2++; if( d == id ) { /* the panel belond to this gpu */ panel_local[d] = inAT(d,s,i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magmablas_ztranspose2( panel_local[d], lddat, d_lAP[d], cols, rows, nb0); if( nb1 > 0 ) //cublasZtrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, inAT(d,s,i_local)+nb0, lddat); } else if( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = &d_panel[d][(s%2)*nb*maxm]; //panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magmablas_ztranspose2( panel_local[d], nb, d_lAP[d], cols, rows, nb0); //cublasZtrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, inAT(d,s,i_local2), lddat); } } } /* if( nb0 > 0 ) */ /* clean up */ trace_finalize( "zgetrf_mgpu.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magma_queue_sync( streaml[d][0] ); magma_queue_sync( streaml[d][1] ); //magma_queue_destroy(streaml[d][0]); //magma_queue_destroy(streaml[d][1]); magmablasSetKernelStream(NULL); } magmaSetDevice(0); #ifdef PROFILE end_timer = get_current_time(); printf("\n Performance %f GFlop/s\n", (2./3.*n*n*n /1000000.) / GetTimerValue(start_timer, end_timer)); #endif } return *info; /* End of MAGMA_ZGETRF2_MGPU */ }
int main(int argc, char **argv) { TESTING_INIT(); magma_setdevice(0); magma_timestr_t start, end; float flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t n_local[4]; FILE *fp ; magma_int_t N, m, i, j, lda, LDA, M; magma_int_t matsize; magma_int_t vecsize; magma_int_t istart = 64; magma_int_t incx = 1; char uplo = MagmaLower; magmaFloatComplex alpha = MAGMA_C_MAKE(1., 0.); // MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE(0., 0.); // MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y[4], *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ; magma_queue_t stream[4][10]; magmaFloatComplex *C_work; magmaFloatComplex *dC_work[4]; int max_num_gpus; magma_int_t num_gpus = 1, nb; magma_int_t blocks, lwork; magma_int_t offset = 0; M = 0; N = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); istart = N; } else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); else if (strcmp("-offset", argv[i])==0) offset = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) { printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); printf(" in %c side \n", uplo); } else { printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { #if defined(PRECISION_z) M = N = 8000; #else M = N = 12480; #endif num_gpus = 2; offset = 0; printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); } ////////////////////////////////////////////////////////////////////////// cudaGetDeviceCount(&max_num_gpus); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); for(int i=0; i< num_gpus; i++) { magma_queue_create(&stream[i][0]); } LDA = ((N+31)/32)*32; matsize = N*LDA; vecsize = N*incx; nb = 32; //nb = 64; printf("block size = %d\n", (int) nb); TESTING_MALLOC_CPU( A, magmaFloatComplex, matsize ); TESTING_MALLOC_CPU( X, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { TESTING_MALLOC_CPU( Y[i], magmaFloatComplex, vecsize ); } magma_setdevice(0); TESTING_MALLOC_DEV( dA, magmaFloatComplex, matsize ); TESTING_MALLOC_DEV( dYcublas, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; magma_setdevice(i); TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, LDA*n_local[i] );// potentially bugged TESTING_MALLOC_DEV( dX[i], magmaFloatComplex, vecsize ); TESTING_MALLOC_DEV( dY[i], magmaFloatComplex, vecsize ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &matsize, A ); magma_cmake_hermitian( N, A, LDA ); blocks = N / nb + (N % nb != 0); lwork = LDA * (blocks + 1); TESTING_MALLOC_CPU( C_work, magmaFloatComplex, lwork ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_MALLOC_DEV( dC_work[i], magmaFloatComplex, lwork ); //fillZero(dC_work[i], lwork); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// fp = fopen ("results_chemv_mgpu.csv", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("CHEMV magmaFloatComplex precision\n\n"); printf( " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); // for( offset = 0; offset< N; offset ++ ) for(int size = istart ; size <= N ; size += 128) { // printf("offset = %d ", offset); m = size ; // m = N; // lda = ((m+31)/32)*32;// lda = LDA; flops = FLOPS( (float)m ) / 1e6; printf( "N %5d ", (int) m ); fprintf( fp, "%5d, ", (int) m ); vecsize = m * incx; lapackf77_clarnv( &ione, ISEED, &vecsize, X ); lapackf77_clarnv( &ione, ISEED, &vecsize, Y[0] ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_setdevice(0); magma_csetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); magma_setdevice(0); magma_csetmatrix( m, m, A, LDA, dA, lda ); magma_csetvector( m, Y[0], incx, dYcublas, incx ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_csetvector( m, X, incx, dX[i], incx ); magma_csetvector( m, Y[0], incx, dY[i], incx ); blocks = m / nb + (m % nb != 0); magma_csetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda ); } magma_setdevice(0); start = get_current_time(); cublasChemv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx ); end = get_current_time(); magma_cgetvector( m, dYcublas, incx, Ycublas, incx ); cuda_perf = flops / GetTimerValue(start,end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f,", cuda_perf ); magma_setdevice(0); start = get_current_time(); if(nb == 32) { magmablas_chemv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } else // nb = 64 { magmablas_chemv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } for(i=1; i<num_gpus; i++) { magma_setdevice(i); cudaDeviceSynchronize(); } end = get_current_time(); magma_perf = flops / GetTimerValue(start,end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f,", magma_perf ); for(i=0; i<num_gpus; i++) { magma_setdevice(i); magma_cgetvector( m, dY[i], incx, Y[i], incx ); } magma_setdevice(0); #ifdef validate for( j= offset;j<m;j++) { for(i=1; i<num_gpus; i++) { // printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x); #if defined(PRECISION_z) || defined(PRECISION_c) Y[0][j].x = Y[0][j].x + Y[i][j].x; Y[0][j].y = Y[0][j].y + Y[i][j].y; #else Y[0][j] = Y[0][j] + Y[i][j]; #endif } } /* #if defined(PRECISION_z) || defined(PRECISION_c) for( j=offset;j<m;j++) { if(Y[0][j].x != Ycublas[j].x) { printf("Y-multi[%d] = %f, %f\n", j, Y[0][j].x, Y[0][j].y ); printf("Ycublas[%d] = %f, %f\n", j, Ycublas[j].x, Ycublas[j].y); } } #else for( j=offset;j<m;j++) { if(Y[0][j] != Ycublas[j]) { printf("Y-multi[%d] = %f\n", j, Y[0][j] ); printf("Ycublas[%d] = %f\n", j, Ycublas[j]); } } #endif */ /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ magma_int_t nw = m - offset ; blasf77_caxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx); error = lapackf77_clange( "M", &nw, &ione, Ycublas + offset, &nw, work ); #if 0 printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); /* * Extra check with cblas vs magma */ cblas_ccopy( m, Y, incx, Ycublas, incx ); cblas_chemv( CblasColMajor, CblasLower, m, CBLAS_SADDR(alpha), A, LDA, X, incx, CBLAS_SADDR(beta), Ycublas, incx ); blasf77_caxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx); error = lapackf77_clange( "M", &m, &ione, Ycublas, &m, work ); #endif printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); #endif printf("\n"); fprintf(fp, "\n"); } fclose( fp ) ; /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( C_work ); magma_setdevice(0); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dYcublas ); for(i=0; i<num_gpus; i++) { TESTING_FREE_CPU( Y[i] ); magma_setdevice(i); TESTING_FREE_DEV( d_lA[i] ); TESTING_FREE_DEV( dX[i] ); TESTING_FREE_DEV( dY[i] ); TESTING_FREE_DEV( dC_work[i] ); } magma_setdevice(0); /////////////////////////////////////////////////////////// /* Free device */ TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { //#define USE_MGPU #ifdef USE_MGPU TESTING_CUDA_INIT_MGPU(); #else TESTING_CUDA_INIT(); #endif magma_int_t nrgpu =1; cuDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; double *rwork, *w1, *w2; magma_int_t *iwork; double gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[4] = {1024,2048,4100,6001}; magma_int_t i, itype, info; magma_int_t ione = 1, izero = 0; magma_int_t five = 5; cuDoubleComplex c_zero = MAGMA_Z_ZERO; cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.; double d_neg_one = -1.; double d_ten = 10.; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t il,iu,m1,m2; double vl,vu; double fraction_ev = 0; //const char *uplo = MagmaLowerStr; char *uplo = (char*)MagmaLowerStr; //char *uplo = (char*)MagmaUpperStr; char *jobz = (char*)MagmaVectorsStr; char range = 'A'; itype = 1; magma_int_t checkres; double result[2]; int flagN = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0){ printf(" testing_zhegvdx -N %d\n\n", (int) N); flagN=1; } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -N %d\n\n", (int) N); exit(1); } } if (strcmp("-ngpu", argv[i])==0){ nrgpu = atoi(argv[++i]); if (nrgpu>0){ printf(" testing_zhegvdx -ngpu %d\n\n", (int) nrgpu); } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -ngpu %d\n\n", (int) nrgpu); exit(1); } } if (strcmp("-itype", argv[i])==0){ itype = atoi(argv[++i]); if (itype>0 && itype <= 3){ printf(" testing_zhegvdx -itype %d\n\n", (int) itype); } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -itype %d\n\n", (int) itype); exit(1); } } if (strcmp("-FE", argv[i])==0){ fraction_ev = atof(argv[++i]); if (fraction_ev > 0 && fraction_ev <= 1){ printf(" testing_zhegvdx -FE %f\n\n", fraction_ev); } else { fraction_ev = 0; } } if (strcmp("-L", argv[i])==0){ uplo = (char*)MagmaLowerStr; printf(" testing_zhegvdx -L"); } if (strcmp("-U", argv[i])==0){ uplo = (char*)MagmaUpperStr; printf(" testing_zhegvdx -U"); } } } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -L/U -N %d -itype %d\n\n", 1024, 1); } if(!flagN) N = size[3]; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; n2 = N * N; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuDoubleComplex, n2); TESTING_MALLOC( h_B, cuDoubleComplex, n2); TESTING_MALLOC( w1, double , N); TESTING_MALLOC( w2, double , N); TESTING_HOSTALLOC(h_R, cuDoubleComplex, n2); TESTING_HOSTALLOC(h_S, cuDoubleComplex, n2); magma_int_t nb = magma_get_zhetrd_nb(N); magma_int_t lwork = magma_zbulge_get_lq2(N) + 2*N + N*N; magma_int_t lrwork = 1 + 5*N +2*N*N; magma_int_t liwork = 3 + 5*N; TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork); TESTING_HOSTALLOC( rwork, double, lrwork); TESTING_MALLOC( iwork, magma_int_t, liwork); printf(" N M GPU Time(s) \n"); printf("==========================\n"); for(i=0; i<4; i++){ if (!flagN){ N = size[i]; n2 = N*N; } if (fraction_ev == 0){ il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int)(fraction_ev*N); if (iu < 1) iu = 1; } /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); //lapackf77_zlatms( &N, &N, "U", ISEED, "P", w1, &five, &d_ten, // &d_one, &N, &N, uplo, h_B, &N, h_work, &info); //lapackf77_zlaset( "A", &N, &N, &c_zero, &c_one, h_B, &N); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); /* increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_Z_SET2REAL( h_B[i*N+i], ( MAGMA_Z_REAL(h_B[i*N+i]) + 1.*N ) ); } } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); #ifdef USE_MGPU magma_zhegvdx_2stage_m(nrgpu, itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #else magma_zhegvdx_2stage(itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #endif lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); #ifdef USE_MGPU magma_zhegvdx_2stage_m(nrgpu, itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #else magma_zhegvdx_2stage(itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #endif end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ double temp1, temp2; cuDoubleComplex *tau; result[0] = 1.; result[0] /= lapackf77_zlanhe("1",uplo, &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1",&N , &m1, h_R, &N, rwork); if (itype == 1){ blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", uplo, &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (itype == 3){ blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx(itype, 'N', range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m2, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<m2; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[1] = temp2 / temp1; } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %6.2f\n", (int) N, (int) m1, gpu_time); if ( checkres ){ printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %e\n", result[0]); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %e\n", result[0]); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %e\n", result[0]); printf("(2) | D(w/ Z)-D(w/o Z)|/ |D| = %e\n\n", result[1]); } if (flagN) break; } cudaSetDevice(0); /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( h_B); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_HOSTFREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); TESTING_HOSTFREE( h_S); /* Shutdown */ #ifdef USE_MGPU TESTING_CUDA_FINALIZE_MGPU(); #else TESTING_CUDA_FINALIZE(); #endif }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; cuFloatComplex *h_A, *h_R; cuFloatComplex *d_A; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info; const char *uplo = MagmaUpperStr; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_cpotri_gpu -N %d\n\n", 1024); } /* Allocate host memory for the matrix */ n2 = size[9] * size[9]; ldda = ((size[9]+31)/32) * 32; TESTING_MALLOC( h_A, cuFloatComplex, n2); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2); TESTING_DEVALLOC( d_A, cuFloatComplex, ldda*size[9] ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS_CPOTRI( (float)N ) / 1000000; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*lda+i], ( MAGMA_C_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = cuConjf(h_A[j*lda+i]); } } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //cublasSetMatrix( N, N, sizeof(cuFloatComplex), h_A, lda, d_A, ldda); //magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); /* factorize matrix */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); // check for exact singularity //magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 ); //magma_csetmatrix( N, N, h_R, lda, d_A, ldda ); start = get_current_time(); magma_cpotri_gpu(uplo[0], N, d_A, ldda, &info); end = get_current_time(); if (info != 0) printf("magma_cpotri_gpu returned error %d\n", (int) info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_cpotrf(uplo, &N, h_A, &lda, &info); start = get_current_time(); lapackf77_cpotri(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info != 0) printf("lapackf77_cpotri returned error %d\n", (int) info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_clange("f", &N, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
extern "C" magma_int_t magma_ssygvd(magma_int_t itype, char jobz, char uplo, magma_int_t n, float *a, magma_int_t lda, float *b, magma_int_t ldb, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SSYGVD computes all the eigenvalues, and optionally, the eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX*16 array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= 2*N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: SPOTRF or SSYEVD returned an error code: <= N: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details =============== Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if SSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; float d_one = MAGMA_S_ONE; float *da; float *db; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; char trans[1]; magma_int_t wantz, lquery; magma_int_t lwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); lquery = lwork == -1 || liwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldb < max(1,n)) { *info = -8; } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -11; } else if (liwork < liwmin && ! lquery) { *info = -13; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) { return MAGMA_SUCCESS; } /* Quick return if possible */ if (n == 0) { return 0; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_ssygvd(&itype, jobz_, uplo_, &n, a, &lda, b, &ldb, w, work, &lwork, iwork, &liwork, info); return *info; } if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_smalloc( &db, n*lddb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_ssetmatrix( n, n, b, ldb, db, lddb ); magma_ssetmatrix_async( n, n, a, lda, da, ldda, stream ); #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_spotrf_gpu(uplo, n, db, lddb, info); if (*info != 0) { *info = n + *info; return 0; } #ifdef ENABLE_TIMER end = get_current_time(); printf("time spotrf_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_queue_sync( stream ); magma_sgetmatrix_async( n, n, db, lddb, b, ldb, stream ); #ifdef ENABLE_TIMER start = get_current_time(); #endif /* Transform problem to standard eigenvalue problem and solve. */ magma_ssygst_gpu(itype, uplo, n, da, ldda, db, lddb, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time ssygst_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* simple fix to be able to run bigger size. * need to have a dwork here that will be used * a db and then passed to ssyevd. * */ if(n > 5000){ magma_queue_sync( stream ); magma_free( db ); } #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_ssyevd_gpu(jobz, uplo, n, da, ldda, w, a, lda, work, lwork, iwork, liwork, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time ssyevd_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif if (wantz && *info == 0) { #ifdef ENABLE_TIMER start = get_current_time(); #endif /* allocate and copy db back */ if(n > 5000){ if (MAGMA_SUCCESS != magma_smalloc( &db, n*lddb ) ){ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ssetmatrix( n, n, b, ldb, db, lddb ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_strsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, n, d_one, db, lddb, da, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaTrans; } magma_strmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, n, d_one, db, lddb, da, ldda); } magma_sgetmatrix( n, n, da, ldda, a, lda ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time strsm/mm + getmatrix = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* free db */ if(n > 5000){ magma_free( db ); } } magma_queue_sync( stream ); magma_queue_destroy( stream ); work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); // round up iwork[0] = liwmin; magma_free( da ); if(n <= 5000){ magma_free( db ); } return MAGMA_SUCCESS; } /* magma_ssygvd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgelqf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; float matnorm, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; cuFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, lwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (N>0 && M>0) printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; min_mn = min(M, N); nb = magma_get_cgeqrf_nb(M); TESTING_MALLOC( tau, cuFloatComplex, min_mn ); TESTING_MALLOC( h_A, cuFloatComplex, n2 ); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2 ); lwork = -1; lapackf77_cgelqf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max( lwork, M*nb ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; flops = FLOPS( (float)M, (float)N ) / 1000000; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cgelqf( M, N, h_R, lda, tau, h_work, lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_cgelqf had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cgelqf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of lapack_cgelqf had an illegal value.\n", (int) -info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ matnorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f %6.2f %e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_clange("f", &M, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_HOSTFREE( h_work ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { magma_int_t nquarkthreads=2; magma_int_t nthreads=2; magma_int_t num_gpus = 1; TRACE = 0; //magma_qr_params mp; cuDoubleComplex *h_A, *h_R, *h_work, *tau; double gpu_perf, cpu_perf, flops; magma_timestr_t start, end; magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params)); /* Matrix size */ magma_int_t M=0, N=0, n2; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; cublasStatus status; magma_int_t i, j, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; mp->nb=-1; mp->ob=-1; mp->fb=-1; mp->ib=32; magma_int_t loop = argc; magma_int_t accuracyflag = 1; char precision; magma_int_t nc = -1; magma_int_t ncps = -1; if (argc != 1) { for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-F", argv[i])==0) mp->fb = atoi(argv[++i]); else if (strcmp("-O", argv[i])==0) mp->ob = atoi(argv[++i]); else if (strcmp("-B", argv[i])==0) mp->nb = atoi(argv[++i]); else if (strcmp("-b", argv[i])==0) mp->ib = atoi(argv[++i]); else if (strcmp("-A", argv[i])==0) accuracyflag = atoi(argv[++i]); else if (strcmp("-P", argv[i])==0) nthreads = atoi(argv[++i]); else if (strcmp("-Q", argv[i])==0) nquarkthreads = atoi(argv[++i]); else if (strcmp("-nc", argv[i])==0) nc = atoi(argv[++i]); else if (strcmp("-ncps", argv[i])==0) ncps = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" Set number of cores per socket and number of cores.\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024); printf(" Alternatively, set:\n"); printf(" Q: Number of threads for panel factorization.\n"); printf(" P: Number of threads for trailing matrix update (CPU).\n"); printf(" B: Block size.\n"); printf(" b: Inner block size.\n"); printf(" O: Block size for trailing matrix update (CPU).\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112); M = N = size[9]; } /* Auto tune based on number of cores and number of cores per socket if provided */ if ((nc > 0) && (ncps > 0)) { precision = 's'; #if (defined(PRECISION_d)) precision = 'd'; #endif #if (defined(PRECISION_c)) precision = 'c'; #endif #if (defined(PRECISION_z)) precision = 'z'; #endif auto_tune('q', precision, nc, ncps, M, N, &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads); fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ mp->sync0 = 0; magma_context *context; context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv); context->params = (void *)(mp); mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads); for (i = 0; i < nthreads; i++) mp->sync1[i] = 0; n2 = M * N; magma_int_t min_mn = min(M, N); magma_int_t nb = magma_get_zgeqrf_nb(min_mn); magma_int_t lwork = N*nb; /* Allocate host memory for the matrix */ TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, min_mn); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork ); printf("\n\n"); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (loop==1){ M = N = min_mn = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M ); //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info); for(j=0; j<n2; j++) h_R[j] = h_A[j]; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_qr_init(mp, M, N, h_R, nthreads); start = get_current_time(); magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info); end = get_current_time(); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); if (accuracyflag == 1) lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of zgeqrf had an illegal value.\n", -info); cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; if (accuracyflag == 1){ matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one); } if (accuracyflag == 1){ printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu_perf, gpu_perf, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); } else { printf("%5d %5d %6.2f \n", M, N, gpu_perf); } if (loop != 1) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R ); /* Shut down the MAGMA context */ magma_finalize(context); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf_mc */ int main( magma_int_t argc, char** argv) { cuDoubleComplex *h_A, *h_R, *h_work, *h_A2; cuDoubleComplex *d_A; float gpu_perf, cpu_perf, cpu_perf2; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2, lda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6048,7200,8064,8928,10080}; magma_int_t i, j, info[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t num_cores = 4; int num_gpus = 0; magma_int_t loop = argc; if (argc != 1) { for(i = 1; i<argc; i++) { if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-C", argv[i])==0) num_cores = atoi(argv[++i]); } if (N==0) { N = size[9]; loop = 1; } else { size[0] = size[9] = N; } } else { printf("\nUsage: \n"); printf(" testing_zpotrf_mc -N %d -B 128 \n\n", 1024); N = size[9]; } lda = N; n2 = size[9] * size[9]; /* Allocate host memory for the matrix */ h_A = (cuDoubleComplex*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); } /* Allocate host memory for the matrix */ h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0])); if (h_A2 == 0) { fprintf (stderr, "!!!! host memory allocation error (A2)\n"); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ magma_context *context; context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv); printf("\n\n"); printf(" N Multicore GFlop/s ||R||_F / ||A||_F\n"); printf("=============================================\n"); for(i=0; i<10; i++) { N = lda = size[i]; n2 = N*N; lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); for(j=0; j<N; j++) MAGMA_Z_SET2REAL( h_A[j*lda+j], ( MAGMA_Z_GET_X(h_A[j*lda+j]) + 2000. ) ); for(j=0; j<n2; j++) h_A2[j] = h_A[j]; /* ===================================================================== Performs operation using LAPACK =================================================================== */ //lapackf77_zpotrf("L", &N, h_A, &lda, info); lapackf77_zpotrf("U", &N, h_A, &lda, info); if (info[0] < 0) printf("Argument %d of zpotrf had an illegal value.\n", -info[0]); /* ===================================================================== Performs operation using multi-core =================================================================== */ start = get_current_time(); //magma_zpotrf_mc(context, "L", &N, h_A2, &lda, info); magma_zpotrf_mc(context, "U", &N, h_A2, &lda, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of magma_zpotrf_mc had an illegal value.\n", -info[0]); cpu_perf2 = FLOPS( (double)N ) / (1000000.*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; int one = 1; matnorm = lapackf77_zlange("f", &N, &N, h_A, &N, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_A2, &one); printf("%5d %6.2f %e\n", size[i], cpu_perf2, lapackf77_zlange("f", &N, &N, h_A2, &N, work) / matnorm); if (loop != 1) break; } /* Memory clean up */ free(h_A); free(h_A2); /* Shut down the MAGMA context */ magma_finalize(context); }
extern "C" magma_int_t magma_chegvdx_m(magma_int_t nrgpu, magma_int_t itype, char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *b, magma_int_t ldb, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHEGVD computes all the eigenvalues, and optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**H*B*Z = I; if ITYPE = 3, Z**H*inv(B)*Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX*16 array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= N + 1. If JOBZ = 'V' and N > 1, LWORK >= 2*N*nb + N**2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. RWORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = 'N' and N > 1, LRWORK >= N. If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: CPOTRF or CHEEVD returned an error code: <= N: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details =============== Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if CHEEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t lower; char trans[1]; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin; magma_int_t liwmin; magma_int_t lrwmin; wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1 || lrwork == -1 || liwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -3; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_chetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = 2*n + n*n; lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } work[0] = MAGMA_C_MAKE( lwmin * (1. + lapackf77_slamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -17; } else if (lrwork < lrwmin && ! lquery) { *info = -19; } else if (liwork < liwmin && ! lquery) { *info = -21; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_chegvd(&itype, jobz_, uplo_, &n, a, &lda, b, &ldb, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } // #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_cpotrf_m(nrgpu, uplo_[0], n, b, ldb, info); if (*info != 0) { *info = n + *info; return *info; } #ifdef ENABLE_TIMER end = get_current_time(); printf("time cpotrf = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif /* Transform problem to standard eigenvalue problem and solve. */ magma_chegst_m(nrgpu, itype, uplo_[0], n, a, lda, b, ldb, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time chegst = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif magma_cheevdx_m(nrgpu, jobz, range, uplo, n, a, lda, vl, vu, il, iu, m, w, work, lwork, rwork, lrwork, iwork, liwork, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time cheevd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif if (wantz && *info == 0) { #ifdef ENABLE_TIMER start = get_current_time(); #endif /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaConjTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_ctrsm_m(nrgpu, MagmaLeft, uplo_[0], *trans, MagmaNonUnit, n, *m, c_one, b, ldb, a, lda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaConjTrans; } //magma_ctrmm(MagmaLeft, uplo_[0], *trans, MagmaNonUnit, // n, n, c_one, db, lddb, da, ldda); } #ifdef ENABLE_TIMER end = get_current_time(); printf("time setmatrices trsm/mm + getmatrices = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } work[0] = MAGMA_C_MAKE( lwmin * (1. + lapackf77_slamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_chegvd_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; magmaDoubleComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif double *w1, result; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_timestr_t start, end; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); char jobz = opts.jobz; int checkres = opts.check; char range = 'A'; char uplo = opts.uplo; magma_int_t itype = opts.itype; double f = opts.fraction; if (f != 1) range='I'; if ( checkres && jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); jobz = MagmaVec; } printf("using: nrgpu = %d, itype = %d, jobz = %c, range = %c, uplo = %c, checkres = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) itype, jobz, range, uplo, (int) checkres, f); printf(" N M nr GPU MGPU Time(s) \n"); printf("====================================\n"); magma_int_t threads = magma_get_numthreads(); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_zbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_zbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_zbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_HOSTALLOC( h_A, magmaDoubleComplex, n2); TESTING_HOSTALLOC( h_B, magmaDoubleComplex, n2); TESTING_MALLOC( w1, double , N); TESTING_HOSTALLOC(h_work, magmaDoubleComplex, lwork); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTALLOC( rwork, double, lrwork); #endif TESTING_MALLOC( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); /* increase the diagonal */ { for(int i=0; i<N; i++) { MAGMA_Z_SET2REAL( h_B[i*N+i], ( MAGMA_Z_REAL(h_B[i*N+i]) + 1.*N ) ); MAGMA_Z_SET2REAL( h_A[i*N+i], MAGMA_Z_REAL(h_A[i*N+i]) ); } } if((opts.warmup)||( checkres )){ TESTING_MALLOC(h_Ainit, magmaDoubleComplex, n2); TESTING_MALLOC(h_Binit, magmaDoubleComplex, n2); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == 'I'){ il = 1; iu = (int) (f*N); } if(opts.warmup){ // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_zhegvdx_2stage_m(opts.ngpu, itype, jobz, range, uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== start = get_current_time(); magma_zhegvdx_2stage_m(opts.ngpu, itype, jobz, range, uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); end = get_current_time(); mgpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_zlanhe("1",&uplo, &N, h_Ainit, &N, rwork); result /= lapackf77_zlange("1",&N , &m1, h_A, &N, rwork); if (itype == 1){ blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_zhemm("L", &uplo, &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_zlange("1", &N, &m1, h_A, &N, rwork)/N; } else if (itype == 3){ blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_zlange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %2d %6.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( checkres ){ printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); } TESTING_HOSTFREE( h_A); TESTING_HOSTFREE( h_B); TESTING_FREE( w1); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTFREE( rwork); #endif TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); if((opts.warmup)||( checkres )){ TESTING_FREE( h_Ainit); TESTING_FREE( h_Binit); } } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return 0; }
extern "C" magma_int_t magma_slaex0(magma_int_t n, float* d, float* e, float* q, magma_int_t ldq, float* work, magma_int_t* iwork, magmaFloat_ptr dwork, magma_vec_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info, magma_queue_t queue) { /* -- MAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 .. Scalar Arguments .. CHARACTER RANGE INTEGER IL, IU, INFO, LDQ, N REAL VL, VU .. .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), E( * ), Q( LDQ, * ), $ WORK( * ), DWORK( * ) .. Purpose ======= SLAEX0 computes all eigenvalues and the choosen eigenvectors of a symmetric tridiagonal matrix using the divide and conquer method. Arguments ========= N (input) INTEGER The dimension of the symmetric tridiagonal matrix. N >= 0. D (input/output) REAL array, dimension (N) On entry, the main diagonal of the tridiagonal matrix. On exit, its eigenvalues. E (input) REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix. On exit, E has been destroyed. Q (input/output) REAL array, dimension (LDQ, N) On entry, Q will be the identity matrix. On exit, Q contains the eigenvectors of the tridiagonal matrix. LDQ (input) INTEGER The leading dimension of the array Q. If eigenvectors are desired, then LDQ >= max(1,N). In any case, LDQ >= 1. WORK (workspace) REAL array, the dimension of WORK must be at least 4*N + N**2. IWORK (workspace) INTEGER array, the dimension of IWORK must be at least 3 + 5*N. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: The algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA ===================================================================== */ magma_int_t ione = 1; magma_vec_t range_ = range; magma_int_t curlvl, curprb, i, indxq; magma_int_t j, k, matsiz, msd2, smlsiz; magma_int_t submat, subpbs, tlvls; // Test the input parameters. *info = 0; if( n < 0 ) *info = -1; else if( ldq < max(1, n) ) *info = -5; if( *info != 0 ){ magma_xerbla( __func__, -*info ); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if(n == 0) return MAGMA_SUCCESS; smlsiz = get_slaex0_smlsize(); // Determine the size and placement of the submatrices, and save in // the leading elements of IWORK. iwork[0] = n; subpbs= 1; tlvls = 0; while (iwork[subpbs - 1] > smlsiz) { for (j = subpbs; j > 0; --j){ iwork[2*j - 1] = (iwork[j-1]+1)/2; iwork[2*j - 2] = iwork[j-1]/2; } ++tlvls; subpbs *= 2; } for (j=1; j<subpbs; ++j) iwork[j] += iwork[j-1]; // Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1 // using rank-1 modifications (cuts). for(i=0; i < subpbs-1; ++i){ submat = iwork[i]; d[submat-1] -= MAGMA_S_ABS(e[submat-1]); d[submat] -= MAGMA_S_ABS(e[submat-1]); } indxq = 4*n + 3; // Solve each submatrix eigenproblem at the bottom of the divide and // conquer tree. char char_I[] = {'I', 0}; //#define ENABLE_TIMER #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif for (i = 0; i < subpbs; ++i){ if(i == 0){ submat = 0; matsiz = iwork[0]; } else { submat = iwork[i-1]; matsiz = iwork[i] - iwork[i-1]; } lapackf77_ssteqr(char_I , &matsiz, &d[submat], &e[submat], Q(submat, submat), &ldq, work, info); // change to edc? if(*info != 0){ printf("info: %d\n, submat: %d\n", (int) *info, (int) submat); *info = (submat+1)*(n+1) + submat + matsiz; printf("info: %d\n", (int) *info); return MAGMA_SUCCESS; } k = 1; for(j = submat; j < iwork[i]; ++j){ iwork[indxq+j] = k; ++k; } } #ifdef ENABLE_TIMER end = get_current_time(); printf("for: ssteqr = %6.2f\n", GetTimerValue(start,end)/1000.); #endif // Successively merge eigensystems of adjacent submatrices // into eigensystem for the corresponding larger matrix. curlvl = 1; while (subpbs > 1){ #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif for (i=0; i<subpbs-1; i+=2){ if(i == 0){ submat = 0; matsiz = iwork[1]; msd2 = iwork[0]; } else { submat = iwork[i-1]; matsiz = iwork[i+1] - iwork[i-1]; msd2 = matsiz / 2; } // Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2) // into an eigensystem of size MATSIZ. // SLAEX1 is used only for the full eigensystem of a tridiagonal // matrix. if (matsiz == n) range_=range; else // We need all the eigenvectors if it is not last step range_= MagmaAllVec; magma_slaex1(matsiz, &d[submat], Q(submat, submat), ldq, &iwork[indxq+submat], e[submat+msd2-1], msd2, work, &iwork[subpbs], dwork, range_, vl, vu, il, iu, info, queue); if(*info != 0){ *info = (submat+1)*(n+1) + submat + matsiz; return MAGMA_SUCCESS; } iwork[i/2]= iwork[i+1]; } subpbs /= 2; ++curlvl; #ifdef ENABLE_TIMER end = get_current_time(); printf("%d: time: %6.2f\n", curlvl, GetTimerValue(start,end)/1000.); #endif } // Re-merge the eigenvalues/vectors which were deflated at the final // merge step. for(i = 0; i<n; ++i){ j = iwork[indxq+i] - 1; work[i] = d[j]; blasf77_scopy(&n, Q(0, j), &ione, &work[ n*(i+1) ], &ione); } blasf77_scopy(&n, work, &ione, d, &ione); char char_A[] = {'A',0}; lapackf77_slacpy ( char_A, &n, &n, &work[n], &n, q, &ldq ); return MAGMA_SUCCESS; } /* magma_slaex0 */
std::vector<EigenComponent> solver_magma(const Eigen::MatrixXf& A, unsigned int num_ev) { static MagmaSpectralSolver magma; magma_int_t N = A.rows(); std::cout << "MAGMA Solver N=" << N << std::endl; magma_timestr_t start, end; float gpu_time; start = get_current_time(); magma_int_t info; const float *h_A = A.data(); float *h_R, *h_work; float *w1; magma_int_t *iwork; const char *uplo = MagmaLowerStr; const char *jobz = MagmaVecStr; /* Query for workspace sizes */ float aux_work[1]; magma_int_t aux_iwork[1]; std::cout << "Querying workspace size" << std::endl; magma_ssyevd( jobz[0], uplo[0], N, h_R, N, w1, aux_work, -1, aux_iwork, -1, &info ); magma_int_t lwork = (magma_int_t) aux_work[0]; magma_int_t liwork = aux_iwork[0]; std::cout << lwork << " " << liwork << std::endl; std::cout << "Allocating" << std::endl; w1 = magma.malloc<float>(N ); h_R = magma.hostmalloc<float>(N*N); h_work = magma.hostmalloc<float>(lwork); iwork = magma.malloc<magma_int_t>(liwork); std::cout << "Copying" << std::endl; slacpy_( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); std::cout << "Solving" << std::endl; magma_ssyevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, iwork, liwork, &info); std::cout << "Collecting" << std::endl; // save eigenvectors and eigenvalues std::vector<EigenComponent> solution(std::min<int>(N, num_ev)); for(unsigned int i=0; i<solution.size(); i++) { solution[i].eigenvalue = w1[i+1]; Eigen::VectorXf ev(N); for(unsigned int j=0; j<N; j++) { ev[j] = *(h_R + i*N + j); } solution[i].eigenvector = ev; } std::cout << "Freeing" << std::endl; magma.free(w1); magma.hostfree(h_R); magma.hostfree(h_work); magma.free(iwork); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; std::cout << "Time: " << gpu_time << std::endl; return solution; }