//################################################################################################## static void *magma_capplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_capplyQ_id_data*)arg) -> id; magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; magmaFloatComplex *E = data -> E; magma_int_t lde = data -> lde; magmaFloatComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaFloatComplex *TAU = data -> TAU; magmaFloatComplex *T = data -> T; magma_int_t ldt = data -> ldt; magmaFloatComplex *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_setlapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if(my_core_id==0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_csetmatrix(n, n_gpu, E, lde, dE, ldde); magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f \n" ,timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if(my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ctile_bulge_applyQ(my_core_id, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f \n" ,timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
extern "C" magma_int_t magma_ssyevdx_2stage(char jobz, char range, char uplo, magma_int_t n, float *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) REAL array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 2). If JOBZ = 'V' and N > 1, LWORK >= LQ2 + 1 + 6*N + 2*N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by MAGMA_BULGE_GET_LQ2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; float d_one = 1.; magma_int_t ione = 1; magma_int_t izero = 0; float d__1; float eps; float anrm; magma_int_t imax; float rmin, rmax; float sigma; magma_int_t lwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; float safmin; float bignum; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; float* dwork; /* determine the number of threads */ magma_int_t threads = magma_get_numthreads(); magma_setlapack_numthreads(threads); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame( range_, "A" ); valeig = lapackf77_lsame( range_, "V" ); indeig = lapackf77_lsame( range_, "I" ); lquery = lwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_sbulge_nb(n, threads); magma_int_t Vblksiz = magma_sbulge_get_Vblksiz(n, nb, threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_sbulge_get_lq2(n, threads); if (wantz) { lwmin = lq2 + 1 + 6 * n + 2 * n * n; liwmin = 5 * n + 3; } else { lwmin = lq2 + n * (nb + 1); liwmin = 1; } work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((liwork < liwmin) && ! lquery) { *info = -16; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = a[0]; if (wantz) { a[0] = MAGMA_S_ONE; } return *info; } #ifdef ENABLE_TIMER printf("using %d threads\n", threads); #endif /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if( ( ntiles < 2 ) || ( n <= 128 ) ){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int ) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_ssyevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_slansy("M", uplo_, &n, a, &lda, work); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_slascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } magma_int_t inde = 0; magma_int_t indT2 = inde + n; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; magma_int_t indwk2 = indwrk + n * n; magma_int_t llwork = lwork - indwrk; magma_int_t llwrk2 = lwork - indwk2; #ifdef ENABLE_TIMER magma_timestr_t start, st1, st2, end; start = get_current_time(); #endif float *dT1; if (MAGMA_SUCCESS != magma_smalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ssytrd_sy2sb(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, dT1, threads, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time ssytrd_sy2sb = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); float* A2 = &work[indwrk]; memset(A2 , 0, n*lda2*sizeof(float)); for (magma_int_t j = 0; j < n-nb; j++) { cblas_scopy(nb+1, &a[j*(lda+1)], 1, &A2[j*lda2], 1); memset(&a[j*(lda+1)], 0, (nb+1)*sizeof(float)); a[nb + j*(lda+1)] = d_one; } for (magma_int_t j = 0; j < nb; j++) { cblas_scopy(nb-j, &a[(j+n-nb)*(lda+1)], 1, &A2[(j+n-nb)*lda2], 1); memset(&a[(j+n-nb)*(lda+1)], 0, (nb-j)*sizeof(float)); } #ifdef ENABLE_TIMER st2 = get_current_time(); printf(" time ssytrd_convert = %6.2f\n" , GetTimerValue(st1,st2)/1000.); #endif magma_ssytrd_sb2st(threads, uplo, n, nb, Vblksiz, A2, lda2, w, &work[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time ssytrd_sy2st = %6.2f\n" , GetTimerValue(st2,end)/1000.); printf(" time ssytrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { #ifdef ENABLE_TIMER start = get_current_time(); #endif lapackf77_ssterf(&n, w, &work[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time sstedc = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_sstedx(range, n, vl, vu, il, iu, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); magma_free( dwork ); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time sstedx = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif float *dZ; magma_int_t lddz = n; float *da; magma_int_t ldda = n; magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); if (MAGMA_SUCCESS != magma_smalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_sbulge_back(threads, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time sbulge_back = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif magma_ssetmatrix( n, n, a, lda, da, ldda ); magma_sormqr_gpu_2stages(MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, da+nb, ldda, dZ+nb, n, dT1, nb, info); magma_sgetmatrix( n, *m, dZ, lddz, a, lda ); magma_free(dT1); magma_free(dZ); magma_free(da); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time sormqr + copy = %6.2f\n", GetTimerValue(st1,end)/1000.); printf(" time eigenvectors backtransf. = %6.2f\n" , GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_zheevdx_2stage */
extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaFloatComplex *Z, magma_int_t ldz, magmaFloatComplex *dZ, magma_int_t lddz, magmaFloatComplex *V, magma_int_t ldv, magmaFloatComplex *TAU, magmaFloatComplex *T, magma_int_t ldt, magma_int_t* info) { magma_setlapack_numthreads(1); float timeaplQ2=0.0; float f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) //float gpu_cpu_perf = 50; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) //float gpu_cpu_perf = 16; // gpu over cpu performance //100% ev // SandyB. - Fermi (M2090) //#else // float gpu_cpu_perf = 27.5; // gpu over cpu performance //100% ev // Westmere - Fermi (M2090) //float gpu_cpu_perf = 37; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) // float gpu_cpu_perf = 130; // gpu over cpu performance //100% ev // Bulldozer - Kepler (K20X) //#endif magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf(); if(threads>1) { f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf) ); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //n_gpu=ne; //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if(n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_capplyQ_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]); } arg[0] = magma_capplyQ_id_data(0, &data_applyQ); magma_capplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ } else { magma_csetmatrix(n, ne, Z, ldz, dZ, lddz); magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_setlapack_numthreads(threads); return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_ssytrd_sb2st(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, float *A, magma_int_t lda, float *D, float *E, float *V, magma_int_t ldv, float *TAU, magma_int_t compT, float *T, magma_int_t ldt) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= Arguments ========= THREADS (input) INTEGER Specifies the number of pthreads used. THREADS > 0 UPLO (input) CHARACTER*1 = 'U': Upper triangles of A is stored; = 'L': Lower triangles of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NB (input) INTEGER The order of the band matrix A. N >= NB >= 0. VBLKSIZ (input) INTEGER The size of the block of householder vectors applied at once. A (input/workspace) REAL array, dimension (LDA, N) On entry the band matrix stored in the following way: LDA (input) INTEGER The leading dimension of the array A. LDA >= 2*NB. D (output) DOUBLE array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) DOUBLE array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. V (output) REAL array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. LDV (input) INTEGER The leading dimension of V. LDV > NB + VBLKSIZ + 1 TAU (output) REAL dimension(BLKCNT, VBLKSIZ) ??? COMPT (input) INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed T (output) REAL dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced LDT (input) INTEGER The leading dimension of T. LDT > Vblksiz INFO (output) INTEGER ???????????????????????????????????????????????????????????????????????????????????? = 0: successful exit ===================================================================== */ #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif //char uplo_[2] = {uplo, 0}; magma_int_t mklth = threads; magma_int_t INgrsiz=1; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t nbtiles = magma_ceildiv(n, nb); memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(float)); memset(TAU, 0, blkcnt*Vblksiz*sizeof(float)); memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(float)); magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t)); memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t)); magma_sbulge_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_sbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_setlapack_numthreads(1); magma_sbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_sbulge_id_data(thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_ssytrd_sb2st_parallel_section, &arg[thread]); } arg[0] = magma_sbulge_id_data(0, &data_bulge); magma_ssytrd_sb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f \n" ,timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu(prog); magma_setlapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag D and E * note that D and E are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In real case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the SLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #if defined(PRECISION_z) || defined(PRECISION_c) if(uplo==MagmaLower){ for (magma_int_t i=0; i < n-1 ; i++) { D[i] = MAGMA_S_REAL(A[i*lda ]); E[i] = MAGMA_S_REAL(A[i*lda+1]); } D[n-1] = MAGMA_S_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i<n-1; i++) { D[i] = MAGMA_S_REAL(A[i*lda+nb]); E[i] = MAGMA_S_REAL(A[i*lda+nb-1]); } D[n-1] = MAGMA_S_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if( uplo == MagmaLower ){ for (magma_int_t i=0; i < n-1; i++) { D[i] = A[i*lda]; // diag E[i] = A[i*lda+1]; //lower diag } D[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { D[i] = A[i*lda+nb]; // diag E[i] = A[i*lda+nb-1]; //lower diag } D[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
static void *magma_ssytrd_sb2st_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_sbulge_id_data*)arg) -> id; magma_sbulge_data* data = ((magma_sbulge_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t nb = data -> nb; magma_int_t nbtiles = data -> nbtiles; magma_int_t grsiz = data -> grsiz; magma_int_t Vblksiz = data -> Vblksiz; magma_int_t compT = data -> compT; float *A = data -> A; magma_int_t lda = data -> lda; float *V = data -> V; magma_int_t ldv = data -> ldv; float *TAU = data -> TAU; float *T = data -> T; magma_int_t ldt = data -> ldt; volatile magma_int_t* prog = data -> prog; pthread_barrier_t* barrier = &(data -> barrier); //magma_int_t sys_corenbr = 1; #ifdef ENABLE_TIMER real_Double_t timeB=0.0, timeT=0.0; #endif // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_setlapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if(compT==1) { /* compute the Q1 overlapped with the bulge chasing+T. * if all_cores_num=1 it call Q1 on GPU and then bulgechasing. * otherwise the first thread run Q1 on GPU and * the other threads run the bulgechasing. * */ if(allcores_num==1) { //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER timeB = magma_wtime(); #endif magma_stile_bulge_parallel(0, 1, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); #ifdef ENABLE_TIMER timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f \n" ,timeB); #endif //========================= // compute the T's to be used when applying Q2 //========================= #ifdef ENABLE_TIMER timeT = magma_wtime(); #endif magma_stile_bulge_computeT_parallel(0, 1, V, ldv, TAU, T, ldt, n, nb, Vblksiz); #ifdef ENABLE_TIMER timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f \n" ,timeT); #endif }else{ // allcore_num > 1 magma_int_t id = my_core_id; magma_int_t tot = allcores_num; //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if(id == 0) timeB = magma_wtime(); #endif magma_stile_bulge_parallel(id, tot, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(id == 0){ timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f \n" ,timeB); } #endif //========================= // compute the T's to be used when applying Q2 //========================= #ifdef ENABLE_TIMER if(id == 0) timeT = magma_wtime(); #endif magma_stile_bulge_computeT_parallel(id, tot, V, ldv, TAU, T, ldt, n, nb, Vblksiz); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (id == 0){ timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f \n" ,timeT); } #endif } // allcore == 1 }else{ // WANTZ = 0 //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if(my_core_id == 0) timeB = magma_wtime(); #endif magma_stile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(my_core_id == 0){ timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f \n" ,timeB); } #endif } // WANTZ > 0 #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0){ check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
extern "C" magma_int_t magma_zbulge_back_m(magma_int_t nrgpu, magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaDoubleComplex *Z, magma_int_t ldz, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t* info) { magma_setlapack_numthreads(1); double timeaplQ2=0.0; double f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) // double gpu_cpu_perf = 32; //gpu over cpu performance //#else // double gpu_cpu_perf = 32; // gpu over cpu performance //#endif double perf_temp= .85; double perf_temp2= perf_temp; for (magma_int_t itmp=1; itmp<nrgpu; ++itmp) perf_temp2*=perf_temp; magma_int_t gpu_cpu_perf = magma_get_zbulge_gcperf(); if(threads>1){ f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf*(1.-perf_temp2)/(1.-perf_temp))); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ //n_gpu = ne; if(n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_zapplyQ_m_data data_applyQ(nrgpu, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt); magma_zapplyQ_m_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zapplyQ_m_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_zapplyQ_m_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_zapplyQ_m_parallel_section, &arg[thread]); } arg[0] = magma_zapplyQ_m_id_data(0, &data_applyQ); magma_zapplyQ_m_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); /*============================ * use only GPU *==========================*/ }else{ magma_zbulge_applyQ_v2_m(nrgpu, 'L', ne, n, nb, Vblksiz, Z, ldz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_setlapack_numthreads(threads); return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_chetrd_he2hb_mgpu( char uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex *dAmgpu[], magma_int_t ldda, magmaFloatComplex *dTmgpu[], magma_int_t lddt, magma_int_t ngpu, magma_int_t distblk, magma_queue_t streams[][20], magma_int_t nstream, magma_int_t threads, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CHETRD_HE2HB reduces a complex Hermitian matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) COMPLEX array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a + ((a_2)-1)*( lda) + (a_1)-1) #define da_ref(a_1,a_2) (da + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define t_ref(a_1) (dT + ((a_1)-1)*(lddt)) #define Atest(a_1,a_2) ( Atest + ((a_2)-1)*( lda) + (a_1)-1) #define dttest(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt)) #define datest(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1) char uplo_[2] = {uplo, 0}; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF; magmaFloatComplex c_one = MAGMA_C_ONE ; magmaFloatComplex c_zero = MAGMA_C_ZERO; float d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0, flipV=-1; magma_int_t iblock, idev, di; int i; int lwkopt; int lquery; assert (nstream>=3); assert (nstream>=(ngpu+1)); *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { MAGMA_C_SET2REAL( work[0], lwkopt ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_int_t mklth = min(threads,16); magma_setlapack_numthreads(mklth); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx=0; magma_buildconnection_mgpu(gnode, &nbcmplx, ngpu); #ifdef ENABLE_DEBUG printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n" , nbcmplx); #endif magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magmaFloatComplex *dspace[MagmaMaxGPUs]; magmaFloatComplex *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs]; magmaFloatComplex *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs]; magmaFloatComplex *workngpu[MagmaMaxGPUs+1]; magma_event_t redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs; magma_int_t lddv = ldda; magma_int_t lddw = lddv; magma_int_t dwrk2siz = ldda*nb*(ngpu+1); magma_int_t worksiz = n*nb; magma_int_t devworksiz = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis // local allocation and stream creation for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_cmalloc( &dspace[dev], devworksiz ); magma_cmalloc_pinned ( &workngpu[dev], worksiz); dvall[dev] = dspace[dev]; dw[dev] = dvall[dev] + 2*nb*lddv; dwork[dev] = dw[dev] + nb*lddw; dworkbis[dev] = dwork[dev] + nb*ldda; magmablasSetKernelStream( streams[ dev ][ 0 ] ); for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming); } } magma_cmalloc_pinned ( &workngpu[ngpu], worksiz); magmaFloatComplex *worktest = NULL; //(magmaFloatComplex *) malloc(n*nb*sizeof(magmaFloatComplex)); // not used // ====================== magmaFloatComplex *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(magmaFloatComplex)); if (upper) { printf("CHETRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); }else { /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ){ // cpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. cpanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); // find the device who own the panel then send it to the CPU. // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((i-1) / distblk) / ngpu; // local block id di = iblock*distblk + (i-1)%distblk; // local index in parent matrix idev = ((i-1) / distblk) % ngpu; // device with this block //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); magma_setdevice( idev ); //magma_device_sync(); magma_cgetmatrix_async( (pm+pn), pn, datest(idev, i, di+1), ldda, a_ref ( i, i), lda, streams[ idev ][ nstream-1 ] ); /* magma_device_sync(); cudaMemcpy2DAsync(a_ref(i,i), lda*sizeof(magmaFloatComplex), datest(idev,i,di+1), ldda*sizeof(magmaFloatComplex), (pm+pn)*sizeof(magmaFloatComplex), pn, cudaMemcpyDeviceToHost, streams[ idev ][ nstream-1 ]); */ //magma_setdevice( 0 ); //printf("updating cher2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); // compute CHER2K_MGPU magmablas_cher2k_mgpu2( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dv, pm_old, pn_old, dw, pm_old, pn_old, d_one, dAmgpu, ldda, indi_old+pn_old-1, ngpu, distblk, streams, 2 ); //magma_setdevice( 0 ); magma_setdevice( idev ); magma_queue_sync( streams[idev][ nstream-1 ] ); //magma_setdevice( 0 ); cq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ lapackf77_cgeqrf(&pm, &pn, a_ref(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, a_ref(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ cpanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work); /* Send V and T from the CPU to the GPU */ // To be able to overlap the GET with the CHER2K // it should be done on last stream. // TO Avoid a BUG that is overwriting the old_V // used atthis moment by cher2k with the new_V // send it now, we decide to have a flipflop // vector of Vs. if step%2=0 use V[0] else use V[nb*n] flipV = ((i-1)/nb)%2; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dv[dev] = dvall[dev] + flipV*nb*lddv; } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); // send V magma_csetmatrix_async( pm, pk, a_ref(indi, indj), lda, dv[dev], pm, streams[dev][nstream-1] ); // Send the triangular factor T to the GPU magma_csetmatrix_async( pk, pk, hT, nb, dttest(dev, 1, i), lddt, streams[dev][nstream-1] ); } /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // dwork = V T magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ nstream-1 ] ); magma_queue_sync( streams[dev][nstream-1] ); magma_cgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dv[dev], pm, dttest(dev, 1, i), lddt, c_zero, dwork[dev], pm); } // =============================================== // SYNC TO BE SURE THAT BOTH V AND T WERE // RECEIVED AND VT IS COMPUTED and SYR2K is done // =============================================== for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t s = 0; s < nstream; ++s ) magma_queue_sync( streams[dev][s] ); } // compute CHEMM_MGPU // The broadcast of the result done inside this function // should be done in stream [0] because i am assuming this // for the GEMMs below otherwise I have to SYNC over the // Broadcasting stream. if(ngpu==1){ magmablasSetKernelStream( streams[ 0 ][ 0 ] ); magma_chemm(MagmaLeft, uplo, pm, pk, c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda, dwork[0], pm, c_zero, dw[0], pm); }else{ magmablas_chemm_mgpu_com( MagmaLeft, uplo, pm, pk, c_one, dAmgpu, ldda, indi-1, dwork, pm, c_zero, dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz, ngpu, distblk, streams, nstream-1, redevents, nbevents, gnode, nbcmplx); } /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // Here we have to wait until the broadcast of CHEMM has been done. // Note that the broadcast should be done on stream[0] so in a way // we can continue here on the same stream and avoid a sync magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); // magma_queue_sync( streams[dev][0] ); magma_cgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork[dev], pm, dw[dev], pm, c_zero, dworkbis[dev], nb); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ magma_cgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dv[dev], pm, dworkbis[dev], nb, c_one, dw[dev], pm); } /* restore the panel it is put here to overlap with the previous GEMM*/ cq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work); // =============================================== // SYNC TO BE SURE THAT BOTH V AND W ARE DONE // =============================================== // Synchronise to be sure that W has been computed // because next CHER2K use streaming and may happen // that lunch a gemm on stream 2 while stream 0 // which compute those 2 GEMM above has not been // computed and also used for the same reason in // the panel update below and also for the last HER2K for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( streams[dev][0] ); } /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb){ /* There would be next iteration; do lookahead - update the next panel */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( streams[ idev ][ nstream-1 ] ); //magma_queue_sync( streams[idev][0] ); removed because the sync has been done in the loop above magma_cgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dv[idev], pm, dw[idev] , pm, c_one, datest(idev, indi, di+1), ldda); magma_cgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dw[idev] , pm, dv[idev], pm, c_one, datest(idev, indi, di+1), ldda); //printf("updating next panel distblk %d idev %d on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); } else { /* no look-ahead as this is last iteration */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( streams[ idev ][ 0 ] ); //printf("LAST CHER2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); magma_cher2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dv[idev], pm, dw[idev] , pm, d_one, datest(idev, indi, di+1), ldda); /* Send the last block to the CPU */ cpanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); magma_cgetmatrix( pk, pk, datest(idev, indi, di+1), ldda, a_ref(n-pk+1, n-pk+1), lda ); cq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for(i) }// end of LOWER //magma_setdevice( 0 ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( dspace[dev]); magma_free_pinned(workngpu[dev]); for( magma_int_t e = 0; e < nbevents; ++e ) { cudaEventDestroy(redevents[dev][e]); } } magma_free_pinned(workngpu[ngpu]); free(worktest); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); MAGMA_C_SET2REAL( work[0], lwkopt ); magma_setlapack_numthreads(1); return *info; } /* chetrd_he2hb_ */
extern "C" magma_int_t magma_ssygvdx_2stage(magma_int_t itype, char jobz, char range, char uplo, magma_int_t n, float *a, magma_int_t lda, float *b, magma_int_t ldb, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SSYGVDX_2STAGE computes all the eigenvalues, and optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) DOUBLE PRECISION array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**H*B*Z = I; if ITYPE = 3, Z**H*inv(B)*Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) DOUBLE PRECISION array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 2). If JOBZ = 'V' and N > 1, LWORK >= LQ2 + 1 + 6*N + 2*N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by MAGMA_BULGE_GET_LQ2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: ZPOTRF or ZHEEVD returned an error code: <= N: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details =============== Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if ZHEEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; float d_one = MAGMA_S_ONE; float *da; float *db; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; char trans[1]; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin; magma_int_t liwmin; magma_queue_t stream; magma_queue_create( &stream ); /* determine the number of threads */ magma_int_t threads = magma_get_numthreads(); magma_setlapack_numthreads(threads); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1 || liwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -3; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_sbulge_nb(n, threads); magma_int_t lq2 = magma_sbulge_get_lq2(n, threads); if (wantz) { lwmin = lq2 + 1 + 6*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n * (nb + 2); liwmin = 1; } work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -17; } else if (liwork < liwmin && ! lquery) { *info = -19; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_ssygvd(&itype, jobz_, uplo_, &n, a, &lda, b, &ldb, w, work, &lwork, iwork, &liwork, info); *m = n; return *info; } if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_smalloc( &db, n*lddb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_ssetmatrix( n, n, b, ldb, db, lddb ); magma_ssetmatrix_async( n, n, a, lda, da, ldda, stream ); #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_spotrf_gpu(uplo_[0], n, db, lddb, info); if (*info != 0) { *info = n + *info; return *info; } #ifdef ENABLE_TIMER end = get_current_time(); printf("time spotrf_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_queue_sync( stream ); magma_sgetmatrix_async( n, n, db, lddb, b, ldb, stream ); #ifdef ENABLE_TIMER start = get_current_time(); #endif /* Transform problem to standard eigenvalue problem and solve. */ magma_ssygst_gpu(itype, uplo, n, da, ldda, db, lddb, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time ssygst_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_sgetmatrix( n, n, da, ldda, a, lda ); magma_queue_sync( stream ); magma_free( da ); magma_free( db ); #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_ssyevdx_2stage(jobz, range, uplo, n, a, lda, vl, vu, il, iu, m, w, work, lwork, iwork, liwork, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time ssyevdx_2stage = %6.2f\n", GetTimerValue(start,end)/1000.); #endif if (wantz && *info == 0) { if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_smalloc( &db, n*lddb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_ssetmatrix( n, *m, a, lda, da, ldda ); magma_ssetmatrix( n, n, b, ldb, db, lddb ); /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaConjTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_strsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, d_one, db, lddb, da, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaConjTrans; } magma_strmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, d_one, db, lddb, da, ldda); } magma_sgetmatrix( n, *m, da, ldda, a, lda ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time strsm/mm + getmatrix = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_free( da ); magma_free( db ); } magma_queue_destroy( stream ); work[0] = lwmin * (1. + lapackf77_slamch("Epsilon")); iwork[0] = liwmin; return *info; } /* ssygvdx_2stage */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; real_Double_t gpu_perf1, gpu_time1, gpu_perf2, gpu_time2, gpu_perf3, gpu_time3, alloc_time, free_time; double error; double *h_A; double *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu, NB; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_int_t P=-1; /*Number of threads in the CPU part*/ double d_cpu=-1; /*pourcentgae of the matrix to allocate in the cpu part*/ magma_int_t Pr=-1; /*Number of threads for the panel*/ magma_int_t async_nb; /*Block size*/ double *WORK; magma_int_t WORK_LD, WORK_n; double **dlpanelT; magma_int_t dlpanelT_m, dlpanelT_n; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); P = opts.nthread; async_nb = opts.nb; Pr = opts.panel_nthread; d_cpu = 0.0; #if defined(CPU_PEAK) && defined(GPU_PEAK) d_cpu = magma_amc_recommanded_dcpu(opts.nthread, CPU_PEAK, opts.ngpu, GPU_PEAK); #endif if(opts.fraction_dcpu!=0){ /*Overwrite the one computed with the model*/ d_cpu = opts.fraction_dcpu; } magma_assert(d_cpu > 0 && d_cpu<=1.0, "error: The cpu fraction is invalid. Ensure you use --fraction_dcpu with fraction_dcpu in [0.0, 1.0] or compile with both -DCPU_PEAK=<cpu peak performance> and -DGPU_PEAK=<gpu peak performance> set.\n"); printf("Asynchronous recursif LU... nb:%d, nbcores:%d, dcpu:%f, panel_nbcores:%d, ngpu: %d\n", async_nb, P, d_cpu, Pr, opts.ngpu); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) GPU_Async_v2 GFlop/s (sec) GPU_Async_work_v2 GFlop/s (sec)"); if ( opts.check == 2 ) { printf(" |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; //nb = magma_get_dgetrf_nb( M ); gflops = FLOPS_DGETRF( M, N ) / 1e9; // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); /*set default number of threads for lapack*/ magma_setlapack_numthreads(P); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_dgetrf_nb( M ); // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time1 = magma_wtime(); magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time1 = magma_wtime() - gpu_time1; gpu_perf1 = gflops / gpu_time1; if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } /* ==================================================================== Performs operation using MAGMA_Async: This interface allocate workspace internally =================================================================== */ /*For the benchmark we have 2 approaches*/ /*1. use directly magma_amc */ /*2. use magma_amc_work and add pinned memory time*/ /*We choose approach 2*/ /* nb = async_nb; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory n_local = numcols2p(0, N, nb, ngpu); ldn_local = n_local; //ldn_local = ((n_local+31)/32)*32; for( int dev=0; dev < ngpu; dev++){ magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); // Switch to the sequential version of BLAS magma_setlapack_numthreads(1); magma_amc_init(P, d_cpu, Pr, nb); gpu_time2 = magma_wtime(); magma_dgetrf_async_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time2 = magma_wtime() - gpu_time2; gpu_perf2 = gflops / gpu_time2; magma_amc_finalize(); if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } */ /* ==================================================================== Performs operation using MAGMA_Async_Work =================================================================== */ nb = async_nb; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory n_local = numcols2p(0, N, nb, ngpu); ldn_local = n_local; //ldn_local = ((n_local+31)/32)*32; for( int dev=0; dev < ngpu; dev++){ magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); // Switch to the sequential version of BLAS magma_setlapack_numthreads(1); //Compute workspace dimension WORK_LD = M; NB = (int) ceil( (double) N / nb); WORK_n = (int) ceil(N*d_cpu)+nb; /*TODO:remove +nb replace with A_N*/ //WORK_n = NSplit(NB, d_cpu)*nb; if(WORK_n<nb) WORK_n = nb;//make sure workspace has at least one block column //Make LD and n multiple of 32 //if(WORK_LD%32!=0) WORK_LD = ((WORK_LD + 31)/32)*32; //if(WORK_n%32!=0) WORK_n = ((WORK_n + 31)/32)*32; //Allocate workspace alloc_time = magma_wtime(); if (MAGMA_SUCCESS != magma_dmalloc_pinned(&WORK, WORK_LD*WORK_n)) { //if (MAGMA_SUCCESS != magma_dmalloc_cpu(&WORK, WORK_LD*WORK_n)) { info = MAGMA_ERR_HOST_ALLOC; printf("magma_dmalloc_pinned returned error %d: %s.\n ", (int) info); } /* Workspace for the panels on the GPU*/ dlpanelT_m = WORK_n; /*assume that the cpu and gpu use the same buffer size*/ dlpanelT_n = M; dlpanelT = (double **) malloc(ngpu*sizeof(double*)); for(int dev=0;dev<ngpu;dev++){ magma_setdevice(dev); if (MAGMA_SUCCESS != magma_dmalloc(&dlpanelT[dev], dlpanelT_m*dlpanelT_n)) { info = MAGMA_ERR_DEVICE_ALLOC; printf("magma_dmalloc returned error %d: %s.\n ", (int) info); } } alloc_time = magma_wtime() - alloc_time; //First touch the workspace with each thread. This may be needed to avoid using numactl --interleave //magma_amc_dmemset(WORK, 0.0, WORK_LD*WORK_n, 256, P); //nb //#pragma omp parallel for private(info) schedule(static,nb) //for(info=0;info<WORK_LD*WORK_n;info++) WORK[info] = 0.0; //alternative first touch by the thread magma_amc_init(P, d_cpu, Pr, nb); gpu_time3 = magma_wtime(); magma_dgetrf_mgpu_work_amc_v3(ngpu, M, N, d_lA, ldda, ipiv, &info, WORK, WORK_LD, WORK_n); gpu_time3 = magma_wtime() - gpu_time3; gpu_perf3 = gflops / gpu_time3; magma_amc_finalize(); if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); //Free workspace free_time = magma_wtime(); magma_free_pinned(WORK); for(int dev=0;dev<ngpu;dev++){ magma_setdevice(dev); magma_free(dlpanelT[dev]); } free(dlpanelT); free_time = magma_wtime() - free_time; /*DEDUCE t2, JUST FOR THE BENCHMARK*/ gpu_time2 = gpu_time3 + alloc_time + free_time; gpu_perf2 = gflops / gpu_time2; for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } */ printf("%5d %5d", (int) M, (int) N); if(cpu_perf!=0.0){ printf(" %7.2f (%7.2f)", cpu_perf, cpu_time); } else{ printf(" --- ( --- )"); } if(gpu_perf1!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf1, gpu_time1); } else{ printf(" --- ( --- )"); } if(gpu_perf2!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf2, gpu_time2); } else{ printf(" --- ( --- )"); } if(gpu_perf3!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf3, gpu_time3); } else{ printf(" --- ( --- )"); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_sstedx_m(magma_int_t nrgpu, char range, magma_int_t n, float vl, float vu, magma_int_t il, magma_int_t iu, float* d, float* e, float* z, magma_int_t ldz, float* work, magma_int_t lwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 .. Scalar Arguments .. CHARACTER RANGE INTEGER IL, IU, INFO, LDZ, LIWORK, LWORK, N REAL VL, VU .. .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), E( * ), WORK( * ), Z( LDZ, * ), $ DWORK ( * ) .. Purpose ======= SSTEDX computes some eigenvalues and, optionally, eigenvectors of a symmetric tridiagonal matrix using the divide and conquer method. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. See SLAEX3 for details. Arguments ========= RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. N (input) INTEGER The dimension of the symmetric tridiagonal matrix. N >= 0. VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. D (input/output) REAL array, dimension (N) On entry, the diagonal elements of the tridiagonal matrix. On exit, if INFO = 0, the eigenvalues in ascending order. E (input/output) REAL array, dimension (N-1) On entry, the subdiagonal elements of the tridiagonal matrix. On exit, E has been destroyed. Z (input/output) REAL array, dimension (LDZ,N) On exit, if INFO = 0, Z contains the orthonormal eigenvectors of the symmetric tridiagonal matrix. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= max(1,N). WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If N > 1 then LWORK >= ( 1 + 4*N + N**2 ). Note that if N is less than or equal to the minimum divide size, usually 25, then LWORK need only be max(1,2*(N-1)). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. LIWORK >= ( 3 + 5*N ). Note that if N is less than or equal to the minimum divide size, usually 25, then LIWORK need only be 1. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the IWORK array, returns this value as the first entry of the IWORK array, and no error message related to LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: The algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. ===================================================================== */ char range_[2] = {range, 0}; float d_zero = 0.; float d_one = 1.; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t alleig, indeig, valeig, lquery; magma_int_t i, j, k, m; magma_int_t liwmin, lwmin; magma_int_t start, end, smlsiz; float eps, orgnrm, p, tiny; // Test the input parameters. alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1 || liwork == -1; *info = 0; if (! (alleig || valeig || indeig)) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldz < max(1,n)) { *info = -10; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -4; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -5; } else if (iu < min(n,il) || iu > n) { *info = -6; } } } if (*info == 0) { // Compute the workspace requirements smlsiz = magma_get_smlsize_divideconquer(); if( n <= 1 ){ lwmin = 1; liwmin = 1; } else { lwmin = 1 + 4*n + n*n; liwmin = 3 + 5*n; } work[0] = lwmin; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -12; } else if (liwork < liwmin && ! lquery) { *info = -14; } } if (*info != 0) { magma_xerbla( __func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) { return MAGMA_SUCCESS; } // Quick return if possible if(n==0) return MAGMA_SUCCESS; if(n==1){ *z = 1.; return MAGMA_SUCCESS; } /* determine the number of threads */ magma_int_t threads = magma_get_numthreads(); magma_setlapack_numthreads(threads); #ifdef ENABLE_DEBUG printf(" D&C_m is using %d threads\n", threads); #endif // If N is smaller than the minimum divide size (SMLSIZ+1), then // solve the problem with another solver. if (n < smlsiz){ char char_I[]= {'I', 0}; lapackf77_ssteqr(char_I, &n, d, e, z, &ldz, work, info); } else { char char_F[]= {'F', 0}; lapackf77_slaset(char_F, &n, &n, &d_zero, &d_one, z, &ldz); //Scale. char char_M[]= {'M', 0}; orgnrm = lapackf77_slanst(char_M, &n, d, e); if (orgnrm == 0){ work[0] = lwmin; iwork[0] = liwmin; return MAGMA_SUCCESS; } eps = lapackf77_slamch( "Epsilon" ); if (alleig){ start = 0; while ( start < n ){ // Let FINISH be the position of the next subdiagonal entry // such that E( END ) <= TINY or FINISH = N if no such // subdiagonal exists. The matrix identified by the elements // between START and END constitutes an independent // sub-problem. for(end = start+1; end < n; ++end){ tiny = eps * sqrt( MAGMA_S_ABS(d[end-1]*d[end])); if (MAGMA_S_ABS(e[end-1]) <= tiny) break; } // (Sub) Problem determined. Compute its size and solve it. m = end - start; if (m==1){ start = end; continue; } if (m > smlsiz){ // Scale char char_G[] = {'G', 0}; orgnrm = lapackf77_slanst(char_M, &m, &d[start], &e[start]); lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &m, &ione, &d[start], &m, info); magma_int_t mm = m-1; lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &mm, &ione, &e[start], &mm, info); magma_slaex0_m( nrgpu, m, &d[start], &e[start], Z(start, start), ldz, work, iwork, 'A', vl, vu, il, iu, info); if( *info != 0) { return MAGMA_SUCCESS; } // Scale Back lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &m, &ione, &d[start], &m, info); } else { char char_I[]= {'I', 0}; lapackf77_ssteqr( char_I, &m, &d[start], &e[start], Z(start, start), &ldz, work, info); if (*info != 0){ *info = (start+1) *(n+1) + end; } } start = end; } // If the problem split any number of times, then the eigenvalues // will not be properly ordered. Here we permute the eigenvalues // (and the associated eigenvectors) into ascending order. if (m < n){ // Use Selection Sort to minimize swaps of eigenvectors for (i = 1; i < n; ++i){ k = i-1; p = d[i-1]; for (j = i; j < n; ++j){ if (d[j] < p){ k = j; p = d[j]; } } if(k != i-1) { d[k] = d[i-1]; d[i-1] = p; blasf77_sswap(&n, Z(0,i-1), &ione, Z(0,k), &ione); } } } } else { // Scale char char_G[] = {'G', 0}; lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &n, &ione, d, &n, info); magma_int_t nm = n-1; lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &nm, &ione, e, &nm, info); magma_slaex0_m(nrgpu, n, d, e, z, ldz, work, iwork, range, vl, vu, il, iu, info); if( *info != 0) { return MAGMA_SUCCESS; } // Scale Back lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &n, &ione, d, &n, info); } } work[0] = lwmin; iwork[0] = liwmin; return MAGMA_SUCCESS; } /* magma_sstedx_m */