extern "C" void magma_slarfxsym( magma_int_t N, float *A, magma_int_t LDA, float *V, float *TAU) { magma_int_t IONE=1; float dtmp; float Z_ZERO = MAGMA_S_ZERO; //float Z_ONE = MAGMA_S_ONE; float Z_MONE = MAGMA_S_NEG_ONE; float Z_HALF = MAGMA_S_HALF; //float WORK[N]; float *WORK; magma_smalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_slarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_ssymv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_sdot(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_saxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_ssyr2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
extern "C" void magma_strdtype1cbHLsym_withQ( magma_int_t N, magma_int_t NB, float *A, magma_int_t LDA, float *V, float *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { //magma_int_t J1, J2, J3, i, j; magma_int_t len, LDX; magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; //float conjtmp; float Z_ONE = MAGMA_S_ONE; float *WORK; magma_smalloc_cpu( &WORK, N ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); //printf("voici vpos %d taupos %d tpos %d blkid %d \n", vpos, taupos, tpos, blkid); LDX = LDA-1; len = ed-st+1; *V(vpos) = Z_ONE; memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(float)); memset(A(st+1, st-1), 0, (len-1)*sizeof(float)); /* Eliminate the col at st-1 */ lapackf77_slarfg( &len, A(st, st-1), V(vpos+1), &IONE, TAU(taupos) ); /* apply left and right on A(st:ed,st:ed)*/ magma_slarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos)); //conjtmp = MAGMA_S_CNJG(*TAU(taupos)); //lapackf77_slarfy("L", &len, V(vpos), &IONE, &conjtmp, A(st,st), &LDX, WORK); //&(MAGMA_S_CNJG(*TAU(taupos))) magma_free_cpu(WORK); }
magma_int_t magma_snan_inf_gpu( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloat_const_ptr dA, magma_int_t dA_offset, magma_int_t ldda, magma_int_t *cnt_nan, magma_int_t *cnt_inf, magma_queue_t queue ) { magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper && uplo != MagmaFull ) info = -1; else if ( m < 0 ) info = -2; else if ( n < 0 ) info = -3; else if ( ldda < max(1,m) ) info = -5; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_int_t lda = m; float* A; magma_smalloc_cpu( &A, lda*n ); magma_sgetmatrix( m, n, dA, dA_offset, ldda, A, lda, queue ); magma_int_t cnt = magma_snan_inf( uplo, m, n, A, lda, cnt_nan, cnt_inf ); magma_free_cpu( A ); return cnt; }
extern "C" void magma_strdtype3cbHLsym_withQ( magma_int_t N, magma_int_t NB, float *A, magma_int_t LDA, float *V, float *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { //magma_int_t J1, J2, J3, i, j; magma_int_t len, LDX; //magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; //float conjtmp; float *WORK; magma_smalloc_cpu( &WORK, N ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); LDX = LDA-1; len = ed-st+1; /* apply left and right on A(st:ed,st:ed)*/ magma_slarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos)); //conjtmp = MAGMA_S_CNJG(*TAU(taupos)); //lapackf77_slarfy("L", &len, V(vpos), &IONE, &(MAGMA_S_CNJG(*TAU(taupos))), A(st,st), &LDX, WORK); magma_free_cpu(WORK); }
void magma_sprint_gpu( magma_int_t m, magma_int_t n, const float *dA, magma_int_t ldda ) { magma_int_t info = 0; if ( m < 0 ) info = -1; else if ( n < 0 ) info = -2; else if ( ldda < max(1,m) ) info = -4; if (info != 0) { magma_xerbla( __func__, -(info) ); return; //info; } magma_int_t lda = m; float* A; magma_smalloc_cpu( &A, lda*n ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_sgetmatrix( m, n, dA, ldda, A, lda, queue ); magma_queue_destroy( queue ); magma_sprint( m, n, A, lda ); magma_free_cpu( A ); }
void magma_sprint_gpu( magma_int_t m, magma_int_t n, const float *dA, magma_int_t ldda ) { magma_int_t info = 0; if ( m < 0 ) info = -1; else if ( n < 0 ) info = -2; else if ( magma_is_devptr( dA ) == 0 ) info = -3; else if ( ldda < max(1,m) ) info = -4; if (info != 0) { magma_xerbla( __func__, -(info) ); return; //info; } magma_int_t lda = m; float* A; magma_smalloc_cpu( &A, lda*n ); magma_sgetmatrix( m, n, dA, ldda, A, lda ); magma_sprint( m, n, A, lda ); magma_free_cpu( A ); }
extern "C" magma_int_t magma_svinit( magma_s_matrix *x, magma_location_t mem_loc, magma_int_t num_rows, magma_int_t num_cols, float values, magma_queue_t queue ) { magma_int_t info = 0; x->val = NULL; x->diag = NULL; x->row = NULL; x->rowidx = NULL; x->col = NULL; x->list = NULL; x->blockinfo = NULL; x->dval = NULL; x->ddiag = NULL; x->drow = NULL; x->drowidx = NULL; x->dcol = NULL; x->dlist = NULL; x->storage_type = Magma_DENSE; x->memory_location = mem_loc; x->sym = Magma_GENERAL; x->diagorder_type = Magma_VALUE; x->fill_mode = MagmaFull; x->num_rows = num_rows; x->num_cols = num_cols; x->nnz = num_rows*num_cols; x->max_nnz_row = num_cols; x->diameter = 0; x->blocksize = 1; x->numblocks = 1; x->alignment = 1; x->major = MagmaColMajor; x->ld = num_rows; if ( mem_loc == Magma_CPU ) { CHECK( magma_smalloc_cpu( &x->val, x->nnz )); for( magma_int_t i=0; i<x->nnz; i++) { x->val[i] = values; } } else if ( mem_loc == Magma_DEV ) { CHECK( magma_smalloc( &x->val, x->nnz )); magmablas_slaset( MagmaFull, x->num_rows, x->num_cols, values, values, x->val, x->num_rows, queue ); } cleanup: return info; }
magma_err_t magma_stranspose_host( int flag, int d, magmaFloat_ptr odata, int offseto, int ldo, magmaFloat_ptr idata, int offseti, int ldi, int m, int n, magma_queue_t queue ) { int i, j; float *work1, *work2; if (MAGMA_SUCCESS != magma_smalloc_cpu( &work1, m*n )) { return MAGMA_ERR_HOST_ALLOC; } if (MAGMA_SUCCESS != magma_smalloc_cpu( &work2, m*n )) { return MAGMA_ERR_HOST_ALLOC; } /* download to CPU */ magma_sgetmatrix_async( m, n, idata, offseti, ldi, work1, 0, m, queue, NULL ); magma_queue_sync( queue ); /* transpose it on CPU */ for( i=0; i<m; i++ ) { for( j=0; j<n; j++ ) work2[j+i*n] = work1[i+j*m]; } /* upload to GPU */ magma_ssetmatrix_async( n, m, work2, 0, n, odata, offseto, ldo, queue, NULL ); magma_queue_sync( queue ); magma_free_cpu( work2 ); magma_free_cpu( work1 ); return 0; }
////////////////////////////////////////////////////////////// // CSTEDC Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_cstedx_withZ(magma_int_t N, magma_int_t NE, float *D, float * E, magmaFloatComplex *Z, magma_int_t LDZ) { float *RWORK; float *dwork; magma_int_t *IWORK; magma_int_t LIWORK, LRWORK; magma_int_t INFO; //LWORK = N; LRWORK = 2*N*N + 4*N + 1 + 256*N; LIWORK = 256*N; magma_smalloc_cpu( &RWORK, LRWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_cstedx\n"); magma_timer_t time=0; timer_start( time ); magma_range_t job = MagmaRangeI; if (NE == N) job = MagmaRangeAll; magma_cstedx(job, N, 0., 0., 1, NE, D, E, Z, LDZ, RWORK, LRWORK, IWORK, LIWORK, dwork, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } timer_stop( time ); timer_printf( "time zstevx = %6.2f\n", time ); magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( RWORK ); }
////////////////////////////////////////////////////////////// // SSTEDX Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_sstedx_withZ(magma_int_t N, magma_int_t NE, float *D, float * E, float *Z, magma_int_t LDZ) { float *WORK; float *dwork; magma_int_t *IWORK; magma_int_t LWORK, LIWORK; magma_int_t INFO; LWORK = N*N+4*N+1; LIWORK = 3 + 5*N; magma_smalloc_cpu( &WORK, LWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("SSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_sstedx\n"); magma_timer_t time=0; timer_start( time ); //magma_range_t job = MagmaRangeI; //if (NE == N) // job = MagmaRangeAll; magma_sstedx(MagmaRangeI, N, 0., 0., 1, NE, D, E, Z, LDZ, WORK, LWORK, IWORK, LIWORK, dwork, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("SSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( WORK ); }
////////////////////////////////////////////////////////////// // CSTEDC Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_cstedc_withZ(magma_vec_t JOBZ, magma_int_t N, float *D, float * E, magmaFloatComplex *Z, magma_int_t LDZ) { magmaFloatComplex *WORK; float *RWORK; magma_int_t *IWORK; magma_int_t LWORK, LIWORK, LRWORK; magma_int_t INFO; // use log() as log2() is not defined everywhere (e.g., Windows) const float log_2 = 0.6931471805599453; if (JOBZ == MagmaVec) { LWORK = N*N; LRWORK = 1 + 3*N + 3*N*((magma_int_t)(log( (float)N )/log_2) + 1) + 4*N*N + 256*N; LIWORK = 6 + 6*N + 6*N*((magma_int_t)(log( (float)N )/log_2) + 1) + 256*N; } else if (JOBZ == MagmaIVec) { LWORK = N; LRWORK = 2*N*N + 4*N + 1 + 256*N; LIWORK = 256*N; } else if (JOBZ == MagmaNoVec) { LWORK = N; LRWORK = 256*N + 1; LIWORK = 256*N; } else { printf("ERROR JOBZ %c\n", JOBZ); exit(-1); } magma_smalloc_cpu( &RWORK, LRWORK ); magma_cmalloc_cpu( &WORK, LWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); lapackf77_cstedc( lapack_vec_const(JOBZ), &N, D, E, Z, &LDZ, WORK, &LWORK, RWORK, &LRWORK, IWORK, &LIWORK, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } magma_free_cpu( IWORK ); magma_free_cpu( WORK ); magma_free_cpu( RWORK ); }
extern "C" void magma_strdtype2cbHLsym_withQ( magma_int_t N, magma_int_t NB, float *A, magma_int_t LDA, float *V, float *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { magma_int_t J1, J2, len, lem, LDX; //magma_int_t i, j; magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; float conjtmp; float Z_ONE = MAGMA_S_ONE; //float WORK[NB]; float *WORK; magma_smalloc_cpu( &WORK, NB ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); LDX = LDA-1; J1 = ed+1; J2 = min(ed+NB,N); len = ed-st+1; lem = J2-J1+1; if (lem > 0) { /* apply remaining right commming from the top block */ lapackf77_slarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK); } if (lem > 1) { findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid); /* remove the first column of the created bulge */ *V(vpos) = Z_ONE; memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(float)); memset(A(J1+1, st),0,(lem-1)*sizeof(float)); /* Eliminate the col at st */ lapackf77_slarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) ); /* apply left on A(J1:J2,st+1:ed) */ len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */ conjtmp = MAGMA_S_CNJG(*TAU(taupos)); lapackf77_slarfx("L", &lem, &len, V(vpos), &conjtmp, A(J1, st+1), &LDX, WORK); } magma_free_cpu(WORK); }
extern "C" magma_int_t magma_slobpcg( magma_s_sparse_matrix A, magma_s_solver_par *solver_par ) { #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_s_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_stranspose( m, n, X, m, blockW, n ); \ magma_s_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_s_spmv(alpha, A, x, beta, ax ); \ magmablas_stranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); float *blockX = solver_par->eigenvectors; float *evalues = solver_par->eigenvalues; float *dwork, *hwork; float *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; float *gramA, *gramB, *gramM; float *gevectors, *h_gramB; float *pointer, *origX = blockX; float *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_smalloc_pinned( &hwork , lwork ); magma_smalloc( &blockAX , m*n ); magma_smalloc( &blockAR , m*n ); magma_smalloc( &blockAP , m*n ); magma_smalloc( &blockR , m*n ); magma_smalloc( &blockP , m*n ); magma_smalloc( &blockW , m*n ); magma_smalloc( &dwork , m*n ); magma_smalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === float residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === float c_one = MAGMA_S_ONE, c_zero = MAGMA_S_ZERO; float *residualNorms, *condestGhistory, condestG; float *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_smalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_smalloc_cpu(&condestGhistory, maxIterations+1); magma_smalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); float *hW; magma_smalloc_pinned(&hW, n*n); magma_smalloc_pinned(&gevectors, 9*n*n); magma_smalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_smalloc(&gramM, n * n); magma_smalloc(&gramA, 9 * n * n); magma_smalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_smalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_sgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_sorthomgs( m, n, blockX ); magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_ssyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_saxpy(m, MAGMA_S_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_slobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_scompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_slacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_sgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_sorthomgs( m, cBlockSize, blockR ); // === compute AR magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_scompactActive(m, n, blockP, m, activeMask); magma_scompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_sgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_sorthomgs( m, cBlockSize, blockP ); //magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_ssetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); float condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_ssyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_S_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_ssygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_ssetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_sgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_saxpy(m*n, c_one, blockP, 1, blockX, 1); magma_slobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_saxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_slobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // float res; // magma_sgetmatrix(1, 1, // (float*)residualNorms(0, iterationNumber), 1, // (float*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter) { solver_par->info = 0; } else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_ssyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_saxpy(m, MAGMA_S_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_slacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== float *hresidualNorms; magma_smalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_sgetmatrix(n, iterationNumber, (float*)residualNorms, n, (float*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, magmaFloat_ptr dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sorgqr */
extern "C" magma_int_t magma_sorgqr(magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. DT (input) REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. NB (input) INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_slaset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_sorgqr */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] T REAL array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_m( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; float *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; float *dA[ MagmaMaxGPUs ] = { NULL }; float *dT[ MagmaMaxGPUs ] = { NULL }; float *dV[ MagmaMaxGPUs ] = { NULL }; float *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } float *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_ssetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_sorgqr */
magma_int_t magma_ssolverinfo_init( magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par ){ /* solver_par->solver = Magma_CG; solver_par->maxiter = 1000; solver_par->numiter = 0; solver_par->ortho = Magma_CGS; solver_par->epsilon = RTOLERANCE; solver_par->restart = 30; solver_par->init_res = 0.; solver_par->final_res = 0.; solver_par->runtime = 0.; solver_par->verbose = 0; solver_par->info = 0; */ if( solver_par->verbose > 0 ){ magma_malloc_cpu( (void **)&solver_par->res_vec, sizeof(real_Double_t) * ( (solver_par->maxiter)/(solver_par->verbose)+1) ); magma_malloc_cpu( (void **)&solver_par->timing, sizeof(real_Double_t) *( (solver_par->maxiter)/(solver_par->verbose)+1) ); }else{ solver_par->res_vec = NULL; solver_par->timing = NULL; } if( solver_par->solver == Magma_LOBPCG ){ magma_smalloc_cpu( &solver_par->eigenvalues , 3*solver_par->num_eigenvalues ); // setup initial guess EV using lapack // then copy to GPU magma_int_t ev = solver_par->num_eigenvalues * solver_par->ev_length; float *initial_guess; magma_smalloc_cpu( &initial_guess, ev ); magma_smalloc( &solver_par->eigenvectors, ev ); magma_int_t ISEED[4] = {0,0,0,1}, ione = 1; lapackf77_slarnv( &ione, ISEED, &ev, initial_guess ); magma_ssetmatrix( solver_par->ev_length, solver_par->num_eigenvalues, initial_guess, solver_par->ev_length, solver_par->eigenvectors, solver_par->ev_length ); magma_free_cpu( initial_guess ); }else{ solver_par->eigenvectors = NULL; solver_par->eigenvalues = NULL; } precond_par->d.val = NULL; precond_par->M.val = NULL; precond_par->M.col = NULL; precond_par->M.row = NULL; precond_par->M.blockinfo = NULL; precond_par->L.val = NULL; precond_par->L.col = NULL; precond_par->L.row = NULL; precond_par->L.blockinfo = NULL; precond_par->U.val = NULL; precond_par->U.col = NULL; precond_par->U.row = NULL; precond_par->U.blockinfo = NULL; precond_par->LD.val = NULL; precond_par->LD.col = NULL; precond_par->LD.row = NULL; precond_par->LD.blockinfo = NULL; precond_par->UD.val = NULL; precond_par->UD.col = NULL; precond_par->UD.row = NULL; precond_par->UD.blockinfo = NULL; return MAGMA_SUCCESS; }
/** Purpose ------- SGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments --------- @param[in] jobvl magma_vec_t - = MagmaNoVec: left eigenvectors of A are not computed; - = MagmaVec: left eigenvectors of are computed. @param[in] jobvr magma_vec_t - = MagmaNoVec: right eigenvectors of A are not computed; - = MagmaVec: right eigenvectors of A are computed. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] wr REAL array, dimension (N) @param[out] wi REAL array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. @param[out] VL REAL array, dimension (LDVL,N) If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = MagmaNoVec, VL is not referenced. u(j) = VL(:,j), the j-th column of VL. @param[in] ldvl INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = MagmaVec, LDVL >= N. @param[out] VR REAL array, dimension (LDVR,N) If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = MagmaNoVec, VR is not referenced. v(j) = VR(:,j), the j-th column of VR. @param[in] ldvr INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = MagmaVec, LDVR >= N. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= (2 + nb + nb*ngpu)*N. For optimal performance, LWORK >= (2 + 2*nb + nb*ngpu)*N. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. @ingroup magma_sgeev_driver ********************************************************************/ extern "C" magma_int_t magma_sgeev_m( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, float *A, magma_int_t lda, #ifdef COMPLEX float *w, #else float *wr, float *wi, #endif float *VL, magma_int_t ldvl, float *VR, magma_int_t ldvr, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) const magma_int_t ione = 1; const magma_int_t izero = 0; float d__1, d__2; float r, cs, sn, scl; float dum[1], eps; float anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb; magma_int_t scalea, minwrk, optwrk, lquery, wantvl, wantvr, select[1]; magma_side_t side = MagmaRight; magma_int_t ngpu = magma_num_gpus(); magma_timer_t time_total=0, time_gehrd=0, time_unghr=0, time_hseqr=0, time_trevc=0, time_sum=0; magma_flops_t flop_total=0, flop_gehrd=0, flop_unghr=0, flop_hseqr=0, flop_trevc=0, flop_sum=0; timer_start( time_total ); flops_start( flop_total ); *info = 0; lquery = (lwork == -1); wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ nb = magma_get_sgehrd_nb( n ); if (*info == 0) { minwrk = (2 + nb + nb*ngpu)*n; optwrk = (2 + 2*nb + nb*ngpu)*n; work[0] = magma_smake_lwork( optwrk ); if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) float *dT; if (MAGMA_SUCCESS != magma_smalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version5) float *T; if (MAGMA_SUCCESS != magma_smalloc_cpu( &T, nb*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_slamch( "P" ); smlnum = lapackf77_slamch( "S" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum = magma_ssqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_slange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_slascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (Workspace: need N) * - this space is reserved until after gebak */ ibal = 0; lapackf77_sgebal( "B", &n, A, &lda, &ilo, &ihi, &work[ibal], &ierr ); /* Reduce to upper Hessenberg form * (Workspace: need 3*N, prefer 2*N + N*NB + NB*NGPU) * - added NB*NGPU needed for multi-GPU magma_sgehrd_m * - including N reserved for gebal/gebak, unused by sgehrd */ itau = ibal + n; iwrk = itau + n; liwrk = lwork - iwrk; timer_start( time_gehrd ); flops_start( flop_gehrd ); #if defined(Version1) // Version 1 - LAPACK lapackf77_sgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_sgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored, magma_sgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version5) // Version 4 - Multi-GPU, T on host magma_sgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); #endif time_sum += timer_stop( time_gehrd ); flop_sum += flops_stop( flop_gehrd ); if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side = MagmaLeft; lapackf77_slacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl ); /* Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) * - including N reserved for gebal/gebak, unused by sorghr */ timer_start( time_unghr ); flops_start( flop_unghr ); #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_sorghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_sorghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_sorghr_m( n, ilo, ihi, VL, ldvl, &work[itau], T, nb, &ierr ); #endif time_sum += timer_stop( time_unghr ); flop_sum += flops_stop( flop_unghr ); timer_start( time_hseqr ); flops_start( flop_hseqr ); /* Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by shseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_shseqr( "S", "V", &n, &ilo, &ihi, A, &lda, wr, wi, VL, &ldvl, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side = MagmaBothSides; lapackf77_slacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side = MagmaRight; lapackf77_slacpy( "L", &n, &n, A, &lda, VR, &ldvr ); /* Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) * - including N reserved for gebal/gebak, unused by sorghr */ timer_start( time_unghr ); flops_start( flop_unghr ); #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_sorghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_sorghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_sorghr_m( n, ilo, ihi, VR, ldvr, &work[itau], T, nb, &ierr ); #endif time_sum += timer_stop( time_unghr ); flop_sum += flops_stop( flop_unghr ); /* Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by shseqr */ timer_start( time_hseqr ); flops_start( flop_hseqr ); iwrk = itau; liwrk = lwork - iwrk; lapackf77_shseqr( "S", "V", &n, &ilo, &ihi, A, &lda, wr, wi, VR, &ldvr, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); } else { /* Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by shseqr */ timer_start( time_hseqr ); flops_start( flop_hseqr ); iwrk = itau; liwrk = lwork - iwrk; lapackf77_shseqr( "E", "N", &n, &ilo, &ihi, A, &lda, wr, wi, VR, &ldvr, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); } /* If INFO > 0 from SHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } timer_start( time_trevc ); flops_start( flop_trevc ); if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (Workspace: need 4*N, prefer (2 + 2*nb)*N) * - including N reserved for gebal/gebak, unused by strevc */ liwrk = lwork - iwrk; #if TREVC_VERSION == 1 lapackf77_strevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &ierr ); #elif TREVC_VERSION == 2 lapackf77_strevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &ierr ); #elif TREVC_VERSION == 3 magma_strevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #elif TREVC_VERSION == 4 magma_strevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #elif TREVC_VERSION == 5 magma_strevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #else #error Unknown TREVC_VERSION #endif } time_sum += timer_stop( time_trevc ); flop_sum += flops_stop( flop_trevc ); if (wantvl) { /* Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_sgebak( "B", "L", &n, &ilo, &ihi, &work[ibal], &n, VL, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { if ( wi[i] == 0. ) { scl = 1. / magma_cblas_snrm2( n, VL(0,i), 1 ); blasf77_sscal( &n, &scl, VL(0,i), &ione ); } else if ( wi[i] > 0. ) { d__1 = magma_cblas_snrm2( n, VL(0,i), 1 ); d__2 = magma_cblas_snrm2( n, VL(0,i+1), 1 ); scl = 1. / lapackf77_slapy2( &d__1, &d__2 ); blasf77_sscal( &n, &scl, VL(0,i), &ione ); blasf77_sscal( &n, &scl, VL(0,i+1), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = *VL(k,i); d__2 = *VL(k,i+1); work[iwrk + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &work[iwrk], &ione ) - 1; // subtract 1; k is 0-based lapackf77_slartg( VL(k,i), VL(k,i+1), &cs, &sn, &r ); blasf77_srot( &n, VL(0,i), &ione, VL(0,i+1), &ione, &cs, &sn ); *VL(k,i+1) = 0.; } } } if (wantvr) { /* Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_sgebak( "B", "R", &n, &ilo, &ihi, &work[ibal], &n, VR, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { if ( wi[i] == 0. ) { scl = 1. / magma_cblas_snrm2( n, VR(0,i), 1 ); blasf77_sscal( &n, &scl, VR(0,i), &ione ); } else if ( wi[i] > 0. ) { d__1 = magma_cblas_snrm2( n, VR(0,i), 1 ); d__2 = magma_cblas_snrm2( n, VR(0,i+1), 1 ); scl = 1. / lapackf77_slapy2( &d__1, &d__2 ); blasf77_sscal( &n, &scl, VR(0,i), &ione ); blasf77_sscal( &n, &scl, VR(0,i+1), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = *VR(k,i); d__2 = *VR(k,i+1); work[iwrk + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &work[iwrk], &ione ) - 1; // subtract 1; k is 0-based lapackf77_slartg( VR(k,i), VR(k,i+1), &cs, &sn, &r ); blasf77_srot( &n, VR(0,i), &ione, VR(0,i+1), &ione, &cs, &sn ); *VR(k,i+1) = 0.; } } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { // converged eigenvalues, stored in wr[i+1:n] and wi[i+1:n] for i = INFO magma_int_t nval = n - (*info); magma_int_t ld = max( nval, 1 ); lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wr + (*info), &ld, &ierr ); lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wi + (*info), &ld, &ierr ); if (*info > 0) { // first ilo columns were already upper triangular, // so the corresponding eigenvalues are also valid. nval = ilo - 1; lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wr, &n, &ierr ); lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wi, &n, &ierr ); } } #if defined(Version3) magma_free( dT ); #endif #if defined(Version5) magma_free_cpu( T ); #endif timer_stop( time_total ); flops_stop( flop_total ); timer_printf( "sgeev times n %5d, gehrd %7.3f, unghr %7.3f, hseqr %7.3f, trevc %7.3f, total %7.3f, sum %7.3f\n", (int) n, time_gehrd, time_unghr, time_hseqr, time_trevc, time_total, time_sum ); timer_printf( "sgeev flops n %5d, gehrd %7lld, unghr %7lld, hseqr %7lld, trevc %7lld, total %7lld, sum %7lld\n", (int) n, flop_gehrd, flop_unghr, flop_hseqr, flop_trevc, flop_total, flop_sum ); work[0] = magma_smake_lwork( optwrk ); return *info; } /* magma_sgeev */
extern "C" magma_int_t magma_sgeqrf_msub( magma_int_t num_subs, magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloat_ptr *dlA, magma_int_t ldda, float *tau, magma_queue_t *queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(gpu,a_1,a_2) dlA[gpu], ((a_2)*(ldda) + (a_1)) #define dlA_offset(a_1, a_2) ((a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hwrk(a_1) ( local_work + (a_1)) #define lhwrk ( local_work + (nb)*(m)) magmaFloat_ptr dwork[MagmaMaxGPUs], panel[MagmaMaxGPUs]; size_t panel_offset[MagmaMaxGPUs]; float *local_work = NULL; magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; int panel_id = -1, i_local, n_local[MagmaMaxGPUs * MagmaMaxSubs], la_id, displacement, tot_subs = num_gpus * num_subs; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); displacement = n * nb; lwork = (m+n+64) * nb; lhwork = lwork - (m)*nb; for (i=0; i<num_gpus; i++) { if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Set the number of local n for each GPU */ for (i=0; i<tot_subs; i++) { n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; } #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*lwork, NULL, NULL); for (j=0; j<num_gpus; j++) { local_work = (float*)clEnqueueMapBuffer(queues[2*j], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(float)*lwork, 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_smalloc_cpu( (&local_work), lwork )) { *info = -9; for (i=0; i<num_gpus; i++) { magma_free( dwork[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { /* Set the GPU number that holds the current panel */ panel_id = (i/nb)%tot_subs; /* Set the local index where the current panel is */ i_local = i/(nb*tot_subs)*nb; ib = min(k-i, nb); rows = m -i; /* Send current panel to the CPU */ magma_queue_sync(queues[2*(panel_id%num_gpus)]); magma_sgetmatrix_async( rows, ib, dlA(panel_id, i, i_local), ldda, hwrk(i), ldwork, queues[2*(panel_id%num_gpus)+1], NULL ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead application to the trailing matrix */ la_id = panel_id; /* only the GPU that has next panel is done look-ahead */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n_local[la_id]-i_local-old_ib, old_ib, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, dwork[la_id%num_gpus], 0, lddwork, dlA(la_id, old_i, i_local+old_ib), ldda, dwork[la_id%num_gpus], old_ib, lddwork, queues[2*(la_id%num_gpus)]); la_id = ((i-nb)/nb)%tot_subs; magma_ssetmatrix_async( old_ib, old_ib, hwrk(old_i), ldwork, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, queues[2*(la_id%num_gpus)], NULL ); } magma_queue_sync( queues[2*(panel_id%num_gpus)+1] ); lapackf77_sgeqrf(&rows, &ib, hwrk(i), &ldwork, tau+i, lhwrk, &lhwork, info); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hwrk(i), &ldwork, tau+i, lhwrk, &ib); spanel_to_q( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); // Send the current panel back to the GPUs // Has to be done with asynchronous copies for (j=0; j<num_gpus; j++) { if (j == panel_id%num_gpus){ panel[j] = dlA(panel_id, i, i_local); panel_offset[j] = dlA_offset(i, i_local); } else { panel[j] = dwork[j]; panel_offset[j] = displacement; } magma_queue_sync( queues[2*j] ); magma_ssetmatrix_async( rows, ib, hwrk(i), ldwork, panel[j], panel_offset[j], ldda, queues[2*j+1], NULL ); /* Send the T matrix to the GPU. Has to be done with asynchronous copies */ magma_ssetmatrix_async( ib, ib, lhwrk, ib, dwork[j], 0, lddwork, queues[2*j+1], NULL ); } for(j=0; j<num_gpus; j++) { magma_queue_sync( queues[2*j+1] ); } if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left; This is update for the next panel; part of the look-ahead */ la_id = (panel_id+1)%tot_subs; int i_loc = (i+nb)/(nb*tot_subs)*nb; for (j=0; j<tot_subs; j++) { if (j == la_id) magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_loc), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); else if (j <= panel_id) magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local-ib, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_local+ib), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); else magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_local), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); } /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); } else { /* do the entire update as we exit and there would be no lookahead */ la_id = (panel_id+1)%tot_subs; int i_loc = (i+nb)/(nb*tot_subs)*nb; magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[la_id]-i_loc, ib, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, dwork[la_id%num_gpus], 0, lddwork, dlA(la_id, i, i_loc), ldda, dwork[la_id%num_gpus], ib, lddwork, queues[2*(la_id%num_gpus)]); /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); magma_ssetmatrix( ib, ib, hwrk(i), ldwork, dlA(panel_id, i, i_local), ldda, queues[2*(panel_id%num_gpus)]); } old_i = i; old_ib = ib; } } } else { i = 0; } for (j=0; j<num_gpus; j++) { magma_free( dwork[j] ); } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; lhwork = lwork - rows*ib; panel_id = (panel_id+1)%tot_subs; int i_loc = (i)/(nb*tot_subs)*nb; magma_sgetmatrix( rows, ib, dlA(panel_id, i, i_loc), ldda, lhwrk, rows, queues[2*(panel_id%num_gpus)]); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, lhwrk, rows, dlA(panel_id, i, i_loc), ldda, queues[2*(panel_id%num_gpus)]); } #ifdef USE_PINNED_CLMEMORY #else magma_free_cpu( local_work ); #endif return *info; } /* magma_sgeqrf_msub */
/** Purpose ------- SGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf_nopiv_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info) { #define dA(i,j) (dA + (i)*nb + (j)*nb*ldda) float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, mindim; magma_int_t i, rows, s, lddwork; float *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_sgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_smalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_sgetmatrix( m, n, dA, ldda, work, m ); magma_sgetrf_nopiv( m, n, work, m, info); magma_ssetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; lddwork = maxm; if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, maxm*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( i=0; i < s; i++ ) { // download i-th panel magma_queue_sync( stream[1] ); magma_sgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] ); if ( i > 0 ) { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (i+1)*nb, c_one, dA(i-1,i-1), ldda, dA(i-1,i+1), ldda ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i*nb, n-(i+1)*nb, nb, c_neg_one, dA(i, i-1), ldda, dA(i-1,i+1), ldda, c_one, dA(i, i+1), ldda ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); magma_sgetrf_nopiv( rows, nb, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_ssetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] ); magma_queue_sync( stream[0] ); // do the small non-parallel computations if ( s > (i+1) ) { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } else { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, n-(i+1)*nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; magma_sgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part magma_sgetrf_nopiv( rows, nb0, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; // upload i-th panel magma_ssetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda ); magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_sgetrf_nopiv_gpu */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA REAL array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT (workspace) REAL work space array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB. This must be the 6th argument of magma_sgeqrf_gpu [ note that if N here is bigger than N in magma_sgeqrf_gpu, the workspace requirement DT in magma_sgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_gpu(magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; float *dV, *dW; float *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for sorgqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_smalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_smalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // ((n+31)/32*32 )*nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_sgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk ); lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_ssetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } if (kk > 0) { // Use blocked code // stream: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_scopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } } magma_queue_sync( stream ); magmablasSetKernelStream( NULL ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( stream ); return *info; } /* magma_sorgqr_gpu */
extern "C" magma_int_t magma_smlumerge( magma_s_sparse_matrix L, magma_s_sparse_matrix U, magma_s_sparse_matrix *A, magma_queue_t queue ){ if( L.storage_type == Magma_CSR && U.storage_type == Magma_CSR ){ if( L.memory_location == Magma_CPU && U.memory_location == Magma_CPU ){ magma_s_mtransfer( L, A, Magma_CPU, Magma_CPU, queue ); magma_free_cpu( A->col ); magma_free_cpu( A->val ); // make sure it is strictly lower triangular magma_int_t z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){ if( L.col[j] < i ){// make sure it is strictly lower triangular z++; } } for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){ z++; } } A->nnz = z; // fill A with the new structure; magma_int_t stat_cpu = 0; stat_cpu += magma_index_malloc_cpu( &A->col, A->nnz ); stat_cpu += magma_smalloc_cpu( &A->val, A->nnz ); if( stat_cpu != 0 ){ magma_s_mfree( A, queue ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ A->row[i] = z; for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){ if( L.col[j] < i ){// make sure it is strictly lower triangular A->col[z] = L.col[j]; A->val[z] = L.val[j]; z++; } } for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){ A->col[z] = U.col[j]; A->val[z] = U.val[j]; z++; } } A->row[A->num_rows] = z; A->nnz = z; return MAGMA_SUCCESS; } else{ printf("error: matrix not on CPU.\n"); return MAGMA_SUCCESS; } } else{ printf("error: matrix not on CPU.\n"); return MAGMA_SUCCESS; } }
/** Purpose ------- SGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = L * U where L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf_nopiv_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, (dA_offset + (i_)*nb + (j_)*nb*ldda) #else #define dA(i_, j_) (dA + (i_)*nb + (j_)*nb*ldda) #endif float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, mindim; magma_int_t j, rows, s, ldwork; float *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min( m, n ); nb = magma_get_sgetrf_nb( m, n ); s = mindim / nb; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_smalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_sgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] ); magma_sgetrf_nopiv( m, n, work, m, info ); magma_ssetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); ldwork = maxm; if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, ldwork*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( j=0; j < s; j++ ) { // get j-th panel from device magma_queue_sync( queues[1] ); magma_sgetmatrix_async( m-j*nb, nb, dA(j,j), ldda, work, ldwork, queues[0] ); if ( j > 0 ) { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (j+1)*nb, c_one, dA(j-1,j-1), ldda, dA(j-1,j+1), ldda, queues[1] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-j*nb, n-(j+1)*nb, nb, c_neg_one, dA(j, j-1), ldda, dA(j-1,j+1), ldda, c_one, dA(j, j+1), ldda, queues[1] ); } // do the cpu part rows = m - j*nb; magma_queue_sync( queues[0] ); magma_sgetrf_nopiv( rows, nb, work, ldwork, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; // send j-th panel to device magma_ssetmatrix_async( m-j*nb, nb, work, ldwork, dA(j, j), ldda, queues[0] ); magma_queue_sync( queues[0] ); // do the small non-parallel computations (next panel update) if ( s > j+1 ) { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(j, j ), ldda, dA(j, j+1), ldda, queues[1] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-(j+1)*nb, nb, nb, c_neg_one, dA(j+1, j ), ldda, dA(j, j+1), ldda, c_one, dA(j+1, j+1), ldda, queues[1] ); } else { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(j, j ), ldda, dA(j, j+1), ldda, queues[1] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-(j+1)*nb, n-(j+1)*nb, nb, c_neg_one, dA(j+1, j ), ldda, dA(j, j+1), ldda, c_one, dA(j+1, j+1), ldda, queues[1] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magma_sgetmatrix( rows, nb0, dA(s,s), ldda, work, ldwork, queues[1] ); // do the cpu part magma_sgetrf_nopiv( rows, nb0, work, ldwork, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; // send j-th panel to device magma_ssetmatrix( rows, nb0, work, ldwork, dA(s,s), ldda, queues[1] ); magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda, queues[1] ); } magma_free_pinned( work ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); return *info; } /* magma_sgetrf_nopiv_gpu */
static void magma_stile_bulge_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *A, magma_int_t lda, float *V, magma_int_t ldv, float *TAU, magma_int_t n, magma_int_t nb, magma_int_t nbtiles, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog) { magma_int_t sweepid, myid, shift, stt, st, ed, stind, edind; magma_int_t blklastind, colpt; magma_int_t stepercol; magma_int_t i,j,m,k; magma_int_t thgrsiz, thgrnb, thgrid, thed; magma_int_t coreid; magma_int_t colblktile,maxrequiredcores,colpercore,mycoresnb; magma_int_t fin; float *work; if(n<=0) return ; if(grsiz<=0) return ; //printf("=================> my core id %d of %d \n",my_core_id, cores_num); /* As I store V in the V vector there are overlap between * tasks so shift is now 4 where group need to be always * multiple of 2, because as example if grs=1 task 2 from * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2 * will overwrite the V of tasks 5 sweep 1 which are used by * task 6, so keep in mind that group need to be multiple of 2, * and thus tasks 2 sweep 2 will never run with task 6 sweep 1. * However, when storing V in A, shift could be back to 3. * */ magma_smalloc_cpu(&work, n); mycoresnb = cores_num; shift = 5; if(grsiz==1) colblktile=1; else colblktile=grsiz/2; maxrequiredcores = nbtiles/colblktile; if(maxrequiredcores<1)maxrequiredcores=1; colpercore = colblktile*nb; if(mycoresnb > maxrequiredcores) mycoresnb = maxrequiredcores; thgrsiz = n; stepercol = magma_ceildiv(shift, grsiz); thgrnb = magma_ceildiv(n-1, thgrsiz); #ifdef ENABLE_DEBUG if(my_core_id==0){ if(cores_num > maxrequiredcores) { printf("==================================================================================\n"); printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n",maxrequiredcores); printf("==================================================================================\n"); } printf(" Static bulgechasing version v9_9col threads %4d N %5d NB %5d grs %4d thgrsiz %4d \n",cores_num, n, nb, grsiz,thgrsiz); } #endif for (thgrid = 1; thgrid<=thgrnb; thgrid++){ stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (n-1)); for (i = stt; i <= n-1; i++){ ed=min(i,thed); if(stt>ed)break; for (m = 1; m <=stepercol; m++){ st=stt; for (sweepid = st; sweepid <=ed; sweepid++){ for (k = 1; k <=grsiz; k++){ myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k; if(myid%2 ==0){ colpt = (myid/2)*nb+1+sweepid-1; stind = colpt-nb+1; edind = min(colpt,n); blklastind = colpt; if(stind>=edind){ printf("ERROR---------> st>=ed %d %d \n\n", (int) stind, (int) edind); exit(-10); } }else{ colpt = ((myid+1)/2)*nb + 1 +sweepid -1 ; stind = colpt-nb+1; edind = min(colpt,n); if( (stind>=edind-1) && (edind==n) ) blklastind=n; else blklastind=0; if(stind>edind){ printf("ERROR---------> st>=ed %d %d \n\n", (int) stind, (int) edind); exit(-10); } } coreid = (stind/colpercore)%mycoresnb; if(my_core_id==coreid) { fin=0; while(fin==0) { if(myid==1) { if( (prog[myid+shift-1]== (sweepid-1)) ) { magma_strdtype1cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); fin=1; prog[myid]= sweepid; if(blklastind >= (n-1)) { for (j = 1; j <= shift; j++) prog[myid+j]=sweepid; } } // END progress condition }else{ if( (prog[myid-1]==sweepid) && (prog[myid+shift-1]== (sweepid-1)) ) { if(myid%2 == 0) magma_strdtype2cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); else magma_strdtype3cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); fin=1; prog[myid]= sweepid; if(blklastind >= (n-1)) { for (j = 1; j <= shift+mycoresnb; j++) prog[myid+j]=sweepid; } } // END progress condition } // END if myid==1 } // END while loop } // END if my_core_id==coreid if(blklastind >= (n-1)) { stt=stt+1; break; } } // END for k=1:grsiz } // END for sweepid=st:ed } // END for m=1:stepercol } // END for i=1:n-1 } // END for thgrid=1:thgrnb magma_free_cpu(work); } // END FUNCTION
extern "C" magma_int_t magma_smlumerge( magma_s_matrix L, magma_s_matrix U, magma_s_matrix *A, magma_queue_t queue ) { magma_int_t info = 0; if( L.storage_type == Magma_CSR && U.storage_type == Magma_CSR ){ if( L.memory_location == Magma_CPU && U.memory_location == Magma_CPU ){ CHECK( magma_smtransfer( L, A, Magma_CPU, Magma_CPU, queue )); magma_free_cpu( A->col ); magma_free_cpu( A->val ); // make sure it is strictly lower triangular magma_int_t z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){ if( L.col[j] < i ){// make sure it is strictly lower triangular z++; } } for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){ z++; } } A->nnz = z; // fill A with the new structure; CHECK( magma_index_malloc_cpu( &A->col, A->nnz )); CHECK( magma_smalloc_cpu( &A->val, A->nnz )); z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ A->row[i] = z; for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){ if( L.col[j] < i ){// make sure it is strictly lower triangular A->col[z] = L.col[j]; A->val[z] = L.val[j]; z++; } } for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){ A->col[z] = U.col[j]; A->val[z] = U.val[j]; z++; } } A->row[A->num_rows] = z; A->nnz = z; } else{ printf("error: matrix not on CPU.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } else{ printf("error: matrix in wrong format.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: if( info != 0 ){ magma_smfree( A, queue ); } return info; }
/** Purpose ------- Solves a system of linear equations A * X = B, A**T * X = B, or A**T * X = B with a general N-by-N matrix A using the LU factorization computed by SGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaTrans: A**T * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by SGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from SGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB REAL array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, magma_int_t *ipiv, float *dB, magma_int_t lddb, magma_int_t *info) { float c_one = MAGMA_S_ONE; float *work = NULL; int notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_smalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_sgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_slaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_ssetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_strsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A**T * X = B or A**T * X = B. */ if ( nrhs == 1) { magma_strsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_strsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_sgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_slaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_ssetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }
/** Purpose ------- SSYEVDX computes selected eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA, N). On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param wA (workspace) REAL array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_ssyev_driver ********************************************************************/ extern "C" magma_int_t magma_ssyevdx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, float *dA, magma_int_t ldda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, float *wA, magma_int_t ldwa, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { magma_int_t ione = 1; float d__1; float eps; magma_int_t inde; float anrm; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; float *dwork; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (ldwa < max(1,n)) { *info = -14; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_slamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif const char* jobz_ = lapack_vec_const( jobz ); const char* uplo_ = lapack_uplo_const( uplo ); float *A; magma_smalloc_cpu( &A, n*n ); magma_sgetmatrix(n, n, dA, ldda, A, n); lapackf77_ssyevd(jobz_, uplo_, &n, A, &n, w, work, &lwork, iwork, &liwork, info); magma_ssetmatrix( n, n, A, n, dA, ldda); magma_free_cpu(A); return *info; } magma_queue_t stream; magma_queue_create( &stream ); // n*lddc for ssytrd2_gpu // n for slansy magma_int_t ldwork = n*lddc; if ( wantz ) { // need 3n^2/2 for sstedx ldwork = max( ldwork, 3*n*(n/2 + 1)); } if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_slansy(MagmaMaxNorm, uplo, n, dA, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_slascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info); } /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */ // ssytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_SYMV magma_ssytrd2_gpu(uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dwork, n*lddc, &iinfo); #else magma_ssytrd_gpu(uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo); #endif timer_stop( time ); timer_printf( "time ssytrd = %6.2f\n", time ); /* For eigenvalues only, call SSTERF. For eigenvectors, first call SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call SORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &work[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); } else { timer_start( time ); magma_sstedx(range, n, vl, vu, il, iu, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); timer_start( time ); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); magma_ssetmatrix( n, *m, &work[indwrk + n* (il-1) ], n, dwork, lddc ); magma_sormtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau], dwork, lddc, wA, ldwa, &iinfo); magma_scopymatrix( n, *m, dwork, lddc, dA, ldda ); timer_stop( time ); timer_printf( "time sormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_sscal(&n, &d__1, w, &ione); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; magma_queue_destroy( stream ); magma_free( dwork ); return *info; } /* magma_ssyevd_gpu */
static void magma_stile_bulge_applyQ(char side, magma_int_t n_loc, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, float *E, magma_int_t lde, float *V, magma_int_t ldv, float *TAU, float *T, magma_int_t ldt)//, magma_int_t* info) { //%=========================== //% local variables //%=========================== magma_int_t firstcolj; magma_int_t bg, rownbm; magma_int_t st,ed,fst,vlen,vnb,colj; magma_int_t vpos,tpos; magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled; if(n<=0) return ; if(n_loc<=0) return ; //info = 0; magma_int_t INFO=0; magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz); /* * version v1: for each chunck it apply all the V's then move to * the other chunck. the locality here inside each * chunck meaning that thread t apply V_k then move * to V_k+1 which overlap with V_k meaning that the * E_k+1 overlap with E_k. so here is the * locality however thread t had to read V_k+1 and * T_k+1 at each apply. note that all thread if they * run at same speed they might reading the same V_k * and T_k at the same time. * */ magma_int_t nb_loc = 128; //$$$$$$$$ magma_int_t lwork = 2*nb_loc*max(Vblksiz,64); float *work, *work2; magma_smalloc_cpu(&work, lwork); magma_smalloc_cpu(&work2, lwork); magma_int_t nbchunk = magma_ceildiv(n_loc, nb_loc); /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning * that q_i+1 overlap a portion of the E(row_i, :). * IN parallel E is splitten in vertical block over the threads */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning * that q_i+1 overlap a portion of the E(:, col_i). * IN parallel E is splitten in horizontal block over the threads */ //printf(" APPLY Q2 N %d N_loc %d nbchunk %d NB %d Vblksiz %d SIDE %c \n", n, n_loc, nbchunk, nb, Vblksiz, side); for (magma_int_t i = 0; i<nbchunk; i++) { magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc)); if(side=='L'){ for (bg = nbGblk; bg>0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = magma_ceildiv((n-(firstcolj+1)),nb); if(bg==nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb); // last blk has size=1 used for real to handle A(N,N-1) for (magma_int_t j = rownbm; j>0; j--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop fst = (rownbm -j)*nb+colj +1; for (magma_int_t k=0; k<Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -j)*nb+colj +1; ed = min(st+nb-1,n-1); if(st>ed) break; if((st==ed)&&(colj!=n-2)) break; vlen=ed-fst+1; vnb=k+1; } colst = (bg-1)*Vblksiz; magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if((vlen>0)&&(vnb>0)){ lapackf77_slarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc); } if(INFO!=0) printf("ERROR SORMQR INFO %d \n",INFO); } } }else if (side=='R'){ rownbm = magma_ceildiv((n-1),nb); for (magma_int_t k = 1; k<=rownbm; k++) { ncolinvolvd = min(n-1, k*nb); avai_blksiz=min(Vblksiz,ncolinvolvd); nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz); for (magma_int_t j = 1; j<=nbgr; j++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz); colst = (j-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -k)*nb+colst +1; for (colj=colst; colj<=coled; colj++) { st = (rownbm -k)*nb+colj +1; ed = min(st+nb-1,n-1); if(st>ed) break; if((st==ed)&&(colj!=n-2)) break; vlen=ed-fst+1; vnb=vnb+1; } magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if((vlen>0)&&(vnb>0)){ lapackf77_slarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc); } } } }else{ printf("ERROR SIDE %d \n",side); } } // END loop over the chunks magma_free_cpu(work); magma_free_cpu(work2); }
/** Purpose ------- SGELS solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA REAL array, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by SGEQRF3. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_driver ********************************************************************/ extern "C" magma_int_t magma_sgels3_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, magmaFloat_ptr dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { magmaFloat_ptr dT; float *tau; magma_int_t min_mn; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_smake_lwork( lwkopt ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = MAGMA_S_ONE; return *info; } /* * Allocate temporary buffers */ magma_int_t ldtwork = ( 2*min_mn + magma_roundup( n, 32 ) )*nb; if (nb < nrhs) ldtwork = ( 2*min_mn + magma_roundup( n, 32 ) )*nrhs; if (MAGMA_SUCCESS != magma_smalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_smalloc_cpu( &tau, min_mn ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_sgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_sgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu( tau ); return *info; }
/***************************************************************************//** Purpose ------- SGEHRD2 reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_gehrd *******************************************************************************/ extern "C" magma_int_t magma_sgehrd2( magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda*2) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dV(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda) #define dwork(i_) dwork, ((i_)) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dV(i_,j_) (dV + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) #endif // Constants const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; // Local variables magma_int_t nb = magma_get_sgehrd_nb( n ); magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t i, nh, iws; magma_int_t iinfo; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = magma_smake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // GPU workspace is: // nb*ldda for dwork for slahru // nb*ldda for dV // n*ldda for dA // nb*nb for dT magmaFloat_ptr dwork; if (MAGMA_SUCCESS != magma_smalloc( &dwork, 2*nb*ldda + n*ldda + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } float *dV = dwork + nb*ldda; float *dA = dwork + nb*ldda*2; float *dT = dwork + nb*ldda*2 + n*ldda; float *T; magma_smalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; assert( nb % 4 == 0 ); for (i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; // Copy the matrix to the GPU magma_ssetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_sgetmatrix( ihi-i, nb, dA(i,i-ilo), ldda, A(i,i), lda, queue ); // add 1 to i for 1-based index magma_slahr2( ihi, i+1, nb, dA(0,i-ilo), ldda, dV(0,0), ldda, A(0,i), lda, &tau[i], T, nb, work, n, queue ); // Copy T from the CPU to dT on the GPU magma_ssetmatrix( nb, nb, T, nb, dT(0,0), nb, queue ); magma_slahru( n, ihi, i, nb, A(0,i), lda, dA(0,i-ilo), ldda, // dA dA(i,i-ilo), ldda, // dY, stored over current panel dV(0,0), ldda, dT(0,0), dwork, queue ); } // Copy remainder to host magma_sgetmatrix( n, n-i, dA(0,i-ilo), ldda, A(0,i), lda, queue ); magma_free( dwork ); magma_free_cpu( T ); magma_queue_destroy( queue ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_smake_lwork( iws ); return *info; } /* magma_sgehrd2 */