// Solve dA * dX = dB, where dA and dX are stored in GPU device memory. // Internally, MAGMA uses a hybrid CPU + GPU algorithm. void GpuSolver::solve( magmaDouble_ptr X ) { real_Double_t gpu_time; magmaDouble_ptr dX=NULL, dWORKD=NULL, Z=NULL; float *dWORKS=NULL; magma_int_t qrsv_iters; magma_int_t info = 0; magma_dmalloc( &dX, n*nrhs ); magma_dmalloc( &dWORKD, n*nrhs ); magma_smalloc( &dWORKS, n*(n+nrhs) ); if ( dX == NULL || dWORKD == NULL || dWORKS == NULL ) { fprintf( stderr, "malloc failed - not enough GPU memory?\n" ); goto cleanup; } gpu_time = magma_wtime(); magma_dsposv_gpu( MagmaUpper, n, nrhs, dA, n, dB, n, dX, n, dWORKD, dWORKS, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; fprintf( stdout, "DSPOSV GPU solution time = %fs\n", gpu_time); if ( qrsv_iters == -3 ) {fprintf( stderr, "cannot factor input matrix in single precision, bad initialization?\n"); } if ( info != 0 ) { fprintf( stderr, "magma_dsposv_gpu failed with info=%d\n", info ); } magma_dgetmatrix( n, nrhs, dX, n, X, n ); cleanup: magma_free( dX ); }
extern "C" magma_int_t magma_svinit( magma_s_matrix *x, magma_location_t mem_loc, magma_int_t num_rows, magma_int_t num_cols, float values, magma_queue_t queue ) { magma_int_t info = 0; x->val = NULL; x->diag = NULL; x->row = NULL; x->rowidx = NULL; x->col = NULL; x->list = NULL; x->blockinfo = NULL; x->dval = NULL; x->ddiag = NULL; x->drow = NULL; x->drowidx = NULL; x->dcol = NULL; x->dlist = NULL; x->storage_type = Magma_DENSE; x->memory_location = mem_loc; x->sym = Magma_GENERAL; x->diagorder_type = Magma_VALUE; x->fill_mode = MagmaFull; x->num_rows = num_rows; x->num_cols = num_cols; x->nnz = num_rows*num_cols; x->max_nnz_row = num_cols; x->diameter = 0; x->blocksize = 1; x->numblocks = 1; x->alignment = 1; x->major = MagmaColMajor; x->ld = num_rows; if ( mem_loc == Magma_CPU ) { CHECK( magma_smalloc_cpu( &x->val, x->nnz )); for( magma_int_t i=0; i<x->nnz; i++) { x->val[i] = values; } } else if ( mem_loc == Magma_DEV ) { CHECK( magma_smalloc( &x->val, x->nnz )); magmablas_slaset( MagmaFull, x->num_rows, x->num_cols, values, values, x->val, x->num_rows, queue ); } cleanup: return info; }
void gmm_magma(const Tensor_core<float,2>& A, const Tensor_core<float,2>& B, Tensor_core<float,2>& C, char TRANSA, char TRANSB, float alpha, float beta) { int AL0 = A.rank(0); int AL1 = A.rank(1); int BL0 = B.rank(0); int BL1 = B.rank(1); int CL0 = C.rank(0); int CL1 = C.rank(1); magma_int_t M, N, K, LDA, LDB, LDC; magma_trans_t transA=magma_trans_const(TRANSA), transB=magma_trans_const(TRANSB); magmaFloat_ptr d_A, d_B, d_C; //Set LDA, LDB, and LDC, round up to multiple of 32 for best GPU performance LDA = ((AL0+31)/32)*32; LDB = ((BL0+31)/32)*32; LDC = ((CL0+31)/32)*32; // Allocate memory for the matrices on GPU magma_smalloc(&d_A, LDA*AL1 ); magma_smalloc(&d_B, LDB*BL1 ); magma_smalloc(&d_C, LDC*CL1 ); // Copy data from host (CPU) to device (GPU) magma_ssetmatrix( AL0, AL1, A.data(), AL0, d_A, LDA ); magma_ssetmatrix( BL0, BL1, B.data(), BL0, d_B, LDB ); if( abs(beta)>1e-32 ) magma_ssetmatrix( CL0, CL1, C.data(), CL0, d_C, LDC ); //Call magma_sgemm M=( TRANSA=='N' || TRANSA=='n' ) ? AL0:AL1; K=( TRANSA=='N' || TRANSA=='n' ) ? AL1:AL0; N=( TRANSB=='N' || TRANSB=='n' ) ? BL1:BL0; magma_sgemm(transA, transB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta,d_C, LDC); // Copy solution from device (GPU) to host (CPU) magma_sgetmatrix(CL0, CL1, d_C, LDC, C.data(), CL0); // Free memory on GPU magma_free(d_A); magma_free(d_B); magma_free(d_C); }
////////////////////////////////////////////////////////////// // CSTEDC Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_cstedx_withZ(magma_int_t N, magma_int_t NE, float *D, float * E, magmaFloatComplex *Z, magma_int_t LDZ) { float *RWORK; float *dwork; magma_int_t *IWORK; magma_int_t LIWORK, LRWORK; magma_int_t INFO; //LWORK = N; LRWORK = 2*N*N + 4*N + 1 + 256*N; LIWORK = 256*N; magma_smalloc_cpu( &RWORK, LRWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_cstedx\n"); magma_timer_t time=0; timer_start( time ); magma_range_t job = MagmaRangeI; if (NE == N) job = MagmaRangeAll; magma_cstedx(job, N, 0., 0., 1, NE, D, E, Z, LDZ, RWORK, LRWORK, IWORK, LIWORK, dwork, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("CSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } timer_stop( time ); timer_printf( "time zstevx = %6.2f\n", time ); magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( RWORK ); }
////////////////////////////////////////////////////////////// // SSTEDX Divide and Conquer for tridiag ////////////////////////////////////////////////////////////// extern "C" void magma_sstedx_withZ(magma_int_t N, magma_int_t NE, float *D, float * E, float *Z, magma_int_t LDZ) { float *WORK; float *dwork; magma_int_t *IWORK; magma_int_t LWORK, LIWORK; magma_int_t INFO; LWORK = N*N+4*N+1; LIWORK = 3 + 5*N; magma_smalloc_cpu( &WORK, LWORK ); magma_imalloc_cpu( &IWORK, LIWORK ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*N*(N/2 + 1) )) { printf("=================================================\n"); printf("SSTEDC ERROR OCCURED IN CUDAMALLOC\n"); printf("=================================================\n"); return; } printf("using magma_sstedx\n"); magma_timer_t time=0; timer_start( time ); //magma_range_t job = MagmaRangeI; //if (NE == N) // job = MagmaRangeAll; magma_sstedx(MagmaRangeI, N, 0., 0., 1, NE, D, E, Z, LDZ, WORK, LWORK, IWORK, LIWORK, dwork, &INFO); if (INFO != 0) { printf("=================================================\n"); printf("SSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO); printf("=================================================\n"); //assert(INFO == 0); } timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); magma_free( dwork ); magma_free_cpu( IWORK ); magma_free_cpu( WORK ); }
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**T * U, if UPLO = MagmaUpper, or dA = L * L**T, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf2_mgpu(int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, float **d_lA, magma_int_t ldda, float **d_lP, magma_int_t lddp, float *A, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { #define Alo(i, j) (A + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i)) #define Aup(i, j) (A + (nb*(((j)/nb)%h)+off_j)*lda + (i+off_i)) #define dlA(id, i, j) (d_lA[(id)] + (j)*ldda + (i)) #define dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i)) #define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i)) magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; int upper = (uplo == MagmaUpper); float *dlpanel; //magma_event_t event0[MagmaMaxGPUs], // syrk // event1[MagmaMaxGPUs], // send off-diagonal // event2[MagmaMaxGPUs], // send diagonal // event3[MagmaMaxGPUs]; // trsm magma_int_t n_local[MagmaMaxGPUs], ldpanel; int stream0 = 0, stream1 = 1; #ifdef STRSM_WORK float *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by strsm_work */ #endif *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for( d=0; d < num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); } magma_setdevice(0); /* == initialize the trace */ trace_init( 1, num_gpus, 3, (magma_queue_t*)stream ); /* Use blocked code. */ if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if defined(PRECISION_d) && defined(STRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 2; j++ ) { magma_smalloc( &d_dinvA[d][j], nb*nb ); magma_smalloc( &d_x[d][j], n*nb ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(float)); } } magma_setdevice(0); #endif for (j=0; j < m; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if ( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { if ( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ trace_gpu_start( d, stream0, "comm", "rows to GPUs" ); magma_ssetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_ssyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk trace_gpu_start( id, stream0, "comm", "D to CPU" ); magma_sgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); if ( j > 0 ) { /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( n_local[d] > nb0 ) { /* wait for the off-diagonal */ if ( d != id ) { //magma_queue_sync( stream[id][3] ); dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "gemm", "gemm" ); magma_sgemm(MagmaTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the diagonal trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } trace_gpu_start( d, stream0, "comm", "D to GPUs" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream0, "comm", "D to GPUs" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if ( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); /*nb2 = n_local[d] - j_local2*nb; magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d], d_x[d] ); */ #else /*nb2 = n_local[d] - j_local2*nb; magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldda, dlA(d, j, nb*j_local2), ldda); */ magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if ( j+jb < m ) { trace_gpu_start( d, stream0, "comm", "rows to CPU" ); magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_sgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif } else if ( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* end of strsm */ } /* end of for j=1, .., n */ } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ #if defined(PRECISION_d) && defined(STRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 2; j++ ) { magma_smalloc( &d_dinvA[d][j], nb*nb ); magma_smalloc( &d_x[d][j], nb*m ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(float)); } } magma_setdevice(0); #endif for (j=0; j < n; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if ( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { if ( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ magma_ssetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk magma_sgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream0] ); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( nb0 < n_local[d] ) { if ( d != id ) { dlpanel = dlPT(d, 0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_sgemm( MagmaNoTrans, MagmaTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream0] ); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if ( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0]); #else magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if ( j+jb < n ) { magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_sgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } else if ( nb2 > 0 ) { /* update the entire trailing matrix */ #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "spotrf.svg", "trace.css" ); /* clean up */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); } magma_setdevice(0); return *info; } /* magma_spotrf_mgpu */
/** Purpose ------- SGELS solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA REAL array, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by SGEQRF3. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_driver ********************************************************************/ extern "C" magma_int_t magma_sgels3_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, magmaFloat_ptr dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { magmaFloat_ptr dT; float *tau; magma_int_t min_mn; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_smake_lwork( lwkopt ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = MAGMA_S_ONE; return *info; } /* * Allocate temporary buffers */ magma_int_t ldtwork = ( 2*min_mn + magma_roundup( n, 32 ) )*nb; if (nb < nrhs) ldtwork = ( 2*min_mn + magma_roundup( n, 32 ) )*nrhs; if (MAGMA_SUCCESS != magma_smalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_smalloc_cpu( &tau, min_mn ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_sgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_sgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu( tau ); return *info; }
/** @deprecated Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_cgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_claqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf) { #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //float d__1; magmaFloatComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaFloatComplex Akk; magmaFloatComplex *Aks; magmaFloatComplex tauk = MAGMA_C_ZERO; magma_int_t pvt; //float temp, temp2; float tol3z; magma_int_t itemp; float lsticc, *lsticcs; magma_int_t lastrk; magma_smalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS isamax; pvt, k are 0-based. pvt = k + magma_isamax( n-k, &vn1[k], ione ) - 1; if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb) { // no need of transfer if pivot is within the panel blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_cswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_cswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_clarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_cgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_ccopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_cscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_cgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_csetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k < n-1) { if (k > 0) { //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_cgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_cgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_scnrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_csetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_cswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_ccopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); else { // Where is the data, CPU or GPU ? float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_claqps */
extern "C" magma_int_t magma_sgeqrf_msub( magma_int_t num_subs, magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloat_ptr *dlA, magma_int_t ldda, float *tau, magma_queue_t *queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(gpu,a_1,a_2) dlA[gpu], ((a_2)*(ldda) + (a_1)) #define dlA_offset(a_1, a_2) ((a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hwrk(a_1) ( local_work + (a_1)) #define lhwrk ( local_work + (nb)*(m)) magmaFloat_ptr dwork[MagmaMaxGPUs], panel[MagmaMaxGPUs]; size_t panel_offset[MagmaMaxGPUs]; float *local_work = NULL; magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; int panel_id = -1, i_local, n_local[MagmaMaxGPUs * MagmaMaxSubs], la_id, displacement, tot_subs = num_gpus * num_subs; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); displacement = n * nb; lwork = (m+n+64) * nb; lhwork = lwork - (m)*nb; for (i=0; i<num_gpus; i++) { if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Set the number of local n for each GPU */ for (i=0; i<tot_subs; i++) { n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; } #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*lwork, NULL, NULL); for (j=0; j<num_gpus; j++) { local_work = (float*)clEnqueueMapBuffer(queues[2*j], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(float)*lwork, 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_smalloc_cpu( (&local_work), lwork )) { *info = -9; for (i=0; i<num_gpus; i++) { magma_free( dwork[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { /* Set the GPU number that holds the current panel */ panel_id = (i/nb)%tot_subs; /* Set the local index where the current panel is */ i_local = i/(nb*tot_subs)*nb; ib = min(k-i, nb); rows = m -i; /* Send current panel to the CPU */ magma_queue_sync(queues[2*(panel_id%num_gpus)]); magma_sgetmatrix_async( rows, ib, dlA(panel_id, i, i_local), ldda, hwrk(i), ldwork, queues[2*(panel_id%num_gpus)+1], NULL ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead application to the trailing matrix */ la_id = panel_id; /* only the GPU that has next panel is done look-ahead */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n_local[la_id]-i_local-old_ib, old_ib, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, dwork[la_id%num_gpus], 0, lddwork, dlA(la_id, old_i, i_local+old_ib), ldda, dwork[la_id%num_gpus], old_ib, lddwork, queues[2*(la_id%num_gpus)]); la_id = ((i-nb)/nb)%tot_subs; magma_ssetmatrix_async( old_ib, old_ib, hwrk(old_i), ldwork, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, queues[2*(la_id%num_gpus)], NULL ); } magma_queue_sync( queues[2*(panel_id%num_gpus)+1] ); lapackf77_sgeqrf(&rows, &ib, hwrk(i), &ldwork, tau+i, lhwrk, &lhwork, info); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hwrk(i), &ldwork, tau+i, lhwrk, &ib); spanel_to_q( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); // Send the current panel back to the GPUs // Has to be done with asynchronous copies for (j=0; j<num_gpus; j++) { if (j == panel_id%num_gpus){ panel[j] = dlA(panel_id, i, i_local); panel_offset[j] = dlA_offset(i, i_local); } else { panel[j] = dwork[j]; panel_offset[j] = displacement; } magma_queue_sync( queues[2*j] ); magma_ssetmatrix_async( rows, ib, hwrk(i), ldwork, panel[j], panel_offset[j], ldda, queues[2*j+1], NULL ); /* Send the T matrix to the GPU. Has to be done with asynchronous copies */ magma_ssetmatrix_async( ib, ib, lhwrk, ib, dwork[j], 0, lddwork, queues[2*j+1], NULL ); } for(j=0; j<num_gpus; j++) { magma_queue_sync( queues[2*j+1] ); } if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left; This is update for the next panel; part of the look-ahead */ la_id = (panel_id+1)%tot_subs; int i_loc = (i+nb)/(nb*tot_subs)*nb; for (j=0; j<tot_subs; j++) { if (j == la_id) magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_loc), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); else if (j <= panel_id) magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local-ib, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_local+ib), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); else magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_local), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); } /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); } else { /* do the entire update as we exit and there would be no lookahead */ la_id = (panel_id+1)%tot_subs; int i_loc = (i+nb)/(nb*tot_subs)*nb; magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[la_id]-i_loc, ib, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, dwork[la_id%num_gpus], 0, lddwork, dlA(la_id, i, i_loc), ldda, dwork[la_id%num_gpus], ib, lddwork, queues[2*(la_id%num_gpus)]); /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); magma_ssetmatrix( ib, ib, hwrk(i), ldwork, dlA(panel_id, i, i_local), ldda, queues[2*(panel_id%num_gpus)]); } old_i = i; old_ib = ib; } } } else { i = 0; } for (j=0; j<num_gpus; j++) { magma_free( dwork[j] ); } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; lhwork = lwork - rows*ib; panel_id = (panel_id+1)%tot_subs; int i_loc = (i)/(nb*tot_subs)*nb; magma_sgetmatrix( rows, ib, dlA(panel_id, i, i_loc), ldda, lhwrk, rows, queues[2*(panel_id%num_gpus)]); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, lhwrk, rows, dlA(panel_id, i, i_loc), ldda, queues[2*(panel_id%num_gpus)]); } #ifdef USE_PINNED_CLMEMORY #else magma_free_cpu( local_work ); #endif return *info; } /* magma_sgeqrf_msub */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA REAL array A on the GPU device, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT REAL work space array on the GPU device, dimension (MIN(M, N) )*NB. This must be the 6th argument of magma_sgeqrf_gpu [ note that if N here is bigger than N in magma_sgeqrf_gpu, the workspace requirement DT in magma_sgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_ssyev_2stage ********************************************************************/ extern "C" magma_int_t magma_sorgqr_2stage_gpu(magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t i__1, i__2, i__3; //magma_int_t lwork; magma_int_t i, ib, ki, kk; //, iinfo; //magma_int_t lddwork = min(m, n); //float *work, *panel; float *dwork; //magma_queue_t stream[2]; magma_int_t ldt=nb; // need to be an input parameter *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) return *info; if (MAGMA_SUCCESS != magma_smalloc( &dwork, n*nb )) { printf ("!!!! sorgqr_2stage magma_alloc failed for: dwork\n" ); exit(-1); } if ( (nb > 1) && (nb < k) ) { /* Use blocked code after the last block. The first kk columns are handled by the block method. ki is start of 2nd-to-last block. */ ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); /* Set A(1:kk,kk+1:n) to zero. */ /* and A(kk+1:m, kk+1:n) = I */ magmablas_slaset( MagmaFull, kk, n-kk, c_zero, c_zero, dA(0, kk), ldda ); magmablas_slaset( MagmaFull, m-kk, n-kk, c_zero, c_one, dA(kk,kk), ldda ); } else { ki = 0; kk = 0; } /* Allocate work space on CPU in pinned memory */ //lwork = (n+m) * nb; //if (kk < n) // lwork = max(lwork, n * nb + (m-kk)*(n-kk)); //if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, (lwork) )) { // *info = MAGMA_ERR_HOST_ALLOC; // return *info; //} //panel = work + n * nb; //magma_queue_create( &stream[0] ); //magma_queue_create( &stream[1] ); /* Use unblocked code for the last or only block. */ if (kk < n) { i__1 = m - kk; i__2 = n - kk; i__3 = k - kk; //magma_sgetmatrix(i__1, i__2, dA(kk, kk), ldda, panel, i__1); //lapackf77_sorgqr(&i__1, &i__2, &i__3, panel, &i__1, &tau[kk], // work, &lwork, &iinfo); // //magma_ssetmatrix(i__1, i__2, panel, i__1, dA(kk, kk), ldda); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, i__1, i__2, i__3, dA(kk, kk-nb), ldda, dT(kk-nb), ldt, dA(kk, kk), ldda, dwork, i__2); //magmablas_slaset(MagmaFull, kk-nb, nb, c_zero, c_zero, dA(0,kk-nb), ldda); //magmablas_slaset(MagmaFull, m-(kk-nb), nb, c_zero, c_one, dA(kk-nb,kk-nb), ldda); } if (kk > 0) { /* Use blocked code */ for (i = ki; i >= nb; i -= nb) { ib = min(nb, k - i); /* Send current panel to the CPU for update */ i__2 = m - i; //magma_sgetmatrix_async( i__2, ib, dA(i,i), ldda, panel, i__2, stream[0] ); // verify if (i + ib < n) { /* Apply H to A(i:m,i+ib:n) from the left */ i__3 = n - i; magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0,i), ldda ); magmablas_slaset( MagmaFull, m-i, ib, c_zero, c_one, dA(i,i), ldda ); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, i__2, i__3, ib, dA(i, i-nb), ldda, dT(i-nb), ldt, dA(i, i), ldda, dwork, i__3); } /* Apply H to rows i:m of current block on the CPU */ //magma_queue_sync( stream[0] ); //lapackf77_sorgqr(&i__2, &ib, &ib, panel, &i__2, &tau[i], // work, &lwork, &iinfo); //magma_ssetmatrix_async( i__2, ib, panel, i__2, dA(i,i), ldda, stream[1] ); // verify /* Set rows 1:i-1 of current block to zero */ i__2 = i + ib; //magmablas_slaset(MagmaFull, i-ib, ib, c_zero, c_zero, dA(0,i-ib), ldda); //magmablas_slaset(MagmaFull, m-(i-ib), ib, c_zero, c_one, dA(i-ib,i-ib), ldda); } } magmablas_slaset( MagmaFull, m, nb, c_zero, c_one, dA(0,0), ldda ); magma_free( dwork ); //magma_free_pinned( work ); //magma_queue_destroy( stream[0] ); //magma_queue_destroy( stream[1] ); return *info; } /* magma_sorgqr_gpu */
extern "C" magma_int_t magma_ssygvd(magma_int_t itype, char jobz, char uplo, magma_int_t n, float *a, magma_int_t lda, float *b, magma_int_t ldb, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SSYGVD computes all the eigenvalues, and optionally, the eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX*16 array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= 2*N*nb + 1. If JOBZ = 'V' and N > 1, LWORK >= 1 + 6*N*nb + 2*N**2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: SPOTRF or SSYEVD returned an error code: <= N: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details =============== Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if SSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; float d_one = MAGMA_S_ONE; float *da; float *db; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; char trans[1]; magma_int_t wantz, lquery; magma_int_t lopt, lwmin, liopt, liwmin; cudaStream_t stream; magma_queue_create( &stream ); wantz = lapackf77_lsame(jobz_, MagmaVectorsStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); lquery = lwork == -1 || liwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVectorsStr))) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldb < max(1,n)) { *info = -8; } magma_int_t nb = magma_get_ssytrd_nb(n); if (n < 1) { liwmin = 1; lwmin = 1; } else if (wantz) { lwmin = 1 + 6 * n * nb + 2* n * n; liwmin = 5 * n + 3; } else { lwmin = 2 * n * nb + 1; liwmin = 1; } lopt = lwmin; liopt = liwmin; work[ 0] = lopt; iwork[0] = liopt; if (lwork < lwmin && ! lquery) { *info = -11; } else if (liwork < liwmin && ! lquery) { *info = -13; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) { return MAGMA_SUCCESS; } /* Quick return if possible */ if (n == 0) { return 0; } if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_smalloc( &db, n*lddb )) { *info = -17; return MAGMA_ERR_DEVICE_ALLOC; } /* Form a Cholesky factorization of B. */ magma_ssetmatrix( n, n, b, ldb, db, lddb ); magma_ssetmatrix_async( n, n, a, lda, da, ldda, stream ); magma_spotrf_gpu(uplo_[0], n, db, lddb, info); if (*info != 0) { *info = n + *info; return 0; } magma_queue_sync( stream ); magma_sgetmatrix_async( n, n, db, lddb, b, ldb, stream ); /* Transform problem to standard eigenvalue problem and solve. */ magma_ssygst_gpu(itype, uplo_[0], n, da, ldda, db, lddb, info); magma_ssyevd_gpu(jobz_[0], uplo_[0], n, da, ldda, w, a, lda, work, lwork, iwork, liwork, info); lopt = max( lopt, (magma_int_t) work[0]); liopt = max(liopt, iwork[0]); if (wantz && *info == 0) { /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_strsm(MagmaLeft, uplo_[0], *trans, MagmaNonUnit, n, n, d_one, db, lddb, da, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaTrans; } magma_strmm(MagmaLeft, uplo_[0], *trans, MagmaNonUnit, n, n, d_one, db, lddb, da, ldda); } magma_sgetmatrix( n, n, da, ldda, a, lda ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); work[0] = (float) lopt; iwork[0] = liopt; magma_free( da ); magma_free( db ); return MAGMA_SUCCESS; } /* magma_ssygvd */
extern "C" magma_int_t magma_sbulge_applyQ_v2( magma_side_t side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaFloat_ptr dE, magma_int_t ldde, float *V, magma_int_t ldv, float *T, magma_int_t ldt, magma_int_t *info) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki; magma_int_t blkid,vpos,tpos; magma_int_t firstrow, nbcolinvolvd; magma_int_t versionL = 113; magma_int_t versionR = 92; magma_int_t Vchunksiz = 10; *info=0; /* Quick return */ if ( NE == 0 ) { return *info; } if ( N == 0 ) { return *info; } if ( NB == 0 ) { return *info; } /* ========================================== * some infos for developer * Initialisation and checking nb of cores * ==========================================*/ /* we have 2 algo for left (113 114) and 2 algo for right (91 92) * which correspond to versionL versionR. * They are very similar (detail explained in tech report and matlab code) * however version 114 and 92 improve locality. * while version 113 is used in case WNATZ=1 (construct Q2) which allow * the construction to be done in an optimized way taking into * consideration that the matrix is Identity so making less flops. * */ // Initialize streaming and events magma_device_sync(); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_t myevent[2]; cudaEventCreateWithFlags(&myevent[0],cudaEventDisableTiming); cudaEventCreateWithFlags(&myevent[1],cudaEventDisableTiming); // Azzam 21/11/2012 // NOTE THAT dwork was of size 2*NE*Vblksiz+... // but I am thinking why not modifing it to NE*Vblksiz+... // BUT NO because the 2* is used because of making 2 streams working and so // they might be using dwork in parallel float *dwork, *dwork0, *dwork1, *dwvt0, *dwvt1; float *dT0, *dV0, *dT1, *dV1; magma_int_t lddv = ldv; magma_int_t lddt = ldt; magma_int_t lddw = 0; magma_int_t lddwork = ((NE+31)/32)*32; magma_int_t dwVTsiz = lddv*Vblksiz; // lddv*lddv + lddv*lddwork; (v2) // lddv*Vblksiz; (v1,v3) magma_int_t dworksiz = lddwork*Vblksiz; // lddv*Vblksiz; (v2) // NE*Vblksiz=lddwork*Vblksiz; (v1,v3) if (MAGMA_SUCCESS != magma_smalloc( &dwork, 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) { printf ("!!!! magma_sbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } dwork0 = dwork; // size = dworksiz; dwork1 = dwork0 + dworksiz; // size = dworksiz; dwvt0 = dwork + 2*dworksiz; // size = dwVTsiz; dwvt1 = dwvt0 + dwVTsiz; // size = dwVTsiz; dV0 = dwork + 2*dworksiz + 2*dwVTsiz; dT0 = dV0 + Vchunksiz*Vblksiz*lddv; dV1 = dT0 + Vchunksiz*Vblksiz*lddt; dT1 = dV1 + Vchunksiz*Vblksiz*lddv; // make overlapped copy magma_int_t ncpy = 0; magma_int_t copyed=0, copyst=0; magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos; findVTsiz(N, NB, Vblksiz, &blkcnt, ¬hing); flip = 0; // performance loss if the reflector are applied to a big number of eigenvectors (~10000) // => apply the reflectors to blocks of eigenvectors. //magma_int_t nr_bl = magma_ceildiv(NE,10000); //nr of blocks magma_int_t sz_bl = NE; //magma_ceildiv(NE,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ #ifdef ENABLE_DEBUG printf(" APPLY Q_v22 GPU with N %d, NE %d, NB %d, Vblksiz %d, versionL %d versionR %d SIDE %c \n", N, NE, NB, Vblksiz, versionL, versionR, side); #endif /* * MagmamaLeft */ if (side == MagmaLeft) { /* * Version 113: * loop over the block_col (nt) and for each find the * number of tiles (mt) in this block_col. then loop over mt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ if ( versionL == 113 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=nt-1; blkj >= 0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki > 0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos); magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid); // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1 if (ncpy == 0) { // flip = 1 for this. copyst = 0; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied if (mysiz > 0) { ncpy = 1; flip = 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; magmablasSetKernelStream(stream[1]); magma_ssetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1, vld, stream[1]); magma_ssetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1, tld, stream[1]); //printf("doing the first copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); } } if (blkid == copyst) { flip = ncpy % 2; copyst = copyed; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed); if (mysiz > 0) { ncpy = ncpy + 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); magmablasSetKernelStream(stream[1]); magma_ssetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1, vld, stream[1]); magma_ssetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1, tld, stream[1]); } else { // now I am working on dV1 so copy the next and put it on dV0 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos); magmablasSetKernelStream(stream[0]); magma_ssetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0, vld, stream[0]); magma_ssetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0, tld, stream[0]); } } } if ((Vm > 0) && (Vn > 0)) { locpos = blkid%Vchunksiz; magma_int_t lcvpos = locpos*Vblksiz*lddv; magma_int_t lctpos = locpos*Vblksiz*lddt; //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d locvpos %5d loctpos %5d blkid %2d using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip); if (flip == 0) { magmablasSetKernelStream(stream[0]); magma_queue_wait_event( stream[0], myevent[1] ); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); lddw = min(lddwork,sz_bl); //magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0+lcvpos, lddv, dT0+lctpos, lddt, dE(myrow,i), ldde, dwork0, lddw); magma_slarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0+lcvpos, lddv, dT0+lctpos, lddt, dE(myrow,i), ldde, dwork0, lddw, dwvt0, lddv); } magma_event_record( myevent[0], stream[0] ); } else { magmablasSetKernelStream(stream[1]); magma_queue_wait_event( stream[1], myevent[0] ); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); lddw = min(lddwork,sz_bl); //magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1+lcvpos, lddv, dT1+lctpos, lddt, dE(myrow,i), ldde, dwork1, lddw); magma_slarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1+lcvpos, lddv, dT1+lctpos, lddt, dE(myrow,i), ldde, dwork1, lddw, dwvt1, lddv); } magma_event_record( myevent[1], stream[1] ); } } // end for (Vm &Vn) > 0 } // end for blki } // end for blkj } // end if version=113 /* * Version 114: * loop over the block_row (mt) and for each find diagonally the * number of tiles (nt) in this block_row. then loop over nt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ else { mt = magma_ceildiv((N-1),NB); for (blki = mt; blki > 0; blki--) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = nt-1; blkj >= 0; blkj--) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_ssetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_ssetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); //printf("voici blki %d rownbm %d mycol %d coled %d blkid %d vpos %d tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE); } } // end for (Vm &Vn) > 0 } // end for blkj } // end for blki } // end version 114 } // end LEFT /* * MagmaRight */ else { /* * Version 91: */ if ( versionR == 91 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=0; blkj < nt; blkj++) { /* the index of the first myrow on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=1; blki <= mt; blki++) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; Vm = min( NB+Vblksiz-1, N-myrow); if ( (blkj == nt-1) && (blki == mt) ) { Vn = min( Vblksiz, Vm ); } else { Vn = min( Vblksiz, Vm-1 ); } mycol = blkj*Vblksiz; if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_ssetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_ssetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_slarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); } // end for (Vm &Vn) > 0 } // end for blki } // end fo blkj } // end of version 91 /* * Version 92: */ else { mt = magma_ceildiv((N-1),NB); for (blki = 1; blki <= mt; blki++) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = 0; blkj < nt; blkj++) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_ssetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_ssetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_slarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); } // end for (Vm &Vn) > 0 } //end for blkj } // end for blki } //end of version 92 } // end RIGHT magma_device_sync(); magmablasSetKernelStream( orig_stream ); magma_event_destroy( myevent[0] ); magma_event_destroy( myevent[1] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free(dwork); return *info; }
extern "C" magma_int_t magma_sorgqr(magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. DT (input) REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. NB (input) INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_slaset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_sorgqr */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] T REAL array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_m( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; float *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; float *dA[ MagmaMaxGPUs ] = { NULL }; float *dT[ MagmaMaxGPUs ] = { NULL }; float *dV[ MagmaMaxGPUs ] = { NULL }; float *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } float *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_ssetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_sorgqr */
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA REAL array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf_mgpu_right( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0) float z_one = MAGMA_S_MAKE( 1.0, 0.0 ); float mz_one = MAGMA_S_MAKE( -1.0, 0.0 ); float one = 1.0; float m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevtrsmrows=0, nqueue = 5; float *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; float *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, igpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t queues[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_spotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_smalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_sgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_spotrf( uplo_, &n, panel, &ldpanel, info); magma_ssetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < ngpu; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / ngpu) * nb; if (d < (n / nb) % ngpu) n_local[d] += nb; else if (d == (n / nb) % ngpu) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_smalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < nqueue; j++ ) { magma_queue_create( &queues[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (ngpu == 4) crosspoint = n; else if (ngpu == 3) crosspoint = n; else if (ngpu == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = magma_wtime(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_sgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, queues[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = magma_wtime()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % ngpu; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( queues[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = magma_wtime(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_sgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = magma_wtime() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = magma_wtime(); #endif lapackf77_spotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = magma_wtime() - tcchol; ttot_cchol += tcchol; tctrsm = magma_wtime(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_strsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = magma_wtime() - tctrsm; ttot_ctrsm += tctrsm; tctm = magma_wtime(); #endif d = (id + 1) % ngpu; // send current panel to gpus for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_ssetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, queues[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % ngpu; for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); } #if defined (ENABLE_TIMER) tctm = magma_wtime() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % ngpu; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * ngpu) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = magma_wtime(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream( queues[d][STREAM_ID(j_local2)] ); #define SSYRK_ON_DIAG #ifdef SSYRK_ON_DIAG magma_ssyrk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_sgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_sgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = magma_wtime() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( queues[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_sgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, queues[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < ngpu; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) therk[d] = magma_wtime(); #endif //magmablasSetKernelStream( queues[d] ); //magma_ssyrk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef SSYRK_ON_DIAG magma_ssyrk_mgpu #else magma_ssyrk_mgpu2 #endif (ngpu, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, nqueue, queues ); #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = magma_wtime() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % ngpu, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( id=0; id < nqueue; id++ ) { magma_queue_sync( queues[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % ngpu; // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = magma_wtime(); #endif magma_sgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_spotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_ssetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = magma_wtime() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < nqueue; j++ ) { magma_queue_destroy( queues[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_spotrf_mgpu_right */
/** Purpose ------- SSYGVD computes all the eigenvalues, and optionally, the eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A REAL array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B REAL array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] w REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: SPOTRF or SSYEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if SSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_ssygv_driver ********************************************************************/ extern "C" magma_int_t magma_ssygvd_m( magma_int_t ngpu, magma_int_t itype, magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, float *B, magma_int_t ldb, float *w, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); float d_one = MAGMA_S_ONE; magma_int_t lower; magma_trans_t trans; magma_int_t wantz, lquery; magma_int_t lwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldb < max(1,n)) { *info = -8; } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_slamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -11; } else if (liwork < liwmin && ! lquery) { *info = -13; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { lapackf77_ssygvd( &itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, iwork, &liwork, info ); return *info; } magma_timer_t time=0; timer_start( time ); magma_spotrf_m( ngpu, uplo, n, B, ldb, info ); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time spotrf = %6.2f\n", time ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_ssygst_m( ngpu, itype, uplo, n, A, lda, B, ldb, info ); timer_stop( time ); timer_printf( "time ssygst = %6.2f\n", time ); timer_start( time ); magma_ssyevd_m( ngpu, jobz, uplo, n, A, lda, w, work, lwork, iwork, liwork, info ); timer_stop( time ); timer_printf( "time ssyevd = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaTrans; } else { trans = MagmaNoTrans; } magma_strsm_m( ngpu, MagmaLeft, uplo, trans, MagmaNonUnit, n, n, d_one, B, ldb, A, lda ); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaTrans; } printf("--- the multi GPU version is falling back to 1 GPU to perform the last TRMM since there is no TRMM_mgpu --- \n"); float *dA=NULL, *dB=NULL; magma_int_t ldda = roundup( n, 32 ); magma_int_t lddb = ldda; if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_smalloc( &dB, n*lddb ) ) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ssetmatrix( n, n, B, ldb, dB, lddb ); magma_ssetmatrix( n, n, A, lda, dA, ldda ); magma_strmm( MagmaLeft, uplo, trans, MagmaNonUnit, n, n, d_one, dB, lddb, dA, ldda ); magma_sgetmatrix( n, n, dA, ldda, A, lda ); magma_free( dA ); magma_free( dB ); } timer_stop( time ); timer_printf( "time setmatrices trsm/mm + getmatrices = %6.2f\n", time ); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; return *info; } /* magma_ssygvd_m */
/** Purpose ------- SGEBRD reduces a general real M-by-N matrix A to upper or lower bidiagonal form B by an orthogonal transformation: Q**H * A * P = B. If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. M >= 0. @param[in] n INTEGER The number of columns in the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N general matrix to be reduced. On exit, if m >= n, the diagonal and the first superdiagonal are overwritten with the upper bidiagonal matrix B; the elements below the diagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the first superdiagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors; \n if m < n, the diagonal and the first subdiagonal are overwritten with the lower bidiagonal matrix B; the elements below the first subdiagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the diagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] d real array, dimension (min(M,N)) The diagonal elements of the bidiagonal matrix B: D(i) = A(i,i). @param[out] e real array, dimension (min(M,N)-1) The off-diagonal elements of the bidiagonal matrix B: if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. @param[out] tauq REAL array dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (M+N)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: If m >= n, Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The contents of A on exit are illustrated by the following examples: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) ( v1 v2 v3 v4 v5 ) @endverbatim where d and e denote diagonal and off-diagonal elements of B, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_sgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_sgebrd( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tauq, float *taup, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float *dA, *dwork; magma_int_t ncol, nrow, jmax, nb, ldda; magma_int_t i, j, nx; magma_int_t iinfo; magma_int_t minmn; magma_int_t ldwrkx, ldwrky, lwkopt; magma_int_t lquery; nb = magma_get_sgebrd_nb(n); ldda = m; lwkopt = (m + n) * nb; work[0] = MAGMA_S_MAKE( lwkopt, 0. ); lquery = (lwork == -1); /* Check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < lwkopt && (! lquery) ) { *info = -10; } if (*info < 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ minmn = min(m,n); if (minmn == 0) { work[0] = c_one; return *info; } if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + (m + n)*nb )) { fprintf (stderr, "!!!! device memory allocation error in sgebrd\n" ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + n*ldda; ldwrkx = m; ldwrky = n; /* Set the block/unblock crossover point NX. */ nx = 128; /* Copy the matrix to the GPU */ if (minmn - nx >= 1) { magma_ssetmatrix( m, n, A, lda, dA, ldda ); } for (i=0; i < (minmn - nx); i += nb) { /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return the matrices X and Y which are needed to update the unreduced part of the matrix */ nrow = m - i; ncol = n - i; /* Get the current panel (no need for the 1st iteration) */ if ( i > 0 ) { magma_sgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda ); magma_sgetmatrix( nb, ncol - nb, dA(i, i+nb), ldda, A( i, i+nb), lda ); } magma_slabrd_gpu(nrow, ncol, nb, A(i, i), lda, dA(i, i), ldda, d+i, e+i, tauq+i, taup+i, work, ldwrkx, dwork, ldwrkx, // x, dx work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update of the form A := A - V*Y' - X*U' */ nrow = m - i - nb; ncol = n - i - nb; // Send Y back to the GPU magma_ssetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx ); magma_ssetmatrix( ncol, nb, work + (ldwrkx+1)*nb, ldwrky, dwork + (ldwrkx+1)*nb, ldwrky ); magma_sgemm( MagmaNoTrans, MagmaConjTrans, nrow, ncol, nb, c_neg_one, dA(i+nb, i ), ldda, dwork+(ldwrkx+1)*nb, ldwrky, c_one, dA(i+nb, i+nb), ldda); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nrow, ncol, nb, c_neg_one, dwork+nb, ldwrkx, dA( i, i+nb ), ldda, c_one, dA( i+nb, i+nb ), ldda); /* Copy diagonal and off-diagonal elements of B back into A */ if (m >= n) { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_S_MAKE( d[j], 0. ); *A(j, j+1) = MAGMA_S_MAKE( e[j], 0. ); } } else { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_S_MAKE( d[j], 0. ); *A(j+1, j ) = MAGMA_S_MAKE( e[j], 0. ); } } } /* Use unblocked code to reduce the remainder of the matrix */ nrow = m - i; ncol = n - i; if ( 0 < minmn - nx ) { magma_sgetmatrix( nrow, ncol, dA(i, i), ldda, A(i, i), lda ); } lapackf77_sgebrd( &nrow, &ncol, A(i, i), &lda, d+i, e+i, tauq+i, taup+i, work, &lwork, &iinfo); work[0] = MAGMA_S_MAKE( lwkopt, 0. ); magma_free( dA ); return *info; } /* magma_sgebrd */
/** Purpose ======= SSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U^H * D * U, if UPLO = MagmaUpper, or A = L * D * L^H, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssysv_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A +(j)*lda + (i)) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaFloat_ptr dA; magmaFloat_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = magma_roundup( n, 32 ); nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_queue_t queues[2]; magma_event_t event; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if ( j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0]); if (k == j+jb) { // magma_event_record( event, queues[0] ); magma_queue_sync( queues[0] ); } } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if (j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) { //magma_event_record( event, queues[0] ); magma_queue_sync(queues[0]); } } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. It uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_)*nb + (j_)*nb*ldda + dA_offset) #define dAT(i_, j_) dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset) #define dwork(i_) dwork, (i_) #else #define dA(i_, j_) ( dA + (i_)*nb + (j_)*nb*ldda) #define dAT(i_, j_) ( dAT + (i_)*nb*lddat + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif // Constants const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; // Local variables float *work; magmaFloat_ptr dA, dAT, dwork; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_sgetrf_nb( m, n ); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_sgetrf( &m, &n, A, &lda, ipiv, info ); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, lddat, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); lddat = maxn; ldda = maxm; /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* check the memory requirement */ size_t mem_size = magma_queue_mem_size( queues[0] ); mem_size /= sizeof(float); magma_int_t h = 1+(2+ngpu); magma_int_t ngpu2 = ngpu; magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((float)NB/nb) ) { ngpu2 = (magma_int_t)ceil((float)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place // dwork is nb*maxm for panel, and maxdim*maxdim for A if (MAGMA_SUCCESS != magma_smalloc( &dwork, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; ldda = lddat = maxdim; magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); dAT = dA; magmablas_stranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place // dwork is nb*maxm for panel, and maxm*maxn for A if (MAGMA_SUCCESS != magma_smalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magmablas_stranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } lapackf77_sgetrf( &m, &nb, work, &lda, ipiv, &iinfo ); for( j = 0; j < s; j++ ) { // get j-th panel from device cols = maxm - j*nb; if (j > 0) { magmablas_stranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] ); magma_queue_sync( queues[0] ); magma_sgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queues[1] ); lapackf77_sgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo ); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // put j-th panel onto device magma_ssetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] ); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_slaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); magma_queue_sync( queues[1] ); magmablas_stranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_stranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] ); magma_sgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] ); magma_queue_sync( queues[0] ); // do the cpu part lapackf77_sgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo ); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_slaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); // put j-th panel onto device magma_ssetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] ); magmablas_stranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queues[0] ); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_stranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); magma_sgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] ); } else { magmablas_stranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] ); magma_free( dAT ); } magma_free( dwork ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); } return *info; } /* magma_sgetrf */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, magmaFloat_ptr dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sorgqr */
/** Purpose ------- SSYEVDX computes selected eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA, N). On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param wA (workspace) REAL array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_ssyev_driver ********************************************************************/ extern "C" magma_int_t magma_ssyevdx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, float *dA, magma_int_t ldda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, float *wA, magma_int_t ldwa, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { magma_int_t ione = 1; float d__1; float eps; magma_int_t inde; float anrm; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; float *dwork; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (ldwa < max(1,n)) { *info = -14; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_slamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif const char* jobz_ = lapack_vec_const( jobz ); const char* uplo_ = lapack_uplo_const( uplo ); float *A; magma_smalloc_cpu( &A, n*n ); magma_sgetmatrix(n, n, dA, ldda, A, n); lapackf77_ssyevd(jobz_, uplo_, &n, A, &n, w, work, &lwork, iwork, &liwork, info); magma_ssetmatrix( n, n, A, n, dA, ldda); magma_free_cpu(A); return *info; } magma_queue_t stream; magma_queue_create( &stream ); // n*lddc for ssytrd2_gpu // n for slansy magma_int_t ldwork = n*lddc; if ( wantz ) { // need 3n^2/2 for sstedx ldwork = max( ldwork, 3*n*(n/2 + 1)); } if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_slansy(MagmaMaxNorm, uplo, n, dA, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_slascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info); } /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */ // ssytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_SYMV magma_ssytrd2_gpu(uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dwork, n*lddc, &iinfo); #else magma_ssytrd_gpu(uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo); #endif timer_stop( time ); timer_printf( "time ssytrd = %6.2f\n", time ); /* For eigenvalues only, call SSTERF. For eigenvectors, first call SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call SORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &work[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); } else { timer_start( time ); magma_sstedx(range, n, vl, vu, il, iu, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); timer_start( time ); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); magma_ssetmatrix( n, *m, &work[indwrk + n* (il-1) ], n, dwork, lddc ); magma_sormtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau], dwork, lddc, wA, ldwa, &iinfo); magma_scopymatrix( n, *m, dwork, lddc, dA, ldda ); timer_stop( time ); timer_printf( "time sormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_sscal(&n, &d__1, w, &ione); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; magma_queue_destroy( stream ); magma_free( dwork ); return *info; } /* magma_ssyevd_gpu */
extern "C" magma_int_t magma_slobpcg( magma_s_sparse_matrix A, magma_s_solver_par *solver_par ) { #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_s_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_stranspose( m, n, X, m, blockW, n ); \ magma_s_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_s_spmv(alpha, A, x, beta, ax ); \ magmablas_stranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); float *blockX = solver_par->eigenvectors; float *evalues = solver_par->eigenvalues; float *dwork, *hwork; float *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; float *gramA, *gramB, *gramM; float *gevectors, *h_gramB; float *pointer, *origX = blockX; float *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_smalloc_pinned( &hwork , lwork ); magma_smalloc( &blockAX , m*n ); magma_smalloc( &blockAR , m*n ); magma_smalloc( &blockAP , m*n ); magma_smalloc( &blockR , m*n ); magma_smalloc( &blockP , m*n ); magma_smalloc( &blockW , m*n ); magma_smalloc( &dwork , m*n ); magma_smalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === float residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === float c_one = MAGMA_S_ONE, c_zero = MAGMA_S_ZERO; float *residualNorms, *condestGhistory, condestG; float *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_smalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_smalloc_cpu(&condestGhistory, maxIterations+1); magma_smalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); float *hW; magma_smalloc_pinned(&hW, n*n); magma_smalloc_pinned(&gevectors, 9*n*n); magma_smalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_smalloc(&gramM, n * n); magma_smalloc(&gramA, 9 * n * n); magma_smalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_smalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_sgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_sorthomgs( m, n, blockX ); magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_ssyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_saxpy(m, MAGMA_S_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_slobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_scompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_slacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_sgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_sorthomgs( m, cBlockSize, blockR ); // === compute AR magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_scompactActive(m, n, blockP, m, activeMask); magma_scompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_sgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_sorthomgs( m, cBlockSize, blockP ); //magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_ssetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); float condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_ssyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_S_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_ssygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_ssetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_sgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_saxpy(m*n, c_one, blockP, 1, blockX, 1); magma_slobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_saxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_slobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // float res; // magma_sgetmatrix(1, 1, // (float*)residualNorms(0, iterationNumber), 1, // (float*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter) { solver_par->info = 0; } else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_ssyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_saxpy(m, MAGMA_S_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_slacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== float *hresidualNorms; magma_smalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_sgetmatrix(n, iterationNumber, (float*)residualNorms, n, (float*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magma_queue_t* queue, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloat_ptr dwork; float *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_smalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_smalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*lwork, NULL, NULL); work = (float*)clEnqueueMapBuffer(queue[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(float), 0, NULL, NULL, NULL); nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_queue_sync( queue[1] ); magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work(i), ldwork, queue[0], NULL); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue[1]); magma_ssetmatrix_async( old_ib, old_ib, work(old_i), ldwork, dA(old_i, old_i), ldda, queue[1], NULL); } magma_queue_sync(queue[0]); lapackf77_sgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); spanel_to_q( MagmaUpper, ib, work(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ magma_ssetmatrix_async(rows, ib, work(i), ldwork, dA(i,i), ldda, queue[0], NULL); /* download the T matrix */ magma_queue_sync( queue[1] ); magma_ssetmatrix_async( ib, ib, hwork, ib, dwork, 0, lddwork, queue[0], NULL); magma_queue_sync( queue[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]); sq_to_panel( MagmaUpper, ib, work(i), ldwork, hwork+ib*ib ); } else { magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]); sq_to_panel( MagmaUpper, ib, work(i), ldwork, hwork+ib*ib ); magma_ssetmatrix_async(ib, ib, work(i), ldwork, dA(i,i), ldda, queue[1], NULL); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work, rows, queue[1], NULL); magma_queue_sync(queue[1]); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix_async(rows, ib, work, rows, dA(i, i), ldda, queue[1], NULL); } magma_queue_sync(queue[0]); magma_queue_sync(queue[1]); // magma_free_cpu(work); clEnqueueUnmapMemObject(queue[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); return *info; } /* magma_sgeqrf2_gpu */
/** Purpose ------- CGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] dwork (workspace) COMPLEX array on the GPU, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n Note: unlike the CPU interface of this routine, the GPU interface does not support a workspace query. @param rwork (workspace, for [cz]geqp3 only) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_cgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqp3_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, magmaFloatComplex_ptr dwork, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define dA(i_, j_) (dA + (i_) + (j_)*ldda) const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magma_int_t ione = 1; //magma_int_t na; magma_int_t n_j; magma_int_t j, jb, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, lwkopt; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } nb = magma_get_cgeqp3_nb( m, n ); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } if (lwork < lwkopt) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (minmn == 0) return *info; #ifdef REAL float *rwork = dwork + (n + 1)*nb; #endif magmaFloatComplex_ptr df; if (MAGMA_SUCCESS != magma_cmalloc( &df, (n+1)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloat_ptr dlsticcs; if (MAGMA_SUCCESS != magma_smalloc( &dlsticcs, 1+256*(n+255)/256 )) { magma_free( df ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magmablas_claset( MagmaFull, n+1, nb, c_zero, c_zero, df, n+1, queue ); nfxd = 0; /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_cswap(&m, dA(0, j), &ione, dA(0, nfxd), &ione); // TODO: ERROR, matrix not on CPU! jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* // TODO: Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. if (nfxd > 0) { na = min(m,nfxd); lapackf77_cgeqrf(&m, &na, dA, &ldda, tau, dwork, &lwork, info); if (na < n) { n_j = n - na; lapackf77_cunmqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, dA, &ldda, tau, dA(0, na), &ldda, dwork, &lwork, info ); } }*/ /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; //sminmn = minmn - nfxd; /* Initialize partial column norms. */ magmablas_scnrm2_cols( sm, sn, dA(nfxd,nfxd), ldda, &rwork[nfxd], queue ); magma_scopymatrix( sn, 1, &rwork[nfxd], sn, &rwork[n+nfxd], sn, queue ); j = nfxd; //if (nb < sminmn) { /* Use blocked code initially. */ /* Compute factorization: while loop. */ topbmn = minmn; // - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; //magma_claqps_gpu // this is a cpp-file magma_claqps2_gpu // this is a cuda-file ( m, n_j, j, jb, &fjb, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], dwork, &df[jb], n_j, dlsticcs, queue ); j += fjb; /* fjb is actual number of columns factored */ } } /* // Use unblocked code to factor the last or only block. if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_cgetmatrix( m-j, n_j, dA(j,j), ldda, A(j,j), lda, queue ); } lapackf77_claqp2(&m, &n_j, &j, dA(0, j), &ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], dwork ); }*/ } magma_queue_destroy( queue ); magma_free( df ); magma_free( dlsticcs ); return *info; } /* magma_cgeqp3_gpu */
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i_, j_) (dAT + (i_)*nb*ldda + (j_)*nb) float *dAT, *dA, *da, *work; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_sgetrf_nb(m); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_sgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_sgetrf_m(ngpu, m, n, A, lda, ipiv, info); return *info; } /* explicitly checking the memory requirement */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(float); int h = 1+(2+ngpu), ngpu2 = ngpu; int NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((float)NB/nb) ) { ngpu2 = (int)ceil((float)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_sgetrf_m(ngpu, m, n, A, lda, ipiv, info); return *info; } ldda = maxn; work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place if (MAGMA_SUCCESS != magma_smalloc( &dA, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_sgetrf_m(ngpu, m, n, A, lda, ipiv, info); return *info; } da = dA + nb*maxm; ldda = maxdim; magma_ssetmatrix( m, n, A, lda, da, ldda ); dAT = da; magmablas_stranspose_inplace( ldda, dAT, ldda ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place if (MAGMA_SUCCESS != magma_smalloc( &dA, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_sgetrf_m(ngpu, m, n, A, lda, ipiv, info); return *info; } da = dA + nb*maxm; magma_ssetmatrix( m, n, A, lda, da, maxm ); if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dA ); magma_sgetrf_m(ngpu, m, n, A, lda, ipiv, info); return *info; } magmablas_stranspose( m, n, da, maxm, dAT, ldda ); } lapackf77_sgetrf( &m, &nb, work, &lda, ipiv, &iinfo); /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( j = 0; j < s; j++ ) { // download j-th panel cols = maxm - j*nb; if (j > 0) { magmablas_stranspose( nb, cols, dAT(j,j), ldda, dA, cols ); // make sure that gpu queue is empty magma_device_sync(); magma_sgetmatrix_async( m-j*nb, nb, dA, cols, work, lda, stream[0]); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), ldda, dAT(j-1,j+1), ldda ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), ldda, dAT(j, j-1), ldda, c_one, dAT(j, j+1), ldda ); // do the cpu part rows = m - j*nb; magma_queue_sync( stream[0] ); lapackf77_sgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // upload j-th panel magma_ssetmatrix_async( m-j*nb, nb, work, lda, dA, cols, stream[0]); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_slaswp( n, dAT, ldda, j*nb + 1, j*nb + nb, ipiv, 1 ); magma_queue_sync( stream[0] ); magmablas_stranspose( cols, nb, dA, cols, dAT(j,j), ldda ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), ldda, dAT(j, j+1), ldda); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), ldda, dAT(j+1, j ), ldda, c_one, dAT(j+1, j+1), ldda ); } else { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), ldda, dAT(j, j+1), ldda); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), ldda, dAT(j+1, j ), ldda, c_one, dAT(j+1, j+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_stranspose( nb0, rows, dAT(s,s), ldda, dA, cols ); magma_sgetmatrix( rows, nb0, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_sgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_slaswp( n, dAT, ldda, s*nb + 1, s*nb + nb0, ipiv, 1 ); // upload j-th panel magma_ssetmatrix( rows, nb0, work, lda, dA, cols ); magmablas_stranspose( rows, nb0, dA, cols, dAT(s,s), ldda ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), ldda, dAT(s,s)+nb0, ldda); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_stranspose_inplace( ldda, dAT, ldda ); magma_sgetmatrix( m, n, da, ldda, A, lda ); } else { magmablas_stranspose( n, m, dAT, ldda, da, maxm ); magma_sgetmatrix( m, n, da, maxm, A, lda ); magma_free( dAT ); } magma_free( dA ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_sgetrf */
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_sgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf(magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) float *dA, *dwork, *dT; float c_one = MAGMA_S_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_sgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_sgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_sgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = current_stream; } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_ssetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i > 0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_sgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_sgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_sgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); spanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_ssetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_ssetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); sq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); sq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_sgetmatrix_async( m, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[1] ); } magma_int_t rows = m-i; lapackf77_sgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( dA ); return *info; } /* magma_sgeqrf */
/** Purpose ======= SSYTRF_nopiv_gpu computes the LDLt factorization of a real symmetric matrix A. The factorization has the form A = U^H * D * U , if UPLO = 'U', or A = L * D * L^H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssytrf_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Local variables */ float zone = MAGMA_S_ONE; float mzone = MAGMA_S_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, stream ); // CPU workspace float *A; if (MAGMA_SUCCESS != magma_smalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaFloat_ptr dW; if (MAGMA_SUCCESS != magma_smalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[1], event ); magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[0], event ); magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); magmablasSetKernelStream( orig_stream ); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- SSYEVD computes all eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] w REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_ssyev_driver ********************************************************************/ extern "C" magma_int_t magma_ssyevd(magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; magma_int_t izero = 0; float d_one = 1.; float d__1; float eps; magma_int_t inde; float anrm; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; float smlnum; magma_int_t lquery; float* dwork; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (lower || (uplo == MagmaUpper))) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. float one_eps = 1. + lapackf77_slamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -8; } else if ((liwork < liwmin) && ! lquery) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = A[0]; if (wantz) { A[0] = 1.; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_ssyevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_slansy("M", uplo_, &n, A, &lda, work ); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_slascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */ // ssytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_ssytrd(uplo, n, A, lda, w, &work[inde], &work[indtau], &work[indwrk], llwork, &iinfo); /* For eigenvalues only, call SSTERF. For eigenvectors, first call SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call SORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &work[inde], info); } else { if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // TTT Possible bug for n < 128 magma_sstedx(311, n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); magma_free( dwork ); magma_sormtr(MagmaLeft, uplo, MagmaNoTrans, n, n, A, lda, &work[indtau], &work[indwrk], n, &work[indwk2], llwrk2, &iinfo); lapackf77_slacpy("A", &n, &n, &work[indwrk], &n, A, &lda); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_sscal(&n, &d__1, w, &ione); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; return *info; } /* magma_ssyevd */
extern "C" magma_int_t magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, float **dlA, magma_int_t ldda, float *tau, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hwrk_ref(a_1) ( local_work + (a_1)) #define lhwrk ( local_work + (nb)*(m)) float *dwork[4], *panel[4], *local_work; magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; magma_device_t cdevice; magma_getdevice(&cdevice); int panel_gpunum, i_local, n_local[4], la_gpu, displacement; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); displacement = n * nb; lwork = (m+n+64) * nb; lhwork = lwork - (m)*nb; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Set the number of local n for each GPU */ for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; } if (MAGMA_SUCCESS != magma_smalloc_pinned( &local_work, lwork )) { *info = -9; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_free( dwork[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } cudaStream_t streaml[4][2]; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); } nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { /* Set the GPU number that holds the current panel */ panel_gpunum = (i/nb)%num_gpus; /* Set the local index where the current panel is */ i_local = i/(nb*num_gpus)*nb; ib = min(k-i, nb); rows = m -i; /* Send current panel to the CPU */ #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_sgetmatrix_async( rows, ib, dlA(panel_gpunum, i, i_local), ldda, hwrk_ref(i), ldwork, streaml[panel_gpunum][1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead application to the trailing matrix */ la_gpu = panel_gpunum; /* only the GPU that has next panel is done look-ahead */ #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib, panel[la_gpu], ldda, dwork[la_gpu], lddwork, dlA(la_gpu, old_i, i_local+old_ib), ldda, dwork[la_gpu]+old_ib, lddwork); la_gpu = ((i-nb)/nb)%num_gpus; #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_ssetmatrix_async( old_ib, old_ib, hwrk_ref(old_i), ldwork, panel[la_gpu], ldda, streaml[la_gpu][0] ); } #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_queue_sync( streaml[panel_gpunum][1] ); lapackf77_sgeqrf(&rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &lhwork, info); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib); spanel_to_q( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib ); // Send the current panel back to the GPUs // Has to be done with asynchronous copies for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif if (j == panel_gpunum) panel[j] = dlA(j, i, i_local); else panel[j] = dwork[j]+displacement; magma_ssetmatrix_async( rows, ib, hwrk_ref(i), ldwork, panel[j], ldda, streaml[j][0] ); } for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif magma_queue_sync( streaml[j][0] ); } /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. Has to be done with asynchronous copies */ for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif magma_ssetmatrix_async( ib, ib, lhwrk, ib, dwork[j], lddwork, streaml[j][0] ); } if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left; This is update for the next panel; part of the look-ahead */ la_gpu = (panel_gpunum+1)%num_gpus; int i_loc = (i+nb)/(nb*num_gpus)*nb; for(j=0; j<num_gpus; j++){ #ifdef MultiGPUs magma_setdevice(j); #endif //magma_queue_sync( streaml[j][0] ); if (j==la_gpu) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork); else if (j<=panel_gpunum) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local-ib, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork); else magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork); } } else { /* do the entire update as we exit and there would be no lookahead */ la_gpu = (panel_gpunum+1)%num_gpus; int i_loc = (i+nb)/(nb*num_gpus)*nb; #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[la_gpu]-i_loc, ib, panel[la_gpu], ldda, dwork[la_gpu], lddwork, dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork); #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_ssetmatrix( ib, ib, hwrk_ref(i), ldwork, dlA(panel_gpunum, i, i_local), ldda ); } old_i = i; old_ib = ib; } } } else { i = 0; } for(j=0; j<num_gpus; j++){ #ifdef MultiGPUs magma_setdevice(j); #endif magma_free( dwork[j] ); } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; lhwork = lwork - rows*ib; panel_gpunum = (panel_gpunum+1)%num_gpus; int i_loc = (i)/(nb*num_gpus)*nb; #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_sgetmatrix( rows, ib, dlA(panel_gpunum, i, i_loc), ldda, lhwrk, rows ); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, lhwrk, rows, dlA(panel_gpunum, i, i_loc), ldda ); } for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_queue_destroy( streaml[i][0] ); magma_queue_destroy( streaml[i][1] ); } magma_setdevice(cdevice); magma_free_pinned( local_work ); return *info; } /* magma_sgeqrf2_mgpu */
/** Purpose ------- SSYTRD reduces a real symmetric matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] num_streams INTEGER The number of GPU streams used for update. 10 >= num_streams > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d REAL array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_comp ********************************************************************/ extern "C" magma_int_t magma_ssytrd_mgpu( magma_int_t num_gpus, magma_int_t num_streams, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dwork[(id)] + (j)*ldda + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ln, ldda; magma_int_t nb = magma_get_ssytrd_nb(n), ib; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float d_one = MAGMA_D_ONE; //float mv_time = 0.0; #ifdef PROFILE_SY2RK float up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i = 0, ii, iii, j, did, i_n; magma_int_t iinfo; magma_int_t ldwork, lddwork, lwkopt, ldwork2; magma_int_t lquery; magma_queue_t stream[MagmaMaxGPUs][10]; float *dx[MagmaMaxGPUs], *dy[MagmaMaxGPUs], *hwork; float *dwork2[MagmaMaxGPUs]; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( num_streams > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = lddwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); float *dA[MagmaMaxGPUs]; float *dwork[MagmaMaxGPUs]; float times[11]; for( did=0; did < 11; did++ ) times[did] = 0; //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK magma_event_t start, stop; float etime; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = lda; ln = ((nb*(1+n/(nb*num_gpus))+31)/32)*32; ldwork2 = (1+ n / nb + (n % nb != 0)) * ldda; for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_smalloc(&dA[did], ln*ldda+3*lddwork*nb) || MAGMA_SUCCESS != magma_smalloc(&dx[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dy[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dwork2[did], ldwork2 ) ) { for( i=0; i < did; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[did] = dA[did] + ln*ldda; for( kk=0; kk < num_streams; kk++ ) magma_queue_create(&stream[did][kk]); } magma_setdevice(0); // TODO fix memory leak dwork2 if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, num_streams*num_gpus*n ) ) { for( i=0; i < num_gpus; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (n < 2048) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } magma_slatrd_mgpu(num_gpus, uplo, n, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dwork, i+ib, dwork2, ldwork2, 1, dx, dy, hwork, stream, times); magma_ssyr2k_mgpu(num_gpus, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dwork, i+ib, 0, d_one, dA, ldda, 0, num_streams, stream); /* get the next panel */ if (i-nb >= nx ) { ib = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*num_gpus)); did = ((i-nb)/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( (i-nb)+ib, ib, dA(did, 0, ii), ldda, A(0, i-nb), lda, stream[did][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_S_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( nx, ib, dA(did, 0, ii), ldda, A(0, i), lda, stream[did][0] ); } } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } /* Use unblocked code to reduce the last or only block */ lapackf77_ssytd2(uplo_, &nx, A(0, 0), &lda, d, e, tau, &iinfo); } } else { trace_init( 1, num_gpus, num_streams, (CUstream_st**)stream ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice(did); trace_gpu_start( did, 0, "comm", "get" ); magma_sgetmatrix_async( n-i, ib, dA(did, i, ii), ldda, A(i,i), lda, stream[did][0] ); trace_gpu_end( did, 0 ); magma_queue_sync(stream[did][0]); magma_setdevice(0); } magma_slatrd_mgpu(num_gpus, uplo, n, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dwork, (n-i), dwork2, ldwork2, 1, dx, dy, hwork, stream, times ); #ifdef PROFILE_SY2RK magma_setdevice(0); if ( i > 0 ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_record(start, 0); #endif magma_ssyr2k_mgpu(num_gpus, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dwork, n-i, ib, d_one, dA, ldda, i+ib, num_streams, stream); #ifdef PROFILE_SY2RK magma_setdevice(0); magma_event_record(stop, 0); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_S_MAKE( e[j], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* for i=... */ /* Use unblocked code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( i_n, ib, dA(did, iii, ii), ldda, A(iii, i), lda, stream[did][0] ); } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } } lapackf77_ssytrd(uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo); } } #ifdef PROFILE_SY2RK magma_setdevice(0); if ( n > nx ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "ssytrd.svg", "trace.css" ); for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); for( kk=0; kk < num_streams; kk++ ) magma_queue_sync(stream[did][kk]); for( kk=0; kk < num_streams; kk++ ) magma_queue_destroy(stream[did][kk]); magma_free(dA[did]); magma_free(dx[did]); magma_free(dy[did]); magma_free(dwork2[did]); } magma_free_pinned(hwork); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); work[0] = MAGMA_S_MAKE( lwkopt, 0 ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in SLARFG: %.2e seconds\n", times[0] ); //printf( " Time in SSYMV : %.2e seconds\n", mv_time ); printf( " Time in SSYR2K: %.2e seconds\n", up_time ); #endif return *info; } /* magma_ssytrd */