extern "C" magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t stat = 0; // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_cpu = 0, stat_dev = 0; // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; stat_cpu += magma_cmalloc_pinned( &H, (ldh+1)*ldh ); stat_cpu += magma_cmalloc_pinned( &y, ldh ); stat_cpu += magma_cmalloc_pinned( &HH, ldh*ldh ); stat_cpu += magma_cmalloc_pinned( &h1, ldh ); if( stat_cpu != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_HOST_ALLOC; } // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero, queue ); q_t.memory_location = Magma_DEV; q_t.dval = NULL; q_t.num_rows = q_t.nnz = dofs; q_t.num_cols = 1; magmaFloatComplex *dy = NULL, *dH = NULL; stat_dev += magma_cmalloc( &dy, ldh ); stat_dev += magma_cmalloc( &dH, (ldh+1)*ldh ); if( stat_dev != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magma_free( dH ); magma_free( dy ); magma_free( dH ); magma_free( dy ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_DEVICE_ALLOC; } // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); //magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->dval, 1 ); // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * solver_par->epsilon ) < ATOLERANCE ){ r0 = solver_par->epsilon; } if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { for(k=1; k<=restart; k++) { magma_ccopy(dofs, r.dval, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.dval = q(k-1); //magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r, queue ); // r = A q[k] // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.dval, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.dval, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = ||r|| /*} else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.dval, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1, queue ); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = sqrt(r . r) if ( k<solver_par->restart ) { magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_cdotc( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1) { for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++) { HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_csetmatrix(m, 1, y+1, m, dy, m ); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->dval, 1); // compute residual magma_c_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A * x magma_caxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r, queue ); magma_c_vfree(&q, queue ); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); //magmablasSetKernelStream(NULL); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cgmres */
extern "C" magma_int_t magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) //#define PROFILE #ifdef PROFILE float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (magma_queue_t **)stream, &iinfo); magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS_CGETRF( m, n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
magma_int_t magma_dpgmres( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; double nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); double *H, *HH, *y, *h1; magma_dmalloc_pinned( &H, (ldh+1)*ldh ); magma_dmalloc_pinned( &y, ldh ); magma_dmalloc_pinned( &HH, ldh*ldh ); magma_dmalloc_pinned( &h1, ldh ); // GPU workspace magma_d_vector r, q, q_t, z, z_t, t; magma_d_vinit( &t, Magma_DEV, dofs, c_zero ); magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z_t, Magma_DEV, dofs, c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; double *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_dmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_dmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_dscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_D_MAKE( nom0, 0. ); magma_dsetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_dcopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/H(1,0) r magma_dscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); // preconditioner // z[k] = M^(-1) q(k) magma_d_applyprecond_left( A, q_t, &t, precond_par ); magma_d_applyprecond_right( A, t, &z_t, precond_par ); magma_dcopy(dofs, z_t.val, 1, z(k-1), 1); // r = A q[k] magma_d_spmv( c_one, A, z_t, c_zero, r ); if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_ddot(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_daxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing dgemv with dnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_dgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_dcopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_dgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef DNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef DNRM2SCALE magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_dnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_ddot(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_D_REAL(H(k+1,k))); } magma_dsetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_dgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, c_one, x->val, 1); magma_d_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_daxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_D_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_d_vfree(&t); magma_d_vfree(&r); magma_d_vfree(&q); magma_d_vfree(&z); magma_d_vfree(&z_t); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_dgmres */
/** Purpose ------- SSYTRD reduces a real symmetric matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] num_streams INTEGER The number of GPU streams used for update. 10 >= num_streams > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d REAL array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_comp ********************************************************************/ extern "C" magma_int_t magma_ssytrd_mgpu( magma_int_t num_gpus, magma_int_t num_streams, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dwork[(id)] + (j)*ldda + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ln, ldda; magma_int_t nb = magma_get_ssytrd_nb(n), ib; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float d_one = MAGMA_D_ONE; //float mv_time = 0.0; #ifdef PROFILE_SY2RK float up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i = 0, ii, iii, j, did, i_n; magma_int_t iinfo; magma_int_t ldwork, lddwork, lwkopt, ldwork2; magma_int_t lquery; magma_queue_t stream[MagmaMaxGPUs][10]; float *dx[MagmaMaxGPUs], *dy[MagmaMaxGPUs], *hwork; float *dwork2[MagmaMaxGPUs]; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( num_streams > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = lddwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); float *dA[MagmaMaxGPUs]; float *dwork[MagmaMaxGPUs]; float times[11]; for( did=0; did < 11; did++ ) times[did] = 0; //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK magma_event_t start, stop; float etime; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = lda; ln = ((nb*(1+n/(nb*num_gpus))+31)/32)*32; ldwork2 = (1+ n / nb + (n % nb != 0)) * ldda; for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_smalloc(&dA[did], ln*ldda+3*lddwork*nb) || MAGMA_SUCCESS != magma_smalloc(&dx[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dy[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dwork2[did], ldwork2 ) ) { for( i=0; i < did; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[did] = dA[did] + ln*ldda; for( kk=0; kk < num_streams; kk++ ) magma_queue_create(&stream[did][kk]); } magma_setdevice(0); // TODO fix memory leak dwork2 if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, num_streams*num_gpus*n ) ) { for( i=0; i < num_gpus; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (n < 2048) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } magma_slatrd_mgpu(num_gpus, uplo, n, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dwork, i+ib, dwork2, ldwork2, 1, dx, dy, hwork, stream, times); magma_ssyr2k_mgpu(num_gpus, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dwork, i+ib, 0, d_one, dA, ldda, 0, num_streams, stream); /* get the next panel */ if (i-nb >= nx ) { ib = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*num_gpus)); did = ((i-nb)/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( (i-nb)+ib, ib, dA(did, 0, ii), ldda, A(0, i-nb), lda, stream[did][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_S_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( nx, ib, dA(did, 0, ii), ldda, A(0, i), lda, stream[did][0] ); } } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } /* Use unblocked code to reduce the last or only block */ lapackf77_ssytd2(uplo_, &nx, A(0, 0), &lda, d, e, tau, &iinfo); } } else { trace_init( 1, num_gpus, num_streams, (CUstream_st**)stream ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice(did); trace_gpu_start( did, 0, "comm", "get" ); magma_sgetmatrix_async( n-i, ib, dA(did, i, ii), ldda, A(i,i), lda, stream[did][0] ); trace_gpu_end( did, 0 ); magma_queue_sync(stream[did][0]); magma_setdevice(0); } magma_slatrd_mgpu(num_gpus, uplo, n, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dwork, (n-i), dwork2, ldwork2, 1, dx, dy, hwork, stream, times ); #ifdef PROFILE_SY2RK magma_setdevice(0); if ( i > 0 ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_record(start, 0); #endif magma_ssyr2k_mgpu(num_gpus, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dwork, n-i, ib, d_one, dA, ldda, i+ib, num_streams, stream); #ifdef PROFILE_SY2RK magma_setdevice(0); magma_event_record(stop, 0); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_S_MAKE( e[j], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* for i=... */ /* Use unblocked code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( i_n, ib, dA(did, iii, ii), ldda, A(iii, i), lda, stream[did][0] ); } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } } lapackf77_ssytrd(uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo); } } #ifdef PROFILE_SY2RK magma_setdevice(0); if ( n > nx ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "ssytrd.svg", "trace.css" ); for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); for( kk=0; kk < num_streams; kk++ ) magma_queue_sync(stream[did][kk]); for( kk=0; kk < num_streams; kk++ ) magma_queue_destroy(stream[did][kk]); magma_free(dA[did]); magma_free(dx[did]); magma_free(dy[did]); magma_free(dwork2[did]); } magma_free_pinned(hwork); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); work[0] = MAGMA_S_MAKE( lwkopt, 0 ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in SLARFG: %.2e seconds\n", times[0] ); //printf( " Time in SSYMV : %.2e seconds\n", mv_time ); printf( " Time in SSYR2K: %.2e seconds\n", up_time ); #endif return *info; } /* magma_ssytrd */
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; double *T; magma_dmalloc_pinned(&T, nb*nb); //printf("calling dormqr_m with nb=%d\n", (int) nb); double* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l; ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_dlaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_dsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(igpu, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_dgetmatrix_async( m, kb, // dC(igpu, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_dlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, T, ib, dT, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dormqr */
/** Purpose ======= SSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U^H * D * U, if UPLO = MagmaUpper, or A = L * D * L^H, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssysv_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A +(j)*lda + (i)) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaFloat_ptr dA; magmaFloat_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = magma_roundup( n, 32 ); nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_queue_t queues[2]; magma_event_t event; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if ( j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0]); if (k == j+jb) { // magma_event_record( event, queues[0] ); magma_queue_sync( queues[0] ); } } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if (j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) { //magma_event_record( event, queues[0] ); magma_queue_sync(queues[0]); } } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA DOUBLE_PRECISION array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); double *work; int upper = (uplo == MagmaUpper); double *dwork[MagmaMaxGPUs]; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_dpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], ldda, work, n ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix( n, n, work, n, d_lA[0], ldda ); magma_free_pinned( work ); } else { lddp = nb*((n+nb-1)/nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); } magma_setdevice(0); h = 1; //ngpu; //(n+nb-1)/nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three streams */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, stream, event, info); } else { /* with three streams */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, stream, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( stream[d][j] ); magma_queue_destroy( stream[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_dpotrf_mgpu */
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T = NULL; magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_zsetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_zgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_zmake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_zunmqr */
extern "C" magma_int_t magma_scg_merge( magma_s_sparse_matrix A, magma_s_vector b, magma_s_vector *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // GPU workspace magma_s_vector r, d, z; magma_s_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_s_vinit( &d, Magma_DEV, dofs, c_zero, queue ); magma_s_vinit( &z, Magma_DEV, dofs, c_zero, queue ); float *d1, *d2, *skp; d1 = NULL; d2 = NULL; skp = NULL; magma_int_t stat_dev = 0, stat_cpu = 0; stat_dev += magma_smalloc( &d1, dofs*(1) ); stat_dev += magma_smalloc( &d2, dofs*(1) ); // array for the parameters stat_dev += magma_smalloc( &skp, 6 ); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] if( stat_dev != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); printf("error: memory allocation.\n"); return MAGMA_ERR_DEVICE_ALLOC; } // solver variables float alpha, beta, gamma, rho, tmp1, *skp_h; float nom, nom0, r0, betanom, den; // solver setup magma_sscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_scopy( dofs, b.dval, 1, r.dval, 1 ); // r = b magma_scopy( dofs, b.dval, 1, d.dval, 1 ); // d = b nom0 = betanom = magma_snrm2( dofs, r.dval, 1 ); nom = nom0 * nom0; // nom = r' * r magma_s_spmv( c_one, A, d, c_zero, z, queue ); // z = A d den = MAGMA_S_REAL( magma_sdot(dofs, d.dval, 1, z.dval, 1) ); // den = d'* z solver_par->init_res = nom0; // array on host for the parameters stat_cpu += magma_smalloc_cpu( &skp_h, 6 ); if( stat_cpu != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } alpha = rho = gamma = tmp1 = c_one; beta = magma_sdot(dofs, r.dval, 1, r.dval, 1); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_S_MAKE(nom, 0.0); magma_ssetvector( 6, skp_h, 1, skp, 1 ); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magmablasSetKernelStream(stream[0]); // computes SpMV and dot product magma_scgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue ); // updates x, r, computes scalars and updates d magma_scgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue ); // check stopping criterion (asynchronous copy) magma_sgetvector_async( 1 , skp+1, 1, skp_h+1, 1, stream[1] ); betanom = sqrt(MAGMA_S_REAL(skp_h[1])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_sresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_s_vfree(&r, queue ); magma_s_vfree(&z, queue ); magma_s_vfree(&d, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_scg_merge */
/***************************************************************************//** Purpose ------- CGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dlA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf *******************************************************************************/ extern "C" magma_int_t magma_cgeqrf2_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dlA[], magma_int_t ldda, magmaFloatComplex *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaFloatComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaFloatComplex *hwork=NULL, *hpanel=NULL; magma_queue_t queues[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); nb = magma_get_cgeqrf_nb( m, n ); /* dwork is (n*nb) --- for T (nb*nb) and clarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think clarfb work could be smaller, max(n_local[:]). * Oddly, T and clarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_cmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for cgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < ngpu; dev++ ) { n_local[dev] = ((n/nb)/ngpu)*nb; if (dev < (n/nb) % ngpu) n_local[dev] += nb; else if (dev == (n/nb) % ngpu) n_local[dev] += n % nb; } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % ngpu; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*ngpu)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( queues[panel_dev][1], panel_event[panel_dev] ); magma_cgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, queues[panel_dev][1] ); magma_queue_sync( queues[panel_dev][1] ); // Factor panel lapackf77_cgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %lld\n", (long long) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); magma_cpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_csetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } // TODO: if magma_cpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ magma_cq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_csetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, queues[dev][0] ); } la_dev = (panel_dev+1) % ngpu; for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*ngpu)*nb; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); magma_event_record( panel_event[dev], queues[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_csetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, queues[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % ngpu; i_local = j/(nb*ngpu)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_cgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows, queues[panel_dev][0] ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for cgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_cgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %lld\n", (long long) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % ngpu; i_local = j/(nb*ngpu)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_csetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda, queues[panel_dev][0] ); } } CLEANUP: // free(NULL) does nothing. for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_event_destroy( panel_event[dev] ); magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); return *info; } /* magma_cgeqrf2_mgpu */
/** Purpose ======= DSYTRF computes the factorization of a real symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. The form of the factorization is A = U*D*U^H or A = L*D*L^H where U (or L) is a product of permutation and unit upper (lower) triangular matrices, and D is symmetric and block diagonal with 1-by-1 and 2-by-2 diagonal blocks. This is the blocked version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, the block diagonal matrix D and the multipliers used to obtain the factor U or L (see below for further details). @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If IPIV(k) > 0, then rows and columns k and IPIV(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = MagmaUpper and IPIV(k) = IPIV(k-1) < 0, then rows and columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = MagmaLower and IPIV(k) = IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, D(i,i) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular, and division by zero will occur if it is used to solve a system of equations. Further Details =============== If UPLO = MagmaUpper, then A = U*D*U', where U = P(n)*U(n)* ... *P(k)U(k)* ..., i.e., U is a product of terms P(k)*U(k), where k decreases from n to 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and U(k) is a unit upper triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I v 0 ) k-s U(k) = ( 0 I 0 ) s ( 0 0 I ) n-k k-s s n-k If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k). If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k), and A(k,k), and v overwrites A(1:k-2,k-1:k). If UPLO = MagmaLower, then A = L*D*L', where L = P(1)*L(1)* ... *P(k)*L(k)* ..., i.e., L is a product of terms P(k)*L(k), where k increases from 1 to n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and L(k) is a unit lower triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I 0 0 ) k-1 L(k) = ( 0 I 0 ) s ( 0 v I ) n-k-s+1 k-1 s n-k-s+1 If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k). If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k), and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1). @ingroup magma_dsysv_comp ********************************************************************/ extern "C" magma_int_t magma_dsytrf( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) /* .. Local Scalars .. */ magma_int_t nb = magma_get_dsytrf_nb(n); magma_int_t iinfo = 0, nk, kb, j, k; /* Test the input parameters. */ *info = 0; bool upper = (uplo == MagmaUpper); if ( ! upper && uplo != MagmaLower ) { *info = -1; } else if ( n < 0 ) { *info = -2; } else if ( lda < max( 1, n ) ) { *info = -4; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // TODO fix memory leak of dA if dW fails magma_int_t ldda = magma_roundup( n, 32 ); magmaDouble_ptr dA, dW; if ((MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) || (MAGMA_SUCCESS != magma_dmalloc( &dW, (1+nb)*ldda ))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t queues[2]; magma_event_t event[2]; magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event[0] ); magma_event_create( &event[1] ); trace_init( 1, 1, 2, queues ); /* copy matrix to GPU */ trace_gpu_start( 0, 0, "set", "setA" ); //magma_dsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, queues[0] ); if ( upper ) { for (k = 0; k < n; k += nb ) { kb = min(nb, n-k); magma_dsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, queues[0] ); } } else { for (k = 0; k < n; k += nb ) { kb = min(nb, n-k); magma_dsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, queues[0] ); } } trace_gpu_end( 0, 0 ); if ( upper ) { /* Factorize A as U*D*U' using the upper triangle of A K is the main loop index, decreasing from N to 1 in steps of KB, where KB is the number of columns factorized by DLASYF; KB is either NB or NB-1, or K for the last block */ kb = min(n,nb); for (k = n-1; k >= 0; k -= kb ) { nk = k+1; kb = min(nb, nk); if ( k+1 > nb ) { /* Factorize columns k-kb+1:k of A and use blocked code to update columns 1:k-kb */ magma_dlasyf_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, &ipiv[0], dW, ldda, queues, event, &iinfo ); } else { /* Use unblocked code to factorize columns 1:k of A */ magma_queue_sync( queues[0] ); magma_dgetmatrix( nk, nk, dA( 0, 0 ), ldda, A( 0, 0 ), lda, queues[0] ); lapackf77_dsytf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo ); kb = k+1; } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo; } } else { /* Factorize A as L*D*L' using the lower triangle of A K is the main loop index, increasing from 1 to N in steps of KB, where KB is the number of columns factorized by DLASYF; KB is either NB or NB-1, or N-K+1 for the last block */ for (k = 0; k < n; k += kb ) { nk = n-k; kb = min(nb, n - k); if ( k < n-nb ) { /* Factorize columns k:k+kb-1 of A and use blocked code to update columns k+kb:n */ magma_dlasyf_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda, &ipiv[k], dW, ldda, queues, event, &iinfo ); } else { /* Use unblocked code to factorize columns k:n of A */ magma_queue_sync( queues[0] ); magma_dgetmatrix( nk, nk, dA(k,k), ldda, A(k,k), lda, queues[0] ); lapackf77_dsytf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo ); } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo + k; /* Adjust IPIV */ for (j = k; j < k + kb; j ++) { if ( ipiv[j] > 0 ) { ipiv[j] = ipiv[j] + k; } else { ipiv[j] = ipiv[j] - k; } } } } trace_finalize( "dsytrf.svg", "trace.css" ); magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_event_destroy( event[0] ); magma_event_destroy( event[1] ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); magma_free( dW ); return *info; } /* End of DSYTRF */
/** Purpose ======= ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The form of the factorization is A = U*D*U**H or A = L*D*L**H where U (or L) is a product of permutation and unit upper (lower) triangular matrices, and D is Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks. This is the blocked version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX*16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, the block diagonal matrix D and the multipliers used to obtain the factor U or L (see below for further details). @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] IPIV INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If IPIV(k) > 0, then rows and columns k and IPIV(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and IPIV(k) = IPIV(k-1) < 0, then rows and columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and IPIV(k) = IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, D(i,i) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular, and division by zero will occur if it is used to solve a system of equations. Further Details =============== If UPLO = 'U', then A = U*D*U', where U = P(n)*U(n)* ... *P(k)U(k)* ..., i.e., U is a product of terms P(k)*U(k), where k decreases from n to 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and U(k) is a unit upper triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I v 0 ) k-s U(k) = ( 0 I 0 ) s ( 0 0 I ) n-k k-s s n-k If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k). If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k), and A(k,k), and v overwrites A(1:k-2,k-1:k). If UPLO = 'L', then A = L*D*L', where L = P(1)*L(1)* ... *P(k)*L(k)* ..., i.e., L is a product of terms P(k)*L(k), where k increases from 1 to n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and L(k) is a unit lower triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I 0 0 ) k-1 L(k) = ( 0 I 0 ) s ( 0 v I ) n-k-s+1 k-1 s n-k-s+1 If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k). If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k), and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1). @ingroup magma_zhetrf_comp ********************************************************************/ extern "C" magma_int_t magma_zhetrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* .. Local Scalars .. */ magma_int_t upper; magma_int_t nb = magma_get_zhetrf_nb(n); magma_int_t iinfo = 0, nk, kb; /* .. Executable Statements .. */ /* Test the input parameters. */ *info = 0; upper = (uplo == MagmaUpper); if ( !upper && uplo != MagmaLower ) { *info = -1; } else if ( n < 0 ) { *info = -2; } else if ( lda < max( 1, n ) ) { *info = -4; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } magma_int_t ldda = 32*((n+31)/32); magmaDoubleComplex *dA, *dW; if ((MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda )) || (MAGMA_SUCCESS != magma_zmalloc( &dW, (1+nb)*ldda ))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magma_event_create( &event[1] ); trace_init( 1, 1, 2, (CUstream_st**)stream ); /* copy matrix to GPU */ trace_gpu_start( 0, 0, "set", "setA" ); //magma_zsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, stream[0] ); if ( upper ) { for (int k = 0; k < n; k+=nb ) { kb = min(nb, n-k); magma_zsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, stream[0] ); } } else { for (int k = 0; k < n; k+=nb ) { kb = min(nb, n-k); magma_zsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, stream[0] ); } } trace_gpu_end( 0, 0 ); if ( upper ) { /* Factorize A as U*D*U' using the upper triangle of A K is the main loop index, decreasing from N to 1 in steps of KB, where KB is the number of columns factorized by ZLAHEF; KB is either NB or NB-1, or K for the last block */ kb = min(n,nb); for (int k = n-1; k >= 0; k-=kb ) { nk = k+1; kb = min(nb, nk); if ( k+1 > nb ) { /* Factorize columns k-kb+1:k of A and use blocked code to update columns 1:k-kb */ magma_zlahef_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, &ipiv[0], dW, ldda, stream, event, &iinfo ); } else { /* Use unblocked code to factorize columns 1:k of A */ magma_queue_sync( stream[0] ); magma_zgetmatrix( nk, nk, dA( 0, 0 ),ldda, A( 0, 0 ),lda ); lapackf77_zhetf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo ); kb = k+1; } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo; } } else { /* Factorize A as L*D*L' using the lower triangle of A K is the main loop index, increasing from 1 to N in steps of KB, where KB is the number of columns factorized by ZLAHEF; KB is either NB or NB-1, or N-K+1 for the last block */ for (int k = 0; k < n; k += kb ) { nk = n-k; kb = min(nb, n - k); if ( k < n-nb ) { /* Factorize columns k:k+kb-1 of A and use blocked code to update columns k+kb:n */ magma_zlahef_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda, &ipiv[k], dW, ldda, stream, event, &iinfo ); } else { /* Use unblocked code to factorize columns k:n of A */ magma_queue_sync( stream[0] ); magma_zgetmatrix( nk,nk, dA(k,k),ldda, A(k,k),lda ); lapackf77_zhetf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo ); } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo + k; /* Adjust IPIV */ for (int j = k; j < k + kb; j ++) { if ( ipiv[j] > 0 ) { ipiv[j] = ipiv[j] + k; } else { ipiv[j] = ipiv[j] - k; } } } } trace_finalize( "zhetrf.svg","trace.css" ); magma_queue_sync( stream[0] ); magma_queue_sync( stream[1] ); magmablasSetKernelStream( NULL ); magma_event_destroy( event[0] ); magma_event_destroy( event[1] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); magma_free( dW ); return *info; /* End of ZHETRF */ }
/** Purpose ------- DLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by DSYTRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e DOUBLE_PRECISION array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W DOUBLE_PRECISION array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_aux ********************************************************************/ extern "C" double magma_dlatrd_mgpu(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0, double *A, magma_int_t lda, double *e, double *tau, double *W, magma_int_t ldw, double **dA, magma_int_t ldda, magma_int_t offset, double **dW, magma_int_t lddw, double *dwork[MagmaMaxGPUs], magma_int_t ldwork, magma_int_t k, double *dx[MagmaMaxGPUs], double *dy[MagmaMaxGPUs], double *work, magma_queue_t stream[][10], double *times) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(id, i, j) (dA[(id)] + ((j)+loffset)*ldda + (i) + offset) #define dW(id, i, j) (dW[(id)] + (j) *lddw + (i)) #define dW1(id, i, j) (dW[(id)] + ((j)+nb) *lddw + (i)) double mv_time = 0.0; magma_int_t i; #ifndef MAGMABLAS_DSYMV_MGPU magma_int_t loffset = nb0*((offset/nb0)/num_gpus); #endif double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double value = MAGMA_D_ZERO; magma_int_t id, idw, i_one = 1; //magma_int_t kk; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; double alpha; double *dx2[MagmaMaxGPUs]; double *f; magma_dmalloc_cpu( &f, n ); if (n <= 0) { return 0; } //#define PROFILE_SYMV #ifdef PROFILE_SYMV magma_event_t start, stop; float etime; magma_timestr_t cpu_start, cpu_end; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ double wii = *W(i, iw+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione); wii = *A(i, i+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_D_REAL( alpha ); *A(i-1,i) = MAGMA_D_MAKE( 1, 0 ); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); dx2[id] = dW1(id, 0, iw); magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]); #ifndef MAGMABLAS_DSYMV_MGPU magma_dsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] ); #endif } magmablas_dsymv_mgpu(num_gpus, k, MagmaUpper, i, nb0, c_one, dA, ldda, 0, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(0, iw), stream ); if (i < n-1) { blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } /* overlap update */ if ( i < n-1 && i-1 >= n - nb ) { magma_int_t im1_1 = i_1 - 1; magma_int_t im1 = i-1; /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t im1_n = i_n + 1; lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw); #endif blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw); lapackf77_dlacgv(&im1_n, A(im1, i +1), &lda); #endif blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(im1, i+1), &lda, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&im1_n, A(im1, i+1), &lda); #endif } // 3. Here is where we need it // TODO find the right place magmablas_dsymv_sync(num_gpus, k, i, work, W(0, iw), stream ); if (i < n-1) { blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -.5f * value; blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) { magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] ); } else { magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] ); } } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; idw = ((offset+i)/nb)%num_gpus; if ( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); double wii = *W(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione); wii = *A(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &lda); #endif wii = -wii; blasf77_daxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); #ifdef PROFILE_SYMV cpu_start = get_current_time(); #endif lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); #ifdef PROFILE_SYMV cpu_end = get_current_time(); times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0; #endif e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = MAGMA_D_MAKE( 1, 0 ); trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU //trace_gpu_start( idw, 0, "comm", "comm1" ); #ifndef MAGMABLAS_DSYMV_MGPU magma_setdevice(idw); magma_dsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 ); #endif for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); #ifdef MAGMABLAS_DSYMV_MGPU dx2[id] = dW1(id, 0, i)-offset; #else dx2[id] = dx[id]; magma_dsetvector( i_n, A(i+1,i), 1, dx[id], 1 ); #endif magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] ); trace_gpu_end( id, 0 ); } /* mat-vec on multiple GPUs */ #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(start, stream[0][0]); #endif magmablas_dsymv_mgpu(num_gpus, k, MagmaLower, i_n, nb0, c_one, dA, ldda, offset+i+1, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(stop, stream[0][0]); #endif trace_cpu_start( 0, "gemv", "gemv" ); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); trace_cpu_end( 0 ); /* overlap update */ if ( i > 0 && i+1 < n ) { magma_int_t ip1 = i+1; trace_cpu_start( 0, "gemv", "gemv" ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(ip1, 0), &ldw); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda, W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(ip1, 0), &ldw); lapackf77_dlacgv(&i, A(ip1, 0), &lda); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw, A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, A(ip1, 0), &lda); #endif trace_cpu_end( 0 ); } /* synchronize */ magmablas_dsymv_sync(num_gpus, k, i_n, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV cudaEventElapsedTime(&etime, start, stop); mv_time += (etime/1000.0); times[1+(i_n/(n0/10))] += (etime/1000.0); #endif trace_cpu_start( 0, "axpy", "axpy" ); if (i != 0) blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i]* -.5f * value; blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); trace_cpu_end( 0 ); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) { magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] ); } else { magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] ); } } } } } #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_destory( start ); magma_event_destory( stop ); #endif for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) magma_queue_sync(stream[id][1]); } magma_free_cpu(f); return mv_time; } /* magma_dlatrd_mgpu */
extern "C" magma_int_t magma_cpotrf_m(magma_int_t num_gpus0, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may not fit entirely in the GPU memory. The factorization has the form A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; char uplo_[2] = {uplo, 0}; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus; magma_int_t j, jj, jb, J, JB, NB, MB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; #ifdef ROW_MAJOR_PROFILE magma_timestr_t start, end, start0, end0; float chol_time = 1.0; #endif *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_dpotrf_nb(n); if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if( NB >= n ) { #ifdef CHECK_CPOTRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CPOTRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_CPOTRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); fflush(stdout); #endif for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda]; for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE start0 = get_current_time(); #endif if (nb <= 1 || nb >= n) { lapackf77_cpotrf(uplo_, &n, a, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-rows */ magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, stream, &iinfo); #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif /* update with the previous big-panels */ for( j=0; j<J; j+=nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj<JB; jj+=nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_cgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_cherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if( n > J+JB ) { for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*num_gpus))*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, // dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal (and diagonal!!!) big panel */ magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, stream, &iinfo); //magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, 0, a, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-columns */ magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif for( j=0; j<J; j+=nb ) { /* upload the diagonal of big panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj<JB; jj+=nb ) { /* diagonal */ d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_cgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_cherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if( n > J+JB ) { /* off-diagonal */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/num_gpus)*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, // dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal big panel */ magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ #ifdef ROW_MAJOR_PROFILE end0 = get_current_time(); #endif if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { if( stream[d][j] != NULL ) magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j<5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE printf("\n n=%d NB=%d nb=%d\n",n,NB,nb); printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, GetTimerValue(start0, end0), FLOPS_CPOTRF(n)/(1000000*GetTimerValue(start0, end0))); printf(" Performance %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, chol_time, FLOPS_CPOTRF(n)/(1000000*chol_time)); #endif return *info; } /* magma_cpotrf_ooc */
extern "C" magma_int_t magma_dormqr_m(magma_int_t nrgpu, char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, double *a, magma_int_t lda, double *tau, double *c, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DORMQR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. C (input/output) DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ double c_one = MAGMA_D_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t nb = 128; double *t ; magma_dmalloc_pinned (&t, nb*nb); //printf("calling dormqr_m with nb=%d\n", (int) nb); double* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; magma_int_t left = lapackf77_lsame(side_, "L"); magma_int_t notran = lapackf77_lsame(trans_, "N"); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l; nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw)) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i){ magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // Put 0s in the upper triangular part of dA; magmablas_dsetdiag1subdiag0_stream('L', kb, kb, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0]); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], t, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_dsetmatrix_async(kb, kb, t, kb, dt(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i){ magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(igpu, 0, i/nrgpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_dgetmatrix_async( m, kb, // dC(igpu, 0, i/nrgpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); magma_xerbla( __func__, 1 ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], t, &ib); // 1) copy the panel from A to the GPU, and // 2) Put 0s in the upper triangular part of dA; magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_dsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, t, ib, dt, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dt, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_dormqr */
extern "C" magma_int_t magma_cpotrf_mgpu(magma_int_t num_gpus, char uplo, magma_int_t n, magmaFloatComplex **d_lA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, nb, d, lddp, h; char uplo_[2] = {uplo, 0}; magmaFloatComplex *work; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs]; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_cpotrf_nb(n); if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*num_gpus)); if( n%(nb*num_gpus) != 0 ) lddp+=min(nb,n-num_gpus*lddp); if( ldda < lddp ) *info = -4; } else if( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (num_gpus == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( n, n, d_lA[0], ldda, work, n ); lapackf77_cpotrf(uplo_, &n, work, &n, info); magma_csetmatrix( n, n, work, n, d_lA[0], ldda ); magma_free_pinned( work ); } else { lddp = nb*((n+nb-1)/nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dwork[d], num_gpus*nb*lddp )) { for( j=0; j<d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); } magma_setdevice(0); h = 1; //num_gpus; //(n+nb-1)/nb; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with two streams */ //magma_cpotrf2_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, // h, stream, event, info); /* with three streams */ magma_cpotrf3_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, stream, event, info); } else { /* with two streams */ //magma_cpotrf2_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, // h, stream, event, info); /* with three streams */ magma_cpotrf3_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, stream, event, info); } /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { magma_queue_sync( stream[d][j] ); magma_queue_destroy( stream[d][j] ); } magmablasSetKernelStream(NULL); for( j=0; j<5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_setdevice(0); magma_free_pinned( work ); } /* end of not lapack */ return *info; } /* magma_cpotrf_mgpu */
magma_int_t magma_dcg_merge( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->info = 0; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // GPU workspace magma_d_vector r, d, z; magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &d, Magma_DEV, dofs, c_zero ); magma_d_vinit( &z, Magma_DEV, dofs, c_zero ); double *d1, *d2, *skp; magma_dmalloc( &d1, dofs*(1) ); magma_dmalloc( &d2, dofs*(1) ); // array for the parameters magma_dmalloc( &skp, 6 ); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver variables double alpha, beta, gamma, rho, tmp1, *skp_h; double nom, nom0, r0, betanom, den; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b magma_dcopy( dofs, b.val, 1, d.val, 1 ); // d = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); nom = nom0 * nom0; // nom = r' * r magma_d_spmv( c_one, A, d, c_zero, z ); // z = A d den = MAGMA_D_REAL( magma_ddot(dofs, d.val, 1, z.val, 1) ); // den = d'* z solver_par->init_res = nom0; // array on host for the parameters magma_dmalloc_cpu( &skp_h, 6 ); alpha = rho = gamma = tmp1 = c_one; beta = magma_ddot(dofs, r.val, 1, r.val, 1); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 6, skp_h, 1, skp, 1 ); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magmablasSetKernelStream(stream[0]); // computes SpMV and dot product magma_dcgmerge_spmv1( A, d1, d2, d.val, z.val, skp ); // updates x, r, computes scalars and updates d magma_dcgmerge_xrbeta( dofs, d1, d2, x->val, r.val, d.val, z.val, skp ); // check stopping criterion (asynchronous copy) magma_dgetvector_async( 1 , skp+1, 1, skp_h+1, 1, stream[1] ); betanom = sqrt(MAGMA_D_REAL(skp_h[1])); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&r); magma_d_vfree(&z); magma_d_vfree(&d); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); return MAGMA_SUCCESS; } /* magma_dcg_merge */
/** Purpose ------- ZHETRD reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] nqueue INTEGER The number of GPU streams used for update. 10 >= nqueue > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d COMPLEX_16 array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e COMPLEX_16 array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize given by magma_get_zhetrd_nb(). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_comp ********************************************************************/ extern "C" magma_int_t magma_zhetrd_mgpu( magma_int_t ngpu, magma_int_t nqueue, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double *d, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dW[(id)] + (j)*ldda + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nlocal, ldda; magma_int_t nb = magma_get_zhetrd_nb(n), ib, ib2; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const double d_one = MAGMA_D_ONE; #ifdef PROFILE_SY2RK double mv_time = 0.0; double up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i, ii, iii, j, dev, i_n; magma_int_t iinfo; magma_int_t ldwork, lddw, lwkopt, ldwork2, lhwork; magma_int_t lquery; // set pointers to NULL so it is safe to goto CLEANUP if any malloc fails. magma_queue_t queues[MagmaMaxGPUs][10] = { { NULL, NULL } }; magma_queue_t queues0[MagmaMaxGPUs] = { NULL }; magmaDoubleComplex *hwork = NULL; magmaDoubleComplex_ptr dwork2[MagmaMaxGPUs] = { NULL }; magmaDoubleComplex_ptr dA[MagmaMaxGPUs] = { NULL }; magmaDoubleComplex_ptr dW[MagmaMaxGPUs] = { NULL }; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( nqueue > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK double times[11] = { 0 }; magma_event_t start, stop; float etime; magma_setdevice( 0 ); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = roundup( lda, 32 ); lddw = ldda; nlocal = nb*(1 + n/(nb*ngpu)); ldwork2 = ldda*( ((n - 1)/nb + 1) + 1); // i.e., ldda*(blocks + 1) for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_zmalloc( &dA[dev], nlocal*ldda + 3*lddw*nb ) || MAGMA_SUCCESS != magma_zmalloc( &dwork2[dev], ldwork2 ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dW[dev] = dA[dev] + nlocal*ldda; for( kk=0; kk < nqueue; kk++ ) { magma_queue_create( &queues[dev][kk] ); } queues0[dev] = queues[dev][0]; } lhwork = nqueue*ngpu*n; if ( MAGMA_SUCCESS != magma_zmalloc_pinned( &hwork, lhwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // crossover point: use CPU code for last nx columns //if (n < 2048) // nx = n; //else // nx = 512; nx = min( 128, n ); // nx <= n is required if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_zhtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } magma_zlatrd_mgpu( ngpu, uplo, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dW, i+ib, hwork, lhwork, dwork2, ldwork2, queues0 ); magma_zher2k_mgpu( ngpu, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dW, i+ib, 0, d_one, dA, ldda, 0, nqueue, queues); /* get the next panel */ if (i-nb >= nx ) { ib2 = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*ngpu)); dev = ((i-nb)/nb)%ngpu; magma_setdevice( dev ); magma_zgetmatrix_async( (i-nb)+ib2, ib2, dA(dev, 0, ii), ldda, A(0, i-nb), lda, queues[dev][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_Z_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_Z_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_zgetmatrix_async( nx, ib, dA(dev, 0, ii), ldda, A(0, i), lda, queues[dev][0] ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* Use CPU code to reduce the last or only block */ lapackf77_zhetrd( uplo_, &nx, A(0, 0), &lda, d, e, tau, work, &lwork, &iinfo ); } } else { trace_init( 1, ngpu, nqueue, (CUstream_st**)queues ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_zhtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice( dev ); trace_gpu_start( dev, 0, "comm", "get" ); magma_zgetmatrix_async( n-i, ib, dA(dev, i, ii), ldda, A(i,i), lda, queues[dev][0] ); trace_gpu_end( dev, 0 ); magma_queue_sync( queues[dev][0] ); magma_setdevice( 0 ); } magma_zlatrd_mgpu( ngpu, uplo, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dW, n-i, hwork, lhwork, dwork2, ldwork2, queues0 ); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( i > 0 ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_record( start, 0 ); #endif magma_zher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dW, n-i, ib, d_one, dA, ldda, i+ib, nqueue, queues); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); magma_event_record( stop, 0 ); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_Z_MAKE( e[j], 0 ); } d[j] = MAGMA_Z_REAL( *A(j, j) ); } } /* for i=... */ /* Use CPU code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_zgetmatrix_async( i_n, ib, dA(dev, iii, ii), ldda, A(iii, i), lda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } } lapackf77_zhetrd( uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_sync( queues[dev][kk] ); } } #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( n > nx ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "zhetrd.svg", "trace.css" ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in ZLARFG: %.2e seconds\n", times[0] ); //printf( " Time in ZHEMV : %.2e seconds\n", mv_time ); printf( " Time in ZHER2K: %.2e seconds\n", up_time ); #endif CLEANUP: for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_destroy( queues[dev][kk] ); } magma_free( dA[dev] ); magma_free( dwork2[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); return *info; } /* magma_zhetrd */
extern "C" magma_int_t magma_cgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloatComplex **dlA, magma_int_t ldda, magmaFloatComplex *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CGEQRF2_MGPU computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaFloatComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaFloatComplex *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; magma_queue_t cqueue; magmablasGetKernelStream( &cqueue ); magma_device_t cdevice; magma_getdevice( &cdevice ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; nb = magma_get_cgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and clarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think clarfb work could be smaller, max(n_local[:]). * Oddly, T and clarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_cmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for cgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j==i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_cgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_cgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); cpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_csetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if cpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ cq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_csetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_csetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_cgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for cgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_cgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_csetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. // check that queues and events are non-zero before destroying them, though. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( stream[dev][0] ) { magma_queue_destroy( stream[dev][0] ); } if ( stream[dev][1] ) { magma_queue_destroy( stream[dev][1] ); } if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); } magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( cdevice ); magmablasSetKernelStream( cqueue ); return *info; } /* magma_cgeqrf2_mgpu */
magma_int_t magma_dbicgstab_merge2( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE2; solver_par->numiter = 0; solver_par->info = 0; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // workspace magma_d_vector q, r,rr,p,v,s,t; double *d1, *d2, *skp; magma_dmalloc( &d1, dofs*(2) ); magma_dmalloc( &d2, dofs*(2) ); // array for the parameters magma_dmalloc( &skp, 8 ); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] magma_d_vinit( &q, Magma_DEV, dofs*6, c_zero ); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.val = NULL; rr.num_rows = rr.nnz = dofs; r.memory_location = Magma_DEV; r.val = NULL; r.num_rows = r.nnz = dofs; p.memory_location = Magma_DEV; p.val = NULL; p.num_rows = p.nnz = dofs; v.memory_location = Magma_DEV; v.val = NULL; v.num_rows = v.nnz = dofs; s.memory_location = Magma_DEV; s.val = NULL; s.num_rows = s.nnz = dofs; t.memory_location = Magma_DEV; t.val = NULL; t.num_rows = t.nnz = dofs; rr.val = q(0); r.val = q(1); p.val = q(2); v.val = q(3); s.val = q(4); t.val = q(5); // solver variables double alpha, beta, omega, rho_old, rho_new, *skp_h; double nom, nom0, betanom, r0, den; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, q(0), 1 ); // rr = b magma_dcopy( dofs, b.val, 1, q(1), 1 ); // r = b rho_new = magma_ddot( dofs, r.val, 1, r.val, 1 ); // rho=<rr,r> nom = MAGMA_D_REAL(magma_ddot( dofs, r.val, 1, r.val, 1 )); nom0 = betanom = sqrt(nom); // nom = || r || rho_old = omega = alpha = MAGMA_D_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters magma_dmalloc_cpu( &skp_h, 8 ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 8, skp_h, 1, skp, 1 ); magma_d_spmv( c_one, A, r, c_zero, v ); // z = A r den = MAGMA_D_REAL( magma_ddot(dofs, v.val, 1, r.val, 1) );// den = z dot r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magmablasSetKernelStream(stream[0]); // computes p=r+beta*(p-omega*v) magma_dbicgmerge1( dofs, skp, v.val, r.val, p.val ); magma_dbicgmerge_spmv1( A, d1, d2, q(2), q(0), q(3), skp ); magma_dbicgmerge2( dofs, skp, r.val, v.val, s.val ); // s=r-alpha*v magma_dbicgmerge_spmv2( A, d1, d2, q(4), q(5), skp); magma_dbicgmerge_xrbeta( dofs, d1, d2, q(0), q(1), q(2), q(4), q(5), x->val, skp); // check stopping criterion (asynchronous copy) magma_dgetvector_async( 1 , skp+5, 1, skp_h+5, 1, stream[1] ); betanom = sqrt(MAGMA_D_REAL(skp_h[5])); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&q); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); return MAGMA_SUCCESS; } /* dbicgstab_merge2 */
extern "C" magma_int_t magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U\*\*H * D * U, if UPLO = 'U', or A = L * D * L\*\*H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U\*\*H*U or A = L*L\*\*H. Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ double zone = MAGMA_D_ONE; double mzone = MAGMA_D_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaDouble_ptr dA; magmaDouble_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = ((n+31)/32)*32; nb = magma_get_dsytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, (CUstream_st**)stream ); //if (nb <= 1 || nb >= n) //{ // lapackf77_dpotrf(uplo_, &n, a, &lda, info); //} else { /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } } trace_finalize( "dsytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_dsytrf_nopiv */
/** Purpose ------- SPOTRF_OOC computes the Cholesky factorization of a real symmetric positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may exceed the GPU memory. The factorization has the form A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf_m(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(d, i, j) (dwork[(d)] + (j)*lddla + (i)) #define dT(d, i, j) ( dt[(d)] + (j)*ldda + (i)) #define dAup(d, i, j) (dwork[(d)] + (j)*NB + (i)) #define dTup(d, i, j) ( dt[(d)] + (j)*nb + (i)) /* Local variables */ float d_one = 1.0; float d_neg_one = -1.0; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; const char* uplo_ = lapack_uplo_const( uplo ); int upper = (uplo == MagmaUpper); float *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus0 = num_gpus; magma_int_t j, jj, jb, J, JB, NB, MB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; magma_timer_t time_total=0, time_sum=0, time=0; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dpotrf_nb(n); if ( num_gpus0 > n/nb ) { num_gpus = n/nb; if ( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(float); MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if ( NB >= n ) { #ifdef CHECK_SPOTRF_OOC printf( " * still fits in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_SPOTRF_OOC printf( " * doesn't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_SPOTRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (float)freeMem ); fflush(stdout); #endif for (d=0; d < num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_smalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda]; for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); timer_start( time_total ); if (nb <= 1 || nb >= n) { lapackf77_spotrf(uplo_, &n, A, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if ( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-rows */ magma_shtodpo( num_gpus, uplo, JB, n, J, J, nb, A, lda, dwork, NB, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_ssetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_ssetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj < JB; jj += nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_sgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if ( n > J+JB ) { for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*num_gpus))*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*num_gpus)); if ( d < (JB/nb)%num_gpus ) J2 += nb; n_local[d] -= J2; magma_sgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_spotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, // dwork, NB, dt, ldda, A, lda, h, stream, event, &iinfo); // using three streams magma_spotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal (and diagonal!!!) big panel */ magma_sdtohpo(num_gpus, uplo, JB, n, J, J, nb, NB, A, lda, dwork, NB, stream, &iinfo); //magma_sdtohpo(num_gpus, uplo, JB, n, J, J, nb, 0, A, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if ( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-columns */ magma_shtodpo( num_gpus, uplo, n, JB, J, J, nb, A, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of big panel */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_ssetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_ssetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj < JB; jj += nb ) { /* diagonal */ d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_sgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_ssyrk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if ( n > J+JB ) { /* off-diagonal */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/num_gpus)*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*num_gpus)); if ( d < (JB/nb)%num_gpus ) J2 += nb; n_local[d] -= J2; magma_sgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_spotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, // dwork, lddla, dt, ldda, A, lda, h, stream, event, &iinfo); // using three streams magma_spotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal big panel */ magma_sdtohpo( num_gpus, uplo, n, JB, J, J, nb, JB, A, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ timer_stop( time_total ); if ( num_gpus0 > n/nb ) { num_gpus = n/nb; if ( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } for (d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j < 5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); timer_printf( "\n n=%d NB=%d nb=%d\n", (int) n, (int) NB, (int) nb ); timer_printf( " Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_SPOTRF(n) / 1e9, time_total, FLOPS_SPOTRF(n) / 1e9 / time_total ); timer_printf( " Performance %f / %f = %f GFlop/s\n", FLOPS_SPOTRF(n) / 1e9, time_sum, FLOPS_SPOTRF(n) / 1e9 / time_sum ); return *info; } /* magma_spotrf_ooc */
/** Purpose ------- ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_zgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((double)NB/nb) ) { ngpu = (int)ceil((double)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_ZGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_ZGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_ZGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_zmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((double)N/nb) ) { ngpu = (int)ceil((double)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_zsetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_zsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ztranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_zlaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_zsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ztranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_zgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_zgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_ZGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_zgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_zgetrf_m */
extern "C" magma_int_t magma_cbicgstab_merge( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // workspace magma_c_vector q, r,rr,p,v,s,t; magmaFloatComplex *d1, *d2, *skp; d1 = NULL; d2 = NULL; skp = NULL; magma_int_t stat_dev = 0, stat_cpu = 0; stat_dev += magma_cmalloc( &d1, dofs*(2) ); stat_dev += magma_cmalloc( &d2, dofs*(2) ); // array for the parameters stat_dev += magma_cmalloc( &skp, 8 ); if( stat_dev != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); printf("error: memory allocation.\n"); return MAGMA_ERR_DEVICE_ALLOC; } // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] magma_c_vinit( &q, Magma_DEV, dofs*6, c_zero, queue ); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new, *skp_h; float nom, nom0, betanom, r0, den; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs, b.dval, 1, q(0), 1 ); // rr = b magma_ccopy( dofs, b.dval, 1, q(1), 1 ); // r = b rho_new = magma_cdotc( dofs, r.dval, 1, r.dval, 1 ); // rho=<rr,r> nom = MAGMA_C_REAL(magma_cdotc( dofs, r.dval, 1, r.dval, 1 )); nom0 = betanom = sqrt(nom); // nom = || r || rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters stat_cpu = magma_cmalloc_cpu( &skp_h, 8 ); if( stat_cpu != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 8, skp_h, 1, skp, 1 ); magma_c_spmv( c_one, A, r, c_zero, v, queue ); // z = A r den = MAGMA_C_REAL( magma_cdotc(dofs, v.dval, 1, r.dval, 1) );// den = z dot r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magmablasSetKernelStream(stream[0]); // computes p=r+beta*(p-omega*v) magma_cbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue ); magma_c_spmv( c_one, A, p, c_zero, v, queue ); // v = Ap magma_cmdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue ); magma_cbicgmerge4( 1, skp, queue ); magma_cbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue ); // s=r-alpha*v magma_c_spmv( c_one, A, s, c_zero, t, queue ); // t=As magma_cmdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue ); magma_cbicgmerge4( 2, skp, queue ); magma_cbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue ); // check stopping criterion (asynchronous copy) magma_cgetvector_async( 1 , skp+5, 1, skp_h+5, 1, stream[1] ); betanom = sqrt(MAGMA_C_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_c_vfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* cbicgstab_merge */
/** Purpose ======= SSYTRF_nopiv_gpu computes the LDLt factorization of a real symmetric matrix A. The factorization has the form A = U^H * D * U , if UPLO = 'U', or A = L * D * L^H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssytrf_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Local variables */ float zone = MAGMA_S_ONE; float mzone = MAGMA_S_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, stream ); // CPU workspace float *A; if (MAGMA_SUCCESS != magma_smalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaFloat_ptr dW; if (MAGMA_SUCCESS != magma_smalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[1], event ); magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[0], event ); magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); magmablasSetKernelStream( orig_stream ); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
extern "C" double magma_zlatrd_mgpu(magma_int_t num_gpus, char uplo, magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0, magmaDoubleComplex *a, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *w, magma_int_t ldw, magmaDoubleComplex **da, magma_int_t ldda, magma_int_t offset, magmaDoubleComplex **dw, magma_int_t lddw, magmaDoubleComplex *dwork[MagmaMaxGPUs], magma_int_t ldwork, magma_int_t k, magmaDoubleComplex *dx[MagmaMaxGPUs], magmaDoubleComplex *dy[MagmaMaxGPUs], magmaDoubleComplex *work, magma_queue_t stream[][10], double *times) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) COMPLEX_16 array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; double mv_time = 0.0; magma_int_t i; #ifndef MAGMABLAS_ZHEMV_MGPU magma_int_t loffset = nb0*((offset/nb0)/num_gpus); #endif magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex value = MAGMA_Z_ZERO; magma_int_t id, idw, i_one = 1; //magma_int_t kk; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaDoubleComplex alpha; magmaDoubleComplex *dx2[MagmaMaxGPUs]; magmaDoubleComplex *f = (magmaDoubleComplex *)malloc(n*sizeof(magmaDoubleComplex )); if (n <= 0) { return 0; } //#define PROFILE_SYMV #ifdef PROFILE_SYMV magma_event_t start, stop; float etime; magma_timestr_t cpu_start, cpu_end; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ magmaDoubleComplex wii = *W(i, iw+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione); wii = *A(i, i+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 ); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); dx2[id] = dW1(id, 0, iw); magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]); #ifndef MAGMABLAS_ZHEMV_MGPU magma_zsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] ); #endif } magmablas_zhemv_mgpu(num_gpus, k, 'U', i, nb0, c_one, da, ldda, 0, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(0, iw), stream ); if (i < n-1) { blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } /* overlap update */ if( i < n-1 && i-1 >= n - nb ) { magma_int_t im1_1 = i_1 - 1; magma_int_t im1 = i-1; /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t im1_n = i_n + 1; lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw); #endif blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw); lapackf77_zlacgv(&im1_n, A(im1, i +1), &lda); #endif blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(im1, i+1), &lda, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&im1_n, A(im1, i+1), &lda); #endif } // 3. Here is where we need it // TODO find the right place magmablas_zhemv_sync(num_gpus, k, i, work, W(0, iw), stream ); if (i < n-1) { blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -.5f * value; blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1 ) { magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] ); } else { magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] ); } } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; idw = ((offset+i)/nb)%num_gpus; if( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); magmaDoubleComplex wii = *W(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione); wii = *A(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &lda); #endif wii = -wii; blasf77_zaxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); #ifdef PROFILE_SYMV cpu_start = get_current_time(); #endif lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); #ifdef PROFILE_SYMV cpu_end = get_current_time(); times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0; #endif e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 ); trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU //trace_gpu_start( idw, 0, "comm", "comm1" ); #ifndef MAGMABLAS_ZHEMV_MGPU magma_setdevice(idw); magma_zsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 ); #endif for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); #ifdef MAGMABLAS_ZHEMV_MGPU dx2[id] = dW1(id, 0, i)-offset; #else dx2[id] = dx[id]; magma_zsetvector( i_n, A(i+1,i), 1, dx[id], 1 ); #endif magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] ); trace_gpu_end( id, 0 ); } /* mat-vec on multiple GPUs */ #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(start, stream[0][0]); #endif magmablas_zhemv_mgpu(num_gpus, k, 'L', i_n, nb0, c_one, da, ldda, offset+i+1, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(stop, stream[0][0]); #endif trace_cpu_start( 0, "gemv", "gemv" ); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); trace_cpu_end( 0 ); /* overlap update */ if( i > 0 && i+1 < n ) { magma_int_t ip1 = i+1; trace_cpu_start( 0, "gemv", "gemv" ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(ip1, 0), &ldw); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda, W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(ip1, 0), &ldw); lapackf77_zlacgv(&i, A(ip1 ,0), &lda); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw, A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, A(ip1, 0), &lda); #endif trace_cpu_end( 0 ); } /* synchronize */ magmablas_zhemv_sync(num_gpus, k, i_n, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV cudaEventElapsedTime(&etime, start, stop); mv_time += (etime/1000.0); times[1+(i_n/(n0/10))] += (etime/1000.0); #endif trace_cpu_start( 0, "axpy", "axpy" ); if (i!=0) blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i]* -.5f * value; blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); trace_cpu_end( 0 ); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1 ) { magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] ); } else { magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] ); } } } } } #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_destory( start ); magma_event_destory( stop ); #endif for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1) magma_queue_sync(stream[id][1]); } free(f); return mv_time; } /* zlatrd_ */