void magma_smake_hpd( magma_int_t N, float* A, magma_int_t lda ) { magma_int_t i, j; for( i=0; i < N; ++i ) { A(i,i) = MAGMA_S_MAKE( MAGMA_S_REAL( A(i,i) ) + N, 0. ); for( j=0; j < i; ++j ) { A(j,i) = MAGMA_S_CONJ( A(i,j) ); } } }
extern "C" void magmablas_ssyr2k_vbatched_max_nocheck( magma_uplo_t uplo, magma_trans_t trans, magma_int_t* n, magma_int_t* k, float alpha, float const * const * dA_array, magma_int_t* ldda, float const * const * dB_array, magma_int_t* lddb, float beta, float **dC_array, magma_int_t* lddc, magma_int_t batchCount, magma_int_t max_n, magma_int_t max_k, magma_queue_t queue ) { float cbeta = MAGMA_S_MAKE(beta, 0.); float c_one = MAGMA_S_MAKE(1., 0.); if( trans == MagmaNoTrans){ magmablas_ssyrk_internal_vbatched(uplo, MagmaNoTrans, n, k, alpha, dA_array, ldda, dB_array, lddb, cbeta, dC_array, lddc, max_n, max_k, batchCount, queue ); magmablas_ssyrk_internal_vbatched(uplo, MagmaNoTrans, n, k, MAGMA_S_CONJ(alpha), dB_array, lddb, dA_array, ldda, c_one, dC_array, lddc, max_n, max_k, batchCount, queue ); }else{ magmablas_ssyrk_internal_vbatched(uplo, MagmaTrans, n, k, alpha, dA_array, ldda, dB_array, lddb, cbeta, dC_array, lddc, max_n, max_k, batchCount, queue ); magmablas_ssyrk_internal_vbatched(uplo, MagmaTrans, n, k, MAGMA_S_CONJ(alpha), dB_array, lddb, dA_array, ldda, c_one, dC_array, lddc, max_n, max_k, batchCount, queue ); } }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssymmetrize Code is very similar to testing_stranspose.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; magmaFloat_ptr d_A; magma_int_t N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("%%====================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default size = lda*N; // load strictly lower triangle, save strictly upper triangle gbytes = sizeof(float) * 1.*N*(N-1) / 1e9; TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < N; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); //magmablas_ssymmetrize( opts.uplo, N-2, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_ssymmetrize( opts.uplo, N, d_A, ldda ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using naive in-place algorithm (LAPACK doesn't implement symmetrize) =================================================================== */ cpu_time = magma_wtime(); //for( int j = 1; j < N-1; ++j ) { // inset by 1 row & col // for( int i = 1; i < j; ++i ) { for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { if ( opts.uplo == MagmaLower ) { h_A[i + j*lda] = MAGMA_S_CONJ( h_A[j + i*lda] ); } else { h_A[j + i*lda] = MAGMA_S_CONJ( h_A[i + j*lda] ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_sbicg( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICG; solver_par->numiter = 0; solver_par->spmv_count = 0; // some useful variables float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t dofs = A.num_rows * b.num_cols; // workspace magma_s_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, pt={Magma_CSR}, z={Magma_CSR}, zt={Magma_CSR}, q={Magma_CSR}, y={Magma_CSR}, yt={Magma_CSR}, qt={Magma_CSR}; // need to transpose the matrix magma_s_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &qt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &yt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &zt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables float alpha, rho, beta, rho_new, ptq; float res, nomb, nom0, r0; // transpose the matrix magma_smtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_smconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_smfree(&Ah1, queue ); magma_smtransposeconjugate( Ah2, &Ah1, queue ); magma_smfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_smconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_smfree(&Ah1, queue ); magma_smtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_smfree(&Ah2, queue ); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); res = nom0; solver_par->init_res = nom0; magma_scopy( dofs, r.dval, 1, rt.dval, 1, queue ); // rr = r rho_new = magma_sdot( dofs, rt.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho = alpha = MAGMA_S_MAKE( 1.0, 0. ); nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; magma_scopy( dofs, r.dval, 1 , y.dval, 1, queue ); // y=r magma_scopy( dofs, y.dval, 1 , z.dval, 1, queue ); // z=y magma_scopy( dofs, rt.dval, 1 , yt.dval, 1, queue ); // yt=rt magma_scopy( dofs, yt.dval, 1 , zt.dval, 1, queue ); // zt=yt rho= rho_new; rho_new = magma_sdot( dofs, rt.dval, 1, z.dval, 1, queue ); // rho=<rt,z> if( magma_s_isnan_inf( rho_new ) ){ info = MAGMA_DIVERGENCE; break; } if( solver_par->numiter==1 ){ magma_scopy( dofs, z.dval, 1 , p.dval, 1, queue ); // yt=rt magma_scopy( dofs, zt.dval, 1 , pt.dval, 1, queue ); // zt=yt } else { beta = rho_new/rho; magma_sscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_saxpy( dofs, c_one , z.dval, 1 , p.dval, 1, queue ); // p = z+beta*p magma_sscal( dofs, MAGMA_S_CONJ(beta), pt.dval, 1, queue ); // pt = beta*pt magma_saxpy( dofs, c_one , zt.dval, 1 , pt.dval, 1, queue ); // pt = zt+beta*pt } CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // v = Ap CHECK( magma_s_spmv( c_one, AT, pt, c_zero, qt, queue )); // v = Ap solver_par->spmv_count++; solver_par->spmv_count++; ptq = magma_sdot( dofs, pt.dval, 1, q.dval, 1, queue ); alpha = rho_new /ptq; magma_saxpy( dofs, alpha, p.dval, 1 , x->dval, 1, queue ); // x=x+alpha*p magma_saxpy( dofs, c_neg_one * alpha, q.dval, 1 , r.dval, 1, queue ); // r=r+alpha*q magma_saxpy( dofs, c_neg_one * MAGMA_S_CONJ(alpha), qt.dval, 1 , rt.dval, 1, queue ); // r=r+alpha*q res = magma_snrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&rt, queue ); magma_smfree(&p, queue ); magma_smfree(&pt, queue ); magma_smfree(&q, queue ); magma_smfree(&qt, queue ); magma_smfree(&y, queue ); magma_smfree(&yt, queue ); magma_smfree(&z, queue ); magma_smfree(&zt, queue ); magma_smfree(&AT, queue ); magma_smfree(&Ah1, queue ); magma_smfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_sbicg */
void magmablas_ssyr2k_mgpu2( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, magmaFloat_ptr dA[], magma_int_t ldda, magma_int_t a_offset, magmaFloat_ptr dB[], magma_int_t lddb, magma_int_t b_offset, float beta, magmaFloat_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda + (a_offset) ) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb + (b_offset) ) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower ) { info = -1; // upper not yet handled } else if ( trans != MagmaNoTrans ) { info = -2; // conj not yet handled } else if ( n < 0 ) { info = -3; } else if ( k < 0 ) { info = -4; } else if ( ((trans == MagmaNoTrans) && ldda < max(1,n)) || ((trans == MagmaTrans) && ldda < max(1,k)) ) { info = -7; } else if ( a_offset < 0 || a_offset > ldda ) { info = -8; } else if ( ((trans == MagmaNoTrans) && lddb < max(1,n)) || ((trans == MagmaTrans) && lddb < max(1,k)) ) { info = -10; } else if ( b_offset < 0 || b_offset > lddb ) { info = -11; } else if ( lddc < max(1,n) ) { info = -13; } else if ( c_offset < 0 || c_offset > lddc ) { info = -14; } else if ( ngpu <= 0 ) { info = -15; } else if ( nb <= 0 ) { info = -16; } else if ( nqueue <= 0 ) { info = -18; } if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } const float c_one = MAGMA_S_ONE; float cbeta = MAGMA_S_MAKE( beta, 0. ); magma_int_t ib, ioff, iblock, idev, di, s; magma_device_t orig_dev; magma_getdevice( &orig_dev ); // loop over all blocks // Faster to have two loops: first loop does C_hat = alpha*A*B**H + beta*C // blockoffset is offset within first block; for subsequent blocks it is 0 magma_int_t blockoffset = c_offset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nqueue; // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i] //printf( "sgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_sgemm( MagmaNoTrans, MagmaTrans, n-i, ib, k, alpha, dA(idev,i,0), ldda, dB(idev,i,0), lddb, cbeta, dC(idev,ioff,di), lddc, queues[idev][s] ); blockoffset = 0; } // second loop does C = conj(alpha)*B*A**H + C_hat alpha = MAGMA_S_CONJ( alpha ); blockoffset = c_offset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nqueue; // C[i:n,i] += conj(alpha) * B[i:n,0] * A[i,0]' //printf( "sgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_sgemm( MagmaNoTrans, MagmaTrans, n-i, ib, k, alpha, dB(idev,i,0), lddb, dA(idev,i,0), ldda, c_one, dC(idev,ioff,di), lddc, queues[idev][s] ); blockoffset = 0; } magma_setdevice( orig_dev ); }