extern "C" magma_int_t magma_zpidr_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_n_one = MAGMA_Z_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const double angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; double residual; double nrm; double nrmb; double nrmr; double nrmt; double rho; magmaDoubleComplex om; magmaDoubleComplex gamma; magmaDoubleComplex fk; // matrices and vectors magma_z_matrix dxs = {Magma_CSR}; magma_z_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_z_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_z_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_z_matrix dU = {Magma_CSR}; magma_z_matrix dM = {Magma_CSR}, hMdiag = {Magma_CSR}; magma_z_matrix df = {Magma_CSR}; magma_z_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_z_matrix dc = {Magma_CSR}; magma_z_matrix dv = {Magma_CSR}; magma_z_matrix dlu = {Magma_CSR}; magma_z_matrix dskp = {Magma_CSR}, hskp = {Magma_CSR}; magma_z_matrix dalpha = {Magma_CSR}, halpha = {Magma_CSR}; magma_z_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; magmaDoubleComplex *d1 = NULL, *d2 = NULL; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_dznrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_zscal( x->num_rows, MAGMA_Z_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_zvinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_zvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_zresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_zvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_zlarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_zmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_zmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_zqr(P1), QR factorization CHECK( magma_zqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_dznrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_zdscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_zmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_zmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_zvinit( &hskp, Magma_CPU, 4, 1, c_zero, queue )); CHECK( magma_zvinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_zvinit( &halpha, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_zvinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_zvinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_zvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_zmalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_zmalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_zmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_zvinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_zvinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_zcopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_zvinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_zvinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_zvinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_zvinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_zvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_zvinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_zvinit( &hMdiag, Magma_CPU, s, 1, c_zero, queue )); magmablas_zlaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_zvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_zvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_zvinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // lu = 0 CHECK( magma_zvinit( &dlu, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_Z_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magma_zgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // c(k:s) = M(k:s,k:s) \ f(k:s) magma_zcopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_ztrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_zcopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_zgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_zgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_zcopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) dGcol.dval = dG.dval + k * dG.ld; CHECK( magma_z_spmv( c_one, A, dv, c_zero, dGcol, queue )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) halpha.val[i] = magma_zdotc( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) halpha.val[i] = halpha.val[i] / hMdiag.val[i]; // G(:,k) = G(:,k) - alpha * G(:,i) magma_zaxpy( dG.num_rows, -halpha.val[i], &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); } // non-first s iteration if ( k > 0 ) { // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) magma_zsetvector( k, halpha.val, 1, dalpha.dval, 1, queue ); magmablas_zgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magma_zgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], &dG.dval[k*dG.ld], d1, d2, &dM.dval[k*dM.ld+k], queue ); magma_zgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag.val[k], 1, queue ); // check M(k,k) == 0 if ( MAGMA_Z_EQUAL(hMdiag.val[k], MAGMA_Z_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_zgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / hMdiag.val[k]; // check for nan if ( magma_z_isnan( hbeta.val[k] ) || magma_z_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_zaxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_dznrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_zaxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_zidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queue ); // t't // t'rs CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queue )); magma_zgetvector( 2, &dskp.dval[2], 1, &hskp.val[2], 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = hskp.val[3] / hskp.val[2]; // rs = rs - gamma * (rs - r) magma_zaxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_zidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queue ); // |rs| nrmr = magma_dznrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_zaxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_zsetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_zgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // v = r magma_zcopy( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // t = A v CHECK( magma_z_spmv( c_one, A, dv, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // t't // t'r CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queue )); magma_zgetvector( 2, dskp.dval, 1, hskp.val, 1, queue ); // |t| nrmt = magma_dsqrt( MAGMA_Z_REAL(hskp.val[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_Z_REAL(hskp.val[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp.val[1] / hskp.val[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_Z_EQUAL(om, MAGMA_Z_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v magma_zaxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_zaxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_dznrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_zidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queue ); // t't // t'rs CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queue )); magma_zgetvector( 2, &dskp.dval[2], 1, &hskp.val[2], 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = hskp.val[3] / hskp.val[2]; // rs = rs - gamma * (rs - r) magma_zaxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_zidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queue ); // |rs| nrmr = magma_dznrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_zcopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_zcopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_zresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_zmfree( &dxs, queue ); magma_zmfree( &drs, queue ); magma_zmfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_zmfree( &dr, queue ); magma_zmfree( &dP, queue ); magma_zmfree( &dP1, queue ); magma_zmfree( &dG, queue ); magma_zmfree( &dGcol, queue ); magma_zmfree( &dU, queue ); magma_zmfree( &dM, queue ); magma_zmfree( &hMdiag, queue ); magma_zmfree( &df, queue ); magma_zmfree( &dt, queue ); magma_zmfree( &dc, queue ); magma_zmfree( &dv, queue ); magma_zmfree( &dlu, queue ); magma_zmfree( &dskp, queue ); magma_zmfree( &dalpha, queue ); magma_zmfree( &dbeta, queue ); magma_zmfree( &hskp, queue ); magma_zmfree( &halpha, queue ); magma_zmfree( &hbeta, queue ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_zpidr_merge */ }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; double magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, ldda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ydev, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf("%% M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf("%% M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ydev, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Xm, X, incx, dX, incx, opts.queue ); magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_zgetvector( Ym, dY, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ double Anorm = lapackf77_zlange( "F", &M, &N, A, &lda, work ); double Xnorm = lapackf77_zlange( "F", &Xm, &ione, X, &Xm, work ); blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_zlange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); bool okay = (magma_error < tol) && (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (okay ? "ok" : "failed")); #else bool okay = (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (okay ? "ok" : "failed")); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; FILE *fp ; magma_int_t i, lda, Xm, Ym; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t szeA, szeX, szeY; magma_int_t istart = 64; magma_int_t iend = 10240; magma_int_t incx = 1; magma_int_t incy = 1; char trans = MagmaNoTrans; cuDoubleComplex alpha = MAGMA_Z_MAKE(1., 0.); // MAGMA_Z_MAKE( 1.5, -2.3 ); cuDoubleComplex beta = MAGMA_Z_MAKE(0., 0.); // MAGMA_Z_MAKE( -0.6, 0.8 ); cuDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; cuDoubleComplex *dA, *dX, *dY; if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-n", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-m", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if (strcmp("-N", argv[i])==0){ trans = MagmaNoTrans; } else if (strcmp("-T", argv[i])==0){ trans = MagmaTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-C", argv[i])==0){ trans = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) ) iend = istart + 1; M = N = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; szeA = lda*N; szeX = incx*Xm; szeY = incy*Ym; TESTING_MALLOC( A, cuDoubleComplex, szeA ); TESTING_MALLOC( X, cuDoubleComplex, szeX ); TESTING_MALLOC( Y, cuDoubleComplex, szeY ); TESTING_MALLOC( Ycublas, cuDoubleComplex, szeY ); TESTING_MALLOC( Ymagma, cuDoubleComplex, szeY ); TESTING_DEVALLOC( dA, cuDoubleComplex, szeA ); TESTING_DEVALLOC( dX, cuDoubleComplex, szeX ); TESTING_DEVALLOC( dY, cuDoubleComplex, szeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &szeA, A ); lapackf77_zlarnv( &ione, ISEED, &szeX, X ); lapackf77_zlarnv( &ione, ISEED, &szeY, Y ); fp = fopen ("results_zgemv.txt", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("\nUsage: \n"); printf(" testing_zgemv [-N|T|C] [-m %d] [-n %d]\n\n", 1024, 1024); printf( " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); for( i=istart; i < iend; i = (int)((i+1)*1.1) ) { M = N = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; flops = FLOPS( (double)M, (double)N ) / 1000000; printf( "%5d %5d ", (int) M, (int) N ); fprintf( fp, "%5d %5d ", (int) M, (int) N ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); /* * Cublas Version */ start = get_current_time(); cublasZgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incy, Ycublas, incy ); cuda_perf = flops / GetTimerValue(start, end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f", cuda_perf ); /* * Magma Version */ magma_zsetvector( Ym, Y, incy, dY, incy ); start = get_current_time(); magmablas_zgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incx, Ymagma, incx ); magma_perf = flops / GetTimerValue(start, end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f", magma_perf ); /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); #if 0 printf( "\t\t %8.6e", error / (double)Ym ); fprintf( fp, "\t\t %8.6e", error / (double)Ym ); /* * Blas comparaison */ { char *blastrans = MagmaNoTransStr; if ( trans == MagmaConjTrans ) blastrans = MagmaConjTransStr; else if ( trans == MagmaTrans ) blastrans = MagmaTransStr; blasf77_zcopy( &Ym, Y, &incy, Ycublas, &incy ); blasf77_zgemv( blastrans, &M, &N, &alpha, A, &lda, X, &incx, &beta, Ycublas, &incy ); blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); } #endif printf( "\t\t %8.6e\n", error / (double)Ym ); fprintf( fp, "\t\t %8.6e\n", error / (double)Ym ); } /* Free Memory */ TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); /* Free device */ TESTING_CUDA_FINALIZE(); return EXIT_SUCCESS; }
extern "C" magma_int_t magma_z_spmv( magmaDoubleComplex alpha, magma_z_sparse_matrix A, magma_z_vector x, magmaDoubleComplex beta, magma_z_vector y, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); if ( A.memory_location != x.memory_location || x.memory_location != y.memory_location ) { printf("error: linear algebra objects are not located in same memory!\n"); printf("memory locations are: %d %d %d\n", A.memory_location, x.memory_location, y.memory_location ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_INVALID_PTR; } // DEV case if ( A.memory_location == Magma_DEV ) { if ( A.num_cols == x.num_rows && x.num_cols == 1 ) { if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { //printf("using CSR kernel for SpMV: "); //magma_zgecsrmv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, // A.dval, A.drow, A.dcol, x.dval, beta, y.dval ); //printf("done.\n"); cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); cusparseZcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, &beta, y.dval ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); } else if ( A.storage_type == Magma_ELL ) { //printf("using ELLPACKT kernel for SpMV: "); magma_zgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLPACKT ) { //printf("using ELL kernel for SpMV: "); magma_zgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLRT ) { //printf("using ELLRT kernel for SpMV: "); magma_zgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, A.alignment, A.blocksize, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_SELLP ) { //printf("using SELLP kernel for SpMV: "); magma_zgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_zgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1 ); //printf("done.\n"); } else if ( A.storage_type == Magma_SPMVFUNCTION ) { //printf("using DENSE kernel for SpMV: "); magma_zcustomspmv( alpha, x, beta, y, queue ); //printf("done.\n"); } /* else if ( A.storage_type == Magma_BCSR ) { //printf("using CUSPARSE BCSR kernel for SpMV: "); // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); // end CUSPARSE context // cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; int mb = (A.num_rows + A.blocksize-1)/A.blocksize; int nb = (A.num_cols + A.blocksize-1)/A.blocksize; cusparseZbsrmv( cusparseHandle, dirA, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks, &alpha, descr, A.dval, A.drow, A.dcol, A.blocksize, x.dval, &beta, y.dval ); //printf("done.\n"); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; }*/ else { printf("error: format not supported.\n"); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_NOT_SUPPORTED; } } else if ( A.num_cols < x.num_rows || x.num_cols > 1 ) { magma_int_t num_vecs = x.num_rows / A.num_cols * x.num_cols; if ( A.storage_type == Magma_CSR ) { cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); if ( x.major == MagmaColMajor) { cusparseZcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); } else if ( x.major == MagmaRowMajor) { cusparseZcsrmm2(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); } cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); } else if ( A.storage_type == Magma_ELL ) { if ( x.major == MagmaColMajor) { magma_zmgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue ); } else if ( x.major == MagmaRowMajor) { // transpose first to col major magma_z_vector x2; magma_zvtranspose( x, &x2, queue ); magma_zmgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue ); magma_z_vfree(&x2, queue ); } } else if ( A.storage_type == Magma_ELLPACKT ) { if ( x.major == MagmaColMajor) { magma_zmgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue ); } else if ( x.major == MagmaRowMajor) { // transpose first to col major magma_z_vector x2; magma_zvtranspose( x, &x2, queue ); magma_zmgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue ); magma_z_vfree(&x2, queue ); } } else if ( A.storage_type == Magma_SELLP ) { if ( x.major == MagmaRowMajor) { magma_zmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue ); } else if ( x.major == MagmaColMajor) { // transpose first to row major magma_z_vector x2; magma_zvtranspose( x, &x2, queue ); magma_zmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x2.dval, beta, y.dval, queue ); magma_z_vfree(&x2, queue ); } }/* if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_zmgemv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1 ); //printf("done.\n"); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; }*/ else { printf("error: format not supported.\n"); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_NOT_SUPPORTED; } } } // CPU case missing! else { printf("error: CPU not yet supported.\n"); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_NOT_SUPPORTED; } magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; }
magma_int_t magma_z_spmv( magmaDoubleComplex alpha, magma_z_sparse_matrix A, magma_z_vector x, magmaDoubleComplex beta, magma_z_vector y ) { if( A.memory_location != x.memory_location || x.memory_location != y.memory_location ){ printf("error: linear algebra objects are not located in same memory!\n"); printf("memory locations are: %d %d %d\n", A.memory_location, x.memory_location, y.memory_location ); return MAGMA_ERR_INVALID_PTR; } // DEV case if( A.memory_location == Magma_DEV ){ if( A.num_cols == x.num_rows ){ if( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ){ //printf("using CSR kernel for SpMV: "); //magma_zgecsrmv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, // A.val, A.row, A.col, x.val, beta, y.val ); //printf("done.\n"); cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); cusparseZcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, A.num_cols, A.nnz, &alpha, descr, A.val, A.row, A.col, x.val, &beta, y.val ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_ELLPACK ){ //printf("using ELLPACK kernel for SpMV: "); magma_zgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.val, A.col, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_ELL ){ //printf("using ELL kernel for SpMV: "); magma_zgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.val, A.col, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_ELLRT ){ //printf("using ELLRT kernel for SpMV: "); magma_zgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.val, A.col, A.row, x.val, beta, y.val, A.alignment, A.blocksize ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_SELLC ){ //printf("using SELLC kernel for SpMV: "); magma_zgesellcmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.val, A.col, A.row, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_SELLP ){ //printf("using SELLP kernel for SpMV: "); magma_zgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.val, A.col, A.row, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_DENSE ){ //printf("using DENSE kernel for SpMV: "); magmablas_zgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, A.val, A.num_rows, x.val, 1, beta, y.val, 1 ); //printf("done.\n"); return MAGMA_SUCCESS; } /* else if( A.storage_type == Magma_BCSR ){ //printf("using CUSPARSE BCSR kernel for SpMV: "); // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); // end CUSPARSE context // cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; int mb = (A.num_rows + A.blocksize-1)/A.blocksize; int nb = (A.num_cols + A.blocksize-1)/A.blocksize; cusparseZbsrmv( cusparseHandle, dirA, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks, &alpha, descr, A.val, A.row, A.col, A.blocksize, x.val, &beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; }*/ else { printf("error: format not supported.\n"); return MAGMA_ERR_NOT_SUPPORTED; } } else if( A.num_cols < x.num_rows ){ magma_int_t num_vecs = x.num_rows / A.num_cols; if( A.storage_type == Magma_CSR ){ //printf("using CSR kernel for SpMV: "); magma_zmgecsrmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.val, A.row, A.col, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_ELLPACK ){ //printf("using ELLPACK kernel for SpMV: "); magma_zmgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.max_nnz_row, alpha, A.val, A.col, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; } else if( A.storage_type == Magma_ELL ){ //printf("using ELL kernel for SpMV: "); magma_zmgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.max_nnz_row, alpha, A.val, A.col, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; }else if( A.storage_type == Magma_SELLP ){ //printf("using SELLP kernel for SpMV: "); magma_zmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.val, A.col, A.row, x.val, beta, y.val ); //printf("done.\n"); return MAGMA_SUCCESS; }/* if( A.storage_type == Magma_DENSE ){ //printf("using DENSE kernel for SpMV: "); magmablas_zmgemv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.val, A.num_rows, x.val, 1, beta, y.val, 1 ); //printf("done.\n"); return MAGMA_SUCCESS; }*/ else { printf("error: format not supported.\n"); return MAGMA_ERR_NOT_SUPPORTED; } } } // CPU case missing! else{ printf("error: CPU not yet supported.\n"); return MAGMA_ERR_NOT_SUPPORTED; } return MAGMA_SUCCESS; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, sizeA ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_zgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- testing zdot */ int main( int argc, char** argv) { TESTING_INIT(); printf("#================================================================================================================================================\n"); printf("\n"); printf(" | runtime | GFLOPS\n"); printf("#n num_vecs | CUDOT CUGEMV MAGMAGEMV MDOT MDGM | CUDOT CUGEMV MAGMAGEMV MDOT MDGM \n"); printf("#------------------------------------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); for( magma_int_t num_vecs=5; num_vecs<6; num_vecs+=1 ){ for( magma_int_t n=10000; n<100000001; n=n+10000 ){ magma_z_sparse_matrix A, B, C, D, E, F, G, H, I, J, K, Z; magma_z_vector a,b,c,x, y, z, skp; int iters = 10; double computations = (2.* n * iters * num_vecs); magmaDoubleComplex one = MAGMA_Z_MAKE(1.0, 0.0); magmaDoubleComplex zero = MAGMA_Z_MAKE(0.0, 0.0); magmaDoubleComplex alpha; #define ENABLE_TIMER #ifdef ENABLE_TIMER double mdot1, mdot2, mdgm1, mdgm2, magmagemv1, magmagemv2, cugemv1, cugemv2, cudot1, cudot2; double mdot_time, mdgm_time, magmagemv_time, cugemv_time, cudot_time; #endif magma_z_vinit( &a, Magma_DEV, n*num_vecs, one ); magma_z_vinit( &b, Magma_DEV, num_vecs, one ); int min_ten = min(num_vecs, 15); magma_z_vinit( &x, Magma_DEV, min_ten*n, one ); magma_z_vinit( &y, Magma_DEV, min_ten*n, one ); magma_z_vinit( &skp, Magma_DEV, num_vecs, zero ); // warm up magma_zgemvmdot( n, num_vecs, a.val, b.val, x.val, y.val, skp.val ); // CUDOT #ifdef ENABLE_TIMER magma_device_sync(); cudot1=magma_wtime(); #endif for( int h=0; h<iters; h++){ for( int l=0; l<num_vecs; l++) alpha = magma_zdotc(n, a.val, 1, b.val, 1); } #ifdef ENABLE_TIMER magma_device_sync(); cudot2=magma_wtime(); cudot_time=cudot2-cudot1; #endif // CUGeMV #ifdef ENABLE_TIMER magma_device_sync(); cugemv1=magma_wtime(); #endif for( int h=0; h<iters; h++){ magma_zgemv(MagmaTrans, n, num_vecs, one, a.val, n, b.val, 1, zero, skp.val, 1); //h++; } #ifdef ENABLE_TIMER magma_device_sync(); cugemv2=magma_wtime(); cugemv_time=cugemv2-cugemv1; #endif // MAGMAGeMV #ifdef ENABLE_TIMER magma_device_sync(); magmagemv1=magma_wtime(); #endif for( int h=0; h<iters; h++){ magmablas_zgemv(MagmaTrans, n, num_vecs, one, a.val, n, b.val, 1, zero, skp.val, 1); //h++; } #ifdef ENABLE_TIMER magma_device_sync(); magmagemv2=magma_wtime(); magmagemv_time=magmagemv2-magmagemv1; #endif // MDOT #ifdef ENABLE_TIMER magma_device_sync(); mdot1=magma_wtime(); #endif for( int h=0; h<iters; h++){ //magma_zmdotc( n, num_vecs, a.val, b.val, x.val, y.val, skp.val ); magma_zmdotc( n, 2, a.val, b.val, x.val, y.val, skp.val ); magma_zmdotc( n, 2, a.val, b.val, x.val, y.val, skp.val ); magma_zmdotc( n, 1, a.val, b.val, x.val, y.val, skp.val ); //h++; } #ifdef ENABLE_TIMER magma_device_sync(); mdot2=magma_wtime(); mdot_time=mdot2-mdot1; #endif // MDGM #ifdef ENABLE_TIMER magma_device_sync(); mdgm1=magma_wtime(); #endif for( int h=0; h<iters; h++){ magma_zgemvmdot( n, num_vecs, a.val, b.val, x.val, y.val, skp.val ); //h++; } #ifdef ENABLE_TIMER magma_device_sync(); mdgm2=magma_wtime(); mdgm_time=mdgm2-mdgm1; #endif //magma_zprint_gpu(num_vecs,1,skp.val,num_vecs); //Chronometry #ifdef ENABLE_TIMER printf("%d %d %e %e %e %e %e %e %e %e %e %e\n", n, num_vecs, cudot_time/iters, (cugemv_time)/iters, (magmagemv_time)/iters, (mdot_time)/iters, (mdgm_time)/iters, (double)(computations)/(cudot_time*(1.e+09)), (double)(computations)/(cugemv_time*(1.e+09)), (double)(computations)/(magmagemv_time*(1.e+09)), (double)(computations)/(mdot_time*(1.e+09)), (double)(computations)/(mdgm_time*(1.e+09)) ); #endif magma_z_vfree(&a); magma_z_vfree(&b); magma_z_vfree(&x); magma_z_vfree(&y); magma_z_vfree(&skp); } // } printf("#================================================================================================================================================\n"); printf("\n"); printf("\n"); } TESTING_FINALIZE(); return 0; }