extern "C" magma_int_t magma_cbombard( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // 1=QMR, 2=CGS, 3+BiCGSTAB magma_int_t flag = 0; // prepare solver feedback solver_par->solver = Magma_BOMBARD; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float nom0, r0, res, Q_res, T_res, C_res, B_res, nomb; //QMR magmaFloatComplex Q_rho = c_one, Q_rho1 = c_one, Q_eta = -c_one , Q_pds = c_one, Q_thet = c_one, Q_thet1 = c_one, Q_epsilon = c_one, Q_beta = c_one, Q_delta = c_one, Q_pde = c_one, Q_rde = c_one, Q_gamm = c_one, Q_gamm1 = c_one, Q_psi = c_one; //TFQMR magmaFloatComplex T_rho = c_one, T_rho_l = c_one, T_eta = c_zero , T_c = c_zero , T_theta = c_zero , T_tau = c_zero, T_alpha = c_one, T_beta = c_zero, T_sigma = c_zero; //CGS magmaFloatComplex C_rho, C_rho_l = c_one, C_alpha, C_beta = c_zero; //BiCGSTAB magmaFloatComplex B_alpha, B_beta, B_omega, B_rho_old, B_rho_new; magma_int_t dofs = A.num_rows* b.num_cols; // need to transpose the matrix // GPU workspace // QMR magma_c_matrix AT = {Magma_CSR}, Ah1 = {Magma_CSR}, Ah2 = {Magma_CSR}, Q_r={Magma_CSR}, r_tld={Magma_CSR}, Q_x={Magma_CSR}, Q_v={Magma_CSR}, Q_w={Magma_CSR}, Q_wt={Magma_CSR}, Q_d={Magma_CSR}, Q_s={Magma_CSR}, Q_z={Magma_CSR}, Q_q={Magma_CSR}, Q_p={Magma_CSR}, Q_pt={Magma_CSR}, Q_y={Magma_CSR}, d1={Magma_CSR}, d2={Magma_CSR}; //TFQMR // GPU workspace magma_c_matrix T_r={Magma_CSR}, T_pu_m={Magma_CSR}, T_x={Magma_CSR}, T_d={Magma_CSR}, T_w={Magma_CSR}, T_v={Magma_CSR}, T_u_mp1={Magma_CSR}, T_u_m={Magma_CSR}, T_Au={Magma_CSR}, T_Ad={Magma_CSR}, T_Au_new={Magma_CSR}; // CGS magma_c_matrix C_r={Magma_CSR}, C_rt={Magma_CSR}, C_x={Magma_CSR}, C_p={Magma_CSR}, C_q={Magma_CSR}, C_u={Magma_CSR}, C_v={Magma_CSR}, C_t={Magma_CSR}, C_p_hat={Magma_CSR}, C_q_hat={Magma_CSR}, C_u_hat={Magma_CSR}, C_v_hat={Magma_CSR}; //BiCGSTAB magma_c_matrix B_r={Magma_CSR}, B_x={Magma_CSR}, B_p={Magma_CSR}, B_v={Magma_CSR}, B_s={Magma_CSR}, B_t={Magma_CSR}; CHECK( magma_cvinit( &r_tld, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d1, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d2, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // QMR CHECK( magma_cvinit( &Q_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_w, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_wt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_x, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // TFQMR CHECK( magma_cvinit( &T_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_u_mp1,Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_u_m, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_pu_m, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_w, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_Ad, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_Au_new, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_Au, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_x, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // CGS CHECK( magma_cvinit( &C_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // BiCGSTAB CHECK( magma_cvinit( &B_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r_tld, &nom0, queue)); solver_par->init_res = nom0; res = nom0; // QMR magma_ccopy( dofs, r_tld.dval, 1, Q_r.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_y.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_v.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_wt.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_z.dval, 1, queue ); magma_ccopy( dofs, x->dval, 1, Q_x.dval, 1, queue ); // transpose the matrix // transpose the matrix magma_cmtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_cmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransposeconjugate( Ah2, &Ah1, queue ); magma_cmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_cmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_cmfree(&Ah2, queue ); // TFQMR solver_par->init_res = nom0; magma_ccopy( dofs, r_tld.dval, 1, T_r.dval, 1, queue ); magma_ccopy( dofs, T_r.dval, 1, T_w.dval, 1, queue ); magma_ccopy( dofs, T_r.dval, 1, T_u_m.dval, 1, queue ); magma_ccopy( dofs, T_r.dval, 1, T_u_mp1.dval, 1, queue ); magma_ccopy( dofs, T_u_m.dval, 1, T_pu_m.dval, 1, queue ); CHECK( magma_c_spmv( c_one, A, T_pu_m, c_zero, T_v, queue )); magma_ccopy( dofs, T_v.dval, 1, T_Au.dval, 1, queue ); // CGS magma_ccopy( dofs, r_tld.dval, 1, C_r.dval, 1, queue ); magma_ccopy( dofs, x->dval, 1, C_x.dval, 1, queue ); // BiCGSTAB magma_ccopy( dofs, r_tld.dval, 1, B_r.dval, 1, queue ); magma_ccopy( dofs, x->dval, 1, B_x.dval, 1, queue ); CHECK( magma_c_spmv( c_one, A, B_r, c_zero, B_v, queue )); nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } T_tau = magma_csqrt( magma_cdotc( dofs, T_r.dval, 1, r_tld.dval, 1, queue) ); T_rho = magma_cdotc( dofs, T_r.dval, 1, r_tld.dval, 1, queue ); T_rho_l = T_rho; Q_psi = magma_csqrt( magma_cdotc( dofs, Q_z.dval, 1, Q_z.dval, 1, queue )); Q_rho = magma_csqrt( magma_cdotc( dofs, Q_y.dval, 1, Q_y.dval, 1, queue )); // BiCGSTAB B_rho_new = magma_cdotc( dofs, B_r.dval, 1, B_r.dval, 1, queue ); B_rho_old = B_omega = B_alpha = MAGMA_C_MAKE( 1.0, 0. ); // v = y / rho // y = y / rho // w = wt / psi // z = z / psi magma_cqmr_1( b.num_rows, b.num_cols, Q_rho, Q_psi, Q_y.dval, Q_z.dval, Q_v.dval, Q_w.dval, queue ); //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; //QMR: delta = z' * y; Q_delta = magma_cdotc( dofs, Q_z.dval, 1, Q_y.dval, 1, queue ); // TFQMR T_alpha = T_rho / magma_cdotc( dofs, T_v.dval, 1, r_tld.dval, 1, queue ); T_sigma = T_theta * T_theta / T_alpha * T_eta; //CGS: rho = r' * r_tld C_rho = magma_cdotc( dofs, C_r.dval, 1, r_tld.dval, 1, queue ); // BiCGSTAB B_rho_old = B_rho_new; B_rho_new = magma_cdotc( dofs, r_tld.dval, 1, B_r.dval, 1, queue ); // rho=<rr,r> B_beta = B_rho_new/B_rho_old * B_alpha/B_omega; // beta=rho/rho_old *alpha/omega if( solver_par->numiter == 1 ){ //QMR: p = y; //QMR: q = z; magma_ccopy( dofs, Q_y.dval, 1, Q_p.dval, 1, queue ); magma_ccopy( dofs, Q_z.dval, 1, Q_q.dval, 1, queue ); //QMR: u = r; //QMR: p = r; magma_ccgs_2( b.num_rows, b.num_cols, C_r.dval, C_u.dval, C_p.dval, queue ); } else{ Q_pde = Q_psi * Q_delta / Q_epsilon; Q_rde = Q_rho * MAGMA_C_CONJ(Q_delta/Q_epsilon); C_beta = C_rho / C_rho_l; //QMR p = y - pde * p //QMR q = z - rde * q magma_cqmr_2( b.num_rows, b.num_cols, Q_pde, Q_rde, Q_y.dval, Q_z.dval, Q_p.dval, Q_q.dval, queue ); //CGS: u = r + beta*q; //CGS: p = u + beta*( q + beta*p ); magma_ccgs_1( b.num_rows, b.num_cols, C_beta, C_r.dval, C_q.dval, C_u.dval, C_p.dval, queue ); } // TFQMR magma_ctfqmr_1( b.num_rows, b.num_cols, T_alpha, T_sigma, T_v.dval, T_Au.dval, T_u_m.dval, T_pu_m.dval, T_u_mp1.dval, T_w.dval, T_d.dval, T_Ad.dval, queue ); T_theta = magma_csqrt( magma_cdotc(dofs, T_w.dval, 1, T_w.dval, 1, queue) ) / T_tau; T_c = c_one / magma_csqrt( c_one + T_theta*T_theta ); T_tau = T_tau * T_theta *T_c; T_eta = T_c * T_c * T_alpha; T_sigma = T_theta * T_theta / T_alpha * T_eta; magma_ctfqmr_2( b.num_rows, b.num_cols, T_eta, T_d.dval, T_Ad.dval, T_x.dval, T_r.dval, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_pu_m.dval, 1, queue ); // BiCGSTAB: p = r + beta * ( p - omega * v ) magma_cbicgstab_1( b.num_rows, b.num_cols, B_beta, B_omega, B_r.dval, B_v.dval, B_p.dval, queue ); //QMR CHECK( magma_c_spmv( c_one, A, Q_p, c_zero, Q_pt, queue )); //TFQMR CHECK( magma_c_spmv( c_one, A, T_pu_m, c_zero, T_Au_new, queue )); //CGS CHECK( magma_c_spmv( c_one, A, C_p, c_zero, C_v_hat, queue )); // BiCGSTAB CHECK( magma_c_spmv( c_one, A, B_p, c_zero, B_v, queue )); // v = Ap solver_par->spmv_count++; //QMR: epsilon = q' * pt; Q_epsilon = magma_cdotc( dofs, Q_q.dval, 1, Q_pt.dval, 1, queue ); Q_beta = Q_epsilon / Q_delta; //TFQMR magma_ccopy( dofs, T_Au_new.dval, 1, T_Au.dval, 1, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_u_m.dval, 1, queue ); //CGS: alpha = r_tld' * v_hat C_alpha = C_rho / magma_cdotc( dofs, r_tld.dval, 1, C_v_hat.dval, 1, queue ); //BiCGSTAB B_alpha = B_rho_new / magma_cdotc( dofs, r_tld.dval, 1, B_v.dval, 1, queue ); //QMR: v = pt - beta * v //QMR: y = v magma_cqmr_3( b.num_rows, b.num_cols, Q_beta, Q_pt.dval, Q_v.dval, Q_y.dval, queue ); // TFQMR magma_ctfqmr_5( b.num_rows, b.num_cols, T_alpha, T_sigma, T_v.dval, T_Au.dval, T_pu_m.dval, T_w.dval, T_d.dval, T_Ad.dval, queue ); // TFQMR T_sigma = T_theta * T_theta / T_alpha * T_eta; T_theta = magma_csqrt( magma_cdotc(dofs, T_w.dval, 1, T_w.dval, 1, queue) ) / T_tau; T_c = c_one / magma_csqrt( c_one + T_theta*T_theta ); T_tau = T_tau * T_theta *T_c; T_eta = T_c * T_c * T_alpha; // TFQMR magma_ctfqmr_2( b.num_rows, b.num_cols, T_eta, T_d.dval, T_Ad.dval, T_x.dval, T_r.dval, queue ); T_rho = magma_cdotc( dofs, T_w.dval, 1, r_tld.dval, 1, queue ); T_beta = T_rho / T_rho_l; T_rho_l = T_rho; magma_ctfqmr_3( b.num_rows, b.num_cols, T_beta, T_w.dval, T_u_m.dval, T_u_mp1.dval, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_pu_m.dval, 1, queue ); //CGS: q = u - alpha v_hat //CGS: t = u + q magma_ccgs_3( b.num_rows, b.num_cols, C_alpha, C_v_hat.dval, C_u.dval, C_q.dval, C_t.dval, queue ); // BiCGSTAB: s = r - alpha v magma_cbicgstab_2( b.num_rows, b.num_cols, B_alpha, B_r.dval, B_v.dval, B_s.dval, queue ); Q_rho1 = Q_rho; //QMR rho = norm(y); Q_rho = magma_csqrt( magma_cdotc( dofs, Q_y.dval, 1, Q_y.dval, 1, queue ) ); //QMR wt = A' * q - beta' * w; CHECK( magma_c_spmv( c_one, AT, Q_q, c_zero, Q_wt, queue )); //TFQMR CHECK( magma_c_spmv( c_one, A, T_pu_m, c_zero, T_Au_new, queue )); //CGS t = A u_hat CHECK( magma_c_spmv( c_one, A, C_t, c_zero, C_rt, queue )); //BiCGSTAB CHECK( magma_c_spmv( c_one, A, B_s, c_zero, B_t, queue )); // t=As solver_par->spmv_count++; //BiCGSTAB B_omega = magma_cdotc( dofs, B_t.dval, 1, B_s.dval, 1, queue ) // omega = <s,t>/<t,t> / magma_cdotc( dofs, B_t.dval, 1, B_t.dval, 1, queue ); // QMR magma_caxpy( dofs, - MAGMA_C_CONJ( Q_beta ), Q_w.dval, 1, Q_wt.dval, 1, queue ); // no precond: z = wt magma_ccopy( dofs, Q_wt.dval, 1, Q_z.dval, 1, queue ); //TFQMR magma_ctfqmr_4( b.num_rows, b.num_cols, T_beta, T_Au_new.dval, T_v.dval, T_Au.dval, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_u_m.dval, 1, queue ); // QMR Q_thet1 = Q_thet; Q_thet = Q_rho / (Q_gamm * MAGMA_C_MAKE( MAGMA_C_ABS(Q_beta), 0.0 )); Q_gamm1 = Q_gamm; Q_gamm = c_one / magma_csqrt(c_one + Q_thet*Q_thet); Q_eta = - Q_eta * Q_rho1 * Q_gamm * Q_gamm / (Q_beta * Q_gamm1 * Q_gamm1); if ( solver_par->numiter == 1 ) { //QMR: d = eta * p + pds * d; //QMR: s = eta * pt + pds * d; //QMR: x = x + d; //QMR: r = r - s; magma_cqmr_4( b.num_rows, b.num_cols, Q_eta, Q_p.dval, Q_pt.dval, Q_d.dval, Q_s.dval, Q_x.dval, Q_r.dval, queue ); } else { Q_pds = (Q_thet1 * Q_gamm) * (Q_thet1 * Q_gamm); // d = eta * p + pds * d; // s = eta * pt + pds * d; // x = x + d; // r = r - s; magma_cqmr_5( b.num_rows, b.num_cols, Q_eta, Q_pds, Q_p.dval, Q_pt.dval, Q_d.dval, Q_s.dval, Q_x.dval, Q_r.dval, queue ); } // CGS: r = r -alpha*A u_hat // CGS: x = x + alpha u_hat magma_ccgs_4( b.num_rows, b.num_cols, C_alpha, C_t.dval, C_rt.dval, C_x.dval, C_r.dval, queue ); C_rho_l = C_rho; // BiCGSTAB: x = x + alpha * p + omega * s // BiCGSTAB: r = s - omega * t magma_cbicgstab_3( b.num_rows, b.num_cols, B_alpha, B_omega, B_p.dval, B_s.dval, B_t.dval, B_x.dval, B_r.dval, queue ); //QMR: psi = norm(z); Q_psi = magma_csqrt( magma_cdotc( dofs, Q_z.dval, 1, Q_z.dval, 1, queue ) ); //QMR: v = y / rho //QMR: y = y / rho //QMR: w = wt / psi //QMR: z = z / psi magma_cqmr_1( b.num_rows, b.num_cols, Q_rho, Q_psi, Q_y.dval, Q_z.dval, Q_v.dval, Q_w.dval, queue ); Q_res = magma_scnrm2( dofs, Q_r.dval, 1, queue ); T_res = magma_scnrm2( dofs, T_r.dval, 1, queue ); C_res = magma_scnrm2( dofs, C_r.dval, 1, queue ); B_res = magma_scnrm2( dofs, B_r.dval, 1, queue ); // printf(" %e %e %e\n", Q_res, C_res, B_res); if( Q_res < res ){ res = Q_res; flag = 1; } if( T_res < res ){ res = Q_res; flag = 2; } if( C_res < res ){ res = C_res; flag = 3; } if( B_res < res ){ res = B_res; flag = 4; } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } if( magma_c_isnan_inf( Q_beta ) && magma_c_isnan_inf( C_beta ) && magma_c_isnan_inf( B_beta ) ){ info = MAGMA_DIVERGENCE; break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); // copy back the best solver switch ( flag ) { case 1: printf("%% QMR fastest solver.\n"); magma_ccopy( dofs, Q_x.dval, 1, x->dval, 1, queue ); break; case 2: printf("%% TFQMR fastest solver.\n"); magma_ccopy( dofs, T_x.dval, 1, x->dval, 1, queue ); break; case 3: printf("%% CGS fastest solver.\n"); magma_ccopy( dofs, C_x.dval, 1, x->dval, 1, queue ); break; case 4: printf("%% BiCGSTAB fastest solver.\n"); magma_ccopy( dofs, B_x.dval, 1, x->dval, 1, queue ); break; } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r_tld, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r_tld, queue ); magma_cmfree(&d1, queue ); magma_cmfree(&d2, queue ); magma_cmfree(&AT, queue ); // QMR magma_cmfree(&Q_r, queue ); magma_cmfree(&Q_v, queue ); magma_cmfree(&Q_w, queue ); magma_cmfree(&Q_wt, queue ); magma_cmfree(&Q_d, queue ); magma_cmfree(&Q_s, queue ); magma_cmfree(&Q_z, queue ); magma_cmfree(&Q_q, queue ); magma_cmfree(&Q_p, queue ); magma_cmfree(&Q_pt, queue ); magma_cmfree(&Q_y, queue ); magma_cmfree(&Q_x, queue ); magma_cmfree(&Ah1, queue ); magma_cmfree(&Ah2, queue ); // TFQMR magma_cmfree(&T_r, queue ); magma_cmfree(&T_x, queue ); magma_cmfree(&T_d, queue ); magma_cmfree(&T_w, queue ); magma_cmfree(&T_v, queue ); magma_cmfree(&T_u_m, queue ); magma_cmfree(&T_u_mp1, queue ); magma_cmfree(&T_pu_m, queue ); magma_cmfree(&T_d, queue ); magma_cmfree(&T_Au, queue ); magma_cmfree(&T_Au_new, queue ); magma_cmfree(&T_Ad, queue ); // CGS magma_cmfree(&C_r, queue ); magma_cmfree(&C_rt, queue ); magma_cmfree(&C_x, queue ); magma_cmfree(&C_p, queue ); magma_cmfree(&C_q, queue ); magma_cmfree(&C_u, queue ); magma_cmfree(&C_v, queue ); magma_cmfree(&C_t, queue ); magma_cmfree(&C_p_hat, queue ); magma_cmfree(&C_q_hat, queue ); magma_cmfree(&C_u_hat, queue ); magma_cmfree(&C_v_hat, queue ); // BiCGSTAB magma_cmfree(&B_r, queue ); magma_cmfree(&B_x, queue ); magma_cmfree(&B_p, queue ); magma_cmfree(&B_v, queue ); magma_cmfree(&B_s, queue ); magma_cmfree(&B_t, queue ); solver_par->info = info; return info; } /* magma_cbombard */
/** Purpose ------- CGETF2_NOPIV computes an LU factorization of a general m-by-n matrix A without pivoting. The factorization has the form A = L * U where L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the m by n matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value - > 0: if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_aux ********************************************************************/ extern "C" magma_int_t magma_cgetf2_nopiv( magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i_,j_) (A + (i_) + (j_)*lda) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t min_mn, i__2, i__3; magmaFloatComplex z__1; magma_int_t i, j; float sfmin; A -= 1 + lda; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Compute machine safe minimum */ sfmin = lapackf77_slamch("S"); min_mn = min(m,n); for (j = 1; j <= min_mn; ++j) { /* Test for singularity. */ if ( ! MAGMA_C_EQUAL( *A(j,j), c_zero)) { /* Compute elements J+1:M of J-th column. */ if (j < m) { if (MAGMA_C_ABS( *A(j,j) ) >= sfmin) { i__2 = m - j; z__1 = MAGMA_C_DIV(c_one, *A(j,j)); blasf77_cscal(&i__2, &z__1, A(j+1,j), &ione); } else { i__2 = m - j; for (i = 1; i <= i__2; ++i) { *A(j+i,j) = MAGMA_C_DIV( *A(j+i,j), *A(j,j) ); } } } } else if (*info == 0) { *info = j; } if (j < min_mn) { /* Update trailing submatrix. */ i__2 = m - j; i__3 = n - j; blasf77_cgeru( &i__2, &i__3, &c_neg_one, A(j+1,j), &ione, A(j,j+1), &lda, A(j+1,j+1), &lda); } } return *info; } /* magma_cgetf2_nopiv */
extern "C" magma_int_t magma_ccg_merge( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaFloatComplex alpha, beta, gamma, rho, tmp1, *skp_h={0}; float nom, nom0, betanom, den, nomb; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_c_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR}; magmaFloatComplex *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cmalloc( &d1, dofs*(1) )); CHECK( magma_cmalloc( &d2, dofs*(1) )); // array for the parameters CHECK( magma_cmalloc( &skp, 6 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver setup magma_cscal( dofs, c_zero, x->dval, 1, queue ); // x = 0 //CHECK( magma_cresidualvec( A, b, *x, &r, nom0, queue)); magma_ccopy( dofs, b.dval, 1, r.dval, 1, queue ); // r = b magma_ccopy( dofs, r.dval, 1, d.dval, 1, queue ); // d = r nom0 = betanom = magma_scnrm2( dofs, r.dval, 1, queue ); nom = nom0 * nom0; // nom = r' * r CHECK( magma_c_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = MAGMA_C_ABS( magma_cdotc( dofs, d.dval, 1, z.dval, 1, queue ) ); // den = d'* z solver_par->init_res = nom0; nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } // array on host for the parameters CHECK( magma_cmalloc_cpu( &skp_h, 6 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_cdotc( dofs, r.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 6, skp_h, 1, skp, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // check positive definite if (den <= 0.0) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_ccgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; // updates x, r, computes scalars and updates d CHECK( magma_ccgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_cgetvector( 1 , skp+1, 1, skp_h+1, 1, queue ); betanom = sqrt(MAGMA_C_ABS(skp_h[1])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&z, queue ); magma_cmfree(&d, queue ); magma_cmfree(&B, queue ); magma_cmfree(&C, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_ccg_merge */
/***************************************************************************//** Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA COMPLEX array, dimension (LDA,N) Copy of A on the GPU. Portions of A are updated on the CPU; portions of dA are updated on the GPU. See code for details. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @param[in,out] dF COMPLEX array, dimension (LDDF,NB) Copy of F on the GPU. See code for details. @param[in] lddf INTEGER The leading dimension of the array dF. LDDF >= max(1,N). @ingroup magma_laqps *******************************************************************************/ extern "C" magma_int_t magma_claqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf, magmaFloatComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; magmaFloatComplex z__1; magma_int_t j, k, rk; magmaFloatComplex Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran isamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_clarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_clarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_cgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_C_NEGATE( tau[k] ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_cgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_scnrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_claqps */
extern "C" magma_int_t magma_cpcgs_merge( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCGS; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float nom0, r0, res, nomb; magmaFloatComplex rho, rho_l = c_one, alpha, beta; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_c_matrix r={Magma_CSR}, rt={Magma_CSR}, r_tld={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, u={Magma_CSR}, v={Magma_CSR}, t={Magma_CSR}, p_hat={Magma_CSR}, q_hat={Magma_CSR}, u_hat={Magma_CSR}, v_hat={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &r_tld,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, r_tld.dval, 1, queue ); solver_par->init_res = nom0; nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; rho = magma_cdotc( dofs, r.dval, 1, r_tld.dval, 1, queue ); // rho = < r,r_tld> if ( MAGMA_C_ABS(rho) == 0.0 ) { goto cleanup; } if ( solver_par->numiter > 1 ) { // direction vectors beta = rho / rho_l; magma_ccgs_1( r.num_rows, r.num_cols, beta, r.dval, q.dval, u.dval, p.dval, queue ); //u = r + beta*q; //p = u + beta*( q + beta*p ); } else{ magma_ccgs_2( r.num_rows, r.num_cols, r.dval, u.dval, p.dval, queue ); // u = r // p = r } // preconditioner tempop1 = magma_sync_wtime( queue ); CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, p, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &p_hat, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; CHECK( magma_c_spmv( c_one, A, p_hat, c_zero, v_hat, queue )); // v = A p solver_par->spmv_count++; alpha = rho / magma_cdotc( dofs, r_tld.dval, 1, v_hat.dval, 1, queue ); magma_ccgs_3( r.num_rows, r.num_cols, alpha, v_hat.dval, u.dval, q.dval, t.dval, queue ); // q = u - alpha v_hat // t = u + q // preconditioner tempop1 = magma_sync_wtime( queue ); CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, t, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &u_hat, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; CHECK( magma_c_spmv( c_one, A, u_hat, c_zero, t, queue )); // t = A u_hat solver_par->spmv_count++; magma_ccgs_4( r.num_rows, r.num_cols, alpha, u_hat.dval, t.dval, x->dval, r.dval, queue ); // r = r -alpha*A u_hat // x = x + alpha u_hat res = magma_scnrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } rho_l = rho; } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&rt, queue ); magma_cmfree(&r_tld, queue ); magma_cmfree(&p, queue ); magma_cmfree(&q, queue ); magma_cmfree(&u, queue ); magma_cmfree(&v, queue ); magma_cmfree(&t, queue ); magma_cmfree(&p_hat, queue ); magma_cmfree(&q_hat, queue ); magma_cmfree(&u_hat, queue ); magma_cmfree(&v_hat, queue ); solver_par->info = info; return info; } /* magma_cpcgs_merge */
extern "C" magma_int_t magma_cpcg( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaFloatComplex alpha, beta; float nom0, r0, res, nomb; magmaFloatComplex den, gammanew, gammaold = MAGMA_C_MAKE(1.0,0.0); // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_c_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, h={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &h, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); // preconditioner CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_ccopy( dofs, h.dval, 1, p.dval, 1, queue ); // p = h CHECK( magma_c_spmv( c_one, A, p, c_zero, q, queue )); // q = A p den = magma_cdotc( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q solver_par->init_res = nom0; nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nomb < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_C_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // preconditioner CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); gammanew = magma_cdotc( dofs, r.dval, 1, h.dval, 1, queue ); // gn = < r,h> if ( solver_par->numiter == 1 ) { magma_ccopy( dofs, h.dval, 1, p.dval, 1, queue ); // p = h } else { beta = (gammanew/gammaold); // beta = gn/go magma_cscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_caxpy( dofs, c_one, h.dval, 1, p.dval, 1, queue ); // p = p + h } CHECK( magma_c_spmv( c_one, A, p, c_zero, q, queue )); // q = A p den = magma_cdotc( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q alpha = gammanew / den; magma_caxpy( dofs, alpha, p.dval, 1, x->dval, 1, queue ); // x = x + alpha p magma_caxpy( dofs, -alpha, q.dval, 1, r.dval, 1, queue ); // r = r - alpha q gammaold = gammanew; res = magma_scnrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&rt, queue ); magma_cmfree(&p, queue ); magma_cmfree(&q, queue ); magma_cmfree(&h, queue ); solver_par->info = info; return info; } /* magma_ccg */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_A, *d_T, *ddA, *dtau; magmaFloatComplex *d_A2, *d_T2, *ddA2, *dtau2; float *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); float tol = 10. * opts.tolerance * lapackf77_slamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_T, magmaFloatComplex, N*N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_T, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( d_A2, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_T2, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA2, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( dtau2, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, float, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, float, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_claset cudaMemset(ddA, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(d_T, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(ddA2, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(d_T2, 0, N*N*sizeof(magmaFloatComplex)); lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); magma_csetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_cgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_cgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_cgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_cgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_cgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_cgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_cgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_cgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_cgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); magma_cgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_clange("M", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_cgetmatrix( N, N, d_T, N, h_T, N ); float terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_C_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_C_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_ssqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cmslice( magma_int_t num_slices, magma_int_t slice, magma_c_matrix A, magma_c_matrix *B, magma_c_matrix *ALOC, magma_c_matrix *ANLOC, magma_index_t *comm_i, magmaFloatComplex *comm_v, magma_int_t *start, magma_int_t *end, magma_queue_t queue ) { magma_int_t info = 0; if( A.num_rows != A.num_cols ){ printf("%% error: only supported for square matrices.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location == Magma_CPU && A.storage_type == Magma_CSR ){ CHECK( magma_cmconvert( A, B, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( B->col ); magma_free_cpu( B->val ); CHECK( magma_cmconvert( A, ALOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ALOC->col ); magma_free_cpu( ALOC->row ); magma_free_cpu( ALOC->val ); CHECK( magma_cmconvert( A, ANLOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ANLOC->col ); magma_free_cpu( ANLOC->row ); magma_free_cpu( ANLOC->val ); magma_int_t i,j,k, nnz, nnz_loc=0, loc_row = 0, nnz_nloc = 0; magma_index_t col; magma_int_t size = magma_ceildiv( A.num_rows, num_slices ); magma_int_t lstart = slice*size; magma_int_t lend = min( (slice+1)*size, A.num_rows ); // correct size for last slice size = lend-lstart; CHECK( magma_index_malloc_cpu( &ALOC->row, size+1 ) ); CHECK( magma_index_malloc_cpu( &ANLOC->row, size+1 ) ); // count elements for slice - identity for rest nnz = A.row[ lend ] - A.row[ lstart ] + ( A.num_rows - size ); CHECK( magma_index_malloc_cpu( &B->col, nnz ) ); CHECK( magma_cmalloc_cpu( &B->val, nnz ) ); // for the communication plan for( i=0; i<A.num_rows; i++ ) { comm_i[i] = 0; comm_v[i] = MAGMA_C_ZERO; } k=0; B->row[i] = 0; ALOC->row[0] = 0; ANLOC->row[0] = 0; // identity above slice for( i=0; i<lstart; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_C_ONE; B->col[k] = i; k++; } // slice for( i=lstart; i<lend; i++ ) { B->row[i+1] = B->row[i] + (A.row[i+1]-A.row[i]); for( j=A.row[i]; j<A.row[i+1]; j++ ){ B->val[k] = A.val[j]; col = A.col[j]; B->col[k] = col; // communication plan if( col<lstart || col>=lend ){ comm_i[ col ] = 1; comm_v[ col ] = comm_v[ col ] + MAGMA_C_MAKE( MAGMA_C_ABS( A.val[j] ), 0.0 ); nnz_nloc++; } else { nnz_loc++; } k++; } loc_row++; ALOC->row[ loc_row ] = nnz_loc; ANLOC->row[ loc_row ] = nnz_nloc; } CHECK( magma_index_malloc_cpu( &ALOC->col, nnz_loc ) ); CHECK( magma_cmalloc_cpu( &ALOC->val, nnz_loc ) ); ALOC->num_rows = size; ALOC->num_cols = size; ALOC->nnz = nnz_loc; CHECK( magma_index_malloc_cpu( &ANLOC->col, nnz_nloc ) ); CHECK( magma_cmalloc_cpu( &ANLOC->val, nnz_nloc ) ); ANLOC->num_rows = size; ANLOC->num_cols = A.num_cols; ANLOC->nnz = nnz_nloc; nnz_loc = 0; nnz_nloc = 0; // local/nonlocal matrix for( i=lstart; i<lend; i++ ) { for( j=A.row[i]; j<A.row[i+1]; j++ ){ col = A.col[j]; // insert only in local part in ALOC, nonlocal in ANLOC if( col<lstart || col>=lend ){ ANLOC->val[ nnz_nloc ] = A.val[j]; ANLOC->col[ nnz_nloc ] = col; nnz_nloc++; } else { ALOC->val[ nnz_loc ] = A.val[j]; ALOC->col[ nnz_loc ] = col-lstart; nnz_loc++; } } } // identity below slice for( i=lend; i<A.num_rows; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_C_ONE; B->col[k] = i; k++; } B->nnz = k; *start = lstart; *end = lend; } else { printf("error: mslice only supported for CSR matrices on the CPU: %d %d.\n", int(A.memory_location), int(A.storage_type) ); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: return info; }