Exemplos de MAGMA_Z_CNJG em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: testing_zutil.cpp Projeto: soulsheng/magma

void magma_zmake_hpd( magma_int_t N, magmaDoubleComplex* A, magma_int_t lda )
{
    magma_int_t i, j;
    for( i=0; i<N; ++i ) {
        A(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL( A(i,i) ) + N, 0. );
        for( j=0; j<i; ++j ) {
            A(j,i) = MAGMA_Z_CNJG( A(i,j) );
        }
    }
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: zbulge_kernel_v2.cpp Projeto: soulsheng/magma

extern "C" void
magma_ztrdtype2cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, 
                                magmaDoubleComplex *A, magma_int_t lda, 
                                magmaDoubleComplex *V, magma_int_t ldv, 
                                magmaDoubleComplex *TAU,
                                magma_int_t st, magma_int_t ed, 
                                magma_int_t sweep, magma_int_t Vblksiz, 
                                magmaDoubleComplex *work) 
{

    /*
     WORK (workspace) double complex array, dimension NB
    */

    magma_int_t ione = 1;
    magma_int_t vpos, taupos;

    magmaDoubleComplex conjtmp;

    magmaDoubleComplex c_one = MAGMA_Z_ONE;

    magma_int_t ldx = lda-1;
    magma_int_t len = ed - st + 1;
    magma_int_t lem = min(ed+nb, n) - ed;

    if(lem>0){
        magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, st-1, ldv, &vpos, &taupos);
        /* apply remaining right coming from the top block */
        lapackf77_zlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(ed+1, st), &ldx, work);
    }
    if(lem>1){
        magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, ed, ldv, &vpos, &taupos);

        /* remove the first column of the created bulge */
        *V(vpos)  = c_one;
        //memcpy(V(vpos+1), A(ed+2, st), (lem-1)*sizeof(magmaDoubleComplex));
        cblas_zcopy(lem-1, A(ed+2, st), ione, V(vpos+1), ione);
        memset(A(ed+2, st),0,(lem-1)*sizeof(magmaDoubleComplex));

        /* Eliminate the col at st */
        lapackf77_zlarfg( &lem, A(ed+1, st), V(vpos+1), &ione, TAU(taupos) );

        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been removed;*/
        conjtmp = MAGMA_Z_CNJG(*TAU(taupos));
        lapackf77_zlarfx("L", &lem, &len, V(vpos),  &conjtmp, A(ed+1, st+1), &ldx, work);
    }

}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: zbulge_kernel_v2.cpp Projeto: soulsheng/magma

inline static void
magma_zlarfxsym_v2(magma_int_t n, 
                magmaDoubleComplex *A, magma_int_t lda, 
                magmaDoubleComplex *V, magmaDoubleComplex *TAU, 
                magmaDoubleComplex *work) 
{
/*
    WORK (workspace) double complex array, dimension N
*/

    magma_int_t ione = 1;
    magmaDoubleComplex dtmp;
    magmaDoubleComplex c_zero   =  MAGMA_Z_ZERO;
    magmaDoubleComplex c_neg_one=  MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_half   =  MAGMA_Z_HALF;

    /* X = AVtau */
    blasf77_zhemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione);

    /* compute dtmp= X'*V */
#if defined(PRECISION_z) || defined(PRECISION_c)
   dtmp = c_zero;
   for (magma_int_t j = 0; j < n ; j++)
      dtmp = dtmp + MAGMA_Z_CNJG(work[j]) * V[j];
    //cblas_zdotc_sub(n, work, ione, V, ione, &dtmp);
#else
    dtmp = cblas_zdotc(n, work, ione, V, ione);
#endif


    /* compute 1/2 X'*V*t = 1/2*dtmp*tau  */
    dtmp = -dtmp * c_half * (*TAU);

    /* compute W=X-1/2VX'Vt = X - dtmp*V */
    blasf77_zaxpy(&n, &dtmp, V, &ione, work, &ione);

    /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */
    blasf77_zher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda);

}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: zbulge_kernel.cpp Projeto: XapaJIaMnu/magma

extern "C" void magma_ztrdtype2cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) {
  magma_int_t    J1, J2, len, lem, LDX;
  //magma_int_t    i, j;
  magma_int_t    IONE=1;
  magma_int_t    blkid, vpos, taupos, tpos;
  magmaDoubleComplex conjtmp;
  magmaDoubleComplex Z_ONE  =  MAGMA_Z_ONE;
  //magmaDoubleComplex WORK[NB];
  magmaDoubleComplex *WORK;
  magma_zmalloc_cpu( &WORK, NB );


  findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
  LDX    = LDA-1;
  J1     = ed+1;
  J2     = min(ed+NB,N);
  len    = ed-st+1;
  lem    = J2-J1+1;
  if (lem > 0) {
     /* apply remaining right commming from the top block */
     lapackf77_zlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK);
  }
  if (lem > 1) {
     findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid);
     /* remove the first column of the created bulge */
     *V(vpos)  = Z_ONE;
     memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(magmaDoubleComplex));
     memset(A(J1+1, st),0,(lem-1)*sizeof(magmaDoubleComplex));
     /* Eliminate the col at st */
     lapackf77_zlarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) );
     /* apply left on A(J1:J2,st+1:ed) */
     len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */
     conjtmp = MAGMA_Z_CNJG(*TAU(taupos));
     lapackf77_zlarfx("L", &lem, &len, V(vpos),  &conjtmp, A(J1, st+1), &LDX, WORK);
  }
  magma_free_cpu(WORK);
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: zbulge_kernel.cpp Projeto: soulsheng/magma

extern "C" void 
magma_zlarfxsym(magma_int_t N, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU) {
  magma_int_t IONE=1; 
  magmaDoubleComplex dtmp;
  magmaDoubleComplex Z_ZERO =  MAGMA_Z_ZERO;
  //magmaDoubleComplex Z_ONE  =  MAGMA_Z_ONE;
  magmaDoubleComplex Z_MONE =  MAGMA_Z_NEG_ONE;
  magmaDoubleComplex Z_HALF =  MAGMA_Z_HALF;
  //magmaDoubleComplex WORK[N];
  magmaDoubleComplex *WORK  = (magmaDoubleComplex *) malloc( N * sizeof(magmaDoubleComplex) );

  /* apply left and right on A(st:ed,st:ed)*/
  //magma_zlarfxsym(len,A(st,st),LDX,V(st),TAU(st));
  /* X = AVtau */
  blasf77_zhemv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE);
  /* je calcul dtmp= X'*V */
#if defined(PRECISION_z) || defined(PRECISION_c)
   dtmp = Z_ZERO;
   for (magma_int_t j = 0; j < N ; j++)
      dtmp = dtmp + MAGMA_Z_CNJG(WORK[j]) * V[j];
   //cblas_zdotc_sub(N, WORK, IONE, V, IONE, &dtmp);
#else
  dtmp = cblas_zdotc(N, WORK, IONE, V, IONE);
#endif
  /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau  */
  dtmp = -dtmp * Z_HALF * (*TAU);
  /* je calcul W=X-1/2VX'Vt = X - dtmp*V */
  /*
  for (j = 0; j < N ; j++)
      WORK[j] = WORK[j] + (dtmp*V[j]); */
  blasf77_zaxpy(&N, &dtmp, V, &IONE, WORK, &IONE);
  /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */
  blasf77_zher2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA);
  
  magma_free_cpu(WORK);
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: zgeev.cpp Projeto: mauro-belgiovine/belgiovi-clmagma

extern "C" magma_int_t
magma_zgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n,
            magmaDoubleComplex *a, magma_int_t lda,
            magmaDoubleComplex *geev_w_array,
            magmaDoubleComplex *vl, magma_int_t ldvl,
            magmaDoubleComplex *vr, magma_int_t ldvr,
            magmaDoubleComplex *work, magma_int_t lwork,
            double *rwork, magma_int_t *info, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.0.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       September 2012

    Purpose   
    =======   
    ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the   
    eigenvalues and, optionally, the left and/or right eigenvectors.   

    The right eigenvector v(j) of A satisfies   
                     A * v(j) = lambda(j) * v(j)   
    where lambda(j) is its eigenvalue.   
    The left eigenvector u(j) of A satisfies   
                  u(j)**H * A = lambda(j) * u(j)**H   
    where u(j)**H denotes the conjugate transpose of u(j).   

    The computed eigenvectors are normalized to have Euclidean norm   
    equal to 1 and largest component real.   

    Arguments   
    =========   
    JOBVL   (input) CHARACTER*1   
            = 'N': left eigenvectors of A are not computed;   
            = 'V': left eigenvectors of are computed.   

    JOBVR   (input) CHARACTER*1   
            = 'N': right eigenvectors of A are not computed;   
            = 'V': right eigenvectors of A are computed.   

    N       (input) INTEGER   
            The order of the matrix A. N >= 0.   

    A       (input/output) COMPLEX*16 array, dimension (LDA,N)   
            On entry, the N-by-N matrix A.   
            On exit, A has been overwritten.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    W       (output) COMPLEX*16 array, dimension (N)   
            W contains the computed eigenvalues.   

    VL      (output) COMPLEX*16 array, dimension (LDVL,N)   
            If JOBVL = 'V', the left eigenvectors u(j) are stored one   
            after another in the columns of VL, in the same order   
            as their eigenvalues.   
            If JOBVL = 'N', VL is not referenced.   
            u(j) = VL(:,j), the j-th column of VL.   

    LDVL    (input) INTEGER   
            The leading dimension of the array VL.  LDVL >= 1; if   
            JOBVL = 'V', LDVL >= N.   

    VR      (output) COMPLEX*16 array, dimension (LDVR,N)   
            If JOBVR = 'V', the right eigenvectors v(j) are stored one   
            after another in the columns of VR, in the same order   
            as their eigenvalues.   
            If JOBVR = 'N', VR is not referenced.   
            v(j) = VR(:,j), the j-th column of VR.   

    LDVR    (input) INTEGER   
            The leading dimension of the array VR.  LDVR >= 1; if   
            JOBVR = 'V', LDVR >= N.   

    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= (1+nb)*N.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    RWORK   (workspace) DOUBLE PRECISION array, dimension (2*N)   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   
            > 0:  if INFO = i, the QR algorithm failed to compute all the   
                  eigenvalues, and no eigenvectors have been computed;   
                  elements and i+1:N of W contain eigenvalues which have   
                  converged.   
    =====================================================================    */

    magma_int_t c__1 = 1;
    magma_int_t c__0 = 0;
    
    magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, 
            i__2, i__3;
    double d__1, d__2;
    magmaDoubleComplex z__1, z__2;

    magma_int_t i__, k, ihi;
    double scl;
    magma_int_t ilo;
    double dum[1], eps;
    magmaDoubleComplex tmp;
    magma_int_t ibal;
    double anrm;
    magma_int_t ierr, itau, iwrk, nout;
    magma_int_t scalea;
    double cscale;
    magma_int_t select[1];
    double bignum;
    magma_int_t minwrk;
    magma_int_t wantvl;
    double smlnum;
    magma_int_t irwork;
    magma_int_t lquery, wantvr;
    magma_int_t nb = 0;
    magmaDoubleComplex_ptr dT;

    //magma_timestr_t start, end;

    char side[2] = {0, 0};
    magma_vec_t jobvl_ = jobvl;
    magma_vec_t jobvr_ = jobvr;

    *info = 0;
    lquery = lwork == -1;
    wantvl = lapackf77_lsame(lapack_const(jobvl_), "V");
    wantvr = lapackf77_lsame(lapack_const(jobvr_), "V");
    if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) {
        *info = -1;
    } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -8;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -10;
    }

    /*  Compute workspace   */
    if (*info == 0) {
        nb = magma_get_zgehrd_nb(n);
        minwrk = (1+nb)*n;
        work[0] = MAGMA_Z_MAKE((double) minwrk, 0.);

        if (lwork < minwrk && ! lquery) {
            *info = -12;
        }
    }   

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
   
    // if eigenvectors are needed
#if defined(VERSION3)
    if (MAGMA_SUCCESS != magma_malloc(&dT, nb*n*sizeof(magmaDoubleComplex) )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
#endif

    a_dim1 = lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    vl_dim1 = ldvl;
    vl_offset = 1 + vl_dim1;
    vl -= vl_offset;
    vr_dim1 = ldvr;
    vr_offset = 1 + vr_dim1;
    vr -= vr_offset;
    --work;
    --rwork;

    /* Get machine constants */
    eps    = lapackf77_dlamch("P");
    smlnum = lapackf77_dlamch("S");
    bignum = 1. / smlnum;
    lapackf77_dlabad(&smlnum, &bignum);
    smlnum = magma_dsqrt(smlnum) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_zlange("M", &n, &n, &a[a_offset], &lda, dum);
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_zlascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, &
                ierr);
    }

    /* Balance the matrix   
       (CWorkspace: none)   
       (RWorkspace: need N) */
    ibal = 1;
    lapackf77_zgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &rwork[ibal], &ierr);

    /* Reduce to upper Hessenberg form   
       (CWorkspace: need 2*N, prefer N+N*NB)   
       (RWorkspace: none) */
    itau = 1;
    iwrk = itau + n;
    i__1 = lwork - iwrk + 1;

    //start = get_current_time();
#if defined(VERSION1)
    /*
     * Version 1 - LAPACK
     */
    lapackf77_zgehrd(&n, &ilo, &ihi, &a[a_offset], &lda,
                     &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION2)
    /*
     *  Version 2 - LAPACK consistent HRD
     */
    magma_zgehrd2(n, ilo, ihi, &a[a_offset], lda,
                  &work[itau], &work[iwrk], &i__1, &ierr);    
#elif defined(VERSION3)
    /*  
     * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, 
     */
    magma_zgehrd(n, ilo, ihi, &a[a_offset], lda,
                 &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue);
#endif
    //end = get_current_time();
    //printf("    Time for zgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.);

    if (wantvl) {
      /*        Want left eigenvectors   
                Copy Householder vectors to VL */
        side[0] = 'L';
        lapackf77_zlacpy(MagmaLowerStr, &n, &n, 
                         &a[a_offset], &lda, &vl[vl_offset], &ldvl);

        /* Generate unitary matrix in VL   
           (CWorkspace: need 2*N-1, prefer N+(N-1)*NB)   
           (RWorkspace: none) */
        i__1 = lwork - iwrk + 1;

        //start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
        /*
         * Version 1 & 2 - LAPACK
         */
        lapackf77_zunghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, 
                         &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
        /*
         * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
         */
        magma_zunghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], 
                     dT, 0, nb, &ierr, queue);
#endif
        //end = get_current_time();
        //printf("    Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);

        /* Perform QR iteration, accumulating Schur vectors in VL   
           (CWorkspace: need 1, prefer HSWORK (see comments) )   
           (RWorkspace: none) */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array,
                &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info);

        if (wantvr) 
          {
            /* Want left and right eigenvectors   
               Copy Schur vectors to VR */
            side[0] = 'B';
            lapackf77_zlacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr);
          }

    } else if (wantvr) {
        /*  Want right eigenvectors   
            Copy Householder vectors to VR */
        side[0] = 'R';
        lapackf77_zlacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr);

        /* Generate unitary matrix in VR   
           (CWorkspace: need 2*N-1, prefer N+(N-1)*NB)   
           (RWorkspace: none) */
        i__1 = lwork - iwrk + 1;
        //start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
        /*
         * Version 1 & 2 - LAPACK
         */
        lapackf77_zunghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, 
                         &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
        /*
         * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
         */
        magma_zunghr(n, ilo, ihi, &vr[vr_offset], ldvr, 
                     &work[itau], dT, 0, nb, &ierr, queue);
#endif
        //end = get_current_time();
        //printf("    Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);

        /* Perform QR iteration, accumulating Schur vectors in VR   
           (CWorkspace: need 1, prefer HSWORK (see comments) )   
           (RWorkspace: none) */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, 
                &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
    } else {
      /*  Compute eigenvalues only   
          (CWorkspace: need 1, prefer HSWORK (see comments) )   
          (RWorkspace: none) */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_zhseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array,
                &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
    }

    /* If INFO > 0 from ZHSEQR, then quit */
    if (*info > 0) {
        goto L50;
    }

    if (wantvl || wantvr) {
        /*  Compute left and/or right eigenvectors   
            (CWorkspace: need 2*N)   
            (RWorkspace: need 2*N) */
        irwork = ibal + n;
        lapackf77_ztrevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl,
                &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], 
                &ierr);
    }

    if (wantvl) {
        /*  Undo balancing of left eigenvectors   
            (CWorkspace: none)   
            (RWorkspace: need N) */
        lapackf77_zgebak("B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, 
                         &vl[vl_offset], &ldvl, &ierr);

        /* Normalize left eigenvectors and make largest component real */
        for (i__ = 1; i__ <= n; ++i__) {
            scl = 1. / cblas_dznrm2(n, &vl[i__ * vl_dim1 + 1], 1);
            cblas_zdscal(n, scl, &vl[i__ * vl_dim1 + 1], 1);
            i__2 = n;
            for (k = 1; k <= i__2; ++k) 
            {
                i__3 = k + i__ * vl_dim1;
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL(vl[i__3]);
                /* Computing 2nd power */
                d__2 = MAGMA_Z_IMAG(vl[k + i__ * vl_dim1]);
                rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2;
            }
            /* Comment:
                   Fortran BLAS does not have to add 1
                   C       BLAS must add one to cblas_idamax */
            k = cblas_idamax(n, &rwork[irwork], 1)+1;
            z__2 = MAGMA_Z_CNJG(vl[k + i__ * vl_dim1]);
            d__1 = magma_dsqrt(rwork[irwork + k - 1]);
            MAGMA_Z_DSCALE(z__1, z__2, d__1);
            tmp = z__1;
            cblas_zscal(n, CBLAS_SADDR(tmp), &vl[i__ * vl_dim1 + 1], 1);
            i__2 = k + i__ * vl_dim1;
            i__3 = k + i__ * vl_dim1;
            d__1 = MAGMA_Z_REAL(vl[i__3]);
            MAGMA_Z_SET2REAL(z__1, d__1);
            vl[i__2] = z__1;
        }
    }

    if (wantvr) {
      /*  Undo balancing of right eigenvectors   
          (CWorkspace: none)   
          (RWorkspace: need N) */
        lapackf77_zgebak("B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, 
                         &vr[vr_offset], &ldvr, &ierr);

        /* Normalize right eigenvectors and make largest component real */
        for (i__ = 1; i__ <= n; ++i__) {
            scl = 1. / cblas_dznrm2(n, &vr[i__ * vr_dim1 + 1], 1);
            cblas_zdscal(n, scl, &vr[i__ * vr_dim1 + 1], 1);
            i__2 = n;
            for (k = 1; k <= i__2; ++k) {
                i__3 = k + i__ * vr_dim1;
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL(vr[i__3]);
                /* Computing 2nd power */
                d__2 = MAGMA_Z_IMAG(vr[k + i__ * vr_dim1]);
                rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2;
            }
            /* Comment:
                   Fortran BLAS does not have to add 1
                   C       BLAS must add one to cblas_idamax */
            k = cblas_idamax(n, &rwork[irwork], 1)+1;
            z__2 = MAGMA_Z_CNJG(vr[k + i__ * vr_dim1]);
            d__1 = magma_dsqrt(rwork[irwork + k - 1]);
            MAGMA_Z_DSCALE(z__1, z__2, d__1);
            tmp = z__1;
            cblas_zscal(n, CBLAS_SADDR(tmp), &vr[i__ * vr_dim1 + 1], 1);
            i__2 = k + i__ * vr_dim1;
            i__3 = k + i__ * vr_dim1;
            d__1 = MAGMA_Z_REAL(vr[i__3]);
            MAGMA_Z_SET2REAL(z__1, d__1);
            vr[i__2] = z__1;
        }
    }

    /*  Undo scaling if necessary */
L50:
    if (scalea) {
        i__1 = n - *info;
        /* Computing MAX */
        i__3 = n - *info;
        i__2 = max(i__3,1);
        lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, 
                geev_w_array + *info, &i__2, &ierr);
        if (*info > 0) {
            i__1 = ilo - 1;
            lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, 
                    geev_w_array, &n, &ierr);
        }
    }

#if defined(VERSION3)
    magma_free( dT );
#endif
    return *info;
} /* magma_zgeev */

Exemplo n.º 7

0

Exibir arquivo

Arquivo: ztrevc3.cpp Projeto: EmergentOrder/magma

magma_int_t magma_ztrevc3(
    magma_side_t side, magma_vec_t howmany,
    magma_int_t *select,  // logical in Fortran
    magma_int_t n,
    magmaDoubleComplex *T,  magma_int_t ldt,
    magmaDoubleComplex *VL, magma_int_t ldvl,
    magmaDoubleComplex *VR, magma_int_t ldvr,
    magma_int_t mm, magma_int_t *mout,
    magmaDoubleComplex *work, magma_int_t lwork,
    double *rwork, magma_int_t *info )
{
    #define  T(i,j)  ( T + (i) + (j)*ldt )
    #define VL(i,j)  (VL + (i) + (j)*ldvl)
    #define VR(i,j)  (VR + (i) + (j)*ldvr)
    #define work(i,j) (work + (i) + (j)*n)

    // .. Parameters ..
    const magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    const magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    const magma_int_t  nbmin = 16, nbmax = 128;
    const magma_int_t  ione = 1;
    
    // .. Local Scalars ..
    magma_int_t            allv, bothv, leftv, over, rightv, somev;
    magma_int_t            i, ii, is, j, k, ki, iv, n2, nb, nb2, version;
    double                 ovfl, remax, scale, smin, smlnum, ulp, unfl;
    
    // Decode and test the input parameters
    bothv  = (side == MagmaBothSides);
    rightv = (side == MagmaRight) || bothv;
    leftv  = (side == MagmaLeft ) || bothv;

    allv  = (howmany == MagmaAllVec);
    over  = (howmany == MagmaBacktransVec);
    somev = (howmany == MagmaSomeVec);

    // Set mout to the number of columns required to store the selected
    // eigenvectors.
    if ( somev ) {
        *mout = 0;
        for( j=0; j < n; ++j ) {
            if ( select[j] ) {
                *mout += 1;
            }
        }
    }
    else {
        *mout = n;
    }

    *info = 0;
    if ( ! rightv && ! leftv )
        *info = -1;
    else if ( ! allv && ! over && ! somev )
        *info = -2;
    else if ( n < 0 )
        *info = -4;
    else if ( ldt < max( 1, n ) )
        *info = -6;
    else if ( ldvl < 1 || ( leftv && ldvl < n ) )
        *info = -8;
    else if ( ldvr < 1 || ( rightv && ldvr < n ) )
        *info = -10;
    else if ( mm < *mout )
        *info = -11;
    else if ( lwork < max( 1, 2*n ) )
        *info = -14;
    
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    // Quick return if possible.
    if ( n == 0 ) {
        return *info;
    }
    
    // Use blocked version (2) if sufficient workspace.
    // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x.
    // (Compared to dtrevc3, rwork stores 1-norms.)
    // Zero-out the workspace to avoid potential NaN propagation.
    nb = 2;
    if ( lwork >= n + 2*n*nbmin ) {
        version = 2;
        nb = (lwork - n) / (2*n);
        nb = min( nb, nbmax );
        nb2 = 1 + 2*nb;
        lapackf77_zlaset( "F", &n, &nb2, &c_zero, &c_zero, work, &n );
    }
    else {
        version = 1;
    }

    // Set the constants to control overflow.
    unfl = lapackf77_dlamch( "Safe minimum" );
    ovfl = 1. / unfl;
    lapackf77_dlabad( &unfl, &ovfl );
    ulp = lapackf77_dlamch( "Precision" );
    smlnum = unfl*( n / ulp );

    // Store the diagonal elements of T in working array work.
    for( i=0; i < n; ++i ) {
        *work(i,0) = *T(i,i);
    }

    // Compute 1-norm of each column of strictly upper triangular
    // part of T to control overflow in triangular solver.
    rwork[0] = 0.;
    for( j=1; j < n; ++j ) {
        rwork[j] = cblas_dzasum( j, T(0,j), ione );
    }

    magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0;
    timer_start( time_total );

    if ( rightv ) {
        // ============================================================
        // Compute right eigenvectors.
        // iv is index of column in current block.
        // Non-blocked version always uses iv=1;
        // blocked     version starts with iv=nb, goes down to 1.
        // (Note the "0-th" column is used to store the original diagonal.)
        iv = 1;
        if ( version == 2 ) {
            iv = nb;
        }
        
        timer_start( time_trsv );
        is = *mout - 1;
        for( ki=n-1; ki >= 0; --ki ) {
            if ( somev ) {
                if ( ! select[ki] ) {
                    continue;
                }
            }
            smin = max( ulp*( MAGMA_Z_ABS1( *T(ki,ki) ) ), smlnum );

            // --------------------------------------------------------
            // Complex right eigenvector
            *work(ki,iv) = c_one;

            // Form right-hand side.
            for( k=0; k < ki; ++k ) {
                *work(k,iv) = -(*T(k,ki));
            }

            // Solve upper triangular system:
            // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work.
            for( k=0; k < ki; ++k ) {
                *T(k,k) -= *T(ki,ki);
                if ( MAGMA_Z_ABS1( *T(k,k) ) < smin ) {
                    *T(k,k) = MAGMA_Z_MAKE( smin, 0. );
                }
            }

            if ( ki > 0 ) {
                lapackf77_zlatrs( "Upper", "No transpose", "Non-unit", "Y",
                                  &ki, T, &ldt,
                                  work(0,iv), &scale, rwork, info );
                *work(ki,iv) = MAGMA_Z_MAKE( scale, 0. );
            }

            // Copy the vector x or Q*x to VR and normalize.
            if ( ! over ) {
                // ------------------------------
                // no back-transform: copy x to VR and normalize
                n2 = ki+1;
                blasf77_zcopy( &n2, work(0,iv), &ione, VR(0,is), &ione );

                ii = blasf77_izamax( &n2, VR(0,is), &ione ) - 1;
                remax = 1. / MAGMA_Z_ABS1( *VR(ii,is) );
                blasf77_zdscal( &n2, &remax, VR(0,is), &ione );

                for( k=ki+1; k < n; ++k ) {
                    *VR(k,is) = c_zero;
                }
            }
            else if ( version == 1 ) {
                // ------------------------------
                // version 1: back-transform each vector with GEMV, Q*x.
                time_trsv_sum += timer_stop( time_trsv );
                timer_start( time_gemv );
                if ( ki > 0 ) {
                    blasf77_zgemv( "n", &n, &ki, &c_one,
                                   VR, &ldvr,
                                   work(0, iv), &ione,
                                   work(ki,iv), VR(0,ki), &ione );
                }
                time_gemv_sum += timer_stop( time_gemv );
                ii = blasf77_izamax( &n, VR(0,ki), &ione ) - 1;
                remax = 1. / MAGMA_Z_ABS1( *VR(ii,ki) );
                blasf77_zdscal( &n, &remax, VR(0,ki), &ione );
                timer_start( time_trsv );
            }
            else if ( version == 2 ) {
                // ------------------------------
                // version 2: back-transform block of vectors with GEMM
                // zero out below vector
                for( k=ki+1; k < n; ++k ) {
                    *work(k,iv) = c_zero;
                }

                // Columns iv:nb of work are valid vectors.
                // When the number of vectors stored reaches nb,
                // or if this was last vector, do the GEMM
                if ( (iv == 1) || (ki == 0) ) {
                    time_trsv_sum += timer_stop( time_trsv );
                    timer_start( time_gemm );
                    nb2 = nb-iv+1;
                    n2  = ki+nb-iv+1;
                    blasf77_zgemm( "n", "n", &n, &nb2, &n2, &c_one,
                                   VR, &ldvr,
                                   work(0,iv   ), &n, &c_zero,
                                   work(0,nb+iv), &n );
                    time_gemm_sum += timer_stop( time_gemm );
                    
                    // normalize vectors
                    // TODO if somev, should copy vectors individually to correct location.
                    for( k = iv; k <= nb; ++k ) {
                        ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1;
                        remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) );
                        blasf77_zdscal( &n, &remax, work(0,nb+k), &ione );
                    }
                    lapackf77_zlacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr );
                    iv = nb;
                    timer_start( time_trsv );
                }
                else {
                    iv -= 1;
                }
            } // blocked back-transform

            // Restore the original diagonal elements of T.
            for( k=0; k <= ki - 1; ++k ) {
                *T(k,k) = *work(k,0);
            }

            is -= 1;
        }
    }
    timer_stop( time_trsv );

    timer_stop( time_total );
    timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n",
                  time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total );

    if ( leftv ) {
        // ============================================================
        // Compute left eigenvectors.
        // iv is index of column in current block.
        // Non-blocked version always uses iv=1;
        // blocked     version starts with iv=1, goes up to nb.
        // (Note the "0-th" column is used to store the original diagonal.)
        iv = 1;
        is = 0;
        for( ki=0; ki < n; ++ki ) {
            if ( somev ) {
                if ( ! select[ki] ) {
                    continue;
                }
            }
            smin = max( ulp*MAGMA_Z_ABS1( *T(ki,ki) ), smlnum );

            // --------------------------------------------------------
            // Complex left eigenvector
            *work(ki,iv) = c_one;

            // Form right-hand side.
            for( k = ki + 1; k < n; ++k ) {
                *work(k,iv) = -MAGMA_Z_CNJG( *T(ki,k) );
            }

            // Solve conjugate-transposed triangular system:
            // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work.
            for( k = ki + 1; k < n; ++k ) {
                *T(k,k) -= *T(ki,ki);
                if ( MAGMA_Z_ABS1( *T(k,k) ) < smin ) {
                    *T(k,k) = MAGMA_Z_MAKE( smin, 0. );
                }
            }

            if ( ki < n-1 ) {
                n2 = n-ki-1;
                lapackf77_zlatrs( "Upper", "Conjugate transpose", "Non-unit", "Y",
                                  &n2, T(ki+1,ki+1), &ldt,
                                  work(ki+1,iv), &scale, rwork, info );
                *work(ki,iv) = MAGMA_Z_MAKE( scale, 0. );
            }

            // Copy the vector x or Q*x to VL and normalize.
            if ( ! over ) {
                // ------------------------------
                // no back-transform: copy x to VL and normalize
                n2 = n-ki;
                blasf77_zcopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione );

                ii = blasf77_izamax( &n2, VL(ki,is), &ione ) + ki - 1;
                remax = 1. / MAGMA_Z_ABS1( *VL(ii,is) );
                blasf77_zdscal( &n2, &remax, VL(ki,is), &ione );

                for( k=0; k < ki; ++k ) {
                    *VL(k,is) = c_zero;
                }
            }
            else if ( version == 1 ) {
                // ------------------------------
                // version 1: back-transform each vector with GEMV, Q*x.
                if ( ki < n-1 ) {
                    n2 = n-ki-1;
                    blasf77_zgemv( "n", &n, &n2, &c_one,
                                   VL(0,ki+1), &ldvl,
                                   work(ki+1,iv), &ione,
                                   work(ki,  iv), VL(0,ki), &ione );
                }
                ii = blasf77_izamax( &n, VL(0,ki), &ione ) - 1;
                remax = 1. / MAGMA_Z_ABS1( *VL(ii,ki) );
                blasf77_zdscal( &n, &remax, VL(0,ki), &ione );
            }
            else if ( version == 2 ) {
                // ------------------------------
                // version 2: back-transform block of vectors with GEMM
                // zero out above vector
                // could go from (ki+1)-NV+1 to ki
                for( k=0; k < ki; ++k ) {
                    *work(k,iv) = c_zero;
                }

                // Columns 1:iv of work are valid vectors.
                // When the number of vectors stored reaches nb,
                // or if this was last vector, do the GEMM
                if ( (iv == nb) || (ki == n-1) ) {
                    n2 = n-(ki+1)+iv;
                    blasf77_zgemm( "n", "n", &n, &iv, &n2, &c_one,
                                   VL(0,ki-iv+1), &ldvl,
                                   work(ki-iv+1,1   ), &n, &c_zero,
                                   work(0,      nb+1), &n );
                    // normalize vectors
                    for( k=1; k <= iv; ++k ) {
                        ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1;
                        remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) );
                        blasf77_zdscal( &n, &remax, work(0,nb+k), &ione );
                    }
                    lapackf77_zlacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl );
                    iv = 1;
                }
                else {
                    iv += 1;
                }
            } // blocked back-transform

            // Restore the original diagonal elements of T.
            for( k = ki + 1; k < n; ++k ) {
                *T(k,k) = *work(k,0);
            }

            is += 1;
        }
    }
    
    return *info;
}  // End of ZTREVC

Exemplo n.º 8

0

Exibir arquivo

Arquivo: zher2k_mgpu.cpp Projeto: soulsheng/magma

void magmablas_zher2k_mgpu2(
    char uplo, char trans, magma_int_t n, magma_int_t k,
    magmaDoubleComplex alpha, magmaDoubleComplex *dA[], magma_int_t lda, magma_int_t aoffset,
                           magmaDoubleComplex *dB[], magma_int_t ldb, magma_int_t boffset,
    double beta,           magmaDoubleComplex *dC[], magma_int_t ldc, magma_int_t coffset,
    magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream )
{
    #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) )
    #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) )
    #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc)
    
    /* Check arguments */
    magma_int_t info = 0;
    if ( ! (uplo == 'l' || uplo == 'L')) {
        info = -1;  // 'u' not yet handled
    } else if ( ! (trans == 'n' || trans == 'N')) {
        info = -2;  // 'c' not yet handled
    } else if ( n < 0 ) {
        info = -3;
    } else if ( k < 0 ) {
        info = -4;
    } else if ( ((trans == 'n' || trans == 'N') && lda < max(1,n)) ||
                ((trans == 'c' || trans == 'C') && lda < max(1,k)) ) {
        info = -7;
    } else if ( aoffset < 0 || aoffset > lda ) {
        info = -8;
    } else if ( ((trans == 'n' || trans == 'N') && ldb < max(1,n)) ||
                ((trans == 'c' || trans == 'C') && ldb < max(1,k)) ) {
        info = -10;
    } else if ( boffset < 0 || boffset > ldb ) {
        info = -11;
    } else if ( ldc < max(1,n) ) {
        info = -13;
    } else if ( coffset < 0 || coffset > ldc ) {
        info = -14;
    } else if ( ngpu <= 0 ) {
        info = -15;
    } else if ( nb <= 0 ) {
        info = -16;
    } else if ( nstream <= 0 ) {
        info = -18;
    }
    if ( info != 0 ) {
        magma_xerbla( __func__, -(info) );
        return;
    }
    
    const magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magmaDoubleComplex cbeta = MAGMA_Z_MAKE( beta, 0. );
    
    magma_int_t ib, ioff, iblock, idev, di, s;
    
    magma_device_t cdev;
    magma_queue_t cqueue;
    magma_getdevice( &cdev );
    magmablasGetKernelStream( &cqueue );
    
    // loop over all blocks
    // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C
    // blockoffset is offset within first block; for subsequent blocks it is 0
    magma_int_t blockoffset = coffset % nb;
    for( magma_int_t i = 0; i < n; i += ib ) {
        ib     = min( nb-blockoffset, n-i );  // block size
        ioff   = i + coffset;                 // global index in parent matrix
        iblock = (ioff / nb) / ngpu;          // local block id
        idev   = (ioff / nb) % ngpu;          // device with this block
        di     = iblock*nb + blockoffset;     // local index in parent matrix
        
        magma_setdevice( idev );
        s = iblock % nstream;
        magmablasSetKernelStream( streams[ idev ][ s ] );
        
        // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i]
        //printf( "zgemm  n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
        magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-i, ib, k,
                     alpha, dA(idev,i,0), lda,
                            dB(idev,i,0), ldb,
                     cbeta, dC(idev,ioff,di), ldc );
        blockoffset = 0;
    }
    
    // second loop does C = conj(alpha)*B*A' + C_hat
    alpha = MAGMA_Z_CNJG( alpha );
    blockoffset = coffset % nb;
    for( magma_int_t i = 0; i < n; i += ib ) {
        ib     = min( nb-blockoffset, n-i );  // block size
        ioff   = i + coffset;                 // global index in parent matrix
        iblock = (ioff / nb) / ngpu;          // local block id
        idev   = (ioff / nb) % ngpu;          // device with this block
        di     = iblock*nb + blockoffset;     // local index in parent matrix
        
        magma_setdevice( idev );
        s = iblock % nstream;
        magmablasSetKernelStream( streams[ idev ][ s ] );
        
        // C[i:n,i] += conj(alpha) * B[i:n,0] * A[i,0]'
        //printf( "zgemm  n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
        magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-i, ib, k,
                     alpha, dB(idev,i,0), ldb,
                            dA(idev,i,0), lda,
                     c_one, dC(idev,ioff,di), ldc );
        blockoffset = 0;
    }
    
    magma_setdevice( cdev );
    magmablasSetKernelStream( cqueue );
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: testing_zpotrf_gpu.cpp Projeto: mauro-belgiovine/belgiovi-clmagma

int main( int argc, char** argv) 
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    magmaDoubleComplex *hA, *hR;
    magmaDoubleComplex_ptr dA;
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 };
    
    magma_int_t i, info;
    magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){        
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[9] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zpotrf_gpu -N %d\n\n", 1024);
    }

    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device;
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( &device, 1, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }

    /* Allocate memory for the largest matrix */
    N    = size[9];
    n2   = N * N;
    ldda = ((N+31)/32) * 32;
    TESTING_MALLOC(      hA, magmaDoubleComplex, n2 );
    TESTING_MALLOC_HOST( hR, magmaDoubleComplex, n2 );
    TESTING_MALLOC_DEV(  dA, magmaDoubleComplex, ldda*N );
    
    printf("\n\n");
    printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N; 
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, hA );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            MAGMA_Z_SET2REAL( hA(i,i), MAGMA_Z_REAL(hA(i,i)) + N );
            for( int j = 0; j < i; ++j ) {
	      hA(i, j) = MAGMA_Z_CNJG( hA(j,i) );
            }
        }
        lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda );

	/* Warm up to measure the performance */
	magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue );
	magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue );

        /* ====================================================================
           Performs operation using MAGMA 
           =================================================================== */
        magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue );
        gpu_time = get_time();
        magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue );
        gpu_time = get_time() - gpu_time;
        if (info != 0)
            printf( "magma_zpotrf had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
        
        /* =====================================================================
           Performs operation using LAPACK 
           =================================================================== */
        cpu_time = get_time();
        lapackf77_zpotrf( MagmaUpperStr, &N, hA, &lda, &info );
        cpu_time = get_time() - cpu_time;
        if (info != 0)
            printf( "lapackf77_zpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        magma_zgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue );
        matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work);
        blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione);
        diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n", 
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE( hA );
    TESTING_FREE_HOST( hR );
    TESTING_FREE_DEV( dA );
    magma_queue_destroy( queue );
    magma_finalize();
}

Exemplo n.º 10

0

Exibir arquivo

extern "C" magma_int_t
magma_zgeev_m(
    char jobvl, char jobvr, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex *W,
    magmaDoubleComplex *vl, magma_int_t ldvl,
    magmaDoubleComplex *vr, magma_int_t ldvr,
    magmaDoubleComplex *work, magma_int_t lwork,
    double *rwork, magma_int_t *info )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the
    eigenvalues and, optionally, the left and/or right eigenvectors.

    The right eigenvector v(j) of A satisfies
                     A * v(j) = lambda(j) * v(j)
    where lambda(j) is its eigenvalue.
    The left eigenvector u(j) of A satisfies
                  u(j)**H * A = lambda(j) * u(j)**H
    where u(j)**H denotes the conjugate transpose of u(j).

    The computed eigenvectors are normalized to have Euclidean norm
    equal to 1 and largest component real.

    Arguments
    =========
    JOBVL   (input) CHARACTER*1
            = 'N': left eigenvectors of A are not computed;
            = 'V': left eigenvectors of are computed.

    JOBVR   (input) CHARACTER*1
            = 'N': right eigenvectors of A are not computed;
            = 'V': right eigenvectors of A are computed.

    N       (input) INTEGER
            The order of the matrix A. N >= 0.

    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
            On entry, the N-by-N matrix A.
            On exit, A has been overwritten.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    W       (output) COMPLEX*16 array, dimension (N)
            W contains the computed eigenvalues.

    VL      (output) COMPLEX*16 array, dimension (LDVL,N)
            If JOBVL = 'V', the left eigenvectors u(j) are stored one
            after another in the columns of VL, in the same order
            as their eigenvalues.
            If JOBVL = 'N', VL is not referenced.
            u(j) = VL(:,j), the j-th column of VL.

    LDVL    (input) INTEGER
            The leading dimension of the array VL.  LDVL >= 1; if
            JOBVL = 'V', LDVL >= N.

    VR      (output) COMPLEX*16 array, dimension (LDVR,N)
            If JOBVR = 'V', the right eigenvectors v(j) are stored one
            after another in the columns of VR, in the same order
            as their eigenvalues.
            If JOBVR = 'N', VR is not referenced.
            v(j) = VR(:,j), the j-th column of VR.

    LDVR    (input) INTEGER
            The leading dimension of the array VR.  LDVR >= 1; if
            JOBVR = 'V', LDVR >= N.

    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= (1+nb)*N.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    RWORK   (workspace) DOUBLE PRECISION array, dimension (2*N)

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  if INFO = i, the QR algorithm failed to compute all the
                  eigenvalues, and no eigenvectors have been computed;
                  elements and i+1:N of W contain eigenvalues which have
                  converged.
    =====================================================================    */

    #define vl(i,j)  (vl + (i) + (j)*ldvl)
    #define vr(i,j)  (vr + (i) + (j)*ldvr)
    
    magma_int_t c_one  = 1;
    magma_int_t c_zero = 0;
    
    double d__1, d__2;
    magmaDoubleComplex z__1, z__2;
    magmaDoubleComplex tmp;
    double scl;
    double dum[1], eps;
    double anrm, cscale, bignum, smlnum;
    magma_int_t i, k, ilo, ihi;
    magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, i__1, i__2, nb;
    magma_int_t scalea, minwrk, irwork, lquery, wantvl, wantvr, select[1];

    char side[2]   = {0, 0};
    char jobvl_[2] = {jobvl, 0};
    char jobvr_[2] = {jobvr, 0};

    irwork = 0;
    *info = 0;
    lquery = lwork == -1;
    wantvl = lapackf77_lsame( jobvl_, "V" );
    wantvr = lapackf77_lsame( jobvr_, "V" );
    if (! wantvl && ! lapackf77_lsame( jobvl_, "N" )) {
        *info = -1;
    } else if (! wantvr && ! lapackf77_lsame( jobvr_, "N" )) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -8;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -10;
    }

    /* Compute workspace */
    nb = magma_get_zgehrd_nb( n );
    if (*info == 0) {
        minwrk = (1+nb)*n;
        work[0] = MAGMA_Z_MAKE( minwrk, 0 );

        if (lwork < minwrk && ! lquery) {
            *info = -12;
        }
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
    
    #if defined(Version3) || defined(Version4) || defined(Version5)
    magmaDoubleComplex *dT;
    if (MAGMA_SUCCESS != magma_zmalloc( &dT, nb*n )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    #endif
    #if defined(Version4) || defined(Version5)
    magmaDoubleComplex *T;
    if (MAGMA_SUCCESS != magma_zmalloc_cpu( &T, nb*n )) {
        magma_free( dT );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    #endif

    /* Get machine constants */
    eps    = lapackf77_dlamch( "P" );
    smlnum = lapackf77_dlamch( "S" );
    bignum = 1. / smlnum;
    lapackf77_dlabad( &smlnum, &bignum );
    smlnum = magma_dsqrt( smlnum ) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_zlange( "M", &n, &n, A, &lda, dum );
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_zlascl( "G", &c_zero, &c_zero, &anrm, &cscale, &n, &n, A, &lda, &ierr );
    }

    /* Balance the matrix
     * (CWorkspace: none)
     * (RWorkspace: need N) */
    ibal = 0;
    lapackf77_zgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr );

    /* Reduce to upper Hessenberg form
     * (CWorkspace: need 2*N, prefer N + N*NB)
     * (RWorkspace: none) */
    itau = 0;
    iwrk = itau + n;
    liwrk = lwork - iwrk;

    #if defined(Version1)
        // Version 1 - LAPACK
        lapackf77_zgehrd( &n, &ilo, &ihi, A, &lda,
                          &work[itau], &work[iwrk], &liwrk, &ierr );
    #elif defined(Version2)
        // Version 2 - LAPACK consistent HRD
        magma_zgehrd2( n, ilo, ihi, A, lda,
                       &work[itau], &work[iwrk], &liwrk, &ierr );
    #elif defined(Version3)
        // Version 3 - LAPACK consistent MAGMA HRD + matrices T stored,
        magma_zgehrd( n, ilo, ihi, A, lda,
                      &work[itau], &work[iwrk], liwrk, dT, &ierr );
    #elif defined(Version4) || defined(Version5)
        // Version 4 - Multi-GPU, T on host
        magma_zgehrd_m( n, ilo, ihi, A, lda,
                        &work[itau], &work[iwrk], liwrk, T, &ierr );
        magma_zsetmatrix( nb, n, T, nb, dT, nb );
    #endif

    if (wantvl) {
        /* Want left eigenvectors
         * Copy Householder vectors to VL */
        side[0] = 'L';
        lapackf77_zlacpy( MagmaLowerStr, &n, &n, A, &lda, vl, &ldvl );

        /* Generate unitary matrix in VL
         * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB)
         * (RWorkspace: none) */
        #if defined(Version1) || defined(Version2)
            // Version 1 & 2 - LAPACK
            lapackf77_zunghr( &n, &ilo, &ihi, vl, &ldvl, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(Version3) || defined(Version4)
            // Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
            magma_zunghr( n, ilo, ihi, vl, ldvl, &work[itau], dT, nb, &ierr );
        #elif defined(Version5)
            // Version 5 - Multi-GPU, T on host
            magma_zunghr_m( n, ilo, ihi, vl, ldvl, &work[itau], T, nb, &ierr );
        #endif

        /* Perform QR iteration, accumulating Schur vectors in VL
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: none) */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, W,
                          vl, &ldvl, &work[iwrk], &liwrk, info );

        if (wantvr) {
            /* Want left and right eigenvectors
             * Copy Schur vectors to VR */
            side[0] = 'B';
            lapackf77_zlacpy( "F", &n, &n, vl, &ldvl, vr, &ldvr );
        }
    }
    else if (wantvr) {
        /* Want right eigenvectors
         * Copy Householder vectors to VR */
        side[0] = 'R';
        lapackf77_zlacpy( "L", &n, &n, A, &lda, vr, &ldvr );

        /* Generate unitary matrix in VR
         * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB)
         * (RWorkspace: none) */
        #if defined(Version1) || defined(Version2)
            // Version 1 & 2 - LAPACK
            lapackf77_zunghr( &n, &ilo, &ihi, vr, &ldvr, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(Version3) || defined(Version4)
            // Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
            magma_zunghr( n, ilo, ihi, vr, ldvr, &work[itau], dT, nb, &ierr );
        #elif defined(Version5)
            // Version 5 - Multi-GPU, T on host
            magma_zunghr_m( n, ilo, ihi, vr, ldvr, &work[itau], T, nb, &ierr );
        #endif

        /* Perform QR iteration, accumulating Schur vectors in VR
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: none) */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, W,
                          vr, &ldvr, &work[iwrk], &liwrk, info );
    }
    else {
        /* Compute eigenvalues only
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: none) */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "E", "N", &n, &ilo, &ihi, A, &lda, W,
                          vr, &ldvr, &work[iwrk], &liwrk, info );
    }

    /* If INFO > 0 from ZHSEQR, then quit */
    if (*info > 0) {
        goto CLEANUP;
    }

    if (wantvl || wantvr) {
        /* Compute left and/or right eigenvectors
         * (CWorkspace: need 2*N)
         * (RWorkspace: need 2*N) */
        irwork = ibal + n;
        lapackf77_ztrevc( side, "B", select, &n, A, &lda, vl, &ldvl,
                          vr, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr );
    }

    if (wantvl) {
        /* Undo balancing of left eigenvectors
         * (CWorkspace: none)
         * (RWorkspace: need N) */
        lapackf77_zgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n,
                          vl, &ldvl, &ierr );

        /* Normalize left eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            scl = 1. / cblas_dznrm2( n, vl(0,i), 1 );
            cblas_zdscal( n, scl, vl(0,i), 1 );
            for (k = 0; k < n; ++k) {
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL( *vl(k,i) );
                d__2 = MAGMA_Z_IMAG( *vl(k,i) );
                rwork[irwork + k] = d__1*d__1 + d__2*d__2;
            }
            k = cblas_idamax( n, &rwork[irwork], 1 );
            z__2 = MAGMA_Z_CNJG( *vl(k,i) );
            d__1 = magma_dsqrt( rwork[irwork + k] );
            MAGMA_Z_DSCALE( z__1, z__2, d__1 );
            tmp = z__1;
            cblas_zscal( n, CBLAS_SADDR(tmp), vl(0,i), 1 );
            d__1 = MAGMA_Z_REAL( *vl(k,i) );
            z__1 = MAGMA_Z_MAKE( d__1, 0 );
            *vl(k,i) = z__1;
        }
    }

    if (wantvr) {
        /* Undo balancing of right eigenvectors
         * (CWorkspace: none)
         * (RWorkspace: need N) */
        lapackf77_zgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n,
                          vr, &ldvr, &ierr );

        /* Normalize right eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            scl = 1. / cblas_dznrm2( n, vr(0,i), 1 );
            cblas_zdscal( n, scl, vr(0,i), 1 );
            for (k = 0; k < n; ++k) {
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL( *vr(k,i) );
                d__2 = MAGMA_Z_IMAG( *vr(k,i) );
                rwork[irwork + k] = d__1*d__1 + d__2*d__2;
            }
            k = cblas_idamax( n, &rwork[irwork], 1 );
            z__2 = MAGMA_Z_CNJG( *vr(k,i) );
            d__1 = magma_dsqrt( rwork[irwork + k] );
            MAGMA_Z_DSCALE( z__1, z__2, d__1 );
            tmp = z__1;
            cblas_zscal( n, CBLAS_SADDR(tmp), vr(0,i), 1 );
            d__1 = MAGMA_Z_REAL( *vr(k,i) );
            z__1 = MAGMA_Z_MAKE( d__1, 0 );
            *vr(k,i) = z__1;
        }
    }

CLEANUP:
    /* Undo scaling if necessary */
    if (scalea) {
        i__1 = n - (*info);
        i__2 = max( n - (*info), 1 );
        lapackf77_zlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one,
                          W + (*info), &i__2, &ierr );
        if (*info > 0) {
            i__1 = ilo - 1;
            lapackf77_zlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one,
                              W, &n, &ierr );
        }
    }

    #if defined(Version3) || defined(Version4) || defined(Version5)
    magma_free( dT );
    #endif
    #if defined(Version4) || defined(Version5)
    magma_free_cpu( T );
    #endif
    
    return *info;
} /* magma_zgeev */

Exemplo n.º 11

0

Exibir arquivo

Arquivo: zgeev.cpp Projeto: cjy7117/FT-MAGMA

/**
    Purpose
    -------
    ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the
    eigenvalues and, optionally, the left and/or right eigenvectors.

    The right eigenvector v(j) of A satisfies
        A * v(j) = lambda(j) * v(j)
    where lambda(j) is its eigenvalue.
    The left eigenvector u(j) of A satisfies
        u(j)**H * A = lambda(j) * u(j)**H
    where u(j)**H denotes the conjugate transpose of u(j).

    The computed eigenvectors are normalized to have Euclidean norm
    equal to 1 and largest component real.

    Arguments
    ---------
    @param[in]
    jobvl   magma_vec_t
      -     = MagmaNoVec:        left eigenvectors of A are not computed;
      -     = MagmaVec:          left eigenvectors of are computed.

    @param[in]
    jobvr   magma_vec_t
      -     = MagmaNoVec:        right eigenvectors of A are not computed;
      -     = MagmaVec:          right eigenvectors of A are computed.

    @param[in]
    n       INTEGER
            The order of the matrix A. N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the N-by-N matrix A.
            On exit, A has been overwritten.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    w       COMPLEX_16 array, dimension (N)
            w contains the computed eigenvalues.

    @param[out]
    VL      COMPLEX_16 array, dimension (LDVL,N)
            If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one
            after another in the columns of VL, in the same order
            as their eigenvalues.
            If JOBVL = MagmaNoVec, VL is not referenced.
            u(j) = VL(:,j), the j-th column of VL.

    @param[in]
    ldvl    INTEGER
            The leading dimension of the array VL.  LDVL >= 1; if
            JOBVL = MagmaVec, LDVL >= N.

    @param[out]
    VR      COMPLEX_16 array, dimension (LDVR,N)
            If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one
            after another in the columns of VR, in the same order
            as their eigenvalues.
            If JOBVR = MagmaNoVec, VR is not referenced.
            v(j) = VR(:,j), the j-th column of VR.

    @param[in]
    ldvr    INTEGER
            The leading dimension of the array VR.  LDVR >= 1; if
            JOBVR = MagmaVec, LDVR >= N.

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= (1+nb)*N.
            For optimal performance, LWORK >= (1+2*nb)*N.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param
    rwork   (workspace) DOUBLE PRECISION array, dimension (2*N)

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = i, the QR algorithm failed to compute all the
                  eigenvalues, and no eigenvectors have been computed;
                  elements and i+1:N of w contain eigenvalues which have
                  converged.

    @ingroup magma_zgeev_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zgeev(
    magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda,
    #ifdef COMPLEX
    magmaDoubleComplex *w,
    #else
    double *wr, double *wi,
    #endif
    magmaDoubleComplex *VL, magma_int_t ldvl,
    magmaDoubleComplex *VR, magma_int_t ldvr,
    magmaDoubleComplex *work, magma_int_t lwork,
    #ifdef COMPLEX
    double *rwork,
    #endif
    magma_int_t *info )
{
    #define VL(i,j)  (VL + (i) + (j)*ldvl)
    #define VR(i,j)  (VR + (i) + (j)*ldvr)
    
    const magma_int_t ione  = 1;
    const magma_int_t izero = 0;
    
    double d__1, d__2;
    magmaDoubleComplex tmp;
    double scl;
    double dum[1], eps;
    double anrm, cscale, bignum, smlnum;
    magma_int_t i, k, ilo, ihi;
    magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb;
    magma_int_t scalea, minwrk, optwrk, irwork, lquery, wantvl, wantvr, select[1];

    magma_side_t side = MagmaRight;

    magma_timer_t time_total=0, time_gehrd=0, time_unghr=0, time_hseqr=0, time_trevc=0, time_sum=0;
    magma_flops_t flop_total=0, flop_gehrd=0, flop_unghr=0, flop_hseqr=0, flop_trevc=0, flop_sum=0;
    timer_start( time_total );
    flops_start( flop_total );
    
    irwork = 0;
    *info = 0;
    lquery = (lwork == -1);
    wantvl = (jobvl == MagmaVec);
    wantvr = (jobvr == MagmaVec);
    if (! wantvl && jobvl != MagmaNoVec) {
        *info = -1;
    } else if (! wantvr && jobvr != MagmaNoVec) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -8;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -10;
    }

    /* Compute workspace */
    nb = magma_get_zgehrd_nb( n );
    if (*info == 0) {
        minwrk = (1+  nb)*n;
        optwrk = (1+2*nb)*n;
        work[0] = MAGMA_Z_MAKE( optwrk, 0 );

        if (lwork < minwrk && ! lquery) {
            *info = -12;
        }
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
    
    #if defined(VERSION3)
    magmaDoubleComplex_ptr dT;
    if (MAGMA_SUCCESS != magma_zmalloc( &dT, nb*n )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    #endif

    /* Get machine constants */
    eps    = lapackf77_dlamch( "P" );
    smlnum = lapackf77_dlamch( "S" );
    bignum = 1. / smlnum;
    lapackf77_dlabad( &smlnum, &bignum );
    smlnum = magma_dsqrt( smlnum ) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_zlange( "M", &n, &n, A, &lda, dum );
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_zlascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr );
    }

    /* Balance the matrix
     * (CWorkspace: none)
     * (RWorkspace: need N)
     *  - this space is reserved until after gebak */
    ibal = 0;
    lapackf77_zgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr );

    /* Reduce to upper Hessenberg form
     * (CWorkspace: need 2*N, prefer N + N*NB)
     * (RWorkspace: N)
     *  - including N reserved for gebal/gebak, unused by zgehrd */
    itau = 0;
    iwrk = itau + n;
    liwrk = lwork - iwrk;

    timer_start( time_gehrd );
    flops_start( flop_gehrd );
    #if defined(VERSION1)
        // Version 1 - LAPACK
        lapackf77_zgehrd( &n, &ilo, &ihi, A, &lda,
                          &work[itau], &work[iwrk], &liwrk, &ierr );
    #elif defined(VERSION2)
        // Version 2 - LAPACK consistent HRD
        magma_zgehrd2( n, ilo, ihi, A, lda,
                       &work[itau], &work[iwrk], liwrk, &ierr );
    #elif defined(VERSION3)
        // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored,
        magma_zgehrd( n, ilo, ihi, A, lda,
                      &work[itau], &work[iwrk], liwrk, dT, &ierr );
    #endif
    time_sum += timer_stop( time_gehrd );
    flop_sum += flops_stop( flop_gehrd );

    if (wantvl) {
        /* Want left eigenvectors
         * Copy Householder vectors to VL */
        side = MagmaLeft;
        lapackf77_zlacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl );

        /* Generate unitary matrix in VL
         * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB)
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zunghr */
        timer_start( time_unghr );
        flops_start( flop_unghr );
        #if defined(VERSION1) || defined(VERSION2)
            // Version 1 & 2 - LAPACK
            lapackf77_zunghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(VERSION3)
            // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored
            magma_zunghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr );
        #endif
        time_sum += timer_stop( time_unghr );
        flop_sum += flops_stop( flop_unghr );
        
        timer_start( time_hseqr );
        flops_start( flop_hseqr );
        /* Perform QR iteration, accumulating Schur vectors in VL
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zhseqr */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w,
                          VL, &ldvl, &work[iwrk], &liwrk, info );
        time_sum += timer_stop( time_hseqr );
        flop_sum += flops_stop( flop_hseqr );

        if (wantvr) {
            /* Want left and right eigenvectors
             * Copy Schur vectors to VR */
            side = MagmaBothSides;
            lapackf77_zlacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr );
        }
    }
    else if (wantvr) {
        /* Want right eigenvectors
         * Copy Householder vectors to VR */
        side = MagmaRight;
        lapackf77_zlacpy( "L", &n, &n, A, &lda, VR, &ldvr );

        /* Generate unitary matrix in VR
         * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB)
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zunghr */
        timer_start( time_unghr );
        flops_start( flop_unghr );
        #if defined(VERSION1) || defined(VERSION2)
            // Version 1 & 2 - LAPACK
            lapackf77_zunghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(VERSION3)
            // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored
            magma_zunghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr );
        #endif
        time_sum += timer_stop( time_unghr );
        flop_sum += flops_stop( flop_unghr );

        /* Perform QR iteration, accumulating Schur vectors in VR
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zhseqr */
        timer_start( time_hseqr );
        flops_start( flop_hseqr );
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w,
                          VR, &ldvr, &work[iwrk], &liwrk, info );
        time_sum += timer_stop( time_hseqr );
        flop_sum += flops_stop( flop_hseqr );
    }
    else {
        /* Compute eigenvalues only
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zhseqr */
        timer_start( time_hseqr );
        flops_start( flop_hseqr );
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "E", "N", &n, &ilo, &ihi, A, &lda, w,
                          VR, &ldvr, &work[iwrk], &liwrk, info );
        time_sum += timer_stop( time_hseqr );
        flop_sum += flops_stop( flop_hseqr );
    }

    /* If INFO > 0 from ZHSEQR, then quit */
    if (*info > 0) {
        goto CLEANUP;
    }

    timer_start( time_trevc );
    flops_start( flop_trevc );
    if (wantvl || wantvr) {
        /* Compute left and/or right eigenvectors
         * (CWorkspace: need 2*N)
         * (RWorkspace: need 2*N)
         *  - including N reserved for gebal/gebak, unused by ztrevc */
        irwork = ibal + n;
        #if TREVC_VERSION == 1
        lapackf77_ztrevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl,
                          VR, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 2
        liwrk = lwork - iwrk;
        lapackf77_ztrevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl,
                           VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 3
        magma_ztrevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl,
                       VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 4
        magma_ztrevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl,
                          VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 5
        magma_ztrevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl,
                              VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr );
        #else
        #error Unknown TREVC_VERSION
        #endif
    }
    time_sum += timer_stop( time_trevc );
    flop_sum += flops_stop( flop_trevc );

    if (wantvl) {
        /* Undo balancing of left eigenvectors
         * (CWorkspace: none)
         * (RWorkspace: need N) */
        lapackf77_zgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n,
                          VL, &ldvl, &ierr );

        /* Normalize left eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            scl = 1. / magma_cblas_dznrm2( n, VL(0,i), 1 );
            blasf77_zdscal( &n, &scl, VL(0,i), &ione );
            for (k = 0; k < n; ++k) {
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL( *VL(k,i) );
                d__2 = MAGMA_Z_IMAG( *VL(k,i) );
                rwork[irwork + k] = d__1*d__1 + d__2*d__2;
            }
            k = blasf77_idamax( &n, &rwork[irwork], &ione ) - 1;  // subtract 1; k is 0-based
            tmp = MAGMA_Z_CNJG( *VL(k,i) ) / magma_dsqrt( rwork[irwork + k] );
            blasf77_zscal( &n, &tmp, VL(0,i), &ione );
            *VL(k,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL( *VL(k,i) ), 0 );
        }
    }

    if (wantvr) {
        /* Undo balancing of right eigenvectors
         * (CWorkspace: none)
         * (RWorkspace: need N) */
        lapackf77_zgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n,
                          VR, &ldvr, &ierr );

        /* Normalize right eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            scl = 1. / magma_cblas_dznrm2( n, VR(0,i), 1 );
            blasf77_zdscal( &n, &scl, VR(0,i), &ione );
            for (k = 0; k < n; ++k) {
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL( *VR(k,i) );
                d__2 = MAGMA_Z_IMAG( *VR(k,i) );
                rwork[irwork + k] = d__1*d__1 + d__2*d__2;
            }
            k = blasf77_idamax( &n, &rwork[irwork], &ione ) - 1;  // subtract 1; k is 0-based
            tmp = MAGMA_Z_CNJG( *VR(k,i) ) / magma_dsqrt( rwork[irwork + k] );
            blasf77_zscal( &n, &tmp, VR(0,i), &ione );
            *VR(k,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL( *VR(k,i) ), 0 );
        }
    }

CLEANUP:
    /* Undo scaling if necessary */
    if (scalea) {
        // converged eigenvalues, stored in WR[i+1:n] and WI[i+1:n] for i = INFO
        magma_int_t nval = n - (*info);
        magma_int_t ld   = max( nval, 1 );
        lapackf77_zlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w + (*info), &ld, &ierr );
        if (*info > 0) {
            // first ilo columns were already upper triangular,
            // so the corresponding eigenvalues are also valid.
            nval = ilo - 1;
            lapackf77_zlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w, &n, &ierr );
        }
    }

    #if defined(VERSION3)
    magma_free( dT );
    #endif
    
    timer_stop( time_total );
    flops_stop( flop_total );
    timer_printf( "dgeev times n %5d, gehrd %7.3f, unghr %7.3f, hseqr %7.3f, trevc %7.3f, total %7.3f, sum %7.3f\n",
                  (int) n, time_gehrd, time_unghr, time_hseqr, time_trevc, time_total, time_sum );
    timer_printf( "dgeev flops n %5d, gehrd %7lld, unghr %7lld, hseqr %7lld, trevc %7lld, total %7lld, sum %7lld\n",
                  (int) n, flop_gehrd, flop_unghr, flop_hseqr, flop_trevc, flop_total, flop_sum );

    work[0] = MAGMA_Z_MAKE( (double) optwrk, 0. );

    return *info;
} /* magma_zgeev */

Exemplo n.º 12

0

Exibir arquivo

Arquivo: testing_zpotrf_mgpu.cpp Projeto: kjbartel/clmagma

int main( int argc, char** argv)
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs];
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 };
    
    magma_int_t i, j, k, info;
    magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
   
    magma_int_t num_gpus0 = 1, num_gpus, flag = 0;
    int nb, mb, n_local, nk;

    magma_uplo_t uplo = MagmaLower;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0){
                N = atoi(argv[++i]);
                if (N>0) {
                    size[0] = size[9] = N;
                    flag = 1;
                }else exit(1);
            }
            if(strcmp("-NGPU", argv[i])==0)
                num_gpus0 = atoi(argv[++i]);
            if(strcmp("-UPLO", argv[i])==0){
                if(strcmp("L", argv[++i])==0){
                    uplo = MagmaLower;
                }else{
                    uplo = MagmaUpper;
                }            
            }
        }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0);
    }

    /* looking for max. ldda */
    ldda = 0;
    n2 = 0;
    for(i=0;i<10;i++){
        N = size[i];
        nb = magma_get_zpotrf_nb(N);
        mb = nb;
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
        }else{
            num_gpus = num_gpus0;
        }
        n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb);
        if(n_local > ldda) ldda = n_local;
        if(n2 < N*N) n2 = N*N;
        if(flag != 0) break;
    }

     /* Allocate host memory for the matrix */
    TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 );
    TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 );

    /* Initialize */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    //magma_queue_t  queues[MagmaMaxGPUs];
    magma_device_t devices[ MagmaMaxGPUs ];
    magma_int_t num = 0;
    magma_int_t err;
    magma_init();
    err = magma_getdevices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        exit(-1);
    }
    for(i=0;i<num_gpus;i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
    }

    printf("each buffer size: %d\n", ldda);
    /* allocate local matrix on Buffers */
    for(i=0; i<num_gpus0; i++){
        TESTING_MALLOC_DEV( d_lA[i], magmaDoubleComplex, ldda );
    }

    
    printf("\n\n");
    printf("Using GPUs: %d\n", num_gpus0);
    if(uplo == MagmaUpper){
        printf("\n  testing_zpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0);
    }else{
        printf("\n  testing_zpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0);
    }
            printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N;
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            h_A(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL(h_A(i,i)) + N, 0 );
            for( int j = 0; j < i; ++j ) {
                h_A(i, j) = MAGMA_Z_CNJG( h_A(j,i) );
            }
        }
        lapackf77_zlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda );

        /* Warm up to measure the performance */
        nb = magma_get_zpotrf_nb(N);
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
            printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus);
        }else{
            num_gpus = num_gpus0;
        }
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
                                 queues[2*k]);
            }
        }else{
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
                                    queues[2*k]);
            }
        }

        magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        
        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
                                 queues[2*k]);
            }
        }else{
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
                                    queues[2*k]);
            }
        }
    
        gpu_time = magma_wtime();
        magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_zpotrf had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
       
        /* gather matrix from gpus */
        if(uplo==MagmaUpper){
            // Upper
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zgetmatrix( N, nk,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda,
                                 &h_R[j*lda], lda, queues[2*k]);
            }
        }else{
            // Lower
            for(j=0; j<N; j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zgetmatrix( nk, N, 
                            d_lA[k], (j/(nb*num_gpus)*nb), ldda, 
                            &h_R[j], lda, queues[2*k] );
            }
        }

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        if(uplo == MagmaLower){
            lapackf77_zpotrf( MagmaLowerStr, &N, h_A, &lda, &info );
        }else{
            lapackf77_zpotrf( MagmaUpperStr, &N, h_A, &lda, &info );
        }
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_zpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        matnorm = lapackf77_zlange("f", &N, &N, h_A, &lda, work);
        blasf77_zaxpy(&n2, &mz_one, h_A, &ione, h_R, &ione);
        diffnorm = lapackf77_zlange("f", &N, &N, h_R, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n",
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (flag != 0)
            break;
    }

    /* clean up */
    TESTING_FREE_PIN( h_A );
    TESTING_FREE_PIN( h_R );
    for(i=0;i<num_gpus;i++){
        TESTING_FREE_DEV( d_lA[i] );
        magma_queue_destroy( queues[2*i]   );
        magma_queue_destroy( queues[2*i+1] );
    }
    magma_finalize();
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: zbombard_symm_merge.cpp Projeto: xulunfan/magma

extern "C" magma_int_t
magma_zbombard_merge(
    magma_z_matrix A, magma_z_matrix b, 
    magma_z_matrix *x, magma_z_solver_par *solver_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;
    
        // queue variables
    const magma_queue_t squeue = 0;    // synchronous kernel queues
    const magma_int_t nqueues = 3;     // number of queues
    magma_queue_t queues[nqueues];    
    magma_int_t q1flag = 0;
    
        // set asynchronous kernel queues
    //printf("%% Kernel queues: (orig, queue) = (%p, %p)\n", (void *)orig_queue, (void *)queue);
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    if ( queue != squeue ) {
        queues[1] = queue;
        q1flag = 0;
    } else {
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &(queues[1]) );
        queue = queues[1];
        q1flag = 1;
    }
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &(queues[2]) );
    for (int i = 0; i < nqueues; ++i ) {
        ; //printf("Kernel queue #%d = %p\n", i, (void *)queues[i]);
    }

    
    // 1=QMR, 2=CGS, 3+BiCGSTAB
    magma_int_t flag = 0;
    
    int mdot = 1;
    
    // prepare solver feedback
    solver_par->solver = Magma_BOMBARD;
    solver_par->numiter = 0;
    
    // constants
    const magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    
    // solver variables
    double nom0, r0, res, Q_res, C_res, B_res, nomb;
    
    //QMR
    magmaDoubleComplex Q_rho = c_one, Q_rho1 = c_one, Q_eta = -c_one , Q_pds = c_one, 
                        Q_thet = c_one, Q_thet1 = c_one, Q_epsilon = c_one, 
                        Q_beta = c_one, Q_delta = c_one, Q_pde = c_one, Q_rde = c_one,
                        Q_gamm = c_one, Q_gamm1 = c_one, Q_psi = c_one;
    //CGS
    magmaDoubleComplex C_rho, C_rho_l = c_one, C_alpha, C_beta;
    //BiCGSTAB
    magmaDoubleComplex B_alpha, B_beta, B_omega, B_rho_old, B_rho_new;
    
    magma_int_t dofs = A.num_rows* b.num_cols;

    // need to transpose the matrix
    
    // GPU workspace
    // overall initial residual
    magma_z_matrix r_tld={Magma_CSR};
    
    // QMR
    magma_z_matrix AT = {Magma_CSR}, Q_r={Magma_CSR},  Q_x={Magma_CSR},
                    Q_v={Magma_CSR}, Q_w={Magma_CSR}, Q_wt={Magma_CSR},
                    Q_d={Magma_CSR}, Q_s={Magma_CSR}, Q_z={Magma_CSR}, Q_q={Magma_CSR}, 
                    Q_p={Magma_CSR}, Q_pt={Magma_CSR}, Q_y={Magma_CSR};
    // CGS
    magma_z_matrix C_r={Magma_CSR}, C_rt={Magma_CSR}, C_x={Magma_CSR},
                    C_p={Magma_CSR}, C_q={Magma_CSR}, C_u={Magma_CSR}, C_v={Magma_CSR},  C_t={Magma_CSR},
                    C_p_hat={Magma_CSR}, C_q_hat={Magma_CSR}, C_u_hat={Magma_CSR}, C_v_hat={Magma_CSR};
    //BiCGSTAB
    magma_z_matrix B_r={Magma_CSR}, B_x={Magma_CSR}, B_p={Magma_CSR}, B_v={Magma_CSR}, 
                    B_s={Magma_CSR}, B_t={Magma_CSR};
                    
    // multi-vector for block-SpMV 
    magma_z_matrix SpMV_in_1={Magma_CSR}, SpMV_out_1={Magma_CSR}, 
                    SpMV_in_2={Magma_CSR}, SpMV_out_2={Magma_CSR}; 
                    
    // workspace for zmzdotc
    magma_z_matrix d1={Magma_CSR},  d2={Magma_CSR}, skp={Magma_CSR};

                    
    CHECK( magma_zvinit( &r_tld, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    
    CHECK( magma_zvinit( &SpMV_in_1,  Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue ));
    CHECK( magma_zvinit( &SpMV_out_1, Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue ));
    CHECK( magma_zvinit( &SpMV_in_2,  Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue ));
    CHECK( magma_zvinit( &SpMV_out_2, Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue ));
    
    CHECK( magma_zvinit( &d1, Magma_DEV, A.num_rows, b.num_cols * 4, c_zero, queue ));
    CHECK( magma_zvinit( &d2, Magma_DEV, A.num_rows, b.num_cols * 4, c_zero, queue ));
    CHECK( magma_zvinit( &skp, Magma_CPU, 4, 1, c_zero, queue ));
    
    // QMR
    CHECK( magma_zvinit( &Q_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_w, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_wt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &Q_x, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    // QMR
    CHECK( magma_zvinit( &C_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &C_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    // BiCGSTAB
    CHECK( magma_zvinit( &B_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &B_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &B_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &B_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &B_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &B_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));

    
    
    
    // solver setup
    CHECK(  magma_zresidualvec( A, b, *x, &r_tld, &nom0, queue));
    solver_par->init_res = nom0;
    res = nom0;
    
    // QMR
    magma_zcopy( dofs, r_tld.dval, 1, Q_r.dval, 1, queue );   
    magma_zcopy( dofs, r_tld.dval, 1, Q_y.dval, 1, queue );   
    magma_zcopy( dofs, r_tld.dval, 1, Q_v.dval, 1, queue );  
    magma_zcopy( dofs, r_tld.dval, 1, Q_wt.dval, 1, queue );   
    magma_zcopy( dofs, r_tld.dval, 1, Q_z.dval, 1, queue ); 
    magma_zcopy( dofs, x->dval, 1, Q_x.dval, 1, queue ); 
    // transpose the matrix
    magma_zmtransposeconjugate( A, &AT, queue );
    
    // CGS
    magma_zcopy( dofs, r_tld.dval, 1, C_r.dval, 1, queue );   
    magma_zcopy( dofs, x->dval, 1, C_x.dval, 1, queue ); 
    
    // BiCGSTAB
    magma_zcopy( dofs, r_tld.dval, 1, B_r.dval, 1, queue );   
    magma_zcopy( dofs, x->dval, 1, B_x.dval, 1, queue ); 
    CHECK( magma_z_spmv( c_one, A, B_r, c_zero, B_v, queue ));     
    magma_zcopy( dofs, B_v.dval, 1, SpMV_out_1.dval+2*dofs, 1, queue );
    
    nomb = magma_dznrm2( dofs, b.dval, 1, queue );
    if ( nomb == 0.0 ){
        nomb=1.0;
    }       
    if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){
        r0 = ATOLERANCE;
    }
    solver_par->final_res = solver_par->init_res;
    solver_par->iter_res = solver_par->init_res;
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = (real_Double_t)nom0;
        solver_par->timing[0] = 0.0;
    }
    if ( nom0 < r0 ) {
        info = MAGMA_SUCCESS;
        goto cleanup;
    }

    Q_psi = magma_zsqrt( magma_zdotc( dofs, Q_z.dval, 1, Q_z.dval, 1), queue );
    Q_rho = magma_zsqrt( magma_zdotc( dofs, Q_y.dval, 1, Q_y.dval, 1), queue );
    
    // BiCGSTAB
    B_rho_new = magma_zdotc( dofs, B_r.dval, 1, B_r.dval, 1, queue );            
    B_rho_old = B_omega = B_alpha = MAGMA_Z_MAKE( 1.0, 0. );
    
        // v = y / rho
        // y = y / rho
        // w = wt / psi
        // z = z / psi
    magma_zqmr_1(  
    b.num_rows, 
    b.num_cols, 
    Q_rho,
    Q_psi,
    Q_y.dval, 
    Q_z.dval,
    Q_v.dval,
    Q_w.dval,
    queue );
    
    //Chronometry
    real_Double_t tempo1, tempo2;
    tempo1 = magma_sync_wtime( queue );
    
    solver_par->numiter = 0;
    // start iteration
    do
    {
        solver_par->numiter++;
        
        if(mdot == 0){   
                //QMR: delta = z' * y;
            Q_delta = magma_zdotc( dofs, Q_z.dval, 1, Q_y.dval, 1, queue );
            
                //CGS: rho = r' * r_tld
            C_rho = magma_zdotc( dofs, C_r.dval, 1, r_tld.dval, 1, queue );
            
                // BiCGSTAB
            B_rho_old = B_rho_new;    
            B_rho_new = magma_zdotc( dofs, r_tld.dval, 1, B_r.dval, 1, queue );  // rho=<rr,r>
            B_beta = B_rho_new/B_rho_old * B_alpha/B_omega;   // beta=rho/rho_old *alpha/omega

        }else{
            B_rho_old = B_rho_new; 
            magma_zmdotc4(
            b.num_rows,  
            Q_z.dval, 
            Q_y.dval,
            r_tld.dval, 
            C_r.dval,
            r_tld.dval, 
            B_r.dval,
            r_tld.dval, 
            B_r.dval,
            d1.dval,
            d2.dval,
            skp.val,
            queue );
            
            Q_delta = skp.val[0];
            C_rho = skp.val[1];
            B_rho_new = skp.val[2];
            B_beta = B_rho_new/B_rho_old * B_alpha/B_omega;
        }
        
        if( solver_par->numiter == 1 ){
                //QMR: p = y;
                //QMR: q = z;
            magma_zcopy( dofs, Q_y.dval, 1, SpMV_in_1.dval, 1, queue );
            magma_zcopy( dofs, Q_z.dval, 1, SpMV_in_2.dval, 1, queue );
            
                //QMR: u = r;
                //QMR: p = r;
            magma_zcgs_2(  
            b.num_rows, 
            b.num_cols, 
            C_r.dval,
            C_u.dval,
            SpMV_in_1.dval+dofs,
            queue );
        }
        else{
            Q_pde = Q_psi * Q_delta / Q_epsilon;
            Q_rde = Q_rho * MAGMA_Z_CNJG(Q_delta/Q_epsilon);
            
            C_beta = C_rho / C_rho_l;  
            
                //QMR p = y - pde * p
                //QMR q = z - rde * q
            magma_zqmr_2(  
            b.num_rows, 
            b.num_cols, 
            Q_pde,
            Q_rde,
            Q_y.dval,
            Q_z.dval,
            SpMV_in_1.dval, 
            SpMV_in_2.dval, 
            queue );
            
                  //CGS: u = r + beta*q;
                  //CGS: p = u + beta*( q + beta*p );
            magma_zcgs_1(  
            b.num_rows, 
            b.num_cols, 
            C_beta,
            C_r.dval,
            C_q.dval, 
            C_u.dval,
            SpMV_in_1.dval+dofs,
            queue );
        }
            // BiCGSTAB: p = r + beta * ( p - omega * v )
        magma_zbicgstab_1(  
        b.num_rows, 
        b.num_cols, 
        B_beta,
        B_omega,
        B_r.dval, 
        SpMV_out_1.dval+2*dofs,
        SpMV_in_1.dval+2*dofs,
        queue );
        /*
        //QMR
        CHECK( magma_z_spmv( c_one, A, Q_p, c_zero, Q_pt, queue ));
        //CGS
        CHECK( magma_z_spmv( c_one, A, C_p, c_zero, C_v_hat, queue ));
        // BiCGSTAB
        CHECK( magma_z_spmv( c_one, A, B_p, c_zero, B_v, queue )); 
        */
        
        // gather everything for a block-SpMV
        //magma_zcopy( dofs, Q_p.dval, 1, SpMV_in_1.dval          , 1, queue );
        //magma_zcopy( dofs, C_p.dval, 1, SpMV_in_1.dval+dofs     , 1, queue );
        //magma_zcopy( dofs, B_p.dval, 1, SpMV_in_1.dval+2*dofs   , 1, queue );
        
        // block SpMV
        CHECK( magma_z_spmv( c_one, A, SpMV_in_1, c_zero, SpMV_out_1, queue ));
        
        // scatter results
        //magma_zcopy( dofs, SpMV_out_1.dval          , 1, Q_pt.dval, 1, queue );
        //magma_zcopy( dofs, SpMV_out_1.dval+dofs     , 1, C_v_hat.dval, 1, queue );
        //magma_zcopy( dofs, SpMV_out_1.dval+2*dofs   , 1, B_v.dval, 1, queue );

        if( mdot == 0 ) {      
                //QMR: epsilon = q' * pt;
            Q_epsilon = magma_zdotc( dofs, SpMV_in_2.dval, 1, SpMV_out_1.dval, 1, queue );
            Q_beta = Q_epsilon / Q_delta;
                //CGS: alpha = r_tld' * v_hat
            C_alpha = C_rho / magma_zdotc( dofs, r_tld.dval, 1, SpMV_out_1.dval+dofs, 1, queue );
            C_rho_l = C_rho;
                //BiCGSTAB
            B_alpha = B_rho_new / magma_zdotc( dofs, r_tld.dval, 1, SpMV_out_1.dval+2*dofs, 1, queue );
        }else{
            magma_zmdotc4(
            b.num_rows,  
            SpMV_in_2.dval, 
            SpMV_out_1.dval,
            r_tld.dval, 
            SpMV_out_1.dval+dofs,
            r_tld.dval, 
            SpMV_out_1.dval+2*dofs,
            r_tld.dval, 
            SpMV_out_1.dval+2*dofs,
            d1.dval,
            d2.dval,
            skp.val,
            queue );
            
            Q_epsilon = skp.val[0];
            Q_beta = Q_epsilon / Q_delta;
            C_alpha = C_rho / skp.val[1];
            C_rho_l = C_rho;
            B_alpha = B_rho_new / skp.val[2];
        }
        
            //QMR: v = pt - beta * v
            //QMR: y = v
        magma_zqmr_3(  
        b.num_rows, 
        b.num_cols, 
        Q_beta,
        SpMV_out_1.dval,
        Q_v.dval,
        Q_y.dval,
        queue );
        
            //CGS: q = u - alpha v_hat
            //CGS: t = u + q
        magma_zcgs_3(  
        b.num_rows, 
        b.num_cols, 
        C_alpha,
        SpMV_out_1.dval+dofs,
        C_u.dval, 
        C_q.dval,
        SpMV_in_2.dval+dofs, 
        queue );
        
            // BiCGSTAB: s = r - alpha v
        magma_zbicgstab_2(  
        b.num_rows, 
        b.num_cols, 
        B_alpha,
        B_r.dval,
        SpMV_out_1.dval+2*dofs,
        SpMV_in_2.dval+2*dofs, 
        queue );
            


            // gather everything for a block-SpMV
        //magma_zcopy( dofs, Q_q.dval, 1, SpMV_in_2.dval          , 1, queue );
        //magma_zcopy( dofs, C_t.dval, 1, SpMV_in_2.dval+dofs     , 1, queue );
        //magma_zcopy( dofs, B_s.dval, 1, SpMV_in_2.dval+2*dofs   , 1, queue );

            // block SpMV
        CHECK( magma_z_spmv( c_one, A, SpMV_in_2, c_zero, SpMV_out_2, queue ));

            // scatter results
        //magma_zcopy( dofs, SpMV_out_2.dval          , 1, Q_wt.dval, 1, queue );
        //magma_zcopy( dofs, SpMV_out_2.dval+dofs     , 1, C_rt.dval, 1, queue );
        //magma_zcopy( dofs, SpMV_out_2.dval+2*dofs   , 1, B_t.dval, 1, queue );        
                
        Q_rho1 = Q_rho;     
        if( mdot == 0 ) {
                //QMR rho = norm(y);
            Q_rho = magma_zsqrt( magma_zdotc( dofs, Q_y.dval, 1, Q_y.dval, 1), queue );
            

            // BiCGSTAB
            B_omega = magma_zdotc( dofs, SpMV_out_2.dval+2*dofs, 1, SpMV_in_2.dval+2*dofs, 1 )   // omega = <s,t>/<t,t>
                       / magma_zdotc( dofs, SpMV_out_2.dval+2*dofs, 1, SpMV_out_2.dval+2*dofs, 1, queue );
        }else{
            magma_zmdotc4(
            b.num_rows,  
            Q_y.dval, 
            Q_y.dval,
            Q_y.dval, 
            Q_y.dval,
            SpMV_out_2.dval+2*dofs, 
            SpMV_in_2.dval+2*dofs,
            SpMV_out_2.dval+2*dofs, 
            SpMV_out_2.dval+2*dofs,
            d1.dval,
            d2.dval,
            skp.val,
            queue );
            
            Q_rho = magma_zsqrt(skp.val[0]);
            B_omega = skp.val[2]/skp.val[3];
        }               
                   
        
        // QMR
        Q_thet1 = Q_thet;        
        Q_thet = Q_rho / (Q_gamm * MAGMA_Z_MAKE( MAGMA_Z_ABS(Q_beta), 0.0 ));
        Q_gamm1 = Q_gamm;        
        
        Q_gamm = c_one / magma_zsqrt(c_one + Q_thet*Q_thet);        
        Q_eta = - Q_eta * Q_rho1 * Q_gamm * Q_gamm / (Q_beta * Q_gamm1 * Q_gamm1);        
        
        if( solver_par->numiter == 1 ){
            
                //QMR: d = eta * p + pds * d;
                //QMR: s = eta * pt + pds * d;
                //QMR: x = x + d;
                //QMR: r = r - s;
            magma_zqmr_4(  
            b.num_rows, 
            b.num_cols, 
            Q_eta,
            SpMV_in_1.dval,
            SpMV_out_1.dval,
            Q_d.dval, 
            Q_s.dval, 
            Q_x.dval, 
            Q_r.dval, 
            queue );
        }
        else{

            Q_pds = (Q_thet1 * Q_gamm) * (Q_thet1 * Q_gamm);
            
                //QMR: d = eta * p + pds * d;
                //QMR: s = eta * pt + pds * d;
                //QMR: x = x + d;
                //QMR: r = r - s;
            magma_zqmr_5(  
            b.num_rows, 
            b.num_cols, 
            Q_eta,
            Q_pds,
            SpMV_in_1.dval,
            SpMV_out_1.dval,
            Q_d.dval, 
            Q_s.dval, 
            Q_x.dval, 
            Q_r.dval, 
            queue );
        }
        
        
        // CGS: r = r -alpha*A u_hat
        // CGS: x = x + alpha u_hat
        magma_zcgs_4(  
        b.num_rows, 
        b.num_cols, 
        C_alpha,
        SpMV_in_2.dval+dofs,
        SpMV_out_2.dval+dofs,
        C_x.dval, 
        C_r.dval,
        queue );
        
            // BiCGSTAB: x = x + alpha * p + omega * s
            // BiCGSTAB: r = s - omega * t
        magma_zbicgstab_3(  
        b.num_rows, 
        b.num_cols, 
        B_alpha,
        B_omega,
        SpMV_in_1.dval+2*dofs,
        SpMV_in_2.dval+2*dofs,
        SpMV_out_2.dval+2*dofs,
        B_x.dval,
        B_r.dval,
        queue );
        
        

        
        if( mdot == 0 ){
            Q_res = magma_dznrm2( dofs, Q_r.dval, 1, queue );
            C_res = magma_dznrm2( dofs, C_r.dval, 1, queue );
            B_res = magma_dznrm2( dofs, B_r.dval, 1, queue );
                //QMR: psi = norm(z);
            Q_psi = magma_zsqrt( magma_zdotc( dofs, Q_z.dval, 1, Q_z.dval, 1), queue );
            
        }else{
            magma_zmdotc4(
            b.num_rows,  
            Q_r.dval, 
            Q_r.dval,
            C_r.dval, 
            C_r.dval,
            B_r.dval, 
            B_r.dval,
            Q_z.dval, 
            Q_z.dval,
            d1.dval,
            d2.dval,
            skp.val,
            queue );
        
            Q_res = MAGMA_Z_ABS(magma_zsqrt(skp.val[0]));
            C_res = MAGMA_Z_ABS(magma_zsqrt(skp.val[1]));
            B_res = MAGMA_Z_ABS(magma_zsqrt(skp.val[2]));
                //QMR: psi = norm(z);
            Q_psi = magma_zsqrt(skp.val[3]);
        }
        
        
            //QMR: v = y / rho
            //QMR: y = y / rho
            //QMR: w = wt / psi
            //QMR: z = z / psi    
            //QMR: wt = A' * q - beta' * w
            //QMR: no precond: z = wt
        magma_zqmr_6(  
        b.num_rows, 
        b.num_cols, 
        Q_beta,
        Q_rho,
        Q_psi,
        Q_y.dval, 
        Q_z.dval,
        Q_v.dval,
        Q_w.dval,
        SpMV_out_2.dval,
        queue );
        
          // printf(" %e   %e   %e\n", Q_res, C_res, B_res);
        if( Q_res < res ){
            res = Q_res;
            flag = 1;
        }
        if( C_res < res ){
            res = C_res;
            flag = 2;
        }
        if( B_res < res ){
            res = B_res;
            flag = 3;
        }

        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            if ( (solver_par->numiter)%solver_par->verbose == c_zero ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) res;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){
            break;
        }
        
                

 
    }
    while ( solver_par->numiter+1 <= solver_par->maxiter );
        
    // copy back the best solver
    switch ( flag ) {
        case 1:
            printf("%% QMR fastest solver.\n");
            magma_zcopy( dofs, Q_x.dval, 1, x->dval, 1, queue ); 
            break;
       case 2:
            printf("%% CGS fastest solver.\n");
            magma_zcopy( dofs, C_x.dval, 1, x->dval, 1, queue ); 
            break;
       case 3:
            printf("%% BiCGSTAB fastest solver.\n");
            magma_zcopy( dofs, B_x.dval, 1, x->dval, 1, queue ); 
            break;
    }


    
    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    CHECK(  magma_zresidualvec( A, b, *x, &r_tld, &residual, queue));
    solver_par->iter_res = res;
    solver_par->final_res = residual;

    if ( solver_par->numiter < solver_par->maxiter ) {
        info = MAGMA_SUCCESS;
    } else if ( solver_par->init_res > solver_par->final_res ) {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose == c_zero ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) res;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_SLOW_CONVERGENCE;
        if( solver_par->iter_res < solver_par->rtol*solver_par->init_res ||
            solver_par->iter_res < solver_par->atol ) {
            info = MAGMA_SUCCESS;
        }
    }
    else {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose == c_zero ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) res;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_DIVERGENCE;
    }
    
cleanup:
    
    magma_zmfree(&r_tld, queue );
    
    magma_zmfree(&SpMV_in_1,  queue );
    magma_zmfree(&SpMV_out_1, queue );
    magma_zmfree(&SpMV_in_2,  queue );
    magma_zmfree(&SpMV_out_2, queue );
    
    magma_zmfree(&d1, queue );
    magma_zmfree(&d2,  queue );
    magma_zmfree(&skp, queue );
    
    magma_zmfree(&AT,  queue );
    
    // QMR
    magma_zmfree(&Q_r, queue );
    magma_zmfree(&Q_v,  queue );
    magma_zmfree(&Q_w,  queue );
    magma_zmfree(&Q_wt, queue );
    magma_zmfree(&Q_d,  queue );
    magma_zmfree(&Q_s,  queue );
    magma_zmfree(&Q_z,  queue );
    magma_zmfree(&Q_q,  queue );
    magma_zmfree(&Q_p,  queue );
    magma_zmfree(&Q_pt, queue );
    magma_zmfree(&Q_y,  queue );
    magma_zmfree(&Q_x,  queue );
    // CGS
    magma_zmfree(&C_r, queue );
    magma_zmfree(&C_rt, queue );
    magma_zmfree(&C_x, queue );
    magma_zmfree(&C_p, queue );
    magma_zmfree(&C_q, queue );
    magma_zmfree(&C_u, queue );
    magma_zmfree(&C_v, queue );
    magma_zmfree(&C_t, queue );
    magma_zmfree(&C_p_hat, queue );
    magma_zmfree(&C_q_hat, queue );
    magma_zmfree(&C_u_hat, queue );
    magma_zmfree(&C_v_hat, queue );
    // BiCGSTAB
    magma_zmfree(&B_r, queue );
    magma_zmfree(&B_x, queue );
    magma_zmfree(&B_p, queue );
    magma_zmfree(&B_v, queue );
    magma_zmfree(&B_s, queue );
    magma_zmfree(&B_t, queue );
    
    
    
    // destroy asynchronous queues
    magma_queue_destroy( queues[0] );
    if ( q1flag == 1 ) {
        magma_queue_destroy( queues[1] );
    }
    magma_queue_destroy( queues[2] );

    solver_par->info = info;
    return info;
}   /* magma_zbombard_merge */