Exemplo n.º 1
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{

    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magmaFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    magmaFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    magmaFloatComplex *TAU       = data -> TAU;
    magmaFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    magmaFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

#ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
#endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_setlapack_numthreads(1);

#ifdef MAGMA_SETAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    affinity_set original_set;
    affinity_set new_set(my_core_id);
    int check  = 0;
    int check2 = 0;
    // bind threads
    check = original_set.get_affinity();
    if (check == 0) {
        check2 = new_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (single cpu)\n");
    }
    else
    {
        printf("Error in sched_getaffinity\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if(my_core_id==0)
    {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
#ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
#endif

        magma_csetmatrix(n, n_gpu, E, lde, dE, ldde);
        magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
        magma_device_sync();

#ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f \n" ,timeQgpu);
#endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
#ifdef ENABLE_TIMER
        if(my_core_id == 1)
            timeQcpu = magma_wtime();
#endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_ctile_bulge_applyQ(my_core_id, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

#ifdef ENABLE_TIMER
        if(my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f \n" ,timeQcpu);
        }
#endif

    } // END if my_core_id

#ifdef MAGMA_SETAFFINITY
    // unbind threads
    if (check == 0) {
        check2 = original_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (restore cpu list)\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}
Exemplo n.º 2
0
extern "C" magma_int_t
magma_ssyevdx_2stage(char jobz, char range, char uplo,
                     magma_int_t n,
                     float *a, magma_int_t lda,
                     float vl, float vu, magma_int_t il, magma_int_t iu,
                     magma_int_t *m, float *w,
                     float *work, magma_int_t lwork,
                     magma_int_t *iwork, magma_int_t liwork,
                     magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
    complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    JOBZ    (input) CHARACTER*1
            = 'N':  Compute eigenvalues only;
            = 'V':  Compute eigenvalues and eigenvectors.

    RANGE   (input) CHARACTER*1
            = 'A': all eigenvalues will be found.
            = 'V': all eigenvalues in the half-open interval (VL,VU]
                   will be found.
            = 'I': the IL-th through IU-th eigenvalues will be found.

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = 'L',
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = 'V', then if INFO = 0, the first m columns
            of A contains the required
            orthonormal eigenvectors of the matrix A.
            If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
            or the upper triangle (if UPLO='U') of A, including the
            diagonal, is destroyed.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    VL      (input) REAL
    VU      (input) REAL
            If RANGE='V', the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = 'A' or 'I'.

    IL      (input) INTEGER
    IU      (input) INTEGER
            If RANGE='I', the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = 'A' or 'V'.

    M       (output) INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.

    W       (output) REAL array, dimension (N)
            If INFO = 0, the required m eigenvalues in ascending order.

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.
            If N <= 1,                LWORK >= 1.
            If JOBZ  = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 2).
            If JOBZ  = 'V' and N > 1, LWORK >= LQ2 + 1 + 6*N + 2*N**2.
                                      where LQ2 is the size needed to store
                                      the Q2 matrix and is returned by
                                      MAGMA_BULGE_GET_LQ2.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            If N <= 1,                LIWORK >= 1.
            If JOBZ  = 'N' and N > 1, LIWORK >= 1.
            If JOBZ  = 'V' and N > 1, LIWORK >= 3 + 5*N.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = 'V', then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through
                  mod(INFO,N+1).

    Further Details
    ===============
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.
    =====================================================================   */

    char uplo_[2] = {uplo, 0};
    char jobz_[2] = {jobz, 0};
    char range_[2] = {range, 0};
    float d_one  = 1.;
    magma_int_t ione = 1;
    magma_int_t izero = 0;

    float d__1;

    float eps;
    float anrm;
    magma_int_t imax;
    float rmin, rmax;
    float sigma;
    magma_int_t lwmin, liwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t iscale;
    float safmin;
    float bignum;
    float smlnum;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;

    float* dwork;

    /* determine the number of threads */
    magma_int_t threads = magma_get_numthreads();
    magma_setlapack_numthreads(threads);

    wantz = lapackf77_lsame(jobz_, MagmaVecStr);
    lower = lapackf77_lsame(uplo_, MagmaLowerStr);

    alleig = lapackf77_lsame( range_, "A" );
    valeig = lapackf77_lsame( range_, "V" );
    indeig = lapackf77_lsame( range_, "I" );

    lquery = lwork == -1 || liwork == -1;

    *info = 0;
    if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (lda < max(1,n)) {
        *info = -6;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -8;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -9;
            } else if (iu < min(n,il) || iu > n) {
                *info = -10;
            }
        }
    }

    magma_int_t nb = magma_get_sbulge_nb(n, threads);
    magma_int_t Vblksiz = magma_sbulge_get_Vblksiz(n, nb, threads);

    magma_int_t ldt = Vblksiz;
    magma_int_t ldv = nb + Vblksiz;
    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    magma_int_t lq2 = magma_sbulge_get_lq2(n, threads);

    if (wantz) {
        lwmin = lq2 + 1 + 6 * n + 2 * n * n;
        liwmin = 5 * n + 3;
    } else {
        lwmin = lq2 + n * (nb + 1);
        liwmin = 1;
    }

    work[0] = lwmin * (1. + lapackf77_slamch("Epsilon"));
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -14;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -16;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }
    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }

    if (n == 1) {
        w[0] = a[0];
        if (wantz) {
            a[0] = MAGMA_S_ONE;
        }
        return *info;
    }

#ifdef ENABLE_TIMER
    printf("using %d threads\n", threads);
#endif
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    magma_int_t ntiles = n/nb;
    if( ( ntiles < 2 ) || ( n <= 128 ) ){
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int ) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_ssyevd(jobz_, uplo_, &n, 
                        a, &lda, w, 
                        work, &lwork, 
                        iwork, &liwork, 
                        info);
        *m = n; 
        return *info;
    }

    /* Get machine constants. */
    safmin = lapackf77_slamch("Safe minimum");
    eps = lapackf77_slamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_ssqrt(smlnum);
    rmax = magma_ssqrt(bignum);

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_slansy("M", uplo_, &n, a, &lda, work);
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    }
    if (iscale == 1) {
        lapackf77_slascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a,
                         &lda, info);
    }

    magma_int_t inde    = 0;
    magma_int_t indT2   = inde + n;
    magma_int_t indTAU2 = indT2  + blkcnt*ldt*Vblksiz;
    magma_int_t indV2   = indTAU2+ blkcnt*Vblksiz;
    magma_int_t indtau1 = indV2  + blkcnt*ldv*Vblksiz;
    magma_int_t indwrk  = indtau1+ n;
    magma_int_t indwk2  = indwrk + n * n;

    magma_int_t llwork = lwork - indwrk;
    magma_int_t llwrk2 = lwork - indwk2;

#ifdef ENABLE_TIMER
    magma_timestr_t start, st1, st2, end;
    start = get_current_time();
#endif

    float *dT1;

    if (MAGMA_SUCCESS != magma_smalloc( &dT1, n*nb)) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_ssytrd_sy2sb(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, dT1, threads, info);

#ifdef ENABLE_TIMER
    st1 = get_current_time();
    printf("  time ssytrd_sy2sb = %6.2f\n" , GetTimerValue(start,st1)/1000.);
#endif

    /* copy the input matrix into WORK(INDWRK) with band storage */
    /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/
    magma_int_t lda2 = 2*nb; //nb+1+(nb-1);
    float* A2 = &work[indwrk];
    memset(A2 , 0, n*lda2*sizeof(float));

    for (magma_int_t j = 0; j < n-nb; j++)
    {
        cblas_scopy(nb+1, &a[j*(lda+1)], 1, &A2[j*lda2], 1);
        memset(&a[j*(lda+1)], 0, (nb+1)*sizeof(float));
        a[nb + j*(lda+1)] = d_one;
    }
    for (magma_int_t j = 0; j < nb; j++)
    {
        cblas_scopy(nb-j, &a[(j+n-nb)*(lda+1)], 1, &A2[(j+n-nb)*lda2], 1);
        memset(&a[(j+n-nb)*(lda+1)], 0, (nb-j)*sizeof(float));
    }

#ifdef ENABLE_TIMER
    st2 = get_current_time();
    printf("  time ssytrd_convert = %6.2f\n" , GetTimerValue(st1,st2)/1000.);
#endif

    magma_ssytrd_sb2st(threads, uplo, n, nb, Vblksiz, A2, lda2, w, &work[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt);

#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("  time ssytrd_sy2st = %6.2f\n" , GetTimerValue(st2,end)/1000.);
    printf("  time ssytrd = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    /* For eigenvalues only, call SSTERF.  For eigenvectors, first call
     ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
     tridiagonal matrix, then call ZUNMTR to multiply it to the Householder
     transformations represented as Householder vectors in A. */
    if (! wantz) {
#ifdef ENABLE_TIMER
        start = get_current_time();
#endif

        lapackf77_ssterf(&n, w, &work[inde], info);
        magma_smove_eig(range, n, w, &il, &iu, vl, vu, m);

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("  time sstedc = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
    } else {

#ifdef ENABLE_TIMER
        start = get_current_time();
#endif

        if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        magma_sstedx(range, n, vl, vu, il, iu, w, &work[inde],
                     &work[indwrk], n, &work[indwk2],
                     llwrk2, iwork, liwork, dwork, info);

        magma_free( dwork );

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("  time sstedx = %6.2f\n", GetTimerValue(start,end)/1000.);
        start = get_current_time();
#endif
        float *dZ;
        magma_int_t lddz = n;

        float *da;
        magma_int_t ldda = n;

        magma_smove_eig(range, n, w, &il, &iu, vl, vu, m);

        if (MAGMA_SUCCESS != magma_smalloc( &dZ, *m*lddz)) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        magma_sbulge_back(threads, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz,
                          &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info);

#ifdef ENABLE_TIMER
        st1 = get_current_time();

        printf("  time sbulge_back = %6.2f\n" , GetTimerValue(start,st1)/1000.);
#endif

        magma_ssetmatrix( n, n, a, lda, da, ldda );

        magma_sormqr_gpu_2stages(MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, da+nb, ldda,
                                 dZ+nb, n, dT1, nb, info);

        magma_sgetmatrix( n, *m, dZ, lddz, a, lda );
        magma_free(dT1);
        magma_free(dZ);
        magma_free(da);

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("  time sormqr + copy = %6.2f\n", GetTimerValue(st1,end)/1000.);

        printf("  time eigenvectors backtransf. = %6.2f\n" , GetTimerValue(start,end)/1000.);
#endif

    }

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        if (*info == 0) {
            imax = n;
        } else {
            imax = *info - 1;
        }
        d__1 = 1. / sigma;
        blasf77_sscal(&imax, &d__1, w, &ione);
    }

    work[0] = lwmin * (1. + lapackf77_slamch("Epsilon"));
    iwork[0] = liwmin;

    return *info;
} /* magma_zheevdx_2stage */
Exemplo n.º 3
0
extern "C" magma_int_t
magma_cbulge_back(magma_int_t threads, char uplo,
                  magma_int_t n, magma_int_t nb,
                  magma_int_t ne, magma_int_t Vblksiz,
                  magmaFloatComplex *Z, magma_int_t ldz,
                  magmaFloatComplex *dZ, magma_int_t lddz,
                  magmaFloatComplex *V, magma_int_t ldv,
                  magmaFloatComplex *TAU,
                  magmaFloatComplex *T, magma_int_t ldt,
                  magma_int_t* info)
{
    magma_setlapack_numthreads(1);

    float timeaplQ2=0.0;
    float f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
    //float gpu_cpu_perf = 50;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
    //float gpu_cpu_perf = 16;  // gpu over cpu performance  //100% ev // SandyB. - Fermi (M2090)
//#else
//    float gpu_cpu_perf = 27.5;  // gpu over cpu performance  //100% ev // Westmere - Fermi (M2090)
    //float gpu_cpu_perf = 37;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
//    float gpu_cpu_perf = 130;  // gpu over cpu performance  //100% ev // Bulldozer - Kepler (K20X)
//#endif

    magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf();
    if(threads>1) {
        f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf)    );
        n_gpu = (magma_int_t)(f*ne);
    }

    /****************************************************
     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//n_gpu=ne;
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
    timeaplQ2 = magma_wtime();
    /*============================
     *  use GPU+CPU's
     *==========================*/

    if(n_gpu < ne)
    {

        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
#ifdef ENABLE_DEBUG
        printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
#endif
        magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);

        magma_capplyQ_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_init(&thread_attr);
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
        pthread_setconcurrency(threads);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
        }
        arg[0] = magma_capplyQ_id_data(0, &data_applyQ);
        magma_capplyQ_parallel_section(&arg[0]);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);
        }

        magma_free_cpu(thread_id);
        magma_free_cpu(arg);

        magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);

        /*============================
         *  use only GPU
         *==========================*/
    } else {
        magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
        magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
        magma_device_sync();
    }

    timeaplQ2 = magma_wtime()-timeaplQ2;

    magma_setlapack_numthreads(threads);
    return MAGMA_SUCCESS;
}
Exemplo n.º 4
0
extern "C" magma_int_t magma_ssytrd_sb2st(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz,
                                          float *A, magma_int_t lda, float *D, float *E,
                                          float *V, magma_int_t ldv, float *TAU, magma_int_t compT, float *T, magma_int_t ldt)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======


    Arguments
    =========
    THREADS (input) INTEGER
            Specifies the number of pthreads used.
            THREADS > 0

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangles of A is stored;
            = 'L':  Lower triangles of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    NB      (input) INTEGER
            The order of the band matrix A.  N >= NB >= 0.

    VBLKSIZ (input) INTEGER
            The size of the block of householder vectors applied at once.

    A       (input/workspace) REAL array, dimension (LDA, N)
            On entry the band matrix stored in the following way:

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= 2*NB.

    D       (output) DOUBLE array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).

    E       (output) DOUBLE array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.

    V       (output) REAL array, dimension (BLKCNT, LDV, VBLKSIZ)
            On exit it contains the blocks of householder reflectors
            BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT.

    LDV     (input) INTEGER
            The leading dimension of V.
            LDV > NB + VBLKSIZ + 1

    TAU     (output) REAL dimension(BLKCNT, VBLKSIZ)
            ???

    COMPT   (input) INTEGER
            if COMPT = 0 T is not computed
            if COMPT = 1 T is computed

    T       (output) REAL dimension(LDT *)
            if COMPT = 1 on exit contains the matrices T needed for Q2
            if COMPT = 0 T is not referenced

    LDT     (input) INTEGER
            The leading dimension of T.
            LDT > Vblksiz

    INFO    (output) INTEGER ????????????????????????????????????????????????????????????????????????????????????
            = 0:  successful exit


    =====================================================================  */

    #ifdef ENABLE_TIMER
    real_Double_t timeblg=0.0;
    #endif

    //char uplo_[2] = {uplo, 0};
    magma_int_t mklth = threads;
    magma_int_t INgrsiz=1;
    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    magma_int_t nbtiles = magma_ceildiv(n, nb);

    memset(T,   0, blkcnt*ldt*Vblksiz*sizeof(float));
    memset(TAU, 0, blkcnt*Vblksiz*sizeof(float));
    memset(V,   0, blkcnt*ldv*Vblksiz*sizeof(float));

    magma_int_t* prog;
    magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t));
    memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t));

    magma_sbulge_id_data* arg;
    magma_malloc_cpu((void**) &arg, threads*sizeof(magma_sbulge_id_data));

    pthread_t* thread_id;
    magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
    pthread_attr_t thread_attr;

    magma_setlapack_numthreads(1);
    magma_sbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT,
                                 A, lda, V, ldv, TAU, T, ldt, prog);

    // Set one thread per core
    pthread_attr_init(&thread_attr);
    pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
    pthread_setconcurrency(threads);

    //timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime();
    #endif

    // Launch threads
    for (magma_int_t thread = 1; thread < threads; thread++)
    {
        arg[thread] = magma_sbulge_id_data(thread, &data_bulge);
        pthread_create(&thread_id[thread], &thread_attr, magma_ssytrd_sb2st_parallel_section, &arg[thread]);
    }
    arg[0] = magma_sbulge_id_data(0, &data_bulge);
    magma_ssytrd_sb2st_parallel_section(&arg[0]);

    // Wait for completion
    for (magma_int_t thread = 1; thread < threads; thread++)
    {
        void *exitcodep;
        pthread_join(thread_id[thread], &exitcodep);
    }

    // timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime()-timeblg;
    printf("  time BULGE+T = %f \n" ,timeblg);
    #endif

    magma_free_cpu(thread_id);
    magma_free_cpu(arg);
    magma_free_cpu(prog);

    magma_setlapack_numthreads(mklth);
    /*================================================
     *  store resulting diag and lower diag D and E
     *  note that D and E are always real
     *================================================*/

    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In real case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the SLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */

#if defined(PRECISION_z) || defined(PRECISION_c)
    if(uplo==MagmaLower){
        for (magma_int_t i=0; i < n-1 ; i++)
        {
            D[i] = MAGMA_S_REAL(A[i*lda  ]);
            E[i] = MAGMA_S_REAL(A[i*lda+1]);
        }
        D[n-1] = MAGMA_S_REAL(A[(n-1)*lda]);
    } else { /* MagmaUpper not tested yet */
        for (magma_int_t i=0; i<n-1; i++)
        {
            D[i]  =  MAGMA_S_REAL(A[i*lda+nb]);
            E[i] = MAGMA_S_REAL(A[i*lda+nb-1]);
        }
        D[n-1] = MAGMA_S_REAL(A[(n-1)*lda+nb]);
    } /* end MagmaUpper */
#else
    if( uplo == MagmaLower ){
        for (magma_int_t i=0; i < n-1; i++) {
            D[i] = A[i*lda];   // diag
            E[i] = A[i*lda+1]; //lower diag
        }
        D[n-1] = A[(n-1)*lda];
    } else {
        for (magma_int_t i=0; i < n-1; i++) {
            D[i] = A[i*lda+nb];   // diag
            E[i] = A[i*lda+nb-1]; //lower diag
        }
        D[n-1] = A[(n-1)*lda+nb];
    }
#endif
    return MAGMA_SUCCESS;

}
Exemplo n.º 5
0
static void *magma_ssytrd_sb2st_parallel_section(void *arg)
{
    magma_int_t my_core_id  = ((magma_sbulge_id_data*)arg) -> id;
    magma_sbulge_data* data = ((magma_sbulge_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t nb             = data -> nb;
    magma_int_t nbtiles        = data -> nbtiles;
    magma_int_t grsiz          = data -> grsiz;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magma_int_t compT          = data -> compT;
    float *A         = data -> A;
    magma_int_t lda            = data -> lda;
    float *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    float *TAU       = data -> TAU;
    float *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    volatile magma_int_t* prog = data -> prog;

    pthread_barrier_t* barrier = &(data -> barrier);

    //magma_int_t sys_corenbr    = 1;

    #ifdef ENABLE_TIMER
    real_Double_t timeB=0.0, timeT=0.0;
    #endif

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_setlapack_numthreads(1);

#ifdef MAGMA_SETAFFINITY
//#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    affinity_set original_set;
    affinity_set new_set(my_core_id);
    int check  = 0;
    int check2 = 0;
    // bind threads
    check = original_set.get_affinity();
    if (check == 0) {
        check2 = new_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (single cpu)\n");
    }
    else
    {
        printf("Error in sched_getaffinity\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if(compT==1)
    {
        /* compute the Q1 overlapped with the bulge chasing+T.
         * if all_cores_num=1 it call Q1 on GPU and then bulgechasing.
         * otherwise the first thread run Q1 on GPU and
         * the other threads run the bulgechasing.
         * */

        if(allcores_num==1)
        {

            //=========================
            //    bulge chasing
            //=========================
            #ifdef ENABLE_TIMER
            timeB = magma_wtime();
            #endif
            
            magma_stile_bulge_parallel(0, 1, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog);

            #ifdef ENABLE_TIMER
            timeB = magma_wtime()-timeB;
            printf("  Finish BULGE   timing= %f \n" ,timeB);
            #endif
            //=========================
            // compute the T's to be used when applying Q2
            //=========================
            #ifdef ENABLE_TIMER
            timeT = magma_wtime();
            #endif

            magma_stile_bulge_computeT_parallel(0, 1, V, ldv, TAU, T, ldt, n, nb, Vblksiz);

            #ifdef ENABLE_TIMER
            timeT = magma_wtime()-timeT;
            printf("  Finish T's     timing= %f \n" ,timeT);
            #endif

        }else{ // allcore_num > 1

            magma_int_t id  = my_core_id;
            magma_int_t tot = allcores_num;


                //=========================
                //    bulge chasing
                //=========================
                #ifdef ENABLE_TIMER
                if(id == 0)
                    timeB = magma_wtime();
                #endif

                magma_stile_bulge_parallel(id, tot, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog);
                pthread_barrier_wait(barrier);

                #ifdef ENABLE_TIMER
                if(id == 0){
                    timeB = magma_wtime()-timeB;
                    printf("  Finish BULGE   timing= %f \n" ,timeB);
                }
                #endif

                //=========================
                // compute the T's to be used when applying Q2
                //=========================
                #ifdef ENABLE_TIMER
                if(id == 0)
                    timeT = magma_wtime();
                #endif

                magma_stile_bulge_computeT_parallel(id, tot, V, ldv, TAU, T, ldt, n, nb, Vblksiz);
                pthread_barrier_wait(barrier);

                #ifdef ENABLE_TIMER
                if (id == 0){
                    timeT = magma_wtime()-timeT;
                    printf("  Finish T's     timing= %f \n" ,timeT);
                }
                #endif

        } // allcore == 1

    }else{ // WANTZ = 0

        //=========================
        //    bulge chasing
        //=========================
        #ifdef ENABLE_TIMER
        if(my_core_id == 0)
            timeB = magma_wtime();
        #endif

        magma_stile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog);
        pthread_barrier_wait(barrier);

        #ifdef ENABLE_TIMER
        if(my_core_id == 0){
            timeB = magma_wtime()-timeB;
            printf("  Finish BULGE   timing= %f \n" ,timeB);
        }
        #endif

    } // WANTZ > 0

#ifdef MAGMA_SETAFFINITY
    // unbind threads
    if (check == 0){
        check2 = original_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (restore cpu list)\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}
Exemplo n.º 6
0
extern "C" magma_int_t 
magma_zbulge_back_m(magma_int_t nrgpu, magma_int_t threads, char uplo, 
                        magma_int_t n, magma_int_t nb, 
                        magma_int_t ne, magma_int_t Vblksiz,
                        magmaDoubleComplex *Z, magma_int_t ldz,
                        magmaDoubleComplex *V, magma_int_t ldv, 
                        magmaDoubleComplex *TAU, 
                        magmaDoubleComplex *T, magma_int_t ldt, 
                        magma_int_t* info)
{
    magma_setlapack_numthreads(1);

    double timeaplQ2=0.0;

    double f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
//    double gpu_cpu_perf = 32; //gpu over cpu performance
//#else
//    double gpu_cpu_perf = 32;  // gpu over cpu performance
//#endif

    double perf_temp= .85;
    double perf_temp2= perf_temp;
    for (magma_int_t itmp=1; itmp<nrgpu; ++itmp)
        perf_temp2*=perf_temp;
    magma_int_t gpu_cpu_perf = magma_get_zbulge_gcperf();
    if(threads>1){
        f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf*(1.-perf_temp2)/(1.-perf_temp)));
        n_gpu = (magma_int_t)(f*ne);
    }






    /****************************************************
     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/

    timeaplQ2 = magma_wtime();

    /*============================
     *  use GPU+CPU's
     *==========================*/
//n_gpu = ne;
    if(n_gpu < ne)
    {

        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
        #ifdef ENABLE_DEBUG
        printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
        #endif
        magma_zapplyQ_m_data data_applyQ(nrgpu, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt);

        magma_zapplyQ_m_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zapplyQ_m_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_init(&thread_attr);
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
        pthread_setconcurrency(threads);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            arg[thread] = magma_zapplyQ_m_id_data(thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_zapplyQ_m_parallel_section, &arg[thread]);
        }
        arg[0] = magma_zapplyQ_m_id_data(0, &data_applyQ);
        magma_zapplyQ_m_parallel_section(&arg[0]);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);
        }

        magma_free_cpu(thread_id);
        magma_free_cpu(arg);

        /*============================
         *  use only GPU
         *==========================*/
    }else{
        magma_zbulge_applyQ_v2_m(nrgpu, 'L', ne, n, nb, Vblksiz, Z, ldz, V, ldv, T, ldt, info);
        magma_device_sync();
    }

    timeaplQ2 = magma_wtime()-timeaplQ2;

    magma_setlapack_numthreads(threads);
    return MAGMA_SUCCESS;
}
Exemplo n.º 7
0
extern "C" magma_int_t
magma_chetrd_he2hb_mgpu( char uplo, magma_int_t n, magma_int_t nb,
                    magmaFloatComplex *a, magma_int_t lda, 
                    magmaFloatComplex *tau,
                    magmaFloatComplex *work, magma_int_t lwork,
                    magmaFloatComplex *dAmgpu[], magma_int_t ldda,
                    magmaFloatComplex *dTmgpu[], magma_int_t lddt,
                    magma_int_t ngpu, magma_int_t distblk, 
                    magma_queue_t streams[][20], magma_int_t nstream, 
                    magma_int_t threads, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose   
    =======   
    CHETRD_HE2HB reduces a complex Hermitian matrix A to real symmetric   
    band-diagonal form T by an orthogonal similarity transformation:   
    Q**H * A * Q = T.   
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) COMPLEX array, dimension (LDA,N)   
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
            On exit, if UPLO = 'U', the Upper band-diagonal of A is 
            overwritten by the corresponding elements of the   
            band-diagonal matrix T, and the elements above the band   
            diagonal, with the array TAU, represent the orthogonal   
            matrix Q as a product of elementary reflectors; if UPLO   
            = 'L', the the Lower band-diagonal of A is overwritten by 
            the corresponding elements of the band-diagonal   
            matrix T, and the elements below the band-diagonal, with   
            the array TAU, represent the orthogonal matrix Q as a product   
            of elementary reflectors. See Further Details.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    TAU     (output) COMPLEX array, dimension (N-1)   
            The scalar factors of the elementary reflectors (see Further   
            Details).   

    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= 1.   
            For optimum performance LWORK >= N*NB, where NB is the   
            optimal blocksize.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    dT      (output) COMPLEX array on the GPU, dimension N*NB, 
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the 
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered 
            consecutively in memory one after another.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    Further Details   
    ===============   
    If UPLO = 'U', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(n-1) . . . H(2) H(1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with   
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in   
    A(1:i-1,i+1), and tau in TAU(i).   

    If UPLO = 'L', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(1) H(2) . . . H(n-1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a complex scalar, and v is a complex vector with   
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),   
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples   
    with n = 5:   

    if UPLO = 'U':                       if UPLO = 'L':   

      (  d   e   v2  v3  v4 )              (  d                  )   
      (      d   e   v3  v4 )              (  e   d              )   
      (          d   e   v4 )              (  v1  e   d          )   
      (              d   e  )              (  v1  v2  e   d      )   
      (                  d  )              (  v1  v2  v3  e   d  )   

    where d and e denote diagonal and off-diagonal elements of T, and vi   
    denotes an element of the vector defining H(i).   
    =====================================================================    */


    #define a_ref(a_1,a_2)  ( a  + ((a_2)-1)*( lda) + (a_1)-1)
    #define da_ref(a_1,a_2) (da  + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1)    (tau + (a_1)-1)
    #define t_ref(a_1)      (dT  + ((a_1)-1)*(lddt))

    #define Atest(a_1,a_2)  ( Atest  + ((a_2)-1)*( lda) + (a_1)-1)

    #define dttest(a_0, a_1, a_2)   (dTmgpu[a_0]  + ((a_2)-1)*(lddt))
    #define datest(a_0, a_1, a_2)   (dAmgpu[a_0]  + ((a_2)-1)*(ldda) + (a_1)-1)


    char uplo_[2] = {uplo, 0};




   
    magmaFloatComplex c_neg_one  = MAGMA_C_NEG_ONE;
    magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF;
    magmaFloatComplex c_one  = MAGMA_C_ONE ;
    magmaFloatComplex c_zero = MAGMA_C_ZERO;
    float  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0, flipV=-1;
    magma_int_t iblock, idev, di;
    int i;
    int lwkopt;
    int lquery;

    assert (nstream>=3);
    assert (nstream>=(ngpu+1));


    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    /* Determine the block size. */
    lwkopt = n * nb;
    if (*info == 0) {
        MAGMA_C_SET2REAL( work[0], lwkopt );
    }


    if (*info != 0)
        return *info;
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_int_t mklth = min(threads,16);
    magma_setlapack_numthreads(mklth);

    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2];
    magma_int_t nbcmplx=0;
    magma_buildconnection_mgpu(gnode, &nbcmplx,  ngpu);
    #ifdef ENABLE_DEBUG
    printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n" , nbcmplx);
    #endif


    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);

    magmaFloatComplex *dspace[MagmaMaxGPUs];
    magmaFloatComplex *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs];
    magmaFloatComplex *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs];
    magmaFloatComplex *workngpu[MagmaMaxGPUs+1];
    magma_event_t     redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; 
    magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs;

    magma_int_t lddv        = ldda;
    magma_int_t lddw        = lddv;
    magma_int_t dwrk2siz    = ldda*nb*(ngpu+1);  
    magma_int_t worksiz     = n*nb;
    magma_int_t devworksiz  = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis

    // local allocation and stream creation
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_cmalloc( &dspace[dev], devworksiz );
        magma_cmalloc_pinned ( &workngpu[dev], worksiz);
        dvall[dev]    = dspace[dev];
        dw[dev]       = dvall[dev]   + 2*nb*lddv;
        dwork[dev]    = dw[dev]      + nb*lddw;
        dworkbis[dev] = dwork[dev]   + nb*ldda;
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        for( magma_int_t i = 0; i < nbevents; ++i ) {
            cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming);
        }
    }
    magma_cmalloc_pinned ( &workngpu[ngpu], worksiz);
    magmaFloatComplex *worktest = NULL; //(magmaFloatComplex *) malloc(n*nb*sizeof(magmaFloatComplex)); // not used
    // ======================
  

    magmaFloatComplex *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(magmaFloatComplex));

    if (upper) {

        printf("CHETRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
        exit(1);

    }else {
            /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) 
        {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ){
                 // cpanel_to_q copy the upper oof diagonal part of 
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the 
                 //    lookahead panel that has been computted from GPU to CPU. 
                 cpanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);

                 // find the device who own the panel then send it to the CPU.
                // below a -1 was added and then a -1 was done on di because of the fortran indexing
                 iblock = ((i-1) / distblk) / ngpu;          // local block id
                 di     = iblock*distblk + (i-1)%distblk;     // local index in parent matrix
                 idev   = ((i-1) / distblk) % ngpu;          // device with this block


                 //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); 
                 magma_setdevice( idev );

                 //magma_device_sync();
                 magma_cgetmatrix_async( (pm+pn), pn,
                                         datest(idev, i, di+1), ldda,
                                         a_ref ( i, i), lda, streams[ idev ][ nstream-1 ] );
               
                 /*
                 magma_device_sync();
                 cudaMemcpy2DAsync(a_ref(i,i), lda*sizeof(magmaFloatComplex),
                                  datest(idev,i,di+1), ldda*sizeof(magmaFloatComplex),
                                  (pm+pn)*sizeof(magmaFloatComplex), pn,
                                  cudaMemcpyDeviceToHost, streams[ idev ][ nstream-1 ]);

                 */

                 //magma_setdevice( 0 );
                 //printf("updating cher2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); 
                // compute CHER2K_MGPU
                 magmablas_cher2k_mgpu2(
                      MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old,
                      c_neg_one, dv, pm_old, pn_old,
                                 dw, pm_old, pn_old,
                      d_one,     dAmgpu, ldda, indi_old+pn_old-1,
                      ngpu, distblk, streams, 2 );
                 //magma_setdevice( 0 );

                 magma_setdevice( idev );
                 magma_queue_sync( streams[idev][ nstream-1 ] );
                 //magma_setdevice( 0 );
                 cq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices. 
                ==========================================================  */
             lapackf77_cgeqrf(&pm, &pn, a_ref(indi, indj), &lda, 
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
             pk=min(pm,pn);
             lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, a_ref(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             cpanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work);



             /* Send V and T from the CPU to the GPU */
             // To be able to overlap the GET with the CHER2K
             // it should be done on last stream. 
             // TO Avoid a BUG that is overwriting the old_V
             // used atthis moment by cher2k with the new_V 
             // send it now, we decide to have a flipflop 
             // vector of Vs. if step%2=0 use V[0] else use V[nb*n]
             flipV = ((i-1)/nb)%2;
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 dv[dev] = dvall[dev] + flipV*nb*lddv;                     
             }

             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 magma_setdevice( dev );
                // send V
                 magma_csetmatrix_async( pm, pk,
                                     a_ref(indi, indj),  lda,
                                     dv[dev], pm, streams[dev][nstream-1] );

                // Send the triangular factor T to the GPU 
                magma_csetmatrix_async( pk, pk,
                                     hT,       nb,
                                     dttest(dev, 1, i), lddt, streams[dev][nstream-1] );
             }

             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X)) 
                ==========================================================  */
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 // dwork = V T 
                 magma_setdevice( dev );
                 magmablasSetKernelStream( streams[ dev ][ nstream-1 ] );
                 magma_queue_sync( streams[dev][nstream-1] );
                 magma_cgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, dv[dev], pm, 
                         dttest(dev, 1, i), lddt,
                         c_zero, dwork[dev], pm);
             }

             // ===============================================
             //   SYNC TO BE SURE THAT BOTH V AND T WERE 
             //   RECEIVED AND VT IS COMPUTED and SYR2K is done
             // ===============================================
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 magma_setdevice( dev );
                 for( magma_int_t s = 0; s < nstream; ++s ) 
                 magma_queue_sync( streams[dev][s] );
             }


              // compute CHEMM_MGPU
              // The broadcast of the result done inside this function
              // should be done in stream [0] because i am assuming this 
              // for the GEMMs below otherwise I have to SYNC over the 
              // Broadcasting stream.
              if(ngpu==1){
                 magmablasSetKernelStream( streams[ 0 ][ 0 ] );
                 magma_chemm(MagmaLeft, uplo, pm, pk,
                         c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda,
                         dwork[0], pm,
                         c_zero, dw[0], pm);
              }else{
                 magmablas_chemm_mgpu_com(
                       MagmaLeft, uplo, pm, pk,
                       c_one, dAmgpu, ldda, indi-1,
                                   dwork, pm,
                       c_zero,     dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz,
                       ngpu, distblk, streams, nstream-1, redevents, nbevents, gnode, nbcmplx);             
             }

             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 // Here we have to wait until the broadcast of CHEMM has been done.
                 // Note that the broadcast should be done on stream[0] so in a way 
                 // we can continue here on the same stream and avoid a sync    
                 magma_setdevice( dev );
                 magmablasSetKernelStream( streams[ dev ][ 0 ] );
                 // magma_queue_sync( streams[dev][0] );
                 magma_cgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm,
                             c_one, dwork[dev], pm, 
                             dw[dev], pm,
                             c_zero, dworkbis[dev], nb);
                 
                 /* W = X - 0.5 * V * T'*V'*X
                  *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
                 magma_cgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                             c_neg_half, dv[dev], pm,
                             dworkbis[dev], nb, 
                             c_one,     dw[dev], pm);
             }
             /* restore the panel it is put here to overlap with the previous GEMM*/
             cq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             // ===============================================
             //   SYNC TO BE SURE THAT BOTH V AND W ARE DONE
             // ===============================================
             // Synchronise to be sure that W has been computed 
             // because next CHER2K use streaming and may happen 
             // that lunch a gemm on stream 2 while stream 0
             // which compute those 2 GEMM above has not been
             // computed and also used for the same reason in
             // the panel update below and also for the last HER2K
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 magma_setdevice( dev );
                 magma_queue_sync( streams[dev][0] );
             }

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using   
                an update of the form:  A := A - V*W' - W*V' 
                ==========================================================  */
             if (i + nb <= n-nb){
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 // below a -1 was added and then a -1 was done on di because of the fortran indexing
                 iblock = ((indi-1) / distblk) / ngpu;          // local block id
                 di     = iblock*distblk + (indi-1)%distblk;     // local index in parent matrix
                 idev   = ((indi-1) / distblk) % ngpu;          // device with this block
                 magma_setdevice( idev );
                 magmablasSetKernelStream( streams[ idev ][ nstream-1 ] );
                 //magma_queue_sync( streams[idev][0] ); removed because the sync has been done in the loop above
                 magma_cgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dv[idev], pm,
                             dw[idev]                , pm, c_one,
                             datest(idev, indi, di+1), ldda);
             
                 magma_cgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dw[idev]                , pm,
                             dv[idev], pm, c_one,
                             datest(idev, indi, di+1), ldda);
                 //printf("updating next panel distblk %d  idev %d  on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); 
             }
             else {
                 /* no look-ahead as this is last iteration */
                 // below a -1 was added and then a -1 was done on di because of the fortran indexing
                 iblock = ((indi-1) / distblk) / ngpu;          // local block id
                 di     = iblock*distblk + (indi-1)%distblk;     // local index in parent matrix
                 idev   = ((indi-1) / distblk) % ngpu;          // device with this block
                 magma_setdevice( idev );
                 magmablasSetKernelStream( streams[ idev ][ 0 ] );
                 //printf("LAST CHER2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); 
                 magma_cher2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              dv[idev], pm,
                              dw[idev]                , pm, d_one,
                              datest(idev, indi, di+1), ldda);


                 /* Send the last block to the CPU */     
                 cpanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
                 magma_cgetmatrix( pk, pk,
                                   datest(idev, indi, di+1), ldda,
                                   a_ref(n-pk+1, n-pk+1),  lda );
                 cq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
             }

             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for(i)

    }// end of LOWER
    //magma_setdevice( 0 );

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_free( dspace[dev]);
        magma_free_pinned(workngpu[dev]);
        for( magma_int_t e = 0; e < nbevents; ++e ) {
            cudaEventDestroy(redevents[dev][e]);
        }
    }
    magma_free_pinned(workngpu[ngpu]);
    free(worktest);

    magma_setdevice( cdev );
    magmablasSetKernelStream( cstream );

    MAGMA_C_SET2REAL( work[0], lwkopt );
    magma_setlapack_numthreads(1);
    return *info;
} /* chetrd_he2hb_ */
Exemplo n.º 8
0
extern "C" magma_int_t
magma_ssygvdx_2stage(magma_int_t itype, char jobz, char range, char uplo, magma_int_t n,
                     float *a, magma_int_t lda, float *b, magma_int_t ldb,
                     float vl, float vu, magma_int_t il, magma_int_t iu,
                     magma_int_t *m, float *w, float *work, magma_int_t lwork,
                     magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SSYGVDX_2STAGE computes all the eigenvalues, and optionally, the eigenvectors
    of a complex generalized Hermitian-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be Hermitian and B is also positive definite.
    It uses a two-stage algorithm for the tridiagonalization.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    ITYPE   (input) INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    RANGE   (input) CHARACTER*1
            = 'A': all eigenvalues will be found.
            = 'V': all eigenvalues in the half-open interval (VL,VU]
                   will be found.
            = 'I': the IL-th through IU-th eigenvalues will be found.

    JOBZ    (input) CHARACTER*1
            = 'N':  Compute eigenvalues only;
            = 'V':  Compute eigenvalues and eigenvectors.

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangles of A and B are stored;
            = 'L':  Lower triangles of A and B are stored.

    N       (input) INTEGER
            The order of the matrices A and B.  N >= 0.

    A       (input/output) DOUBLE PRECISION array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = 'L',
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.

            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**H*B*Z = I;
            if ITYPE = 3, Z**H*inv(B)*Z = I.
            If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
            or the lower triangle (if UPLO='L') of A, including the
            diagonal, is destroyed.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    B       (input/output) DOUBLE PRECISION array, dimension (LDB, N)
            On entry, the Hermitian matrix B.  If UPLO = 'U', the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = 'L',
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.

            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**H*U or B = L*L**H.

    LDB     (input) INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    VL      (input) DOUBLE PRECISION
    VU      (input) DOUBLE PRECISION
            If RANGE='V', the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = 'A' or 'I'.

    IL      (input) INTEGER
    IU      (input) INTEGER
            If RANGE='I', the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = 'A' or 'V'.

    M       (output) INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.

    W       (output) DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.
            If N <= 1,                LWORK >= 1.
            If JOBZ  = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 2).
            If JOBZ  = 'V' and N > 1, LWORK >= LQ2 + 1 + 6*N + 2*N**2.
                                      where LQ2 is the size needed to store
                                      the Q2 matrix and is returned by
                                      MAGMA_BULGE_GET_LQ2.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            If N <= 1,                LIWORK >= 1.
            If JOBZ  = 'N' and N > 1, LIWORK >= 1.
            If JOBZ  = 'V' and N > 1, LIWORK >= 3 + 5*N.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  ZPOTRF or ZHEEVD returned an error code:
               <= N:  if INFO = i and JOBZ = 'N', then the algorithm
                      failed to converge; i off-diagonal elements of an
                      intermediate tridiagonal form did not converge to
                      zero;
                      if INFO = i and JOBZ = 'V', then the algorithm
                      failed to compute an eigenvalue while working on
                      the submatrix lying in rows and columns INFO/(N+1)
                      through mod(INFO,N+1);
               > N:   if INFO = N + i, for 1 <= i <= N, then the leading
                      minor of order i of B is not positive definite.
                      The factorization of B could not be completed and
                      no eigenvalues or eigenvectors were computed.

    Further Details
    ===============
    Based on contributions by
       Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA

    Modified so that no backsubstitution is performed if ZHEEVD fails to
    converge (NEIG in old code could be greater than N causing out of
    bounds reference to A - reported by Ralf Meyer).  Also corrected the
    description of INFO and the test on ITYPE. Sven, 16 Feb 05.
    =====================================================================  */

    char uplo_[2] = {uplo, 0};
    char jobz_[2] = {jobz, 0};
    char range_[2] = {range, 0};

    float d_one = MAGMA_S_ONE;

    float *da;
    float *db;
    magma_int_t ldda = n;
    magma_int_t lddb = n;

    magma_int_t lower;
    char trans[1];
    magma_int_t wantz;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;

    magma_int_t lwmin;
    magma_int_t liwmin;

    magma_queue_t stream;
    magma_queue_create( &stream );

    /* determine the number of threads */
    magma_int_t threads = magma_get_numthreads();
    magma_setlapack_numthreads(threads);

    wantz = lapackf77_lsame(jobz_, MagmaVecStr);
    lower = lapackf77_lsame(uplo_, MagmaLowerStr);
    alleig = lapackf77_lsame(range_, "A");
    valeig = lapackf77_lsame(range_, "V");
    indeig = lapackf77_lsame(range_, "I");
    lquery = lwork == -1 || liwork == -1;

    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) {
        *info = -3;
    } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
        *info = -4;
    } else if (n < 0) {
        *info = -5;
    } else if (lda < max(1,n)) {
        *info = -7;
    } else if (ldb < max(1,n)) {
        *info = -9;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -11;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -12;
            } else if (iu < min(n,il) || iu > n) {
                *info = -13;
            }
        }
    }

    magma_int_t nb = magma_get_sbulge_nb(n, threads);
    magma_int_t lq2 = magma_sbulge_get_lq2(n, threads);

    if (wantz) {
        lwmin = lq2 + 1 + 6*n + 2*n*n;
        liwmin = 3 + 5*n;
    } else {
        lwmin = n * (nb + 2);
        liwmin = 1;
    }

    work[0] = lwmin * (1. + lapackf77_slamch("Epsilon"));
    iwork[0] = liwmin;

    if (lwork < lwmin && ! lquery) {
        *info = -17;
    } else if (liwork < liwmin && ! lquery) {
        *info = -19;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return *info;
    } else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }

    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128){
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_ssygvd(&itype, jobz_, uplo_,
                         &n, a, &lda, b, &ldb,
                         w, work, &lwork,
                         iwork, &liwork, info);
        *m = n;
        return *info;
    }

    if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda ) ||
        MAGMA_SUCCESS != magma_smalloc( &db, n*lddb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Form a Cholesky factorization of B. */
    magma_ssetmatrix( n, n, b, ldb, db, lddb );
    magma_ssetmatrix_async( n, n,
                            a,  lda,
                            da, ldda, stream );

#ifdef ENABLE_TIMER
    magma_timestr_t start, end;
    start = get_current_time();
#endif

    magma_spotrf_gpu(uplo_[0], n, db, lddb, info);
    if (*info != 0) {
        *info = n + *info;
        return *info;
    }

#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time spotrf_gpu = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    magma_queue_sync( stream );
    magma_sgetmatrix_async( n, n,
                            db, lddb,
                            b,  ldb, stream );

#ifdef ENABLE_TIMER
    start = get_current_time();
#endif

    /* Transform problem to standard eigenvalue problem and solve. */
    magma_ssygst_gpu(itype, uplo, n, da, ldda, db, lddb, info);

#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time ssygst_gpu = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    magma_sgetmatrix( n, n, da, ldda, a, lda );
    magma_queue_sync( stream );
    magma_free( da );
    magma_free( db );

#ifdef ENABLE_TIMER
    start = get_current_time();
#endif

    magma_ssyevdx_2stage(jobz, range, uplo, n, a, lda, vl, vu, il, iu, m, w, work, lwork, iwork, liwork, info);

#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time ssyevdx_2stage = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    if (wantz && *info == 0) {

        if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda ) ||
            MAGMA_SUCCESS != magma_smalloc( &db, n*lddb )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

#ifdef ENABLE_TIMER
        start = get_current_time();
#endif

        magma_ssetmatrix( n, *m, a, lda, da, ldda );
        magma_ssetmatrix( n,  n, b, ldb, db, lddb );

        /* Backtransform eigenvectors to the original problem. */
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                *(unsigned char *)trans = MagmaConjTrans;
            } else {
                *(unsigned char *)trans = MagmaNoTrans;
            }

            magma_strsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, d_one, db, lddb, da, ldda);
        }
        else if (itype == 3) {
            /* For B*A*x=(lambda)*x;
               backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                *(unsigned char *)trans = MagmaNoTrans;
            } else {
                *(unsigned char *)trans = MagmaConjTrans;
            }

            magma_strmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, d_one, db, lddb, da, ldda);
        }

        magma_sgetmatrix( n, *m, da, ldda, a, lda );

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time strsm/mm + getmatrix = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

        magma_free( da );
        magma_free( db );
    }

    magma_queue_destroy( stream );

    work[0] = lwmin * (1. + lapackf77_slamch("Epsilon"));
    iwork[0] = liwmin;

    return *info;
} /* ssygvdx_2stage */
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dgetrf_mgpu
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    real_Double_t   gpu_perf1, gpu_time1, gpu_perf2, gpu_time2, gpu_perf3, gpu_time3, alloc_time, free_time;
    double           error;
    double *h_A;
    double *d_lA[ MagmaMaxGPUs ];
    magma_int_t *ipiv;
    magma_int_t M, N, n2, lda, ldda, n_local, ngpu, NB;
    magma_int_t info, min_mn, nb, ldn_local;
    magma_int_t status = 0;

    magma_int_t P=-1;    /*Number of threads in the CPU part*/
    double d_cpu=-1;    /*pourcentgae of the matrix to allocate in the cpu part*/
    magma_int_t Pr=-1;  /*Number of threads for the panel*/
    magma_int_t async_nb; /*Block size*/
    
    double *WORK;
    magma_int_t WORK_LD, WORK_n;

    double **dlpanelT;
    magma_int_t dlpanelT_m, dlpanelT_n;

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    P =  opts.nthread;
    async_nb = opts.nb;
    Pr = opts.panel_nthread;
    
    d_cpu = 0.0;
    #if defined(CPU_PEAK) && defined(GPU_PEAK)
    d_cpu = magma_amc_recommanded_dcpu(opts.nthread, CPU_PEAK, opts.ngpu, GPU_PEAK);
    #endif
    if(opts.fraction_dcpu!=0){ /*Overwrite the one computed with the model*/
    d_cpu = opts.fraction_dcpu;
    }
    magma_assert(d_cpu > 0 && d_cpu<=1.0,
    "error: The cpu fraction is invalid. Ensure you use --fraction_dcpu with fraction_dcpu in [0.0, 1.0] or compile with both -DCPU_PEAK=<cpu peak performance> and -DGPU_PEAK=<gpu peak performance> set.\n");
    
    
    printf("Asynchronous recursif LU... nb:%d, nbcores:%d, dcpu:%f, panel_nbcores:%d, ngpu: %d\n", async_nb, P, d_cpu, Pr, opts.ngpu);
    printf("  M     N     CPU GFlop/s (sec)   GPU GFlop/s (sec)   GPU_Async_v2 GFlop/s (sec)  GPU_Async_work_v2 GFlop/s (sec)");
    if ( opts.check == 2 ) {
        printf("   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");

    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            //nb     = magma_get_dgetrf_nb( M );
            gflops = FLOPS_DGETRF( M, N ) / 1e9;
            
            
            
            // Allocate host memory for the matrix
            TESTING_MALLOC_CPU(    ipiv, magma_int_t,        min_mn );
            TESTING_MALLOC_CPU(    h_A,  double, n2     );
            
            /*set default number of threads for lapack*/
            magma_setlapack_numthreads(P);
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            
            nb = magma_get_dgetrf_nb( M );            

            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, int((N+nb-1)/nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            }

            // Allocate device memory
            for( int dev=0; dev < ngpu; dev++){
                n_local = ((N/nb)/ngpu)*nb;
                if (dev < (N/nb) % ngpu)
                    n_local += nb;
                else if (dev == (N/nb) % ngpu)
                    n_local += N % nb;
                ldn_local = ((n_local+31)/32)*32;  // TODO why?
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local );
            }

            init_matrix( M, N, h_A, lda );
            magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb );
    
            gpu_time1 = magma_wtime();
            magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info );
            gpu_time1 = magma_wtime() - gpu_time1;
            gpu_perf1 = gflops / gpu_time1;
            if (info != 0)
                printf("magma_dgetrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
                       
            magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb );

            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            }
            
            /* ====================================================================
               Performs operation using MAGMA_Async: This interface allocate workspace internally
               =================================================================== */

            /*For the benchmark we have 2 approaches*/
            /*1. use directly magma_amc */
            /*2. use magma_amc_work and add pinned memory time*/
            /*We choose approach 2*/
            /*
            nb = async_nb;

            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, int((N+nb-1)/nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            }

            // Allocate device memory
            n_local = numcols2p(0, N, nb, ngpu);
            ldn_local = n_local;

            //ldn_local = ((n_local+31)/32)*32;
            for( int dev=0; dev < ngpu; dev++){
  
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local );
            }

            init_matrix( M, N, h_A, lda );
            magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb );
            
            // Switch to the sequential version of BLAS
            magma_setlapack_numthreads(1);

            magma_amc_init(P, d_cpu, Pr, nb);
            gpu_time2 = magma_wtime();
            magma_dgetrf_async_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info );
            gpu_time2 = magma_wtime() - gpu_time2;
            gpu_perf2 = gflops / gpu_time2;
            magma_amc_finalize();
            if (info != 0)
                printf("magma_dgetrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
                       
            magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb );

            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            }
            */

            /* ====================================================================
               Performs operation using MAGMA_Async_Work
               =================================================================== */
            
            nb = async_nb;

            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, int((N+nb-1)/nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            }

            // Allocate device memory
            n_local = numcols2p(0, N, nb, ngpu);
            ldn_local = n_local;

            //ldn_local = ((n_local+31)/32)*32;
            for( int dev=0; dev < ngpu; dev++){
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local );
            }

            init_matrix( M, N, h_A, lda );
            magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb );
            
            // Switch to the sequential version of BLAS
            magma_setlapack_numthreads(1);

            //Compute workspace dimension
            WORK_LD = M;

          
            NB  = (int) ceil( (double) N / nb);

            WORK_n = (int) ceil(N*d_cpu)+nb; /*TODO:remove +nb replace with A_N*/
            //WORK_n = NSplit(NB, d_cpu)*nb;

            if(WORK_n<nb) WORK_n = nb;//make sure workspace has at least one block column

            //Make LD and n multiple of 32
            //if(WORK_LD%32!=0) WORK_LD = ((WORK_LD + 31)/32)*32;
            //if(WORK_n%32!=0) WORK_n = ((WORK_n + 31)/32)*32;
            //Allocate workspace
            alloc_time = magma_wtime();
            if (MAGMA_SUCCESS != magma_dmalloc_pinned(&WORK, WORK_LD*WORK_n)) { 
            //if (MAGMA_SUCCESS != magma_dmalloc_cpu(&WORK, WORK_LD*WORK_n)) {
                info = MAGMA_ERR_HOST_ALLOC;
                printf("magma_dmalloc_pinned returned error %d: %s.\n     ", (int) info);
            }

            /* Workspace for the panels on the GPU*/
            dlpanelT_m = WORK_n; /*assume that the cpu and gpu use the same buffer size*/
            dlpanelT_n = M;
             dlpanelT = (double **)    malloc(ngpu*sizeof(double*));
              for(int dev=0;dev<ngpu;dev++){
                 magma_setdevice(dev);

                 if (MAGMA_SUCCESS != magma_dmalloc(&dlpanelT[dev], dlpanelT_m*dlpanelT_n)) { 

                
                        info = MAGMA_ERR_DEVICE_ALLOC; 
                        printf("magma_dmalloc returned error %d: %s.\n     ", (int) info);
                }
              }

            alloc_time = magma_wtime() - alloc_time;

            //First touch the workspace with each thread. This may be needed to avoid using numactl --interleave
            //magma_amc_dmemset(WORK, 0.0, WORK_LD*WORK_n, 256, P); //nb
            //#pragma omp parallel for  private(info) schedule(static,nb)
            //for(info=0;info<WORK_LD*WORK_n;info++) WORK[info] = 0.0; //alternative first touch by the thread

            magma_amc_init(P, d_cpu, Pr, nb);
            gpu_time3 = magma_wtime();
            magma_dgetrf_mgpu_work_amc_v3(ngpu, M, N, d_lA, ldda, ipiv, &info, WORK, WORK_LD, WORK_n);
            gpu_time3 = magma_wtime() - gpu_time3;
            gpu_perf3 = gflops / gpu_time3;
            magma_amc_finalize();
            if (info != 0)
                printf("magma_dgetrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
                       
            magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb );

            

            //Free workspace
            free_time = magma_wtime();
            magma_free_pinned(WORK);
            
            for(int dev=0;dev<ngpu;dev++){
                magma_setdevice(dev);
                magma_free(dlpanelT[dev]);
            }

            free(dlpanelT);
            free_time = magma_wtime() - free_time;

            /*DEDUCE t2, JUST FOR THE BENCHMARK*/
             gpu_time2 =  gpu_time3 + alloc_time +  free_time;
             gpu_perf2 = gflops / gpu_time2;

            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            }
            /* =====================================================================
               Check the factorization
               =================================================================== */
            /*
            if ( opts.lapack ) {
                printf("%5d %5d  %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d    ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            */
            printf("%5d %5d", (int) M, (int) N);
            if(cpu_perf!=0.0){
                printf("   %7.2f (%7.2f)", cpu_perf, cpu_time);
            }
            else{
                printf("   ---   (  ---  )");
            }
            if(gpu_perf1!=0.0){
                printf("   %7.2f (%7.2f)", gpu_perf1, gpu_time1);
            }
            else{
                printf("   ---   (  ---  )");
            }
            if(gpu_perf2!=0.0){
                printf("   %7.2f (%7.2f)", gpu_perf2, gpu_time2);
            }
            else{
                printf("   ---   (  ---  )");
            }
            if(gpu_perf3!=0.0){
                printf("   %7.2f (%7.2f)", gpu_perf3, gpu_time3);
            }
            else{
                printf("   ---   (  ---  )");
            }

            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e%s\n", error, (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e%s\n", error, (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else {
                printf( "     ---\n" );
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_CPU( h_A );
  
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Exemplo n.º 10
0
extern "C" magma_int_t
magma_sstedx_m(magma_int_t nrgpu, char range, magma_int_t n, float vl, float vu,
               magma_int_t il, magma_int_t iu, float* d, float* e, float* z, magma_int_t ldz,
               float* work, magma_int_t lwork, magma_int_t* iwork, magma_int_t liwork,
               magma_int_t* info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

       .. Scalar Arguments ..
      CHARACTER          RANGE
      INTEGER            IL, IU, INFO, LDZ, LIWORK, LWORK, N
      REAL   VL, VU
       ..
       .. Array Arguments ..
      INTEGER            IWORK( * )
      REAL   D( * ), E( * ), WORK( * ), Z( LDZ, * ),
     $                   DWORK ( * )
       ..

    Purpose
    =======
    SSTEDX computes some eigenvalues and, optionally, eigenvectors of a
    symmetric tridiagonal matrix using the divide and conquer method.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.  See SLAEX3 for details.

    Arguments
    =========
    RANGE   (input) CHARACTER*1
            = 'A': all eigenvalues will be found.
            = 'V': all eigenvalues in the half-open interval (VL,VU]
                   will be found.
            = 'I': the IL-th through IU-th eigenvalues will be found.

    N       (input) INTEGER
            The dimension of the symmetric tridiagonal matrix.  N >= 0.

    VL      (input) REAL
    VU      (input) REAL
            If RANGE='V', the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = 'A' or 'I'.

    IL      (input) INTEGER
    IU      (input) INTEGER
            If RANGE='I', the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = 'A' or 'V'.

    D       (input/output) REAL array, dimension (N)
            On entry, the diagonal elements of the tridiagonal matrix.
            On exit, if INFO = 0, the eigenvalues in ascending order.

    E       (input/output) REAL array, dimension (N-1)
            On entry, the subdiagonal elements of the tridiagonal matrix.
            On exit, E has been destroyed.

    Z       (input/output) REAL array, dimension (LDZ,N)
            On exit, if INFO = 0, Z contains the orthonormal eigenvectors
            of the symmetric tridiagonal matrix.

    LDZ     (input) INTEGER
            The leading dimension of the array Z. LDZ >= max(1,N).

    WORK    (workspace/output) REAL array,
                                           dimension (LWORK)
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.
            If N > 1 then LWORK >= ( 1 + 4*N + N**2 ).
            Note that  if N is less than or
            equal to the minimum divide size, usually 25, then LWORK need
            only be max(1,2*(N-1)).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            LIWORK >= ( 3 + 5*N ).
            Note that if N is less than or
            equal to the minimum divide size, usually 25, then LIWORK
            need only be 1.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal size of the IWORK array,
            returns this value as the first entry of the IWORK array, and
            no error message related to LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit.
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  The algorithm failed to compute an eigenvalue while
                  working on the submatrix lying in rows and columns
                  INFO/(N+1) through mod(INFO,N+1).

    Further Details
    ===============
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    =====================================================================
*/
    char range_[2] = {range, 0};

    float d_zero = 0.;
    float d_one  = 1.;
    magma_int_t izero = 0;
    magma_int_t ione = 1;


    magma_int_t alleig, indeig, valeig, lquery;
    magma_int_t i, j, k, m;
    magma_int_t liwmin, lwmin;
    magma_int_t start, end, smlsiz;
    float eps, orgnrm, p, tiny;

    // Test the input parameters.

    alleig = lapackf77_lsame(range_, "A");
    valeig = lapackf77_lsame(range_, "V");
    indeig = lapackf77_lsame(range_, "I");
    lquery = lwork == -1 || liwork == -1;

    *info = 0;

    if (! (alleig || valeig || indeig)) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldz < max(1,n)) {
        *info = -10;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -4;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -5;
            } else if (iu < min(n,il) || iu > n) {
                *info = -6;
            }
        }
    }

    if (*info == 0) {
        // Compute the workspace requirements

        smlsiz = magma_get_smlsize_divideconquer();
        if( n <= 1 ){
            lwmin = 1;
            liwmin = 1;
        } else {
            lwmin = 1 + 4*n + n*n;
            liwmin = 3 + 5*n;
        }

        work[0] = lwmin;
        iwork[0] = liwmin;

        if (lwork < lwmin && ! lquery) {
            *info = -12;
        } else if (liwork < liwmin && ! lquery) {
            *info = -14;
        }
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return MAGMA_ERR_ILLEGAL_VALUE;
    } else if (lquery) {
        return MAGMA_SUCCESS;
    }

    // Quick return if possible

    if(n==0)
        return MAGMA_SUCCESS;
    if(n==1){
        *z = 1.;
        return MAGMA_SUCCESS;
    }

    /* determine the number of threads */
    magma_int_t threads = magma_get_numthreads();
    magma_setlapack_numthreads(threads);

#ifdef ENABLE_DEBUG
    printf("  D&C_m is using %d threads\n", threads);
#endif

    // If N is smaller than the minimum divide size (SMLSIZ+1), then
    // solve the problem with another solver.

    if (n < smlsiz){
        char char_I[]= {'I', 0};
        lapackf77_ssteqr(char_I, &n, d, e, z, &ldz, work, info);
    } else {
        char char_F[]= {'F', 0};
        lapackf77_slaset(char_F, &n, &n, &d_zero, &d_one, z, &ldz);

        //Scale.
        char char_M[]= {'M', 0};

        orgnrm = lapackf77_slanst(char_M, &n, d, e);

        if (orgnrm == 0){
            work[0]  = lwmin;
            iwork[0] = liwmin;
            return MAGMA_SUCCESS;
        }

        eps = lapackf77_slamch( "Epsilon" );

        if (alleig){
            start = 0;
            while ( start < n ){

                // Let FINISH be the position of the next subdiagonal entry
                // such that E( END ) <= TINY or FINISH = N if no such
                // subdiagonal exists.  The matrix identified by the elements
                // between START and END constitutes an independent
                // sub-problem.

                for(end = start+1; end < n; ++end){
                    tiny = eps * sqrt( MAGMA_S_ABS(d[end-1]*d[end]));
                    if (MAGMA_S_ABS(e[end-1]) <= tiny)
                        break;
                }

                // (Sub) Problem determined.  Compute its size and solve it.

                m = end - start;
                if (m==1){
                    start = end;
                    continue;
                }
                if (m > smlsiz){

                    // Scale
                    char char_G[] = {'G', 0};
                    orgnrm = lapackf77_slanst(char_M, &m, &d[start], &e[start]);
                    lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &m, &ione, &d[start], &m, info);
                    magma_int_t mm = m-1;
                    lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &mm, &ione, &e[start], &mm, info);

                    magma_slaex0_m( nrgpu, m, &d[start], &e[start], Z(start, start), ldz, work, iwork, 'A', vl, vu, il, iu, info);

                    if( *info != 0) {
                        return MAGMA_SUCCESS;
                    }

                    // Scale Back
                    lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &m, &ione, &d[start], &m, info);

                } else {

                    char char_I[]= {'I', 0};
                    lapackf77_ssteqr( char_I, &m, &d[start], &e[start], Z(start, start), &ldz, work, info);
                    if (*info != 0){
                        *info = (start+1) *(n+1) + end;
                    }
                }

                start = end;
            }


            // If the problem split any number of times, then the eigenvalues
            // will not be properly ordered.  Here we permute the eigenvalues
            // (and the associated eigenvectors) into ascending order.

            if (m < n){

                // Use Selection Sort to minimize swaps of eigenvectors
                for (i = 1; i < n; ++i){
                    k = i-1;
                    p = d[i-1];
                    for (j = i; j < n; ++j){
                        if (d[j] < p){
                            k = j;
                            p = d[j];
                        }
                    }
                    if(k != i-1) {
                        d[k] = d[i-1];
                        d[i-1] = p;
                        blasf77_sswap(&n, Z(0,i-1), &ione, Z(0,k), &ione);
                    }
                }
            }

        } else {

            // Scale
            char char_G[] = {'G', 0};
            lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &n, &ione, d, &n, info);
            magma_int_t nm = n-1;
            lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &nm, &ione, e, &nm, info);

            magma_slaex0_m(nrgpu, n, d, e, z, ldz, work, iwork, range, vl, vu, il, iu, info);

            if( *info != 0) {
                return MAGMA_SUCCESS;
            }

            // Scale Back
            lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &n, &ione, d, &n, info);

        }
    }

    work[0]  = lwmin;
    iwork[0] = liwmin;

    return MAGMA_SUCCESS;

} /* magma_sstedx_m */