コード例 #1
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_dbulge_get_Vblksiz( magma_int_t m, magma_int_t nb, magma_int_t nbthreads  )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler + SB
        return min(nb,64);
    }
    else {                     // 2.x Fermi or 1.x
        return min(nb,64);
    }
}
コード例 #2
0
ファイル: get_nb.cpp プロジェクト: cjy7117/FT-MAGMA
magma_int_t magma_get_zgebrd_nb( magma_int_t /*m*/ )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 200 ) {       // 2.x Fermi
        return 32;
    }
    else {                     // 1.x
        return 32;
    }
}
コード例 #3
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
/* ////////////////////////////////////////////////////////////////////////////
   -- Return nb for gehrd based on m
*/
magma_int_t magma_get_sgehrd_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 200 ) {       // 2.x Fermi
        if      (m <  1024) return 32;
        else                return 96;
    }
    else {                     // 1.x
        if      (m <  1024) return 32;
        else                return 64;
    }
}
コード例 #4
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_dgelqf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 200 ) {       // 2.x Fermi
        return magma_get_dgeqrf_nb( m );
    }
    else {                     // 1.x
        if      (m <  2048) return 32;
        else if (m <  4032) return 64;
        else                return 128;
    }
}
コード例 #5
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_zbulge_gcperf( )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler + SB
            return 50;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 15000;
    }
    else {                     // 1.x
        return 10000;
    }
}
コード例 #6
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_dbulge_nb( magma_int_t m, magma_int_t nbthreads  )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler + SB
        return 128;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 128;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #7
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_cgeqrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  4096) return 64;
        else                return 128;
    }
    else {                     // 1.x and 2.x Fermi
        if      (m <  2048) return 32;
        else if (m <  4096) return 64;
        else                return 128;
    }
}
コード例 #8
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_dgeqrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  3072) return 64;
        else if (m < 10240) return 128;
        else                return 256;
    }
    else {                     // 1.x and 2.x Fermi
        if      (m <  4096) return 64;
        else                return 128;
    }
}
コード例 #9
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_zbulge_nb_mgpu( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler + SB
        return 64;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 64;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #10
0
ファイル: get_nb.cpp プロジェクト: cjy7117/FT-MAGMA
magma_int_t magma_zbulge_get_Vblksiz( magma_int_t /*m*/, magma_int_t nb, magma_int_t nbthreads )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler + SB
        if ( nbthreads > 14 )
            return min(nb, 64);
        else
            return min(nb, 32);
    }
    else {                     // 2.x Fermi or 1.x
        return min(nb, 48);
    }
}
コード例 #11
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_zhegst_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        return 384;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 256;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #12
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_zgebrd_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 200 ) {       // 2.x Fermi
        return 32;
        //if (m <  1024)
        //    return 64;
        //else
        //    return 64;
    }
    else {                     // 1.x
        return 32;
    }
}
コード例 #13
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_chegst_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  2048) return 384;
        else                return 768;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 512;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #14
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_zpotrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        return 256;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        if      (m <  1500) return 192;
        else                return 256;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #15
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
/* ////////////////////////////////////////////////////////////////////////////
   -- Return nb for sygst based on m
*/
magma_int_t magma_get_ssygst_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  4096) return 768;
        else                return 1536;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        if      (m <  2048) return 512;
        else                return 1024;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #16
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_zgetrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m < 4096) return 64;
        else if (m < 8192) return 256;
        else               return 512;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        if      (m < 4096) return 64;
        else               return 128;
    }
    else {                     // 1.x
        return 128;
    }
}
コード例 #17
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_dsygst_nb_m( magma_int_t m )
{
    return 256; //to be updated

    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  2048) return 384;
        else                return 768;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 512;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #18
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_dpotrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  3072) return 256;
        else                return 512;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 256;
    }
    else {                     // 1.x
        if      (m <  3328) return 128;
        else if (m <  4256) return 128;
        else                return 256;
    }
}
コード例 #19
0
ファイル: get_nb.cpp プロジェクト: cjy7117/FT-MAGMA
magma_int_t magma_get_zbulge_nb( magma_int_t /*m*/, magma_int_t nbthreads )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler + SB
        if ( nbthreads > 14 )
            return 128;
        else
            return 64;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        return 64;
    }
    else {                     // 1.x
        return 64;
    }
}
コード例 #20
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
/* ////////////////////////////////////////////////////////////////////////////
   -- Return nb for potrf based on m
*/
magma_int_t magma_get_spotrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  1500) return 256;
        else                return 512;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        if      (m <  2048) return 256;
        else                return 512;
    }
    else {                     // 1.x
        if      (m <  3328) return 128;
        else if (m <  4256) return 224;
        else                return 288;
    }
}
コード例 #21
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
magma_int_t magma_get_dgetrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  3072) return 128;
        else if (m <  8192) return 256;
        else                return 512;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        if      (m <  3072) return 128;
        else if (m < 10240) return 256;
        else                return 512;
    }
    else {                     // 1.x
        if      (m <  2048) return 64;
        else                return 128;
    }
}
コード例 #22
0
ファイル: get_nb.cpp プロジェクト: soulsheng/magma
/* ////////////////////////////////////////////////////////////////////////////
   -- Return nb for geqrf based on m
*/
magma_int_t magma_get_sgeqrf_nb( magma_int_t m )
{
    magma_int_t arch = magma_getdevice_arch();
    if ( arch >= 300 ) {       // 3.x Kepler
        if      (m <  4096) return 96;
        else if (m <  7168) return 128;
        else if (m < 18432) return 256;
        else                return 512;
    }
    else if ( arch >= 200 ) {  // 2.x Fermi
        if      (m <  3072) return 64;
        else if (m <  8192) return 128;
        else                return 256;
    }
    else {                     // 1.x
        if      (m <  2048) return 32;
        else if (m <  4096) return 64;
        else                return 128;
    }
}
コード例 #23
0
ファイル: testing_zlanhe.cpp プロジェクト: xulunfan/magma
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zlanhe
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time;
    magmaDoubleComplex *h_A;
    double *h_work;
    magmaDoubleComplex_ptr d_A;
    magmaDouble_ptr d_work;
    magma_int_t i, j, N, n2, lda, ldda;
    magma_int_t idist    = 3;  // normal distribution (otherwise max norm is always ~ 1)
    magma_int_t ISEED[4] = {0,0,0,1};
    double      error, norm_magma, norm_lapack;
    magma_int_t status = 0;
    magma_int_t lapack_nan_fail = 0;
    magma_int_t lapack_inf_fail = 0;
    bool mkl_warning = false;

    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    double tol2;
    
    magma_uplo_t uplo[] = { MagmaLower, MagmaUpper };
    magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm };
    
    // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x)
#if defined(PRECISION_z)
    magma_int_t arch = magma_getdevice_arch();
    if ( arch < 200 ) {
        printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n"
               "!!!! on CUDA architecture %d; requires arch >= 200.\n"
               "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n",
               MagmaInfNormStr, MagmaOneNormStr, (int) arch );
        for( int inorm = 0; inorm < 2; ++inorm ) {
        for( int iuplo = 0; iuplo < 2; ++iuplo ) {
            printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n",
                    lapack_norm_const( norm[inorm] ),
                    lapack_uplo_const( uplo[iuplo] ));
            norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 );
            if ( norm_magma != -1 ) {
                printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma );
                status = 1;
            }
        }}
        printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") );
    }
#endif

    #ifdef MAGMA_WITH_MKL
    // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around.
    // MKL 11.2 corrects it for inf, one, max norm.
    // MKL 11.2 still segfaults for Frobenius norm, which is not tested here
    // because MAGMA doesn't implement Frobenius norm yet.
    MKLVersion mkl_version;
    mkl_get_version( &mkl_version );
    magma_int_t la_threads = magma_get_lapack_numthreads();
    bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2);
    if ( mkl_single_thread ) {
        printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" );
    }
    #endif
    
    printf("%%   N   norm   uplo   CPU GByte/s (ms)    GPU GByte/s (ms)        error               nan      inf\n");
    printf("%%=================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      for( int inorm = 0; inorm < 3; ++inorm ) {  /* < 4 for Frobenius */
      for( int iuplo = 0; iuplo < 2; ++iuplo ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N   = opts.nsize[itest];
            lda = N;
            n2  = lda*N;
            ldda = magma_roundup( N, opts.align );
            // read upper or lower triangle
            gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9;
            
            TESTING_MALLOC_CPU( h_A,    magmaDoubleComplex, n2 );
            TESTING_MALLOC_CPU( h_work, double, N );
            
            TESTING_MALLOC_DEV( d_A,    magmaDoubleComplex, ldda*N );
            TESTING_MALLOC_DEV( d_work, double, N );
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &idist, ISEED, &n2, h_A );
            
            magma_zsetmatrix( N, N, h_A, lda, d_A, ldda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gbytes / gpu_time;
            if (norm_magma == -1) {
                printf( "%5d   %4c   skipped because %s norm isn't supported\n",
                        (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] ));
                goto cleanup;
            }
            else if (norm_magma < 0) {
                printf("magmablas_zlanhe returned error %f: %s.\n",
                       norm_magma, magma_strerror( (int) norm_magma ));
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            #ifdef MAGMA_WITH_MKL
            if ( mkl_single_thread ) {
                // work around MKL bug in multi-threaded zlanhe
                magma_set_lapack_numthreads( 1 );
            }
            #endif
            
            cpu_time = magma_wtime();
            norm_lapack = lapackf77_zlanhe(
                lapack_norm_const( norm[inorm] ),
                lapack_uplo_const( uplo[iuplo] ),
                &N, h_A, &lda, h_work );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gbytes / cpu_time;
            if (norm_lapack < 0) {
                printf("lapackf77_zlanhe returned error %f: %s.\n",
                       norm_lapack, magma_strerror( (int) norm_lapack ));
            }
            
            /* =====================================================================
               Check the result compared to LAPACK
               =================================================================== */
            error = fabs( norm_magma - norm_lapack ) / norm_lapack;
            tol2 = tol;
            if ( norm[inorm] == MagmaMaxNorm ) {
                // max-norm depends on only one element, so for Real precisions,
                // MAGMA and LAPACK should exactly agree (tol2 = 0),
                // while Complex precisions incur roundoff in cuCabs.
                #ifdef REAL
                tol2 = 0;
                #endif
            }
            
            bool okay; okay = (error <= tol2);
            status += ! okay;
            mkl_warning |= ! okay;
            
            /* ====================================================================
               Check for NAN and INF propagation
               =================================================================== */
            #define h_A(i_, j_) (h_A + (i_) + (j_)*lda)
            #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda)
            
            i = rand() % N;
            j = rand() % N;
            magma_int_t tmp;
            if ( uplo[iuplo] == MagmaLower && i < j ) {
                tmp = i;
                i = j;
                j = tmp;
            }
            else if ( uplo[iuplo] == MagmaUpper && i > j ) {
                tmp = i;
                i = j;
                j = tmp;
            }
            
            *h_A(i,j) = MAGMA_Z_NAN;
            magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 );
            norm_magma  = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N );
            norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ),
                                            lapack_uplo_const( uplo[iuplo] ),
                                            &N, h_A, &lda, h_work );
            bool nan_okay;    nan_okay    = isnan(norm_magma);
            bool la_nan_okay; la_nan_okay = isnan(norm_lapack);
            lapack_nan_fail += ! la_nan_okay;
            status          += !    nan_okay;
            
            *h_A(i,j) = MAGMA_Z_INF;
            magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 );
            norm_magma  = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N );
            norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ),
                                            lapack_uplo_const( uplo[iuplo] ),
                                            &N, h_A, &lda, h_work );
            bool inf_okay;    inf_okay    = isinf(norm_magma);
            bool la_inf_okay; la_inf_okay = isinf(norm_lapack);
            lapack_inf_fail += ! la_inf_okay;
            status          += !    inf_okay;
            
            #ifdef MAGMA_WITH_MKL
            if ( mkl_single_thread ) {
                // end single thread to work around MKL bug
                magma_set_lapack_numthreads( la_threads );
            }
            #endif
            
            printf("%5d   %4c   %4c   %7.2f (%7.2f)   %7.2f (%7.2f)   %#9.3g   %-6s   %6s%1s  %6s%1s\n",
                   (int) N,
                   lapacke_norm_const( norm[inorm] ),
                   lapacke_uplo_const( uplo[iuplo] ),
                   cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                   error,
                   (okay     ? "ok" : "failed"),
                   (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"),
                   (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*"));
            
        cleanup:
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_DEV( d_A    );
            TESTING_FREE_DEV( d_work );
            fflush( stdout );
        } // end iter
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }} // end iuplo, inorm
      printf( "\n" );
    }
    
    // don't print "failed" here because then run_tests.py thinks MAGMA failed
    if ( lapack_nan_fail ) {
        printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" );
    }
    if ( lapack_inf_fail ) {
        printf( "* Warning: LAPACK did not pass INF propagation test\n" );
    }
    if ( mkl_warning ) {
        printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n"
               "  corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n"
               "  Try again with MKL_NUM_THREADS=1.\n" );
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
コード例 #24
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dlansy
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time;
    double *h_A;
    double *h_work;
    double *d_A;
    double *d_work;
    magma_int_t N, n2, lda, ldda;
    magma_int_t idist    = 3;  // normal distribution (otherwise max norm is always ~ 1)
    magma_int_t ISEED[4] = {0,0,0,1};
    double      error, norm_magma, norm_lapack;
    magma_int_t status = 0;
    bool mkl_warning = false;

    magma_opts opts;
    parse_opts( argc, argv, &opts );

    double tol = opts.tolerance * lapackf77_dlamch("E");

    magma_uplo_t uplo[] = { MagmaLower, MagmaUpper };
    magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm };

    // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x)
#if defined(PRECISION_z)
    magma_int_t arch = magma_getdevice_arch();
    if ( arch < 200 ) {
        printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n"
               "!!!! on CUDA architecture %d; requires arch >= 200.\n"
               "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n",
               MagmaInfNormStr, MagmaOneNormStr, (int) arch );
        for( int inorm = 0; inorm < 2; ++inorm ) {
            for( int iuplo = 0; iuplo < 2; ++iuplo ) {
                printf( "Testing that magmablas_dlansy( %s, %s, ... ) returns -1 error...\n",
                        lapack_norm_const( norm[inorm] ),
                        lapack_uplo_const( uplo[iuplo] ));
                norm_magma = magmablas_dlansy( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL );
                if ( norm_magma != -1 ) {
                    printf( "expected magmablas_dlansy to return -1 error, but got %f\n", norm_magma );
                    status = 1;
                }
            }
        }
        printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") );
    }
#endif

    printf("    N   norm   uplo   CPU GByte/s (ms)    GPU GByte/s (ms)    error   \n");
    printf("=======================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int inorm = 0; inorm < 3; ++inorm ) {
            for( int iuplo = 0; iuplo < 2; ++iuplo ) {
                for( int iter = 0; iter < opts.niter; ++iter ) {
                    N   = opts.nsize[itest];
                    lda = N;
                    n2  = lda*N;
                    ldda = roundup( N, opts.pad );
                    // read upper or lower triangle
                    gbytes = 0.5*(N+1)*N*sizeof(double) / 1e9;

                    TESTING_MALLOC_CPU( h_A,    double, n2 );
                    TESTING_MALLOC_CPU( h_work, double, N );

                    TESTING_MALLOC_DEV( d_A,    double, ldda*N );
                    TESTING_MALLOC_DEV( d_work, double, N );

                    /* Initialize the matrix */
                    lapackf77_dlarnv( &idist, ISEED, &n2, h_A );
                    //magma_dmake_symmetric( N, h_A, lda );
                    // make diagonal real -- according to docs, should NOT be necesary
                    //for( int i=0; i < N; ++i ) {
                    //    h_A[i + i*lda] = MAGMA_D_MAKE( MAGMA_D_REAL( h_A[i + i*lda] ), 0 );
                    //}
                    magma_dsetmatrix( N, N, h_A, lda, d_A, ldda );

                    /* ====================================================================
                       Performs operation using MAGMA
                       =================================================================== */
                    gpu_time = magma_wtime();
                    norm_magma = magmablas_dlansy( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work );
                    gpu_time = magma_wtime() - gpu_time;
                    gpu_perf = gbytes / gpu_time;
                    if (norm_magma == -1) {
                        printf( "%5d   %4c   skipped because it isn't supported on this GPU\n",
                                (int) N, lapacke_norm_const( norm[inorm] ));
                        continue;
                    }
                    if (norm_magma < 0)
                        printf("magmablas_dlansy returned error %f: %s.\n",
                               norm_magma, magma_strerror( (int) norm_magma ));

                    /* =====================================================================
                       Performs operation using LAPACK
                       =================================================================== */
                    cpu_time = magma_wtime();
                    norm_lapack = lapackf77_dlansy(
                                      lapack_norm_const( norm[inorm] ),
                                      lapack_uplo_const( uplo[iuplo] ),
                                      &N, h_A, &lda, h_work );
                    cpu_time = magma_wtime() - cpu_time;
                    cpu_perf = gbytes / cpu_time;
                    if (norm_lapack < 0)
                        printf("lapackf77_dlansy returned error %f: %s.\n",
                               norm_lapack, magma_strerror( (int) norm_lapack ));

                    /* =====================================================================
                       Check the result compared to LAPACK
                       Note: MKL (11.1.0) has bug for uplo=Lower with multiple threads.
                       Try with $MKL_NUM_THREADS = 1.
                       =================================================================== */
                    error = fabs( norm_magma - norm_lapack ) / norm_lapack;
                    double tol2 = tol;
                    if ( norm[inorm] == MagmaMaxNorm ) {
                        // max-norm depends on only one element, so for Real precisions,
                        // MAGMA and LAPACK should exactly agree (tol2 = 0),
                        // while Complex precisions incur roundoff in fabs.
#if defined(PRECISION_s) || defined(PRECISION_d)
                        tol2 = 0;
#endif
                    }

                    if ( error > tol2 && norm[inorm] == MagmaInfNorm && uplo[iuplo] == MagmaLower ) {
                        mkl_warning = true;
                    }

                    printf("%5d   %4c   %4c   %7.2f (%7.2f)   %7.2f (%7.2f)   %#9.3g   %s\n",
                           (int) N,
                           lapacke_norm_const( norm[inorm] ),
                           lapacke_uplo_const( uplo[iuplo] ),
                           cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                           error, (error <= tol2 ? "ok" : "failed") );
                    status += ! (error <= tol2);

                    TESTING_FREE_CPU( h_A    );
                    TESTING_FREE_CPU( h_work );

                    TESTING_FREE_DEV( d_A    );
                    TESTING_FREE_DEV( d_work );
                    fflush( stdout );
                }
                if ( opts.niter > 1 ) {
                    printf( "\n" );
                }
            }
        } // end iuplo, inorm, iter
        printf( "\n" );
    }

    if ( mkl_warning ) {
        printf("* Some versions of MKL (e.g., 11.1.0) have a bug in dlansy with uplo=L\n"
               "  and multiple threads. Try again with MKL_NUM_THREADS=1.\n" );
    }

    TESTING_FINALIZE();
    return status;
}