예제 #1
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cpotrf
*/
int main( int argc, char** argv) 
{
    TESTING_CUDA_INIT();

    magma_timestr_t  start, end;
    float      flops, gpu_perf, cpu_perf;
    cuFloatComplex *h_A, *h_R;
    cuFloatComplex *d_A;
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};
    
    magma_int_t i, info;
    const char *uplo     = MagmaUpperStr;
    cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    float      work[1], matnorm;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){        
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[9] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_cpotri_gpu -N %d\n\n", 1024);
    }

    /* Allocate host memory for the matrix */
    n2   = size[9] * size[9];
    ldda = ((size[9]+31)/32) * 32;
    TESTING_MALLOC(    h_A, cuFloatComplex, n2);
    TESTING_HOSTALLOC( h_R, cuFloatComplex, n2);
    TESTING_DEVALLOC(  d_A, cuFloatComplex, ldda*size[9] );

    printf("  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F\n");
    printf("========================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N; 
        n2  = lda*N;
        flops = FLOPS_CPOTRI( (float)N ) / 1000000;
        
        ldda = ((N+31)/32)*32;

        /* Initialize the matrix */
        lapackf77_clarnv( &ione, ISEED, &n2, h_A );
        /* Symmetrize and increase the diagonal */
        {
            magma_int_t i, j;
            for(i=0; i<N; i++) {
                MAGMA_C_SET2REAL( h_A[i*lda+i], ( MAGMA_C_REAL(h_A[i*lda+i]) + 1.*N ) );
                for(j=0; j<i; j++)
                    h_A[i*lda+j] = cuConjf(h_A[j*lda+i]);
            }
        }
        lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );

        /* ====================================================================
           Performs operation using MAGMA 
           =================================================================== */
        //cublasSetMatrix( N, N, sizeof(cuFloatComplex), h_A, lda, d_A, ldda);
        //magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info);

        /* factorize matrix */
        magma_csetmatrix( N, N, h_A, lda, d_A, ldda );
        magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info);
        
        // check for exact singularity
        //magma_cgetmatrix( N, N, d_A, ldda, h_R, lda );
        //h_R[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 );
        //magma_csetmatrix( N, N, h_R, lda, d_A, ldda );
        
        start = get_current_time();
        magma_cpotri_gpu(uplo[0], N, d_A, ldda, &info);
        end = get_current_time();
        if (info != 0)
            printf("magma_cpotri_gpu returned error %d\n", (int) info);

        gpu_perf = flops / GetTimerValue(start, end);
        
        /* =====================================================================
           Performs operation using LAPACK 
           =================================================================== */
        lapackf77_cpotrf(uplo, &N, h_A, &lda, &info);
        
        start = get_current_time();
        lapackf77_cpotri(uplo, &N, h_A, &lda, &info);
        end = get_current_time();
        if (info != 0)
            printf("lapackf77_cpotri returned error %d\n", (int) info);
        
        cpu_perf = flops / GetTimerValue(start, end);
      
        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        magma_cgetmatrix( N, N, d_A, ldda, h_R, lda );
        matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work);
        blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
        printf("%5d    %6.2f         %6.2f        %e\n", 
               (int) size[i], cpu_perf, gpu_perf,
               lapackf77_clange("f", &N, &N, h_R, &lda, work) / matnorm);
        
        if (argc != 1)
            break;
    }

    /* Memory clean up */
    TESTING_FREE( h_A );
    TESTING_HOSTFREE( h_R );
    TESTING_DEVFREE( d_A );

    /* Shutdown */
    TESTING_CUDA_FINALIZE();
}
예제 #2
0
__SDH__ cfloat  conj(cfloat  c) { return cuConjf(c);}
예제 #3
0
int main(int argc, char **argv)
{        
#if (GPUSHMEM >= 200)
    TESTING_CUDA_INIT();
    cudaSetDevice(0);

    magma_timestr_t  start, end;
    float      flops, magma_perf, cuda_perf, error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t n_local[4];

    FILE        *fp ; 
    magma_int_t N, m, i, j, lda, LDA, M;
    magma_int_t matsize;
    magma_int_t vecsize;
    magma_int_t istart = 64;
    magma_int_t incx = 1;
    char        uplo = MagmaLower;

    cuFloatComplex alpha = MAGMA_C_MAKE(1., 0.); // MAGMA_C_MAKE(  1.5, -2.3 );
    cuFloatComplex beta  = MAGMA_C_MAKE(0., 0.); // MAGMA_C_MAKE( -0.6,  0.8 );
    cuFloatComplex *A, *X, *Y[4], *Ycublas, *Ymagma;
    cuFloatComplex *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ;

    cudaStream_t stream[4][10];
    cuFloatComplex *C_work;
    cuFloatComplex *dC_work[4];

    magma_int_t num_gpus = 1, max_num_gpus, nb;
    magma_int_t blocks, workspace;
    magma_int_t offset;
    
//    offset = 257;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
            {
                N = atoi(argv[++i]);
                istart = N;
            }
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
            else if (strcmp("-NGPU", argv[i])==0)
              num_gpus = atoi(argv[++i]);
            else if (strcmp("-offset", argv[i])==0)
              offset = atoi(argv[++i]);
        }
        if ( M == 0 ) {
            M = N;
        }
        if ( N == 0 ) {
            N = M;
        }
        if (M>0 && N>0)
        {    printf("  testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", M, N, num_gpus);
            printf("  in %c side \n", uplo);
        }
        else
            {
                printf("\nUsage: \n");
                printf("  testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", 
                       1024, 1024, 1);
                exit(1);
            }
    }
    else {
#if defined(PRECISION_z)
        M = N = 8000;
      
#else
        M = N = 12480;

#endif 
        num_gpus = 2;
        offset = 0;
        printf("\nUsage: \n");
        printf("  testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", M, N, num_gpus);
    }
         

///////////////////////////////////////////////////////////////////////////////////////
    cudaGetDeviceCount(&max_num_gpus);
    if (num_gpus > max_num_gpus){
      printf("More GPUs requested than available. Have to change it.\n");
      num_gpus = max_num_gpus;
    }
    printf("Number of GPUs to be used = %d\n", num_gpus);
    for(int i=0; i< num_gpus; i++)
    {
      cudaStreamCreate(&stream[i][0]);
    }
    

    LDA = ((N+31)/32)*32;
    matsize = N*LDA;
    vecsize = N*incx;
    nb = 32;
    //nb = 64;

    printf("block size = %d\n", nb);
   
    TESTING_MALLOC( A, cuFloatComplex, matsize );
    TESTING_MALLOC( X, cuFloatComplex, vecsize );
    TESTING_MALLOC( Ycublas, cuFloatComplex, vecsize );
    TESTING_MALLOC( Ymagma,  cuFloatComplex, vecsize );
    for(i=0; i<num_gpus; i++)
    {     
    TESTING_MALLOC( Y[i], cuFloatComplex, vecsize );
    }

    cudaSetDevice(0);
    TESTING_DEVALLOC( dA, cuFloatComplex, matsize );
    TESTING_DEVALLOC( dYcublas, cuFloatComplex, vecsize );

    for(i=0; i<num_gpus; i++)
    {      
      n_local[i] = ((N/nb)/num_gpus)*nb;
      if (i < (N/nb)%num_gpus)
        n_local[i] += nb;
      else if (i == (N/nb)%num_gpus)
        n_local[i] += N%nb;

      cudaSetDevice(i);

      TESTING_DEVALLOC( d_lA[i], cuFloatComplex, LDA*n_local[i] );// potentially bugged 
      TESTING_DEVALLOC( dX[i], cuFloatComplex, vecsize );
      TESTING_DEVALLOC( dY[i], cuFloatComplex, vecsize );
      
      printf("device %2d n_local = %4d\n", i, n_local[i]); 

    }
    cudaSetDevice(0);

      

///////////////////////////////////////////////////////////////////////

    /* Initialize the matrix */
    lapackf77_clarnv( &ione, ISEED, &matsize, A );
    /* Make A hermitian */
    { 
        magma_int_t i, j;
        for(i=0; i<N; i++) {
            A[i*LDA+i] = MAGMA_C_MAKE( MAGMA_C_REAL(A[i*LDA+i]), 0. );
            for(j=0; j<i; j++)
                A[i*LDA+j] = cuConjf(A[j*LDA+i]);
        }
    }
        

      blocks    = N / nb + (N % nb != 0);
      workspace = LDA * (blocks + 1);
      TESTING_MALLOC(    C_work, cuFloatComplex, workspace );
      for(i=0; i<num_gpus; i++){
             cudaSetDevice(i);  
             TESTING_DEVALLOC( dC_work[i], cuFloatComplex, workspace );
             //fillZero(dC_work[i], workspace);
      }
      
     cudaSetDevice(0);


//////////////////////////////////////////////////////////////////////////////////////////////
   
    fp = fopen ("results_chemv_mgpu.csv", "w") ;
    if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);}

    printf("HEMV cuFloatComplex Precision\n\n");

    printf( "   n   CUBLAS,Gflop/s   MAGMABLAS,Gflop/s      \"error\"\n" 
            "==============================================================\n");
    fprintf(fp, "   n   CUBLAS,Gflop/s   MAGMABLAS,Gflop/s      \"error\"\n" 
            "==============================================================\n");


//    for( offset = 0; offset< N; offset ++ )
    
    for(int size = istart ; size <= N ; size += 128)
    {
    //    printf("offset = %d ", offset);
        m = size ;
    //    m = N;
        // lda = ((m+31)/32)*32;// 
        lda = LDA; 
        flops = FLOPS( (float)m ) / 1e6;

        printf(      "N %5d ", m );
        fprintf( fp, "%5d, ", m );

        vecsize = m * incx;
        lapackf77_clarnv( &ione, ISEED, &vecsize, X );
        lapackf77_clarnv( &ione, ISEED, &vecsize, Y[0] );

        /* =====================================================================
           Performs operation using CUDA-BLAS
           =================================================================== */
        cudaSetDevice(0);
        magmablas_csetmatrix_1D_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); 
        cudaSetDevice(0);

    
    
    magma_csetmatrix( m, m, A, LDA, dA, lda );
        magma_csetvector( m, Y[0], incx, dYcublas, incx );
        
        for(i=0; i<num_gpus; i++){
            cudaSetDevice(i);
            magma_csetvector( m, X, incx, dX[i], incx );
            magma_csetvector( m, Y[0], incx, dY[i], incx );


            blocks    = m / nb + (m % nb != 0);
            magma_csetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda );
            
            //cudaMemset(dC_work[i], 0, sizeof( cuFloatComplex) * lda * blocks);
           
       }

        cudaSetDevice(0);
        start = get_current_time();
        cublasChemv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx );
         
        end = get_current_time();

        magma_cgetvector( m, dYcublas, incx, Ycublas, incx );
                
        
        cuda_perf = flops / GetTimerValue(start,end);
        printf(     "%11.2f", cuda_perf );
        fprintf(fp, "%11.2f,", cuda_perf );
       
        
        cudaSetDevice(0);

        
        start = get_current_time();
        

        if(nb == 32)
       { 

        magmablas_chemv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, 
                dC_work, workspace, num_gpus, nb, offset);
 
        }
        else // nb = 64
       { 

        magmablas_chemv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, 
                dC_work, workspace, num_gpus, nb, offset);
 
        }
    
            
        for(i=1; i<num_gpus; i++)
        {
           cudaSetDevice(i);
           cudaDeviceSynchronize();
        }
      
        end = get_current_time();
        magma_perf = flops / GetTimerValue(start,end); 
        printf(     "%11.2f", magma_perf );
        fprintf(fp, "%11.2f,", magma_perf );
       

        for(i=0; i<num_gpus; i++)
        {        
            cudaSetDevice(i);
            magma_cgetvector( m, dY[i], incx, Y[i], incx );
        }
        cudaSetDevice(0);

        
#ifdef validate        

        for( j= offset;j<m;j++)
        {
            for(i=1; i<num_gpus; i++)
            {

//            printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x);
#if defined(PRECISION_z) || defined(PRECISION_c)
            Y[0][j].x = Y[0][j].x + Y[i][j].x;
                        Y[0][j].y = Y[0][j].y + Y[i][j].y;
#else 
            Y[0][j] = Y[0][j] + Y[i][j];
            
#endif 

            }
        }

/*

#if defined(PRECISION_z) || defined(PRECISION_c)
        
        for( j=offset;j<m;j++)
        {
            if(Y[0][j].x != Ycublas[j].x)
            {
                     printf("Y-multi[%d] = %f, %f\n",  j, Y[0][j].x, Y[0][j].y );
                     printf("Ycublas[%d] = %f, %f\n",  j, Ycublas[j].x, Ycublas[j].y);
            }
        }

#else 

        for( j=offset;j<m;j++)
        {
            if(Y[0][j] != Ycublas[j])
            {
                     printf("Y-multi[%d] = %f\n",  j, Y[0][j] );
                     printf("Ycublas[%d] = %f\n",  j, Ycublas[j]);
            }
        }

#endif

*/        
        /* =====================================================================
           Computing the Difference Cublas VS Magma
           =================================================================== */
       
        int nw = m - offset ;
        blasf77_caxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx);
        error = lapackf77_clange( "M", &nw, &ione, Ycublas + offset, &nw, work );
            
#if  0
        printf(      "\t\t %8.6e", error / m );
        fprintf( fp, "\t\t %8.6e", error / m );

        /*
         * Extra check with cblas vs magma
         */
        cblas_ccopy( m, Y, incx, Ycublas, incx );
        cblas_chemv( CblasColMajor, CblasLower, m, 
                     CBLAS_SADDR(alpha), A, LDA, X, incx, 
                     CBLAS_SADDR(beta), Ycublas, incx );
 
        blasf77_caxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx);
        error = lapackf77_clange( "M", &m, &ione, Ycublas, &m, work );
#endif

        printf(      "\t\t %8.6e", error / m );
        fprintf( fp, "\t\t %8.6e", error / m );
 
#endif 
        printf("\n");        
        fprintf(fp, "\n");        
    }
    
    fclose( fp ) ; 

    /* Free Memory */
    TESTING_FREE( A );
    TESTING_FREE( X );
    TESTING_FREE( Ycublas );
    TESTING_FREE( Ymagma );
    TESTING_FREE( C_work );

    TESTING_DEVFREE( dA );
    TESTING_DEVFREE( dYcublas );
    
    for(i=0; i<num_gpus; i++)
    { 
        TESTING_FREE( Y[i] );
        cudaSetDevice(i);

        TESTING_DEVFREE( d_lA[i] )
        TESTING_DEVFREE( dX[i] );
        TESTING_DEVFREE( dY[i] );


        TESTING_DEVFREE( dC_work[i] );

    }

    cudaSetDevice(0);
 ///////////////////////////////////////////////////////////   
      

    /* Free device */
    TESTING_CUDA_FINALIZE();
#endif
    return 0;
}