Example #1
0
extern "C" magma_context *
magma_init( void *params, void* (*func)(void *a), magma_int_t nthread, magma_int_t ncpu, magma_int_t ngpu,
        magma_int_t argc, char **argv)
{
/*  -- MAGMA (version 1.6.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    This function initializes the hardware context to be used for
    subsequent calls to routines in the MAGMA library.

    Arguments
    =========
    NCPU   (input) INTEGER
            Number of CPU cores to be used in the computations.

    NGPU    (input) INTEGER
            Number of GPU cores to be used in the computations.
    ===================================================================== */

    t_params **tp = (t_params**)malloc(sizeof(t_params*)*nthread);

    pthread_t *thread;

    magma_int_t i;
    magma_context *context;
    context  = (magma_context *)malloc(sizeof(magma_context));

    if (nthread > 0) {
      thread = (pthread_t*)malloc(sizeof(pthread_t)*nthread);

      for (i = 0; i < nthread; i++){
        tp[i] = (t_params*)malloc(sizeof(t_params));
        tp[i]->params = params;
        tp[i]->tid = i;
        pthread_create(&thread[i], NULL, func, (void *)tp[i]);
      }
    }


    if (ncpu <= 1)
      ncpu = 1;

    if (ngpu <= 0)
      ngpu = 0;

    context->num_cores = ncpu;
    context->num_gpus  = ngpu; 

    if (ncpu > 1)
      {
        /* Initialize the QUARK scheduler */
        context->quark = QUARK_New(ncpu);
      }

    if (ngpu > 1)
      {
        printf("The requested number of GPUs is not yet supported.\n\n");
        printf("The number of GPUs set to one.\n\n");
        context->num_gpus  = 1;
      }
    
    if (ngpu == 1)
      {
    CUdevice  dev;
    context->gpu_context = (CUcontext *)malloc(ngpu * sizeof(CUcontext));
    
    /* For now we use by default device 0, always */
    if( CUDA_SUCCESS != cuInit( 0 ) ) {
      fprintf(stderr, "CUDA: Not initialized\n" ); 
      exit(-1);
    }
    if( CUDA_SUCCESS != cuDeviceGet( &dev, 0 ) ) {
      fprintf(stderr, "CUDA: Cannot get the device\n");
      exit(-1);
    } 
    if( CUDA_SUCCESS != cuCtxCreate( &context->gpu_context[0], 0, dev ) ) {
      fprintf(stderr, "CUDA: Cannot create the context\n");
      exit(-1);
    }
    if( CUDA_SUCCESS != cublasInit( ) ) {
      fprintf(stderr, "CUBLAS: Not initialized\n");
      exit(-1);
    }
    printout_devices( );
      }
    
    context->nb = -1;
    for(i = 1; i<argc; i++)
      if (strcmp("-b", argv[i])==0)
    context->nb = atoi(argv[++i]);
    
    return context;
}
Example #2
0
int main(int argc, char **argv)
{
    #define test_A(i,j) test_A[(size_t)(j)*N+(i)]
    #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)]
    int N,NB,w,LDA,BB;
    size_t memsize; //bytes
    int iam, nprocs, mydevice;
    int ICTXT, nprow, npcol, myprow, mypcol;
    int i_one = 1, i_zero = 0, i_negone = -1;
    double d_one = 1.0, d_zero = 0.0, d_negone = -1.0;
    int IASEED = 100;
/*  printf("N=?\n");
    scanf("%ld",&N);
    printf("NB=?\n");
    scanf("%d", &NB);
    printf("width of Y panel=?\n");
    scanf("%ld",&w);
*/
    if(argc < 4){
        printf("invalid arguments N NB memsize(M)\n");
        exit(1);
    }
    N = atoi(argv[1]);
    NB = atoi(argv[2]);
    memsize = (size_t)atoi(argv[3])*1024*1024;
    BB = (N + NB - 1) / NB;
    w = memsize/sizeof(double)/BB/NB/NB - 1;
    assert(w > 0);
    LDA = N + 0; //padding

    int do_io = (N <= NSIZE);
    double llttime;
    double gflops;
    
    nprow = npcol = 1;
    blacs_pinfo_(&iam, &nprocs);
    blacs_get_(&i_negone, &i_zero, &ICTXT);
    blacs_gridinit_(&ICTXT, "R", &nprow, &npcol);
    blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol);
    #ifdef USE_MIC
        #ifdef __INTEL_OFFLOAD
            printf("offload compilation enabled\ninitialize each MIC\n");
            offload_init(&iam, &mydevice);
            #pragma offload target(mic:0)
            {
                mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
            }
        #else
            if(isroot)
                printf("offload compilation not enabled\n");
            exit(0);
        #endif
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){
                cuStatus = cublasCreate(&worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasInit();
        #endif
    #endif

    double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol
#ifdef VERIFY
    double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify
#endif
    
    /*Initialize A */
    int i,j;
    printf("Initialize A ... "); fflush(stdout);
    llttime = MPI_Wtime();
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A, &LDA, &i_zero, &i_zero,
         &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol); 
    llttime = MPI_Wtime() - llttime;
    printf("time %lf\n", llttime);
              
    /*print test_A*/
    if(do_io){
        printf("Original A=\n\n");
        matprint(test_A, N, LDA, 'A');
    }

    /*Use directed unblocked Cholesky factorization*/    
    /*
    t1 = clock();
    Test_dpotrf(test_A2,N);
    t2 = clock();
    printf ("time for unblocked Cholesky factorization on host %f \n",
        ((float) (t2 - t1)) / CLOCKS_PER_SEC);
    */
    
    /*print test_A*/
    /*
    if(do_io){
        printf("Unblocked result:\n\n");
        matprint(test_A2,N,'L');   
    }
    */ 

    /*Use tile algorithm*/
    Quark *quark = QUARK_New(OOC_NTHREADS);
    QUARK_DOT_DAG_Enable(quark, 0);
    #ifdef USE_MIC
//      mklmem(NB);
        printf("QUARK MIC affinity binding\n");
        QUARK_bind(quark);
        printf("offload warm up\n");
        warmup(quark);
    #endif
    QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0));
    printf("LLT start %lf\n", MPI_Wtime());
    llttime = Cholesky(quark,test_A,N,NB,LDA,memsize);
    printf("LLT end %lf\n", MPI_Wtime());
    QUARK_Delete(quark);
    #ifdef USE_MIC
        offload_destroy();
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){ 
                cuStatus = cublasDestroy(worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasShutdown();
        #endif
    #endif

    gflops = (double) N;
    gflops = gflops/3.0 + 0.5;
    gflops = gflops*(double)(N)*(double)(N);
    gflops = gflops/llttime/1024.0/1024.0/1024.0;
    printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n",
        N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops);
    #ifdef USE_MIC
        #pragma offload target(mic:0)
        {
            memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
        }
        printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0);
    #endif

    /*Update and print L*/             
    if(do_io){
        printf("L:\n\n");
        matprint(test_A,N,LDA,'L');
    }
#ifdef VERIFY
    printf("Verify... ");
    llttime = MPI_Wtime();
  /*
   * ------------------------
   * check difference betwen 
   * test_A and test_A2
   * ------------------------
   */
    /*
    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = j; i < N; i++)
          {
            double err = (test_A (i, j) - test_A2 (i, j));
            err = ABS (err);
            maxerr = MAX (err, maxerr);
            maxerr2 = maxerr2 + err * err;
          };
      };
    maxerr2 = sqrt (ABS (maxerr2));
    printf ("max difference between test_A and test_A2 %lf \n", maxerr);
    printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2);
    };
    */

  /*
   * ------------------
   * over-write test_A2
   * ------------------
   */
   
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A2, &LDA, &i_zero,
         &i_zero, &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol);

  /*
   * ---------------------------------------
   * after solve, test_A2 should be identity
   * ---------------------------------------
   */
  // test_A = chol(B) = L;
  // test_A2 = B
  // solve L*L'*X = B
  // if L is correct, X is identity */
     
    {
    int uplo = 'L';
    const char *uplo_char = ((uplo == (int) 'U')
                    || (uplo == (int) 'u')) ? "U" : "L";
    int info = 0;
    int nrhs = N;
    int LDA = N;
    int ldb = N;
    dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info);
    assert (info == 0);
    }

    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = 0; i < N; i++)
          {
            double eyeij = (i == j) ? 1.0 : 0.0;
            double err = (test_A2 (i, j) - eyeij);
            err = ABS (err);
            maxerr = MAX (maxerr, err);
            maxerr2 = maxerr2 + err * err;
          };
      };

    maxerr2 = sqrt (ABS (maxerr2));
    printf("time %lf\n", MPI_Wtime() - llttime);
    printf ("max error %lf \n", maxerr);
    printf ("max L2 error %lf \n", maxerr2);
    }
#endif

    free(test_A);test_A=NULL;
#ifdef VERIFY
    free(test_A2);test_A2=NULL;
#endif
    blacs_gridexit_(&ICTXT);
    blacs_exit_(&i_zero);
    return 0;
    #undef test_A
    #undef test_A2
}
Example #3
0
/* Try various ways to do matmul and time them.  Tiled algorithms
 * running serially; multi-threaded QUARK runtime with tiled
 * algorithms; and direct serial computation over standard layout. */
int main_algorithm(int NB, int N, int THREADS)
{
    int i, j, k, nerr=0;
    int BB = N/NB;
    double *A = (double*)malloc(N*N*sizeof(double));
    double *Ablk = (double*)malloc(N*N*sizeof(double));
    double *B = (double*)malloc(N*N*sizeof(double));
    double *Bblk = (double*)malloc(N*N*sizeof(double));
    double *C_direct = (double*)malloc(N*N*sizeof(double));
    double *C = (double*)malloc(N*N*sizeof(double));
    double *Cblk = (double*)malloc(N*N*sizeof(double));
    double *C_quark = (double*)malloc(N*N*sizeof(double));
    double *C_quark_blk = (double*)malloc(N*N*sizeof(double));
    struct timeval tstart, tend, tdiff;
    double t_blk=0, t_quark=0, t_direct=0;

    // Initialize
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            A[i+j*N] = (double)1.0+i;
            B[i+j*N] = (double)2.0+i+j;
            C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0;
        }
    }

    matrix_print("Printing A", A, N);
    matrix_print("Printing B", B, N);
    matrix_print("Printing C before computation", C, N);

    // Move from F77 to BDL
    std_to_bdl( A, Ablk, N, NB );
    std_to_bdl( B, Bblk, N, NB );
    std_to_bdl( C, Cblk, N, NB );
    std_to_bdl( C_quark, C_quark_blk, N, NB );

    /* ORIGINAL TILED ROUTINE */
    /* This is the code for the serial tile-by-tile multiplication */
    printf("Doing matrix multiplication using serial tile-by-tile algorithm\n");
    gettimeofday( &tstart, NULL );
    for (i = 0; i < BB; i++)
        for (j = 0; j < BB; j++)
            for (k = 0; k < BB; k++)
                matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB);
    gettimeofday( &tend, NULL );
    t_blk = timeval_subtract( &tdiff, &tend, &tstart );
    printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 );
    bdl_to_std( C, Cblk, N, NB );
    matrix_print("Printing C produced by serial tile-algorithm after computation", C, N);
    printf("\n");

    /* QUARK PARALLEL TILED ROUTINE */
    /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */
    printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n");
    Quark *quark = QUARK_New(THREADS);
    gettimeofday( &tstart, NULL );
    for (i = 0; i < BB; i++)
        for (j = 0; j < BB; j++)
            for (k = 0; k < BB; k++) 
                matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB);
    QUARK_Barrier( quark );
    gettimeofday( &tend, NULL );
    t_quark = timeval_subtract( &tdiff, &tend, &tstart );
    printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 );
    QUARK_Delete(quark);
    bdl_to_std( C_quark, C_quark_blk, N, NB );    
    matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N);
    printf("\n");

    /* DIRECT COMPUTATION OVER STANDARD LAYOUT */
    /* Compute direct C if desired */
    printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n");
    gettimeofday( &tstart, NULL );
    matmul ( A, B, C_direct, N );
    gettimeofday( &tend, NULL );
    t_direct = timeval_subtract( &tdiff, &tend, &tstart );
    printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) );
    matrix_print("Printing C produced by direct matmul after computation", C_direct, N);
    printf("\n");

    /* Check for errors */
    printf("Comparing result matrices (direct versus QUARK)\n");
    nerr = matrix_compare( C_direct, C_quark, N );
    printf("Number of differences: %d\n", nerr);    
    printf("\n");    

    printf("Summary of time taken\n");
    printf("Direct       SerialBlock  QUARK(%d threads)\n", THREADS);
    printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark);
    
    free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk);
    return 0;
}