コード例 #1
0
ファイル: oocdag.cpp プロジェクト: efdazedo/oocdag
int main(int argc, char **argv)
{
    #define test_A(i,j) test_A[(size_t)(j)*N+(i)]
    #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)]
    int N,NB,w,LDA,BB;
    size_t memsize; //bytes
    int iam, nprocs, mydevice;
    int ICTXT, nprow, npcol, myprow, mypcol;
    int i_one = 1, i_zero = 0, i_negone = -1;
    double d_one = 1.0, d_zero = 0.0, d_negone = -1.0;
    int IASEED = 100;
/*  printf("N=?\n");
    scanf("%ld",&N);
    printf("NB=?\n");
    scanf("%d", &NB);
    printf("width of Y panel=?\n");
    scanf("%ld",&w);
*/
    if(argc < 4){
        printf("invalid arguments N NB memsize(M)\n");
        exit(1);
    }
    N = atoi(argv[1]);
    NB = atoi(argv[2]);
    memsize = (size_t)atoi(argv[3])*1024*1024;
    BB = (N + NB - 1) / NB;
    w = memsize/sizeof(double)/BB/NB/NB - 1;
    assert(w > 0);
    LDA = N + 0; //padding

    int do_io = (N <= NSIZE);
    double llttime;
    double gflops;
    
    nprow = npcol = 1;
    blacs_pinfo_(&iam, &nprocs);
    blacs_get_(&i_negone, &i_zero, &ICTXT);
    blacs_gridinit_(&ICTXT, "R", &nprow, &npcol);
    blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol);
    #ifdef USE_MIC
        #ifdef __INTEL_OFFLOAD
            printf("offload compilation enabled\ninitialize each MIC\n");
            offload_init(&iam, &mydevice);
            #pragma offload target(mic:0)
            {
                mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
            }
        #else
            if(isroot)
                printf("offload compilation not enabled\n");
            exit(0);
        #endif
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){
                cuStatus = cublasCreate(&worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasInit();
        #endif
    #endif

    double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol
#ifdef VERIFY
    double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify
#endif
    
    /*Initialize A */
    int i,j;
    printf("Initialize A ... "); fflush(stdout);
    llttime = MPI_Wtime();
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A, &LDA, &i_zero, &i_zero,
         &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol); 
    llttime = MPI_Wtime() - llttime;
    printf("time %lf\n", llttime);
              
    /*print test_A*/
    if(do_io){
        printf("Original A=\n\n");
        matprint(test_A, N, LDA, 'A');
    }

    /*Use directed unblocked Cholesky factorization*/    
    /*
    t1 = clock();
    Test_dpotrf(test_A2,N);
    t2 = clock();
    printf ("time for unblocked Cholesky factorization on host %f \n",
        ((float) (t2 - t1)) / CLOCKS_PER_SEC);
    */
    
    /*print test_A*/
    /*
    if(do_io){
        printf("Unblocked result:\n\n");
        matprint(test_A2,N,'L');   
    }
    */ 

    /*Use tile algorithm*/
    Quark *quark = QUARK_New(OOC_NTHREADS);
    QUARK_DOT_DAG_Enable(quark, 0);
    #ifdef USE_MIC
//      mklmem(NB);
        printf("QUARK MIC affinity binding\n");
        QUARK_bind(quark);
        printf("offload warm up\n");
        warmup(quark);
    #endif
    QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0));
    printf("LLT start %lf\n", MPI_Wtime());
    llttime = Cholesky(quark,test_A,N,NB,LDA,memsize);
    printf("LLT end %lf\n", MPI_Wtime());
    QUARK_Delete(quark);
    #ifdef USE_MIC
        offload_destroy();
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){ 
                cuStatus = cublasDestroy(worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasShutdown();
        #endif
    #endif

    gflops = (double) N;
    gflops = gflops/3.0 + 0.5;
    gflops = gflops*(double)(N)*(double)(N);
    gflops = gflops/llttime/1024.0/1024.0/1024.0;
    printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n",
        N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops);
    #ifdef USE_MIC
        #pragma offload target(mic:0)
        {
            memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
        }
        printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0);
    #endif

    /*Update and print L*/             
    if(do_io){
        printf("L:\n\n");
        matprint(test_A,N,LDA,'L');
    }
#ifdef VERIFY
    printf("Verify... ");
    llttime = MPI_Wtime();
  /*
   * ------------------------
   * check difference betwen 
   * test_A and test_A2
   * ------------------------
   */
    /*
    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = j; i < N; i++)
          {
            double err = (test_A (i, j) - test_A2 (i, j));
            err = ABS (err);
            maxerr = MAX (err, maxerr);
            maxerr2 = maxerr2 + err * err;
          };
      };
    maxerr2 = sqrt (ABS (maxerr2));
    printf ("max difference between test_A and test_A2 %lf \n", maxerr);
    printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2);
    };
    */

  /*
   * ------------------
   * over-write test_A2
   * ------------------
   */
   
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A2, &LDA, &i_zero,
         &i_zero, &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol);

  /*
   * ---------------------------------------
   * after solve, test_A2 should be identity
   * ---------------------------------------
   */
  // test_A = chol(B) = L;
  // test_A2 = B
  // solve L*L'*X = B
  // if L is correct, X is identity */
     
    {
    int uplo = 'L';
    const char *uplo_char = ((uplo == (int) 'U')
                    || (uplo == (int) 'u')) ? "U" : "L";
    int info = 0;
    int nrhs = N;
    int LDA = N;
    int ldb = N;
    dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info);
    assert (info == 0);
    }

    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = 0; i < N; i++)
          {
            double eyeij = (i == j) ? 1.0 : 0.0;
            double err = (test_A2 (i, j) - eyeij);
            err = ABS (err);
            maxerr = MAX (maxerr, err);
            maxerr2 = maxerr2 + err * err;
          };
      };

    maxerr2 = sqrt (ABS (maxerr2));
    printf("time %lf\n", MPI_Wtime() - llttime);
    printf ("max error %lf \n", maxerr);
    printf ("max L2 error %lf \n", maxerr2);
    }
#endif

    free(test_A);test_A=NULL;
#ifdef VERIFY
    free(test_A2);test_A2=NULL;
#endif
    blacs_gridexit_(&ICTXT);
    blacs_exit_(&i_zero);
    return 0;
    #undef test_A
    #undef test_A2
}
コード例 #2
0
extern "C" void offload_destroy_(){
    offload_destroy();    
}