extern "C" magma_context * magma_init( void *params, void* (*func)(void *a), magma_int_t nthread, magma_int_t ncpu, magma_int_t ngpu, magma_int_t argc, char **argv) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= This function initializes the hardware context to be used for subsequent calls to routines in the MAGMA library. Arguments ========= NCPU (input) INTEGER Number of CPU cores to be used in the computations. NGPU (input) INTEGER Number of GPU cores to be used in the computations. ===================================================================== */ t_params **tp = (t_params**)malloc(sizeof(t_params*)*nthread); pthread_t *thread; magma_int_t i; magma_context *context; context = (magma_context *)malloc(sizeof(magma_context)); if (nthread > 0) { thread = (pthread_t*)malloc(sizeof(pthread_t)*nthread); for (i = 0; i < nthread; i++){ tp[i] = (t_params*)malloc(sizeof(t_params)); tp[i]->params = params; tp[i]->tid = i; pthread_create(&thread[i], NULL, func, (void *)tp[i]); } } if (ncpu <= 1) ncpu = 1; if (ngpu <= 0) ngpu = 0; context->num_cores = ncpu; context->num_gpus = ngpu; if (ncpu > 1) { /* Initialize the QUARK scheduler */ context->quark = QUARK_New(ncpu); } if (ngpu > 1) { printf("The requested number of GPUs is not yet supported.\n\n"); printf("The number of GPUs set to one.\n\n"); context->num_gpus = 1; } if (ngpu == 1) { CUdevice dev; context->gpu_context = (CUcontext *)malloc(ngpu * sizeof(CUcontext)); /* For now we use by default device 0, always */ if( CUDA_SUCCESS != cuInit( 0 ) ) { fprintf(stderr, "CUDA: Not initialized\n" ); exit(-1); } if( CUDA_SUCCESS != cuDeviceGet( &dev, 0 ) ) { fprintf(stderr, "CUDA: Cannot get the device\n"); exit(-1); } if( CUDA_SUCCESS != cuCtxCreate( &context->gpu_context[0], 0, dev ) ) { fprintf(stderr, "CUDA: Cannot create the context\n"); exit(-1); } if( CUDA_SUCCESS != cublasInit( ) ) { fprintf(stderr, "CUBLAS: Not initialized\n"); exit(-1); } printout_devices( ); } context->nb = -1; for(i = 1; i<argc; i++) if (strcmp("-b", argv[i])==0) context->nb = atoi(argv[++i]); return context; }
int main(int argc, char **argv) { #define test_A(i,j) test_A[(size_t)(j)*N+(i)] #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)] int N,NB,w,LDA,BB; size_t memsize; //bytes int iam, nprocs, mydevice; int ICTXT, nprow, npcol, myprow, mypcol; int i_one = 1, i_zero = 0, i_negone = -1; double d_one = 1.0, d_zero = 0.0, d_negone = -1.0; int IASEED = 100; /* printf("N=?\n"); scanf("%ld",&N); printf("NB=?\n"); scanf("%d", &NB); printf("width of Y panel=?\n"); scanf("%ld",&w); */ if(argc < 4){ printf("invalid arguments N NB memsize(M)\n"); exit(1); } N = atoi(argv[1]); NB = atoi(argv[2]); memsize = (size_t)atoi(argv[3])*1024*1024; BB = (N + NB - 1) / NB; w = memsize/sizeof(double)/BB/NB/NB - 1; assert(w > 0); LDA = N + 0; //padding int do_io = (N <= NSIZE); double llttime; double gflops; nprow = npcol = 1; blacs_pinfo_(&iam, &nprocs); blacs_get_(&i_negone, &i_zero, &ICTXT); blacs_gridinit_(&ICTXT, "R", &nprow, &npcol); blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol); #ifdef USE_MIC #ifdef __INTEL_OFFLOAD printf("offload compilation enabled\ninitialize each MIC\n"); offload_init(&iam, &mydevice); #pragma offload target(mic:0) { mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE); } #else if(isroot) printf("offload compilation not enabled\n"); exit(0); #endif #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasCreate(&worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasInit(); #endif #endif double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol #ifdef VERIFY double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify #endif /*Initialize A */ int i,j; printf("Initialize A ... "); fflush(stdout); llttime = MPI_Wtime(); pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); llttime = MPI_Wtime() - llttime; printf("time %lf\n", llttime); /*print test_A*/ if(do_io){ printf("Original A=\n\n"); matprint(test_A, N, LDA, 'A'); } /*Use directed unblocked Cholesky factorization*/ /* t1 = clock(); Test_dpotrf(test_A2,N); t2 = clock(); printf ("time for unblocked Cholesky factorization on host %f \n", ((float) (t2 - t1)) / CLOCKS_PER_SEC); */ /*print test_A*/ /* if(do_io){ printf("Unblocked result:\n\n"); matprint(test_A2,N,'L'); } */ /*Use tile algorithm*/ Quark *quark = QUARK_New(OOC_NTHREADS); QUARK_DOT_DAG_Enable(quark, 0); #ifdef USE_MIC // mklmem(NB); printf("QUARK MIC affinity binding\n"); QUARK_bind(quark); printf("offload warm up\n"); warmup(quark); #endif QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0)); printf("LLT start %lf\n", MPI_Wtime()); llttime = Cholesky(quark,test_A,N,NB,LDA,memsize); printf("LLT end %lf\n", MPI_Wtime()); QUARK_Delete(quark); #ifdef USE_MIC offload_destroy(); #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasDestroy(worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasShutdown(); #endif #endif gflops = (double) N; gflops = gflops/3.0 + 0.5; gflops = gflops*(double)(N)*(double)(N); gflops = gflops/llttime/1024.0/1024.0/1024.0; printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n", N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops); #ifdef USE_MIC #pragma offload target(mic:0) { memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); } printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0); #endif /*Update and print L*/ if(do_io){ printf("L:\n\n"); matprint(test_A,N,LDA,'L'); } #ifdef VERIFY printf("Verify... "); llttime = MPI_Wtime(); /* * ------------------------ * check difference betwen * test_A and test_A2 * ------------------------ */ /* { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = j; i < N; i++) { double err = (test_A (i, j) - test_A2 (i, j)); err = ABS (err); maxerr = MAX (err, maxerr); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf ("max difference between test_A and test_A2 %lf \n", maxerr); printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2); }; */ /* * ------------------ * over-write test_A2 * ------------------ */ pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A2, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); /* * --------------------------------------- * after solve, test_A2 should be identity * --------------------------------------- */ // test_A = chol(B) = L; // test_A2 = B // solve L*L'*X = B // if L is correct, X is identity */ { int uplo = 'L'; const char *uplo_char = ((uplo == (int) 'U') || (uplo == (int) 'u')) ? "U" : "L"; int info = 0; int nrhs = N; int LDA = N; int ldb = N; dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info); assert (info == 0); } { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = 0; i < N; i++) { double eyeij = (i == j) ? 1.0 : 0.0; double err = (test_A2 (i, j) - eyeij); err = ABS (err); maxerr = MAX (maxerr, err); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf("time %lf\n", MPI_Wtime() - llttime); printf ("max error %lf \n", maxerr); printf ("max L2 error %lf \n", maxerr2); } #endif free(test_A);test_A=NULL; #ifdef VERIFY free(test_A2);test_A2=NULL; #endif blacs_gridexit_(&ICTXT); blacs_exit_(&i_zero); return 0; #undef test_A #undef test_A2 }
/* Try various ways to do matmul and time them. Tiled algorithms * running serially; multi-threaded QUARK runtime with tiled * algorithms; and direct serial computation over standard layout. */ int main_algorithm(int NB, int N, int THREADS) { int i, j, k, nerr=0; int BB = N/NB; double *A = (double*)malloc(N*N*sizeof(double)); double *Ablk = (double*)malloc(N*N*sizeof(double)); double *B = (double*)malloc(N*N*sizeof(double)); double *Bblk = (double*)malloc(N*N*sizeof(double)); double *C_direct = (double*)malloc(N*N*sizeof(double)); double *C = (double*)malloc(N*N*sizeof(double)); double *Cblk = (double*)malloc(N*N*sizeof(double)); double *C_quark = (double*)malloc(N*N*sizeof(double)); double *C_quark_blk = (double*)malloc(N*N*sizeof(double)); struct timeval tstart, tend, tdiff; double t_blk=0, t_quark=0, t_direct=0; // Initialize for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i+j*N] = (double)1.0+i; B[i+j*N] = (double)2.0+i+j; C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0; } } matrix_print("Printing A", A, N); matrix_print("Printing B", B, N); matrix_print("Printing C before computation", C, N); // Move from F77 to BDL std_to_bdl( A, Ablk, N, NB ); std_to_bdl( B, Bblk, N, NB ); std_to_bdl( C, Cblk, N, NB ); std_to_bdl( C_quark, C_quark_blk, N, NB ); /* ORIGINAL TILED ROUTINE */ /* This is the code for the serial tile-by-tile multiplication */ printf("Doing matrix multiplication using serial tile-by-tile algorithm\n"); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB); gettimeofday( &tend, NULL ); t_blk = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); bdl_to_std( C, Cblk, N, NB ); matrix_print("Printing C produced by serial tile-algorithm after computation", C, N); printf("\n"); /* QUARK PARALLEL TILED ROUTINE */ /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */ printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n"); Quark *quark = QUARK_New(THREADS); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB); QUARK_Barrier( quark ); gettimeofday( &tend, NULL ); t_quark = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); QUARK_Delete(quark); bdl_to_std( C_quark, C_quark_blk, N, NB ); matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N); printf("\n"); /* DIRECT COMPUTATION OVER STANDARD LAYOUT */ /* Compute direct C if desired */ printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n"); gettimeofday( &tstart, NULL ); matmul ( A, B, C_direct, N ); gettimeofday( &tend, NULL ); t_direct = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) ); matrix_print("Printing C produced by direct matmul after computation", C_direct, N); printf("\n"); /* Check for errors */ printf("Comparing result matrices (direct versus QUARK)\n"); nerr = matrix_compare( C_direct, C_quark, N ); printf("Number of differences: %d\n", nerr); printf("\n"); printf("Summary of time taken\n"); printf("Direct SerialBlock QUARK(%d threads)\n", THREADS); printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark); free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk); return 0; }