void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){ int i,j,k; /* BLACS_PARAMS: [0] = ictxt; [1] = my_proc; [2] = total_procs; [3] = Nproc_row; [4] = Nproc_col; [5] = my_row; [6] = my_col; [7] = Global_rows; [8] = Global_columns; [9] = Block_Size_row; [10] = Block_Size_col; [11] = LOC_rows; [12] = LOC_columns; */ blacs_pinfo_( i, j); BLACS_PARAMS[1] = i; BLACS_PARAMS[2] = j; blacs_get_( -1, 0, k); // <- LG check the arguments of this call BLACS_PARAMS[0] = k; i = BLACS_PARAMS[3]; j = BLACS_PARAMS[4]; blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j); blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j); BLACS_PARAMS[5] = i; BLACS_PARAMS[6] = j; i = 0; BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]); BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]); i = 0; j = 0; descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8], BLACS_PARAMS[9], BLACS_PARAMS[10], i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); descinit_(DESCB, BLACS_PARAMS[7], 1, BLACS_PARAMS[9], 1, i, j, BLACS_PARAMS[0],BLACS_PARAMS[11], k); if (k != 0) fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k); }
/// TODO: I think there is a version of this in scalapackTools.h to use instead /// /// /// for a given context ICTXT, return the parameters of the ScaLAPACK /// /// This is slated to be re-worked during Cheshire m4. It will probably /// become a method on ScaLAPACK operator. /// /// static void getSlInfo(const slpp::int_t ICTXT, slpp::int_t& NPROW, slpp::int_t& NPCOL, slpp::int_t& MYPROW, slpp::int_t& MYPCOL, slpp::int_t& MYPNUM) { if(DBG) std::cerr << "getSlInfo: ICTXT: " << ICTXT << std::endl; NPROW=-1 ; NPCOL=-1 ; MYPROW=-1 ; MYPCOL=-1 ; MYPNUM= -1; blacs_gridinfo_(ICTXT, NPROW, NPCOL, MYPROW, MYPCOL); if(DBG) std::cerr << "getSlInfo: blacs_gridinfo_(ICTXT: "<<ICTXT<<") -> " << "NPROW: " << NPROW << ", NPCOL: " << NPCOL << ", MYPROW: " << MYPROW << ", MYPCOL: " << MYPCOL << std::endl; if(NPROW < 1 || NPCOL < 1) { std::cerr << "getSlInfo: blacs_gridinfo_ error -- aborting" << std::endl; ::blacs_abort_(ICTXT, 99); // something that does not look like a signal } if(MYPROW < 0 || MYPCOL < 0) { std::cerr << "getSlInfo: blacs_gridinfo_ error -- aborting" << std::endl; ::blacs_abort_(ICTXT, 99); // something that does not look like a signal } MYPNUM = blacs_pnum_(ICTXT, MYPROW, MYPCOL); if(DBG) std::cerr << "getSlInfo: blacs_pnum() -> MYPNUM: " << MYPNUM <<std::endl; }
int main(int argc, char **argv) { #define test_A(i,j) test_A[(size_t)(j)*N+(i)] #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)] int N,NB,w,LDA,BB; size_t memsize; //bytes int iam, nprocs, mydevice; int ICTXT, nprow, npcol, myprow, mypcol; int i_one = 1, i_zero = 0, i_negone = -1; double d_one = 1.0, d_zero = 0.0, d_negone = -1.0; int IASEED = 100; /* printf("N=?\n"); scanf("%ld",&N); printf("NB=?\n"); scanf("%d", &NB); printf("width of Y panel=?\n"); scanf("%ld",&w); */ if(argc < 4){ printf("invalid arguments N NB memsize(M)\n"); exit(1); } N = atoi(argv[1]); NB = atoi(argv[2]); memsize = (size_t)atoi(argv[3])*1024*1024; BB = (N + NB - 1) / NB; w = memsize/sizeof(double)/BB/NB/NB - 1; assert(w > 0); LDA = N + 0; //padding int do_io = (N <= NSIZE); double llttime; double gflops; nprow = npcol = 1; blacs_pinfo_(&iam, &nprocs); blacs_get_(&i_negone, &i_zero, &ICTXT); blacs_gridinit_(&ICTXT, "R", &nprow, &npcol); blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol); #ifdef USE_MIC #ifdef __INTEL_OFFLOAD printf("offload compilation enabled\ninitialize each MIC\n"); offload_init(&iam, &mydevice); #pragma offload target(mic:0) { mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE); } #else if(isroot) printf("offload compilation not enabled\n"); exit(0); #endif #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasCreate(&worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasInit(); #endif #endif double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol #ifdef VERIFY double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify #endif /*Initialize A */ int i,j; printf("Initialize A ... "); fflush(stdout); llttime = MPI_Wtime(); pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); llttime = MPI_Wtime() - llttime; printf("time %lf\n", llttime); /*print test_A*/ if(do_io){ printf("Original A=\n\n"); matprint(test_A, N, LDA, 'A'); } /*Use directed unblocked Cholesky factorization*/ /* t1 = clock(); Test_dpotrf(test_A2,N); t2 = clock(); printf ("time for unblocked Cholesky factorization on host %f \n", ((float) (t2 - t1)) / CLOCKS_PER_SEC); */ /*print test_A*/ /* if(do_io){ printf("Unblocked result:\n\n"); matprint(test_A2,N,'L'); } */ /*Use tile algorithm*/ Quark *quark = QUARK_New(OOC_NTHREADS); QUARK_DOT_DAG_Enable(quark, 0); #ifdef USE_MIC // mklmem(NB); printf("QUARK MIC affinity binding\n"); QUARK_bind(quark); printf("offload warm up\n"); warmup(quark); #endif QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0)); printf("LLT start %lf\n", MPI_Wtime()); llttime = Cholesky(quark,test_A,N,NB,LDA,memsize); printf("LLT end %lf\n", MPI_Wtime()); QUARK_Delete(quark); #ifdef USE_MIC offload_destroy(); #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasDestroy(worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasShutdown(); #endif #endif gflops = (double) N; gflops = gflops/3.0 + 0.5; gflops = gflops*(double)(N)*(double)(N); gflops = gflops/llttime/1024.0/1024.0/1024.0; printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n", N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops); #ifdef USE_MIC #pragma offload target(mic:0) { memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); } printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0); #endif /*Update and print L*/ if(do_io){ printf("L:\n\n"); matprint(test_A,N,LDA,'L'); } #ifdef VERIFY printf("Verify... "); llttime = MPI_Wtime(); /* * ------------------------ * check difference betwen * test_A and test_A2 * ------------------------ */ /* { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = j; i < N; i++) { double err = (test_A (i, j) - test_A2 (i, j)); err = ABS (err); maxerr = MAX (err, maxerr); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf ("max difference between test_A and test_A2 %lf \n", maxerr); printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2); }; */ /* * ------------------ * over-write test_A2 * ------------------ */ pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A2, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); /* * --------------------------------------- * after solve, test_A2 should be identity * --------------------------------------- */ // test_A = chol(B) = L; // test_A2 = B // solve L*L'*X = B // if L is correct, X is identity */ { int uplo = 'L'; const char *uplo_char = ((uplo == (int) 'U') || (uplo == (int) 'u')) ? "U" : "L"; int info = 0; int nrhs = N; int LDA = N; int ldb = N; dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info); assert (info == 0); } { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = 0; i < N; i++) { double eyeij = (i == j) ? 1.0 : 0.0; double err = (test_A2 (i, j) - eyeij); err = ABS (err); maxerr = MAX (maxerr, err); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf("time %lf\n", MPI_Wtime() - llttime); printf ("max error %lf \n", maxerr); printf ("max L2 error %lf \n", maxerr2); } #endif free(test_A);test_A=NULL; #ifdef VERIFY free(test_A2);test_A2=NULL; #endif blacs_gridexit_(&ICTXT); blacs_exit_(&i_zero); return 0; #undef test_A #undef test_A2 }
int main() { const MKL_INT m = 1000; const MKL_INT k = 100000; const MKL_INT n = 1000; const MKL_INT nb = 100; const MKL_INT nprow = 2; const MKL_INT npcol = 2; MKL_INT iam, nprocs, ictxt, myrow, mycol; MDESC descA, descB, descC, descA_local, descB_local, descC_local; MKL_INT info; MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local; MKL_INT a_lld, b_lld, c_lld; blacs_pinfo_( &iam, &nprocs ); blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); double *a = 0; double *b = 0; double *c = 0; if (iam==0) { a = gen_a(m, k); b = gen_b(k, n); c = (double*)calloc(m*n, sizeof(double)); puts("a="); print(a, m, k); puts("b="); print(b, k, n); } a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol ); b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow ); b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) ); double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) ); double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) ); a_lld = MAX( a_m_local, 1 ); b_lld = MAX( b_m_local, 1 ); c_lld = MAX( c_m_local, 1 ); if (iam==0) { printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local); printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld); } descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info ); descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info ); descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info ); descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info ); printf("Rank %d: start distribute data\n", iam); pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); printf("Rank %d: finished distribute data\n", iam); if (iam==0) { puts("a"); print(A, a_m_local, a_n_local); puts("b"); print(B, b_m_local, b_n_local); } pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); printf("Rank %d: finished dgemm\n", iam); if (iam == 0) { puts("c"); print(C, c_m_local, c_n_local); } pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local); if (iam==0) { puts("global c"); print(c, m, n); } free(A); free(B); free(C); if (iam==0) { free(a); free(b); free(c); } blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); }
/*==== MAIN FUNCTION =================================================*/ int main( int argc, char *argv[] ){ /* ==== Declarations =================================================== */ /* File variables */ FILE *fin; /* Matrix descriptors */ MDESC descA, descB, descC, descA_local, descB_local; /* Local scalars */ MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol; MKL_INT n, nb, mp, nq, lld, lld_local; MKL_INT i, j, info; int n_int, nb_int, nprow_int, npcol_int; double thresh, diffnorm, anorm, bnorm, residual, eps; /* Local arrays */ double *A_local, *B_local, *A, *B, *C, *work; MKL_INT iwork[ 4 ]; /* ==== Executable statements ========================================== */ /* Get information about how many processes are used for program execution and number of current process */ blacs_pinfo_( &iam, &nprocs ); /* Init temporary 1D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "C", &nprocs, &i_one ); /* Open input file */ if ( iam == 0 ) { fin = fopen( "../in/pblas3ex.in", "r" ); if ( fin == NULL ) { printf( "Error while open input file." ); return 2; } } /* Read data and send it to all processes */ if ( iam == 0 ) { /* Read parameters */ fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int ); fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int ); fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int ); fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int ); fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh ); n = (MKL_INT) n_int; nb = (MKL_INT) nb_int; nprow = (MKL_INT) nprow_int; npcol = (MKL_INT) npcol_int; /* Check if all parameters are correct */ if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) { printf( "One or several input parameters has incorrect value. Limitations:\n" ); printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" ); printf( "p*q = number of processes\n" ); printf( "threshold - double (set to negative to swicth off check)\n"); return 2; } /* Pack data into array and send it to other processes */ iwork[ 0 ] = n; iwork[ 1 ] = nb; iwork[ 2 ] = nprow; iwork[ 3 ] = npcol; igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four ); dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one ); } else { /* Recieve and unpack data */ igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero ); dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero ); n = iwork[ 0 ]; nb = iwork[ 1 ]; nprow = iwork[ 2 ]; npcol = iwork[ 3 ]; } if ( iam == 0 ) { fclose( fin ); } /* Destroy temporary process grid */ blacs_gridexit_( &ictxt ); /* Init workind 2D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* Create on process 0 two matrices: A - orthonormal, B -random */ if ( ( myrow == 0 ) && ( mycol == 0 ) ){ /* Allocate arrays */ A_local = (double*) calloc( n*n, sizeof( double ) ); B_local = (double*) calloc( n*n, sizeof( double ) ); /* Set arrays */ for ( i=0; i<n; i++ ){ for ( j=0; j<n; j++ ){ B_local[ i+n*j ] = one*rand()/RAND_MAX; } B_local[ i+n*i ] += two; } for ( j=0; j<n; j++ ){ for ( i=0; i<n; i++ ){ if ( j < n-1 ){ if ( i <= j ){ A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) ); } else if ( i == j+1 ) { A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) ); } else { A_local[ i+n*j ] = zero; } } else { A_local[ i+n*(n-1) ] = one / sqrt( ( double )n ); } } } /* Print information of task */ printf( "=== START OF EXAMPLE ===================\n" ); printf( "Matrix-matrix multiplication: A*B = C\n\n" ); printf( "/ 1/q_1 ........ 1/q_n-1 1/q_n \\ \n" ); printf( "| . | \n" ); printf( "| `. : : | \n" ); printf( "| -1/q_1 `. : : | \n" ); printf( "| . `. : : | = A \n" ); printf( "| 0 `. ` | \n" ); printf( "| : `. `. 1/q_n-1 1/q_n | \n" ); printf( "| : `. `. | \n" ); printf( "\\ 0 .... 0 -(n-1)/q_n-1 1/q_n / \n\n" ); printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" ); printf( "A - n*n real matrix (orthonormal) \n" ); printf( "B - random n*n real matrix\n\n" ); printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol ); printf( "=== PROGRESS ===========================\n" ); } else { /* Other processes don't contain parts of initial arrays */ A_local = NULL; B_local = NULL; } /* Compute precise length of local pieces and allocate array on each process for parts of distributed vectors */ mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow ); nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); A = (double*) calloc( mp*nq, sizeof( double ) ); B = (double*) calloc( mp*nq, sizeof( double ) ); C = (double*) calloc( mp*nq, sizeof( double ) ); /* Compute leading dimensions */ lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 ); lld = MAX( mp, 1 ); /* Initialize descriptors for initial arrays located on 0 process */ descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); /* Initialize descriptors for distributed arrays */ descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); /* Distribute matrices from 0 process over process grid */ pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); } /* Destroy arrays on 0 process - they are not necessary anymore */ if( ( myrow == 0 ) && ( mycol == 0 ) ){ free( A_local ); free( B_local ); } /* Compute norm of A and B */ work = (double*) calloc( mp, sizeof( double ) ); anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work ); bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); } /* Compute product C = A*B */ pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); } /* Compute difference B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */ pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC, &negone, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); } /* Compute norm of B - inv_A*C (which is contained in B) */ diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); free( work ); if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); } /* Print results */ if( iam == 0 ){ printf( ".. Solutions are compared ..\n" ); printf( "== Results ==\n" ); printf( "||A|| = %03.11f\n", anorm ); printf( "||B|| = %03.11f\n", bnorm ); printf( "=== END OF EXAMPLE =====================\n" ); } /* Compute machine epsilon */ eps = pdlamch_( &ictxt, "e" ); /* Compute residual */ residual = diffnorm /( two*anorm*bnorm*eps ); /* Destroy arrays */ free( A ); free( B ); free( C ); /* Destroy process grid */ blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); /* Check if residual passed or failed the threshold */ if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){ printf( "FAILED. Residual = %05.16f\n", residual ); return 1; } else { return 0; } /*======================================================================== ====== End of PBLAS Level 3 example ==================================== ======================================================================*/ }
int main (int argc, char *argv[]) { myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL; myscalar elem; int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE]; int n; int nrhs; int nb; int locr, locc; int i, j, ii, jj; int ierr; int dummy; int myid, np; int myrow, mycol, nprow, npcol; int ctxt; myreal rdummy, res; n=1024; /* Size of the problem */ nrhs=3; /* Number of RHS */ nb=16; /* Blocksize for the 2D block-cyclic distribution */ /* Initialize MPI */ if((ierr=MPI_Init(&argc,&argv))) return 1; myid=-1; if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid))) return 1; np=-1; if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np))) return 1; /* Initialize the BLACS grid */ nprow=floor(sqrt((float)np)); npcol=np/nprow; blacs_get_(&IZERO,&IZERO,&ctxt); blacs_gridinit_(&ctxt,"R",&nprow,&npcol); blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); /* A is a dense n x n distributed Toeplitz matrix */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol); A=new myscalar[locr*locc]; dummy=std::max(1,locr); descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); myreal pi=3.1416, d=0.1; for(i=1;i<=locr;i++) { for(j=1;j<=locc;j++) { ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow); jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol); // Toeplitz matrix from Quantum Chemistry. A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2); } } } else { descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initializing solution */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol); dummy=std::max(1,locr); descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); Xtrue=new myscalar[locr*locc](); for(i=0;i<locr*locc;i++) Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX)); } else { descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initializing solution and intermediate vector space */ if(myid<nprow*npcol) { X=new myscalar[locr*locc](); Y=new myscalar[locr*locc](); } /* Initializing RHS as A*Xtrue */ if(myid<nprow*npcol) { B=new myscalar[locr*locc](); pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec); } /* Initialize the solver and set parameters */ StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD); sdp.use_HSS=true; sdp.levels_HSS=4; sdp.min_rand_HSS=64; sdp.lim_rand_HSS=0; sdp.tol_HSS=1e-12; sdp.split_HSS=768; /* Size of A11 */ /* Compression */ sdp.compress(A,descA); /* Accuracy checking */ sdp.check_compression(A,descA); /* Factorization */ sdp.partially_factor(A,descA,sdp.split_HSS); /* Schur complement update */ sdp.compute_schur(); /* Extracting a random element from the Schur complement */ if(myid<nprow*npcol) { descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); } else { descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } i=1+rand()%(n-sdp.split_HSS); j=1+rand()%(n-sdp.split_HSS); MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE); if(!myid) std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl; /* Condensation */ sdp.reduce_RHS(Y,descVec,B,descVec); /* Solve the Schur complement system (touches only the bottom part of X)*/ sdp.verbose=false; if(!myid) std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl; CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14); sdp.verbose=true; /* Expansion (touches only the top part of X) */ sdp.expand_solution(X,descVec,Y,descVec); /* Accuracy checking */ sdp.check_solution(A,descA,X,descVec,B,descVec); /* Forward error */ if(myid<nprow*npcol) { for(i=0;i<locr*locc;i++) Y[i]=X[i]-Xtrue[i]; res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy); res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy); if(!myid) std::cout << "Forward error = " << res << std::endl; } /* Statistics */ sdp.print_statistics(); /* Clean-up */ delete[] A; delete[] B; delete[] X; delete[] Y; delete[] Xtrue; /* The end */ MPI_Finalize(); return 0; }
void CG(StrumpackDensePackage<myscalar,myreal> *sdp, myscalar *X, myscalar *B, int *descVec, int n, int nrhs, int niter, myreal threshold) { /* Conjugate Gradients. Calls sdp.schur_product for matvecs. Fills only the bottom part of X. */ int printit=250; int IA; int ctxt; int nprow, npcol; int myrow, mycol; int rsrc, csrc; int nb; int locr, locc; int i, j; int idummy, ierr; int neff; int it; int desc[BLACSCTXTSIZE], descScal[BLACSCTXTSIZE]; bool ingrid; myreal res; myscalar *x=NULL, *b=NULL; myscalar *r=NULL, *p=NULL, *Ap=NULL; myscalar alpha, rrprev, rrnext; IA=sdp->split_HSS+1; neff=n-IA+1; ctxt=descVec[BLACSctxt]; rsrc=descVec[BLACSrsrc]; csrc=descVec[BLACScsrc]; nb=descVec[BLACSmb]; blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); ingrid=myrow>=0 && mycol>=0; /* X and B have n rows. We create versions with * the last neff rows only that we pass to SDP. */ if(ingrid) { locr=numroc_(&neff,&nb,&myrow,&rsrc,&nprow); locc=numroc_(&IONE,&nb,&mycol,&csrc,&npcol); idummy=locr>1?locr:1; descinit_(desc,&neff,&IONE,&nb,&nb,&rsrc,&csrc,&ctxt,&idummy,&ierr); x=new myscalar[locr*locc](); b=new myscalar[locr*locc](); r=new myscalar[locr*locc](); p=new myscalar[locr*locc](); Ap=new myscalar[locr*locc](); } else { locr=0; locc=0; descset_(desc,&neff,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Descriptor for the scalar containing the result of the dot product */ if(ingrid) descinit_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&IONE,&ierr); else descset_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); for(j=1;j<=nrhs;j++) { /* One RHS at a time */ if(ingrid) { pgeadd('N',neff,IONE,ONE,B,IA,j,descVec,ZERO,b,IONE,IONE,desc); pgeadd('N',neff,IONE,ONE,X,IA,j,descVec,ZERO,x,IONE,IONE,desc); } /* r = b - A x * If the Schur was explitly in matrix A, the BLAS call would be * pgemm('N','N',neff,IONE,neff,NONE,A,IA,IA,descA,x,IONE,IONE,desc,ONE,r,IONE,IONE,desc); * */ if(ingrid) placpy('N',neff,IONE,b,IONE,IONE,desc,r,IONE,IONE,desc); sdp->schur_product('N',NONE,x,desc,ONE,r,desc); /* p = r */ if(ingrid) placpy('N',neff,IONE,r,IONE,IONE,desc,p,IONE,IONE,desc); /* "Previous" r' * r */ rrprev=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrprev,IONE,IONE,descScal); MPI_Bcast((void *)&rrprev,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); it=1; while(it<=niter) { /* Ap = A * p * If the Schur was explitly in matrix A, the BLAS call would be * pgemm('N','N',neff,IONE,neff,ONE,A,IA,IA,descA,p,IONE,IONE,desc,ZERO,Ap,IONE,IONE,desc); * */ sdp->schur_product('N',ONE,p,desc,ZERO,Ap,desc); /* alpha = r'*r / (p' * A * p) = rrprev/(p' * Ap) */ alpha=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,p,IONE,IONE,desc,Ap,IONE,IONE,desc,ZERO,&alpha,IONE,IONE,descScal); MPI_Bcast((void *)&alpha,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); alpha=rrprev/alpha; /* x = x + alpha * p */ for(i=0;i<locr*locc;i++) x[i]+=alpha*p[i]; /* r = r - alpha * Ap */ for(i=0;i<locr*locc;i++) r[i]-=alpha*Ap[i]; /* "Next" r' * r */ rrnext=ZERO; if(ingrid) pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrnext,IONE,IONE,descScal); MPI_Bcast((void *)&rrnext,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD); /* Residual */ res=sqrt(rrnext.real()); if(it%printit==0) if(!myrow && !mycol) std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl; if(res<threshold) break; /* p = r + rrnext/rrprev * p */ for(i=0;i<locr*locc;i++) p[i]=r[i]+rrnext/rrprev*p[i]; /* "Previous" r' * r */ rrprev=rrnext; it++; } if(it>niter) it=niter; if(it%printit) if(!myrow && !mycol) std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl << std::endl; /* Back to n-sized vector */ if(ingrid) pgeadd('N',neff,IONE,ONE,x,IONE,IONE,desc,ZERO,X,IA,j,descVec); } delete[] b; delete[] x; delete[] r; delete[] p; delete[] Ap; }
int main (int argc, char *argv[]) { myscalar *A=NULL, *B=NULL, *Btrue=NULL; int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE]; int n; int nb; int locr, locc; int i, j, ii, jj; int *I, *J; int nI, nJ; int ierr; int dummy; int myid, np; int myrow, mycol, nprow, npcol; int ctxt; myreal err; n=1024; /* Size of the problem */ nb=16; /* Blocksize for the 2D block-cyclic distribution */ /* Initialize MPI */ if((ierr=MPI_Init(&argc,&argv))) return 1; myid=-1; if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid))) return 1; np=-1; if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np))) return 1; /* Initialize the BLACS grid */ nprow=floor(sqrt((float)np)); npcol=np/nprow; blacs_get_(&IZERO,&IZERO,&ctxt); blacs_gridinit_(&ctxt,"R",&nprow,&npcol); blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol); /* A is a dense n x n distributed Toeplitz matrix */ if(myid<nprow*npcol) { locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol); A=new myscalar[locr*locc]; dummy=std::max(1,locr); descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); for(i=1;i<=locr;i++) for(j=1;j<=locc;j++) { ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow); jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol); // Toeplitz matrix from Quantum Chemistry. myreal pi=3.1416, d=0.1; A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2); } } else { descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); } /* Initialize the solver and set parameters */ StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD); sdp.use_HSS=true; sdp.levels_HSS=4; sdp.min_rand_HSS=64; sdp.lim_rand_HSS=0; sdp.tol_HSS=1e-6; /* Compression */ sdp.compress(A,descA); /* Accuracy checking */ sdp.check_compression(A,descA); /* Element extraction: a bunch of random indices. * Not that duplicates do not matter (the code works). */ nI=1+rand()%n; MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); I=new int[nI]; if(!myid) for(i=0;i<nI;i++) I[i]=1+rand()%n; MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD); nJ=1+rand()%n; MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD); J=new int[nJ]; if(!myid) for(j=0;j<nJ;j++) J[j]=1+rand()%n; MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD); /* Extraction for the HSS form */ if(myid<nprow*npcol) { locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow); locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol); B=new myscalar[locr*locc](); dummy=std::max(1,locr); descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr); } else descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE); sdp.extract(A,descA,B,descB,I,nI,J,nJ); /* Extraction from the original matrix just to compare */ sdp.use_HSS=false; if(myid<nprow*npcol) Btrue=new myscalar[locr*locc]; sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ); /* Comparison with elements of input matrix */ if(myid<nprow*npcol){ err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL); for(i=0;i<locr*locc;i++) Btrue[i]-=B[i]; err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err; } if(!myid) std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl; /* Statistics */ sdp.print_statistics(); /* Clean-up */ delete[] A; delete[] B; delete[] Btrue; delete[] I; delete[] J; /* The end */ MPI_Finalize(); return 0; }