int main(int argc, char **argv) { #define test_A(i,j) test_A[(size_t)(j)*N+(i)] #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)] int N,NB,w,LDA,BB; size_t memsize; //bytes int iam, nprocs, mydevice; int ICTXT, nprow, npcol, myprow, mypcol; int i_one = 1, i_zero = 0, i_negone = -1; double d_one = 1.0, d_zero = 0.0, d_negone = -1.0; int IASEED = 100; /* printf("N=?\n"); scanf("%ld",&N); printf("NB=?\n"); scanf("%d", &NB); printf("width of Y panel=?\n"); scanf("%ld",&w); */ if(argc < 4){ printf("invalid arguments N NB memsize(M)\n"); exit(1); } N = atoi(argv[1]); NB = atoi(argv[2]); memsize = (size_t)atoi(argv[3])*1024*1024; BB = (N + NB - 1) / NB; w = memsize/sizeof(double)/BB/NB/NB - 1; assert(w > 0); LDA = N + 0; //padding int do_io = (N <= NSIZE); double llttime; double gflops; nprow = npcol = 1; blacs_pinfo_(&iam, &nprocs); blacs_get_(&i_negone, &i_zero, &ICTXT); blacs_gridinit_(&ICTXT, "R", &nprow, &npcol); blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol); #ifdef USE_MIC #ifdef __INTEL_OFFLOAD printf("offload compilation enabled\ninitialize each MIC\n"); offload_init(&iam, &mydevice); #pragma offload target(mic:0) { mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE); } #else if(isroot) printf("offload compilation not enabled\n"); exit(0); #endif #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasCreate(&worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasInit(); #endif #endif double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol #ifdef VERIFY double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify #endif /*Initialize A */ int i,j; printf("Initialize A ... "); fflush(stdout); llttime = MPI_Wtime(); pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); llttime = MPI_Wtime() - llttime; printf("time %lf\n", llttime); /*print test_A*/ if(do_io){ printf("Original A=\n\n"); matprint(test_A, N, LDA, 'A'); } /*Use directed unblocked Cholesky factorization*/ /* t1 = clock(); Test_dpotrf(test_A2,N); t2 = clock(); printf ("time for unblocked Cholesky factorization on host %f \n", ((float) (t2 - t1)) / CLOCKS_PER_SEC); */ /*print test_A*/ /* if(do_io){ printf("Unblocked result:\n\n"); matprint(test_A2,N,'L'); } */ /*Use tile algorithm*/ Quark *quark = QUARK_New(OOC_NTHREADS); QUARK_DOT_DAG_Enable(quark, 0); #ifdef USE_MIC // mklmem(NB); printf("QUARK MIC affinity binding\n"); QUARK_bind(quark); printf("offload warm up\n"); warmup(quark); #endif QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0)); printf("LLT start %lf\n", MPI_Wtime()); llttime = Cholesky(quark,test_A,N,NB,LDA,memsize); printf("LLT end %lf\n", MPI_Wtime()); QUARK_Delete(quark); #ifdef USE_MIC offload_destroy(); #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasDestroy(worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasShutdown(); #endif #endif gflops = (double) N; gflops = gflops/3.0 + 0.5; gflops = gflops*(double)(N)*(double)(N); gflops = gflops/llttime/1024.0/1024.0/1024.0; printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n", N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops); #ifdef USE_MIC #pragma offload target(mic:0) { memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); } printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0); #endif /*Update and print L*/ if(do_io){ printf("L:\n\n"); matprint(test_A,N,LDA,'L'); } #ifdef VERIFY printf("Verify... "); llttime = MPI_Wtime(); /* * ------------------------ * check difference betwen * test_A and test_A2 * ------------------------ */ /* { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = j; i < N; i++) { double err = (test_A (i, j) - test_A2 (i, j)); err = ABS (err); maxerr = MAX (err, maxerr); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf ("max difference between test_A and test_A2 %lf \n", maxerr); printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2); }; */ /* * ------------------ * over-write test_A2 * ------------------ */ pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A2, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); /* * --------------------------------------- * after solve, test_A2 should be identity * --------------------------------------- */ // test_A = chol(B) = L; // test_A2 = B // solve L*L'*X = B // if L is correct, X is identity */ { int uplo = 'L'; const char *uplo_char = ((uplo == (int) 'U') || (uplo == (int) 'u')) ? "U" : "L"; int info = 0; int nrhs = N; int LDA = N; int ldb = N; dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info); assert (info == 0); } { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = 0; i < N; i++) { double eyeij = (i == j) ? 1.0 : 0.0; double err = (test_A2 (i, j) - eyeij); err = ABS (err); maxerr = MAX (maxerr, err); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf("time %lf\n", MPI_Wtime() - llttime); printf ("max error %lf \n", maxerr); printf ("max L2 error %lf \n", maxerr2); } #endif free(test_A);test_A=NULL; #ifdef VERIFY free(test_A2);test_A2=NULL; #endif blacs_gridexit_(&ICTXT); blacs_exit_(&i_zero); return 0; #undef test_A #undef test_A2 }
/*==== MAIN FUNCTION =================================================*/ int main( int argc, char *argv[] ){ /* ==== Declarations =================================================== */ /* File variables */ FILE *fin; /* Matrix descriptors */ MDESC descA, descB, descC, descA_local, descB_local; /* Local scalars */ MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol; MKL_INT n, nb, mp, nq, lld, lld_local; MKL_INT i, j, info; int n_int, nb_int, nprow_int, npcol_int; double thresh, diffnorm, anorm, bnorm, residual, eps; /* Local arrays */ double *A_local, *B_local, *A, *B, *C, *work; MKL_INT iwork[ 4 ]; /* ==== Executable statements ========================================== */ /* Get information about how many processes are used for program execution and number of current process */ blacs_pinfo_( &iam, &nprocs ); /* Init temporary 1D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "C", &nprocs, &i_one ); /* Open input file */ if ( iam == 0 ) { fin = fopen( "../in/pblas3ex.in", "r" ); if ( fin == NULL ) { printf( "Error while open input file." ); return 2; } } /* Read data and send it to all processes */ if ( iam == 0 ) { /* Read parameters */ fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int ); fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int ); fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int ); fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int ); fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh ); n = (MKL_INT) n_int; nb = (MKL_INT) nb_int; nprow = (MKL_INT) nprow_int; npcol = (MKL_INT) npcol_int; /* Check if all parameters are correct */ if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) { printf( "One or several input parameters has incorrect value. Limitations:\n" ); printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" ); printf( "p*q = number of processes\n" ); printf( "threshold - double (set to negative to swicth off check)\n"); return 2; } /* Pack data into array and send it to other processes */ iwork[ 0 ] = n; iwork[ 1 ] = nb; iwork[ 2 ] = nprow; iwork[ 3 ] = npcol; igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four ); dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one ); } else { /* Recieve and unpack data */ igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero ); dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero ); n = iwork[ 0 ]; nb = iwork[ 1 ]; nprow = iwork[ 2 ]; npcol = iwork[ 3 ]; } if ( iam == 0 ) { fclose( fin ); } /* Destroy temporary process grid */ blacs_gridexit_( &ictxt ); /* Init workind 2D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* Create on process 0 two matrices: A - orthonormal, B -random */ if ( ( myrow == 0 ) && ( mycol == 0 ) ){ /* Allocate arrays */ A_local = (double*) calloc( n*n, sizeof( double ) ); B_local = (double*) calloc( n*n, sizeof( double ) ); /* Set arrays */ for ( i=0; i<n; i++ ){ for ( j=0; j<n; j++ ){ B_local[ i+n*j ] = one*rand()/RAND_MAX; } B_local[ i+n*i ] += two; } for ( j=0; j<n; j++ ){ for ( i=0; i<n; i++ ){ if ( j < n-1 ){ if ( i <= j ){ A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) ); } else if ( i == j+1 ) { A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) ); } else { A_local[ i+n*j ] = zero; } } else { A_local[ i+n*(n-1) ] = one / sqrt( ( double )n ); } } } /* Print information of task */ printf( "=== START OF EXAMPLE ===================\n" ); printf( "Matrix-matrix multiplication: A*B = C\n\n" ); printf( "/ 1/q_1 ........ 1/q_n-1 1/q_n \\ \n" ); printf( "| . | \n" ); printf( "| `. : : | \n" ); printf( "| -1/q_1 `. : : | \n" ); printf( "| . `. : : | = A \n" ); printf( "| 0 `. ` | \n" ); printf( "| : `. `. 1/q_n-1 1/q_n | \n" ); printf( "| : `. `. | \n" ); printf( "\\ 0 .... 0 -(n-1)/q_n-1 1/q_n / \n\n" ); printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" ); printf( "A - n*n real matrix (orthonormal) \n" ); printf( "B - random n*n real matrix\n\n" ); printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol ); printf( "=== PROGRESS ===========================\n" ); } else { /* Other processes don't contain parts of initial arrays */ A_local = NULL; B_local = NULL; } /* Compute precise length of local pieces and allocate array on each process for parts of distributed vectors */ mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow ); nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); A = (double*) calloc( mp*nq, sizeof( double ) ); B = (double*) calloc( mp*nq, sizeof( double ) ); C = (double*) calloc( mp*nq, sizeof( double ) ); /* Compute leading dimensions */ lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 ); lld = MAX( mp, 1 ); /* Initialize descriptors for initial arrays located on 0 process */ descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); /* Initialize descriptors for distributed arrays */ descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); /* Distribute matrices from 0 process over process grid */ pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); } /* Destroy arrays on 0 process - they are not necessary anymore */ if( ( myrow == 0 ) && ( mycol == 0 ) ){ free( A_local ); free( B_local ); } /* Compute norm of A and B */ work = (double*) calloc( mp, sizeof( double ) ); anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work ); bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); } /* Compute product C = A*B */ pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); } /* Compute difference B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */ pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC, &negone, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); } /* Compute norm of B - inv_A*C (which is contained in B) */ diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); free( work ); if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); } /* Print results */ if( iam == 0 ){ printf( ".. Solutions are compared ..\n" ); printf( "== Results ==\n" ); printf( "||A|| = %03.11f\n", anorm ); printf( "||B|| = %03.11f\n", bnorm ); printf( "=== END OF EXAMPLE =====================\n" ); } /* Compute machine epsilon */ eps = pdlamch_( &ictxt, "e" ); /* Compute residual */ residual = diffnorm /( two*anorm*bnorm*eps ); /* Destroy arrays */ free( A ); free( B ); free( C ); /* Destroy process grid */ blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); /* Check if residual passed or failed the threshold */ if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){ printf( "FAILED. Residual = %05.16f\n", residual ); return 1; } else { return 0; } /*======================================================================== ====== End of PBLAS Level 3 example ==================================== ======================================================================*/ }
int main() { const MKL_INT m = 1000; const MKL_INT k = 100000; const MKL_INT n = 1000; const MKL_INT nb = 100; const MKL_INT nprow = 2; const MKL_INT npcol = 2; MKL_INT iam, nprocs, ictxt, myrow, mycol; MDESC descA, descB, descC, descA_local, descB_local, descC_local; MKL_INT info; MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local; MKL_INT a_lld, b_lld, c_lld; blacs_pinfo_( &iam, &nprocs ); blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); double *a = 0; double *b = 0; double *c = 0; if (iam==0) { a = gen_a(m, k); b = gen_b(k, n); c = (double*)calloc(m*n, sizeof(double)); puts("a="); print(a, m, k); puts("b="); print(b, k, n); } a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol ); b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow ); b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) ); double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) ); double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) ); a_lld = MAX( a_m_local, 1 ); b_lld = MAX( b_m_local, 1 ); c_lld = MAX( c_m_local, 1 ); if (iam==0) { printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local); printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld); } descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info ); descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info ); descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info ); descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info ); printf("Rank %d: start distribute data\n", iam); pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); printf("Rank %d: finished distribute data\n", iam); if (iam==0) { puts("a"); print(A, a_m_local, a_n_local); puts("b"); print(B, b_m_local, b_n_local); } pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); printf("Rank %d: finished dgemm\n", iam); if (iam == 0) { puts("c"); print(C, c_m_local, c_n_local); } pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local); if (iam==0) { puts("global c"); print(c, m, n); } free(A); free(B); free(C); if (iam==0) { free(a); free(b); free(c); } blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); }
int MAIN__(int argc, char** argv) { int num; // number of data int dim; // dimension of each data int nprow=4; // number of row int npcol=1; // number of columnn int zero=0, one=1; // constant value int ictxt,myrow,mycol,pnum,pdim,info; char ifilename[LEN_FILENAME]; char ofilename[LEN_FILENAME]; int myproc, nprocs; Cblacs_pinfo(&myproc, &nprocs); Cblacs_setup(&myproc, &nprocs); Cblacs_get(-1,0,&ictxt); nprow = nprocs; npcol = 1; // fixed char order[] = "Row"; Cblacs_gridinit(&ictxt, order, nprow, npcol); Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); if (DEBUG_MODE) { printf("ConTxt = %d\n", ictxt); printf("nprocs=%d, nprow=%d, npcol=%d\n", nprocs, nprow, npcol); printf("nprocs=%d, myrow=%d, mycol=%d\n", nprocs, myrow, mycol); } get_option(argc, argv, ifilename, ofilename, &num, &dim); // 0. cosinedist(ij) = 1 - V(i)V(j)/(Length(V(i))*Length(V(j))) // 1. calculate submatrix size int bsize = num / nprow; // blocking factor pnum = num / nprow; pdim = dim; if ( myrow < (num/bsize)%nprow) { pnum += bsize; } else if ( myrow == (num/bsize)%nprow) { pnum += (num % bsize); } else { } if(DEBUG_MODE) printf("myproc=%d: pnum=%d, pdim=%d, bsize=%d\n", myproc, pnum, pdim, bsize); int desc_input[9], desc_v[9], desc_ip[9], desc_n[9], desc_result[9]; descinit_(desc_input, &num, &dim, &num, &dim, &zero, &zero, &ictxt, &num, &info); descinit_(desc_v, &num, &dim, &bsize, &pdim, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_ip, &num, &num, &bsize, &num, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_n, &num, &one, &bsize, &one, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_result, &num, &num, &num, &num, &zero, &zero, &ictxt, &num, &info); // 2. read input data double* input; if (myproc == 0) { input = (double*)malloc(sizeof(double)*num*dim); memset(input, 0, sizeof(double)*num*dim); read_data(ifilename, num, dim, input); printArray("input", myproc, input, num, dim); } // 3. distribute input data array double* V = (double*)malloc(sizeof(double)*pnum*pdim); memset(V, 0, sizeof(double)*pnum*pdim); Cpdgemr2d(num, dim, input, 1, 1, desc_input, V, 1, 1, desc_v, ictxt); printArray("V", myproc, V, pnum, pdim); // 4. InnerProduct = VV' double* InnerProduct = (double*)malloc(sizeof(double)*pnum*num); memset(InnerProduct, 0, sizeof(double)*pnum*num); char transa = 'N', transb = 'T'; int m = num, n = num, k = dim; int lda = num, ldb = num, ldc = num; double alpha = 1.0f, beta = 0.0f; pdgemm_(&transa, &transb, &m, &n, &k, &alpha, V, &one, &one, desc_v, V, &one, &one, desc_v, &beta, InnerProduct, &one, &one, desc_ip); printArray("InnerProduct", myproc, InnerProduct, pnum, num); // 5. Norm of each vector double* Norm = (double*)malloc(sizeof(double)*pnum); for (int i = 0; i < pnum; i++) { int n = ((myproc*bsize)+(i/bsize)*(nprocs-1)*bsize+i)*pnum + i; Norm[i] = sqrt(InnerProduct[n]); } printArray("Norm", myproc, Norm, 1, pnum); // 6. Norm product matrix double* NormProduct = (double*)malloc(sizeof(double)*pnum*num); memset(NormProduct, 0, sizeof(double)*pnum*num); char uplo = 'U'; n = num; alpha = 1.0f; int incx = 1; lda = num; pdsyr_(&uplo, &n, &alpha, Norm, &one, &one, desc_n, &incx, NormProduct, &one, &one, desc_ip); printArray("NormProduct", myproc, NormProduct, pnum, num); // 7. CosineDistance(ij) = 1-InnerProduct(ij)/NormProduct(ij) double* CosineDistance = (double*)malloc(sizeof(double)*pnum*num); memset(CosineDistance, 0, sizeof(double)*pnum*num); for (int j = 0; j < num; j++) { for (int i = 0; i < pnum; i++) { int n = ((myproc*bsize)+i+(i/bsize)*(nprocs-1)*bsize)*pnum+i; int p = i+j*pnum; if (p<=n) { CosineDistance[p] = 0.0; } else { CosineDistance[p] = 1 - InnerProduct[p]/NormProduct[p]; } } } printArray("CosineDistance", myproc, CosineDistance, pnum, num); // 8. gather result double* result; if ( myproc == 0 ) { result = (double*)malloc(sizeof(double)*num*num); memset(result, 0, sizeof(double)*num*num); } Cpdgemr2d(num, num, CosineDistance, 1, 1, desc_ip, result, 1, 1, desc_result, ictxt); // 9. output to file if ( myproc == 0 ) { output_results(ofilename, result, num, num); } // a. cleanup memory free(V); free(InnerProduct); free(Norm); free(NormProduct); free(CosineDistance); if ( myproc == 0 ) { free(input); free(result); } blacs_exit_(&zero); return 0; }