void fastSineTransform(ColumnMatrix localMatrix, Vector fstBuffer) { int problemSize = localMatrix->globalSize+1; for(int column=0;column<localMatrix->localSize;column++) { fst_(&localMatrix->data[column*localMatrix->globalSize], &problemSize, fstBuffer->data, &fstBuffer->localSize); } }
int main(int argc, char **argv) { double wall_start = MPI_Wtime(); Real *diag, **b, **bt, **z; Real pi, h, omp_local_max, local_max, global_max; int i, j, omp_id; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); omp_tot_threads = omp_get_max_threads(); /* the total number of grid points in each spatial direction is (n+1) */ /* the total number of degrees-of-freedom in each spatial direction is (n-1) */ /* this version requires n to be a power of 2 */ if (argc < 2) { if (mpi_rank == 0){ printf("need a problem size\n"); } MPI_Finalize(); return 0; } n = atoi(argv[1]); m = n-1; // mpi_work is the amount of work needed to be done by each mpi node. The last // mpi node may do slightly less work than the others, but that's the closest // we'll get to proper load balancing. mpi_work = 1 + ((m - 1) / mpi_size); nn = 4*n; diag = createRealArray (m); b = createReal2DArray (mpi_work, mpi_size*mpi_work); bt = createReal2DArray (mpi_work, mpi_size*mpi_work); z = createReal2DArray (omp_tot_threads, nn); h = 1./(Real)n; pi = 4.*atan(1.); #pragma omp parallel for private(i) for (i=0; i < m; i++) { diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n)); } #pragma omp parallel for private(j, i) for (j=0; j < mpi_work; j++) { // MPI for (i=0; j + mpi_work * mpi_rank < m && i < m; i++) { // OMP int k = j + mpi_work * mpi_rank; b[j][i] = exp((Real) k) * sin(2.0 * pi * k) * sin(2.0 * i); } } #pragma omp parallel for private(omp_id, i) for (j=0; j < mpi_work; j++) { // MPI cut + OMP omp_id = omp_get_thread_num(); fst_(b[j], &n, z[omp_id], &nn); } transpose (bt,b); #pragma omp parallel for private(i, omp_id) schedule(static) for (i=0; i < mpi_work; i++) { // MPI cut + OMP omp_id = omp_get_thread_num(); fstinv_(bt[i], &n, z[omp_id], &nn); } #pragma omp parallel for private(j, i) for (j=0; j < mpi_work; j++) { // MPI for (i=0; i < m; i++) { bt[j][i] = bt[j][i]/(diag[i]+diag[j + mpi_work * mpi_rank]); } } #pragma omp parallel for private(i, omp_id) schedule(static) for (i=0; i < mpi_work; i++) { // MPI cut + OMP omp_id = omp_get_thread_num(); fst_(bt[i], &n, z[omp_id], &nn); } transpose (b,bt); #pragma omp parallel for private(j, omp_id) for (j=0; j < mpi_work; j++) { // MPI cut + OMP omp_id = omp_get_thread_num(); fstinv_(b[j], &n, z[omp_id], &nn); } local_max = 0.0; omp_local_max = 0.0; #pragma omp parallel shared(local_max) private(j,i) firstprivate(omp_local_max) { // MPI, work in range (and handle last node overflow) #pragma omp for nowait for (j=0; j < mpi_work; j++) { for (i=0; j + mpi_work * mpi_rank < m && i < m; i++) { if (b[j][i] > omp_local_max) omp_local_max = b[j][i]; } } #pragma omp critical { if (omp_local_max > local_max) { local_max = omp_local_max; } } } MPI_Reduce(&local_max, &global_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); free(diag); free(b[0]); free(b); free(bt[0]); free(bt); free(z[0]); free(z); MPI_Finalize(); double wall_end = MPI_Wtime(); if (mpi_rank == 0) { printf (" umax = %e, time = %.3fs \n", global_max,wall_end-wall_start); printf(" mpi_size = %d, omp_max_threads = %d, n = %d\n", mpi_size, omp_tot_threads, n); } }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int nprocs, rank; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); int numthreads = omp_get_max_threads(); if (argc < 2) { printf("Usage:\n"); printf(" poisson n\n\n"); printf("Arguments:\n"); printf(" n: the problem size (must be a power of 2)\n"); } double time_start; if (rank == 0) { time_start = MPI_Wtime(); } // The number of grid points in each direction is n+1 // The number of degrees of freedom in each direction is n-1 = m int n = atoi(argv[1]); int m = n - 1; int nn = 4 * n; real h = 1.0 / n; // Splitting the matrix into columns: int exact = n/nprocs; int rem = m - (nprocs - 1)*exact; // Size of each process owns a strip matrix which is m*exact or m*remain. // We consider that each such a matrix is made of 'nprocs' blocks vertically. int block_col = exact; int block_uk = exact*exact; int rem_uk = exact*rem; // For the last such strip, number of columns is rem. Consequently: if (rank == nprocs-1){ block_col = rem; block_uk = rem*exact; rem_uk = rem*rem; } // Grid points real *grid = mk_1D_array(n+1, false); #pragma omp parallel for schedule(static) for (size_t i = 0; i < n+1; i++) { grid[i] = i * h; } // The diagonal of the eigenvalue matrix of T real *diag = mk_1D_array(m, false); #pragma omp parallel for schedule(static) for (size_t i = 0; i < m; i++) { diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); } // Initialize the right hand side data // B is the column strip that the process owns.* real **B = mk_2D_array(block_col, m, false); #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { for (size_t j = 0; j < m; j++) { B[i][j] = h * h * rhs(grid[i+1+(rank*exact)], grid[j+1]); } } // For the Sine Transform: real **z = mk_2D_array(numthreads, nn, false); // Calculate Btilde^T = S^-1 * (S * B)^T #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fst_(B[i], &n, z[omp_get_thread_num()], &nn); } transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank); #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fstinv_(B[i], &n, z[omp_get_thread_num()], &nn); } // Solve Lambda * Xtilde = Btilde #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { for (size_t j = 0; j < m; j++) { B[i][j] = B[i][j] / (diag[i+(rank*exact)] + diag[j]); } } // Calculate X = S^-1 * (S * Xtilde^T) ^ T #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fst_(B[i], &n, z[omp_get_thread_num()], &nn); } transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank); #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fstinv_(B[i], &n, z[omp_get_thread_num()], &nn); } // Calculate maximal value of solution double U_max = 0.0, e_max = 0.0, global_max, global_emax, error; for (size_t i = 0; i < block_col; i++){ for (size_t j = 0; j < m; j++){ error = fabs(B[i][j] - sin(PI*(i+1+(rank*exact))*h)*sin(2*PI*(j+1)*h)); U_max = U_max > B[i][j] ? U_max : B[i][j]; e_max = e_max > error ? e_max : error; } } // MPI_Max to find the true maximum: MPI_Reduce(&U_max, &global_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&e_max, &global_emax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); // Print the Global Maximum on process 0: if (rank == 0){ printf("Problem Size = %d\tNumprocs = %d\tNumthreads = %d\n", n, nprocs, numthreads); printf("U_max = %0.16f\n", global_max); printf("E_max = %0.16f\n", global_emax); double duration = MPI_Wtime() - time_start ; printf("Execution Time: %0.16f \n", duration); } MPI_Finalize(); return 0; }
void runPoisson(int rank, int size, int n){ double time=MPI_Wtime(); Real **b, *diag, *RecvBuf,*z, h, maxError; int i, j, m, nn, *len, *disp; m = n-1; nn = 4*n; splitVector(m, size, &len, &disp); diag = createRealArray (m); b = createReal2DArray (len[rank],m); z = createRealArray (nn); h = 1./(Real)n; #pragma omp parallel for schedule(static) for (i=0; i < m; i++) { diag[i] = 2.*(1.-cos((i+1)*M_PI/(Real)n)); } #pragma omp for for (j=0; j < len[rank]; j++) { #pragma omp parallel for schedule(static) for (i=0; i < m; i++) { Real x=(Real)(j+1+disp[rank])/n; Real y=(Real) (i+1)/n; b[j][i] = h*h * funcf(x,y); } } #pragma omp parallel for schedule(static) for (j=0; j < len[rank]; j++) { Real* zt = createRealArray (nn); fst_(b[j], &n, zt, &nn); free(zt); } transpose(b, size, len, disp, rank, m); #pragma omp parallel for schedule(static) for (i=0; i < len[rank]; i++) { Real* zt = createRealArray (nn); fstinv_(b[i], &n, zt, &nn); free(zt); } #pragma omp for for (j=0; j < len[rank]; j++) { #pragma omp parallel for schedule(static) for (i=0; i < m; i++) { b[j][i] = b[j][i]/(diag[i]+diag[j+disp[rank]]); } } #pragma omp parallel for schedule(static) for (i=0; i < len[rank]; i++) { Real* zt = createRealArray (nn); fst_(b[i], &n, zt, &nn); free(zt); } transpose(b, size, len, disp, rank, m); #pragma omp parallel for schedule(static) for (j=0; j < len[rank]; j++) { Real* zt = createRealArray (nn); fstinv_(b[j], &n, zt, &nn); free(zt); } if (rank==0) { RecvBuf = createRealArray (m*m); } gatherMatrix(b, m, RecvBuf, len, disp,0); if (rank==0) { for (int j=0; j < m; j++) { for (int i=0; i < m; i++) { printf("%e %e %e \n",(Real)i/m,(Real)j/m,RecvBuf[j*m+i] ); } } } }
main(int argc, char **argv ) { Real *diag, **b, **bt, *z; Real pi, h, umax; int i, j, n, m, nn; /* the total number of grid points in each spatial direction is (n+1) */ /* the total number of degrees-of-freedom in each spatial direction is (n-1) */ /* this version requires n to be a power of 2 */ if( argc < 2 ) { printf("need a problem size\n"); return; } n = atoi(argv[1]); m = n-1; nn = 4*n; diag = createRealArray (m); b = createReal2DArray (m,m); bt = createReal2DArray (m,m); z = createRealArray (nn); h = 1./(Real)n; pi = 4.*atan(1.); for (i=0; i < m; i++) { diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n)); } for (j=0; j < m; j++) { for (i=0; i < m; i++) { b[j][i] = h*h; } } for (j=0; j < m; j++) { fst_(b[j], &n, z, &nn); } transpose (bt,b,m); for (i=0; i < m; i++) { fstinv_(bt[i], &n, z, &nn); } for (j=0; j < m; j++) { for (i=0; i < m; i++) { bt[j][i] = bt[j][i]/(diag[i]+diag[j]); } } for (i=0; i < m; i++) { fst_(bt[i], &n, z, &nn); } transpose (b,bt,m); for (j=0; j < m; j++) { fstinv_(b[j], &n, z, &nn); } umax = 0.0; for (j=0; j < m; j++) { for (i=0; i < m; i++) { if (b[j][i] > umax) umax = b[j][i]; } } printf (" umax = %e \n",umax); }
int main(int argc, char **argv) { init_mpi(argc, argv,rank,grid_size,dims,coords,periods); if(grid_size==0) grid_size =1;/*For debugging and with only 1 processor*/ if( argc < 2 ) { printf("need a problem size\n"); return 1; } double pi, h, l_umax; int n, m, nn, i, j, cols, l_diag_rows; n = atoi(argv[1]); m = n-1; nn = 4*n; struct Array *len = newArray(grid_size); struct Array *displ = newArray(grid_size); splitVector(m,grid_size,) cols = m/grid_size; /*number of coulums of b to each processor*/ l_diag_rows =m/grid_size; struct Array *diag = newArray(m,1); struct Array *l_diag = newArray(l_diag_rows,1); struct Array *b = newArray(m,cols); struct Array *bt = newArray(m,cols); struct Array *z = newArray(nn,1); createDatatype(b,&grid_size); h = 1./(double)n; pi = 4.*atan(1.); /*struct timeval start, end; gettimeofday(&start,NULL); */ for(i=0;i<diag->rows;i++){ diag->data[i] = 2.*(1.-cos((i+1)*pi/(double)n)); } /* #pragma omp for schedule(static) for(i=0+rank*l_diag->rows;i<((rank+1)*l_diag->rows);i++){ *(l_diag->data+i) = 2.*(1.-cos((i+1)*pi/(double)n)); } MPI_Allgather(l_diag->data,l_diag->rows,MPI_DOUBLE,diag->data,diag->size,MPI_DOUBLE,cart_comm); */ #pragma omp parallel for schedule(static) for(i=0;i<m;i++){ b->data[i] = h*h; } #pragma omp parallel for schedule(static) for(i=0;i<b->cols;i++){ fst_(b->data+b->rows*i, &b->rows, z->data,&nn); } /*Transpose b by sending/receiving rows of the columns in each local b; eg (l_b_0 = {1,2,a,b})+(l_b_1 = {3,4,c,d}) -> b_all = {1,a,3,c,2,b,4,d}*/ MPI_Alltoall(b->data,grid_size,transpose_select_t,bt->data,1,transpose_insert_t,cart_comm); #pragma omp parallel for schedule(static) for(i=0;i<m;i++){ fstinv_(bt->data+i*bt->rows,&bt->rows,z->data,&nn); } #pragma omp parallel for schedule(static) private(i) for(j=0;j<m;j++){ for(i=0;i<cols;i++){ *(bt->data + j*bt->rows + i)=*(bt->data + j*bt->rows + i)/(*(diag->data+i)+*(diag->data+j)); } } #pragma omp parallel for schedule(static) for(i=0;i<cols;i++){ fst_(bt->data+i*b->rows, &bt->rows, z->data,&nn); } MPI_Alltoall(bt->data,grid_size,transpose_select_t,b->data,1,transpose_insert_t,cart_comm); #pragma omp parallel for schedule(static) for(i=0;i<m;i++){ fstinv_(b->data+i*b->rows,&b->rows,z->data,&nn); } l_umax = 0.0; #pragma omp parallel for schedule(static) reduction(max:l_umax) for(i=0;i<b->size;i++){ if(l_umax<*(b->data +i)) l_umax=*(b->data+i); } if(rank==0){ double umax = 0.0; struct Array *umaxArray = newArray(grid_size,1); MPI_Gather(&l_umax,1,MPI_DOUBLE,umaxArray->data,grid_size,MPI_DOUBLE,0,cart_comm); #pragma omp parallel for schedule(static) reduction(max:umax) for(i=0;i<grid_size;i++){ if(umax<*(umaxArray->data +i)) umax=*(umaxArray->data+i); } printf (" umax = %e \n",umax); /*gettimeofday(&end,NULL); print_time(start,end); */ } freeArray(b); freeArray(bt); freeArray(l_diag); freeArray(diag); freeArray(z); freeDatatype(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int size , rank, number; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD , &size); MPI_Comm_rank(MPI_COMM_WORLD , &rank); double start = MPI_Wtime(); //Starter klokka double umaxglob=0; //Max feil for alle tråder if (argc < 2) { printf("Usage:\n"); printf(" poisson n\n\n"); printf("Arguments:\n"); printf(" n: the problem size (must be a power of 2)\n"); } // The number of grid points in each direction is n+1 // The number of degrees of freedom in each direction is n-1 int n = atoi(argv[1]); int m = n - 1; // ant punk hver vei i B int *cnt = (int *) malloc(size * sizeof(int)); //loakal ant kolonner i matrix int *displs = (int *) malloc((size+1) * sizeof(int)); //lokal displacement for de andre prosessorene sine punkter i sendbuf displs[size] = m; displs[0]=0; //Displacement til første prosessor er alltid 0 int overflow = m % size; //ant kolonner som blir til overs for(int i = 0;i<size;i++){ cnt[i] = m / size; // nrColon for hver prosessor if (overflow != 0){ cnt[i]++; //fordeler de ekstra kolonnene overflow--; } if (i < size-1){ displs[i+1] = displs[i]+cnt[i]; } } int nrColon = cnt[rank]; //ant kolonner "jeg" har int pros_dof = nrColon*m; //ant lementer jeg har int nn = 4 * n; double h = 1.0 / n; // Grid points double *grid = mk_1D_array(n+1, false); double **b = mk_2D_array(nrColon, m, false); double **bt = mk_2D_array(nrColon, m,false); int trad = omp_get_max_threads(); //ant tråder double **z = mk_2D_array(trad,nn, false); //z er 2D pga paralellisering med OpenMP, da FST ikke skal overskrive andre tråders z double *diag = mk_1D_array(m, false); double *sendbuf = mk_1D_array(nrColon*m, false); double *recbuf = mk_1D_array(nrColon*m, false); int *sendcnt = (int *) malloc((size+1) * sizeof(int)); //ant elementer jeg skal sende hver prosessor int *sdispls = (int *) malloc((size+1) * sizeof(int)); //index i sendbuf for hver prosessor sdispls[0]=0; //prosessor 0 skal alltid ha fra index 0 for(int i = 0;i<size;i++){ sendcnt[i] = cnt[i]*cnt[rank]; // antt elementer jeg eier * ant element den eier sdispls[i] = displs[i]*cnt[rank]; //displacement for hver prosessor } // GRID #pragma omp parallel for schedule(static) for (int i = 0; i < n+1; i++) { grid[i] = i * h; } #pragma omp parallel for schedule(static) for (size_t i = 0; i < m; i++) { diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); //Eigenvalue } // Initialize the right hand side data #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { for (size_t j = 0; j < m; j++) { // b[i][j] = h * h; b[i][j] = h * h * func1(grid[i+displs[rank]], grid[j]); //evaluerer funksjoen * h*h } } // Calculate Btilde^T = S^-1 * (S * B)^T #pragma omp parallel for schedule(guided, 5) for (size_t i = 0; i < nrColon; i++) { fst_(b[i], &n, z[omp_get_thread_num()], &nn); } MPItranspose (b, bt,nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs); #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { fstinv_(bt[i], &n, z[omp_get_thread_num()], &nn); } // Solve Lambda * Xtilde = Btilde #pragma omp parallel for schedule(static) for (int j=0; j < nrColon; j++) { for (int i=0; i < m; i++) { bt[j][i] /= (diag[j+displs[rank]]+diag[i]); } } // Calculate X = S^-1 * (S * Xtilde^T) #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { fst_(bt[i], &n, z[omp_get_thread_num()], &nn); } MPItranspose (bt, b, nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs); #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { fstinv_(b[i], &n, z[omp_get_thread_num()], &nn); } // Calculate maximal value of solution double u_max = 0.0, temp; #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { for (size_t j = 0; j < m; j++) { temp = b[i][j] - func2(grid[displs[rank]+i], grid[j]); //tester resultat - kjent funksjon, skal bli = 0 if (temp > u_max){ u_max = temp; } } } MPI_Reduce (&u_max, &umaxglob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); //Finner den største u_max fra de forskjellige prosessorene og setter den til umaxglob MPI_Finalize(); if (rank == 0) { printf("Nodes = %d \n", size); printf("Threads per node = %d \n", omp_get_max_threads()); printf("u_max = %e\n", umaxglob); //Printer max feil double times = MPI_Wtime()-start; //Stopper klokka printf("Time elapsed = %1.16f \n", times); //Pinter tid } return 0; }
main(int argc, char **argv ) { Real *diag, **b, **bt, **z; Real pi, h, umax, f; int i,j,n, m, nn, numthreads, currentthread; /* the total number of grid points in each spatial direction is (n+1) */ /* the total number of degrees-of-freedom in each spatial direction is (n-1) */ /* this version requires n to be a power of 2 */ if( argc < 2 ) { printf("need a problem size\n"); return 1; } Real starttime, endtime, runtime, maxtime, mintime, temptime, avgruntime; Real timer1, timer2, timer3, timer4,timer5,timer6,timer7,timer8; int size, rank; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); n = atoi(argv[1]); m = n-1; nn = 4*n; #ifdef HAVE_OPENMP numthreads = omp_get_max_threads(); #else numthreads = 1; #endif // --- Distribute Work --- int *gsize; int *ssize; gsize = (int *)malloc(size*sizeof(int)); ssize = (int *)malloc(size*sizeof(int)); int remain = m % size; ssize[0] = 0; for (int q = 0;q<size;++q) { gsize[q] = m/size; if (q<remain) gsize[q]++; if (q<(size-1)) ssize[q+1] = ssize[q]+gsize[q]; } // ----------------------- diag = createRealArray (m); b = createReal2DArray (gsize[rank],m); bt = createReal2DArray (gsize[rank],m); z = createReal2DArray (numthreads,nn); //GJØR NOE HER DA h = 1./(Real)n; pi = 4.*atan(1.); starttime = MPI_Wtime(); #pragma omp parallel for schedule(static) for ( i=0; i < m; i++) { diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n)); } #pragma omp parallel for private(i,f) schedule(static) for ( j=0; j < gsize[rank]; j++) { for ( i=0; i < m; i++) { f = 5*pi*pi*sin(pi*h*(j+ssize[rank]+1))*sin(2*pi*h*(i+1)); //f = 1.0; //f =2*( ((j+1+ssize[rank])*h)*((j+1+ssize[rank])*h) + (i+1)*(i+1)*h*h ); b[j][i] = h*h*f; //b[j][i] = (j+ssize[rank])*m+i; transpose test } } timer1 = MPI_Wtime(); // ---------- DEBUG PART --------------- /*if (rank == 0) { printVector(b[0],m*gsize[rank]); } */ /* if(rank ==0) { printf("\n\n\n"); printMatrix(b,m,gsize[rank]); printf("\n\n\n"); } transpose (bt,b,&m,m,size,rank,gsize,ssize); if(rank ==0) { printMatrix(bt,m,gsize[rank]); printf("\n\n\n"); } */ // ----------- DEBUG END --------------- // Alternativt ha #ifdef utenfor loopen, og operere med 2 loops #pragma omp parallel for private(currentthread) schedule(static) for ( j=0; j < gsize[rank]; j++) { currentthread = 0; #ifdef HAVE_OPENMP currentthread = omp_get_thread_num(); #endif fst_(b[j], &n, z[omp_get_thread_num()], &nn); } timer2 = MPI_Wtime(); /* if (rank == 0) { printf("\n\n\n"); printMatrix(b,m); } */ transpose (bt,b,&m,m,size,rank,gsize,ssize); timer3 = MPI_Wtime(); /* if (rank == 0) { printf("\n\n\n"); printMatrix(bt,m); printf("\n\n\n"); } */ #pragma omp parallel for private(currentthread) schedule(static) for ( i=0; i < gsize[rank]; i++) { currentthread = 0; #ifdef HAVE_OPENMP currentthread = omp_get_thread_num(); #endif fstinv_(bt[i], &n, z[currentthread], &nn); } timer4 = MPI_Wtime(); #pragma omp parallel for private(i) schedule(static) for ( j=0; j < gsize[rank]; j++) { for ( i=0; i < m; i++) { bt[j][i] = bt[j][i]/(diag[i]+diag[j+ssize[rank]]); //offset implemented } } timer5 = MPI_Wtime(); #pragma omp parallel for private(currentthread) schedule(static) for ( i=0; i < gsize[rank]; i++) { currentthread = 0; #ifdef HAVE_OPENMP currentthread = omp_get_thread_num(); #endif fst_(bt[i], &n, z[currentthread], &nn); } timer6 = MPI_Wtime(); transpose (b,bt,&m,m,size,rank,gsize,ssize); timer7 = MPI_Wtime(); #pragma omp parallel for private(currentthread) schedule(static) for ( j=0; j < gsize[rank]; j++) { currentthread = 0; #ifdef HAVE_OPENMP currentthread = omp_get_thread_num(); #endif fstinv_(b[j], &n, z[currentthread], &nn); } timer8 = MPI_Wtime(); endtime = MPI_Wtime(); runtime = endtime-starttime; timer8 -=timer7;timer7 -=timer6;timer6 -=timer5;timer5 -=timer4; timer4 -=timer3;timer3 -=timer2;timer2 -=timer1;timer1 -=starttime; MPI_Allreduce(&runtime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&runtime, &mintime, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&runtime, &avgruntime, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); avgruntime /= (Real) size; MPI_Allreduce(&timer1, &timer1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer2, &timer2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer3, &timer3, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer4, &timer4, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer5, &timer5, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer6, &timer6, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer7, &timer7, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&timer8, &timer8, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); timer1/=(Real)size;timer2/=(Real)size;timer3/=(Real)size;timer4/=(Real)size; timer5/=(Real)size;timer6/=(Real)size;timer7/=(Real)size;timer8/=(Real)size; if (rank == 0) printf("Slowest runtime: %e s\n", maxtime); if (rank == 0) printf("Fastest runtime: %e s\n", mintime); if (rank == 0) printf("Average runtime: %e s\n", avgruntime); if (rank == 0) printf("Init: \t\t %f p.c.\n", timer1/avgruntime); if (rank == 0) printf("FST1: \t\t %f p.c.\n", timer2/avgruntime); if (rank == 0) printf("Transpose1: \t %f p.c.\n", timer3/avgruntime); if (rank == 0) printf("FSTINV1: \t %f p.c.\n", timer4/avgruntime); if (rank == 0) printf("Diag: \t\t %f p.c.\n", timer5/avgruntime); if (rank == 0) printf("FST2: \t\t %f p.c.\n", timer6/avgruntime); if (rank == 0) printf("Transpose2: \t %f p.c.\n", timer7/avgruntime); if (rank == 0) printf("FSTINV2: \t %f p.c.\n", timer8/avgruntime); if (rank == 0) printf("\nFST TOTAL: \t\t %f p.c.\t%es\n",(timer2+timer4+timer6+timer8)/avgruntime,timer2+timer4+timer6+timer8); if (rank == 0) printf("\nTranspose TOTAL: \t %f p.c.\t%es\n",(timer3+timer7)/avgruntime,timer3+timer7); umax = 0.0; for ( j=0; j < gsize[rank]; j++) { for ( i=0; i < m; i++) { if (b[j][i] > umax) umax = b[j][i]; } } MPI_Allreduce(&umax ,&umax , 1 ,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); if (rank == 0) printf (" umax = %e \n",umax); //printVector(b[m/4],m); Real locerr, err; locerr = checkError( b,m, h,size, rank, gsize, ssize); MPI_Allreduce(&locerr ,&err , 1 ,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); if (rank == 0) printf("\nMaximal error: %e\n", err); //printMatrix(b,m,m); /***********************************************************/ /***********************************************************/ /* FREE MEMORY */ /***********************************************************/ /***********************************************************/ free(b[0]); free(bt[0]); free(b); free(bt); free(z[0]); free(z); free(diag); free(gsize); free(ssize); /***********************************************************/ /***********************************************************/ /***********************************************************/ MPI_Finalize(); }
int main(int argc, char **argv ) { Real *diag, **A, *z; Real pi, h, umax, globalumax, emax, globalemax, error, time; int i, j, n, m, nn, b, re, l, bb, bre, rank, size; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &size); /* the total number of grid points in each spatial direction is (n+1) */ /* the total number of degrees-of-freedom in each spatial direction is (n-1) */ /* this version requires n to be a power of 2 */ if( argc < 2 ) { printf("need a problem size\n"); return 0; } n = atoi(argv[1]); m = n-1; nn = 4*n; h = 1./(Real)n; pi = 4.*atan(1.); b = floor(m/size); re = m - (size-1)*b; l = b; bb = b*b; bre = b*re; if(rank+1 == size) { l = re; bb = bre; bre = re*re; } diag = createRealArray (m); A = createReal2DArray (l,m); z = createRealArray (nn); time = MPI_Wtime(); for (i=0; i < m; i++) { diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n)); } for (j=0; j < l; j++) { for (i=0; i < m; i++) { // h^2 * f(x,y) A[j][i] = h*h*5*pi*pi*sin(pi*i*h)*sin(2*pi*(j + rank*b)*h); } } for (j=0; j < l; j++) { fst_(A[j], &n, z, &nn); } transpose(A, l, m, size, bb, bre); for (i=0; i < l; i++) { fstinv_(A[i], &n, z, &nn); } for (j=0; j < l; j++) { for (i=0; i < m; i++) { A[j][i] = A[j][i]/(diag[i]+diag[j + rank*b]); } } for (i=0; i < l; i++) { fst_(A[i], &n, z, &nn); } transpose(A, l, m, size, bb, bre); for (j=0; j < l; j++) { fstinv_(A[j], &n, z, &nn); } umax = 0.0; emax = 0.0; for (j=0; j < l; j++) { for (i=0; i < m; i++) { // error = abs( numerical u(x,y) - exact u(x,y) ) error = fabs(A[j][i] - sin(pi*i*h)*sin(2*pi*(j + rank*b)*h)); if (A[j][i] > umax) umax = A[j][i]; if (error > emax) emax = error; } } MPI_Reduce (&umax, &globalumax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce (&emax, &globalemax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (rank == 0) { printf("elapsed: %f\n", MPI_Wtime()-time); printf ("umax = %e \n",globalumax); printf ("emax = %e \n",globalemax); } MPI_Finalize(); return 0; }