int main(int argc, char * argv[]) { // First, we initialize MPI int rank, nb_nodes; int return_code; return_code = MPI_Init(&argc, &argv); if(return_code != MPI_SUCCESS) { printf("Fail to initialize MPI\n"); MPI_Abort(MPI_COMM_WORLD, return_code); } MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes); // Check the arguments if(!rank && argc != 4) { printf("Usage: %s <N> <init-value> <error>", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } // Setup arguments (number of columns, initial value, error tolerance) // n et m are the dimension of the subdomains, we keep the size of the global matrix in N int n = atoi(argv[1]); int N = n; int m = n; n = (n-2)/nb_nodes + 2; double w = atof(argv[2]); double e = atof(argv[3]); // Check that p divides N-2. Why '-2' because we don't care of the top and bottom lines of 0 if(!rank && (N-2) % nb_nodes != 0) { printf("The number of processors must divide the size of the matrix - 2\n"); MPI_Abort(MPI_COMM_WORLD, 1); } // Compute matrices double ** prev_m = NULL; double ** new_m = init_matrix(n, m, w, rank, nb_nodes); // Local and global error double it_error = 0.0; double global_error = 0.0; double t1, t2; t1 = MPI_Wtime(); int itnb = 0; do { itnb++; if(prev_m != NULL) free_matrix(prev_m, n); prev_m = new_m; new_m = copy_matrix(prev_m, n, m); compute(new_m, n, m); exchange_halo(new_m, n, m, rank, nb_nodes); /* compute_red(new_m, n, m); */ /* exchange_red(new_m, n, m, rank, nb_nodes); */ /* compute_black(new_m, n, m); */ /* exchange_black(new_m, n, m, rank, nb_nodes); */ // Every 30 iterations we check the convergence if(itnb % 30 == 0) { it_error = error(prev_m, new_m, n, m); MPI_Reduce(&it_error, &global_error, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Bcast(&global_error, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(global_error <= e) break; } } while(1); // We need to gather all the data to node 0 // Number of lines - halo data * number of columns int send_size = (n-2) * m; int result_size = send_size * nb_nodes; double * send = (double*)malloc(sizeof(double) * send_size); double * result; if(!rank) { printf("Iterations number: %d\n", itnb); result = (double *)malloc(sizeof(double) * result_size); } // Some DEBUG prints /* printf("End partial matrix of %d\n", rank); */ /* print_matrix(new_m, n, m); */ for(int i = 1; i < n-1; i++) for(int j = 0; j < m; j++) send[(i-1)*m+j] = new_m[i][j]; t2 = MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); MPI_Gather(send, send_size, MPI_DOUBLE, result, send_size, MPI_DOUBLE, 0, MPI_COMM_WORLD); free(send); // The node 0 builds the global matrix and print the data if(!rank) { printf("Execution Time: %1.2lf\n", t2-t1); /* printf("Result\n"); */ /* print_array(result, result_size); */ double ** global_matrix = init_matrix_from_array(result, N); free(result); print_data_matrix(argv[0], nb_nodes, global_matrix, N); } MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); return 0; }
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5}; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; int sent[6] = {0,0,0,0,0,0}; if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[0] = exchange_halo(args[i]); if(sent[0] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc \n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_adt_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], (double *)arg4.data, (double *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; int sent[8] = {0,0,0,0,0,0,0,0}; //array to set if halo is exchanged if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[i] = exchange_halo(args[i]); //if(sent[i] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan int block_offset; op_plan *Plan; #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif //get offsets int core_len = core_num[set->index]; int noncore_len = set->size + OP_import_exec_list[set->index]->size - core_len; double cpu_t1, cpu_t2, wall_t1, wall_t2; //process core set if (core_len>0) { if (OP_latency_sets[set->index].core_set == NULL) { op_set core_set = (op_set)malloc(sizeof(op_set_core)); core_set->index = set->index; core_set->name = set->name; core_set->size = core_len; core_set->exec_size = 0; core_set->nonexec_size = 0; OP_latency_sets[set->index].core_set = core_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].core_set, 0,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for(int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if(sent[i] == 1)wait_all(args[i]); } } } if (noncore_len>0) { if (OP_latency_sets[set->index].noncore_set == NULL) { op_set noncore_set = (op_set)malloc(sizeof (op_set_core)); noncore_set->size = noncore_len; noncore_set->name = set->name; noncore_set->index = set->index; noncore_set->exec_size = 0; noncore_set->nonexec_size = 0; OP_latency_sets[set->index].noncore_set = noncore_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].noncore_set,core_len, part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timing_realloc(3); OP_kernels[2].name = name; OP_kernels[2].count += 1; }