int main(int argc, char *argv[]) { int i, j; // grid indexes int max_iterations; // number of iterations int iteration=1; // current iteration double dt=100; // largest change in t struct timeval start_time, stop_time, elapsed_time; // timers printf("Maximum iterations [100-4000]?\n"); scanf("%d", &max_iterations); gettimeofday(&start_time,NULL); // Unix timer initialize(); // initialize Temp_last including boundary conditions // do until error is minimal or until max steps while ( dt > MAX_TEMP_ERROR && iteration <= max_iterations ) { // main calculation: average my four neighbors // do parellelization here #pragma acc kernels #pragma omp parallel for private (i,j) for(i = 1; i <= ROWS; i++) { for(j = 1; j <= COLUMNS; j++) { Temperature[i][j] = 0.25 * (Temperature_last[i+1][j] + Temperature_last[i-1][j] + Temperature_last[i][j+1] + Temperature_last[i][j-1]); } } dt = 0.0; // reset largest temperature change // copy grid to old grid for next iteration and find latest dt // try parallelization here #pragma acc kernels #pragma omp parallel for private (i,j) reduction(max:dt) for(i = 1; i <= ROWS; i++){ for(j = 1; j <= COLUMNS; j++){ dt = fmax( fabs(Temperature[i][j]-Temperature_last[i][j]), dt); Temperature_last[i][j] = Temperature[i][j]; } } // periodically print test values if((iteration % 100) == 0) { track_progress(iteration); } iteration++; } gettimeofday(&stop_time,NULL); timersub(&stop_time, &start_time, &elapsed_time); // Unix time subtract routine printf("\nMax error at iteration %d was %f\n", iteration-1, dt); printf("Total time was %f seconds.\n", elapsed_time.tv_sec+elapsed_time.tv_usec/1000000.0); }
int main(int argc, char *argv[]) { int i, j; int max_iterations; int iteration=1; double dt; struct timeval start_time, stop_time, elapsed_time; int npes; // number of PEs int my_PE_num; // my PE number double dt_global=100; // delta t across all PEs MPI_Status status; // status returned by MPI calls // the usual MPI startup routines MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_PE_num); MPI_Comm_size(MPI_COMM_WORLD, &npes); // verify only NPES PEs are being used if (npes != NPES) { if (my_PE_num == 0) printf("This code must be run with %d PEs\n", NPES); MPI_Finalize(); exit(1); } // PE 0 asks for input // `max_iterations` undefined on all other PEs! if (my_PE_num == 0) { printf("Maximum iterations [100-4000]?\n"); fflush(stdout); scanf("%d", &max_iterations); } // bcast max iterations to other PEs MPI_Bcast(&max_iterations, 1, MPI_INT, 0, MPI_COMM_WORLD); if (my_PE_num==0) gettimeofday(&start_time,NULL); initialize(npes, my_PE_num); while ( dt_global > MAX_TEMP_ERROR && iteration <= max_iterations ) { // main calculation: average my four neighbors for(i = 1; i <= ROWS; i++) { for(j = 1; j <= COLUMNS; j++) { Temperature[i][j] = 0.25 * (Temperature_last[i+1][j] + Temperature_last[i-1][j] + Temperature_last[i][j+1] + Temperature_last[i][j-1]); } } // COMMUNICATION PHASE: send and receive ghost rows for next iteration /* There is a top and a bottom ghost row for all PEs, except for: 0: only a bottom npes-1: only a top send Temperature, receive Temperature_last COLUMNS, not COLUMNS+2 all the indexing issues seem to stem from (mis)understanding that the boundary conditions don't change */ if (my_PE_num != 0) { // send second row to my_PE_num-1 // receive first row from my_PE_num-1 MPI_Send(&Temperature[1][1], COLUMNS, MPI_DOUBLE, my_PE_num-1, 99, MPI_COMM_WORLD); MPI_Recv(&Temperature_last[0][1], COLUMNS, MPI_DOUBLE, my_PE_num-1, MPI_ANY_TAG, MPI_COMM_WORLD, &status); } if (my_PE_num != npes-1) { // send second to last row to my_PE_num+1 // receive last row from my_PE_num+1 MPI_Send(&Temperature[ROWS][1], COLUMNS, MPI_DOUBLE, my_PE_num+1, 99, MPI_COMM_WORLD); MPI_Recv(&Temperature_last[ROWS+1][1], COLUMNS, MPI_DOUBLE, my_PE_num+1, MPI_ANY_TAG, MPI_COMM_WORLD, &status); } dt = 0.0; for(i = 1; i <= ROWS; i++){ for(j = 1; j <= COLUMNS; j++){ dt = fmax( fabs(Temperature[i][j]-Temperature_last[i][j]), dt); Temperature_last[i][j] = Temperature[i][j]; } } // find global dt MPI_Reduce(&dt, &dt_global, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Bcast(&dt_global, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); // periodically print test values - only for PE in lower corner if((iteration % 100) == 0) { if (my_PE_num == npes-1){ track_progress(iteration); } if (my_PE_num == 2) { printf(" Global coord [750, 900] is: %f\n", Temperature[250][900]); fflush(stdout); } output(my_PE_num, iteration); } iteration++; } // Slightly more accurate timing and cleaner output MPI_Barrier(MPI_COMM_WORLD); // PE 0 finish timing and output values if (my_PE_num==0){ gettimeofday(&stop_time,NULL); timersub(&stop_time, &start_time, &elapsed_time); printf("\nMax error at iteration %d was %f\n", iteration-1, dt_global); printf("Total time was %f seconds.\n", elapsed_time.tv_sec+elapsed_time.tv_usec/1000000.0); } MPI_Finalize(); }
void ComparisonStageIR::build_stage() { assert(!is_tiled() || (is_tiled() && !track_progress())); // timer is only allowed for serial loops (just use it to get avg iterations per second or something like that) assert(!time_loop() || (time_loop() && !is_parallelized())); set_stage_function(create_stage_function()); set_user_function(create_user_function()); // stuff before the loop // build the return idx MVar *loop_start = new MVar(MScalarType::get_long_type()); // don't make a constant b/c it should be updateable loop_start->register_for_delete(); MStatement *set_loop_start = new MStatement(loop_start, MVar::create_constant<long>(0)); set_loop_start->register_for_delete(); MStatement *set_result = new MStatement(get_return_idx(), loop_start); set_result->register_for_delete(); set_start_block(new MBlock("start")); get_start_block()->register_for_delete(); get_start_block()->add_expr(set_loop_start); get_start_block()->add_expr(set_result); // When we don't parallelize, then make the inner loop's index outside of both the loops rather than within // the outer loop. This is a hack for llvm because if we have an alloca call within each iteration of the outer loop, // we will be "leaking" stack space each time that is called, so moving it outside of the loop prevents that. // However, it makes it hard to work with when we then parallelize because the code sees that inner loop index as a // free variable that needs to be added to the closure. This is not fun because our index is now a pointer to an index // and then we would need to update the index by going through the pointer, etc. Basically, it would cause some hacks on the // LLVM side (and unless this becomes something that is needed in the future, I don't want to deal with it). // So instead, it is dealt with below. Without parallelization, the inner loop index is initialized outside of the // nested loop, and then updated to the correct start right before the inner loop begins execution. // When parallelization is turned on, the inner loop index is made INSIDE the outer loop. This is because the // parallelized outer loop calls a function every iteration which is the outer loop body, and then within that the // inner loop is created. alloca is scoped at the function level, so the inner loop index gets a single alloca // in this function call, and then the inner loop is created. // This may not be required of other possible back-end languages that we choose, but it will depend on their scoping rules. // // TL;DR LLVM has function scoping for allocainst, so if we create the inner loop index as so // val outer_index... // for outer_index... // val inner_index... // for inner_index... // every iteration of the outer loop adds space to the stack which isn't released until the function ends. So we want // val outer_index... // val inner_index... // for outer_index... // for inner_index... MVar *inner_start = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); MBlock *preallocation_block = create_preallocator(); get_start_block()->add_expr(preallocation_block); MTimer *timer = nullptr; timer = new MTimer(); timer->register_for_delete(); MFor *outer_loop_skeleton_1 = nullptr; MFor *inner_loop_skeleton_1 = nullptr; MFor *outer_loop_skeleton_2 = nullptr; MFor *inner_loop_skeleton_2 = nullptr; MBlock *inner_loop_body = nullptr; // think of all comparisons as being in an NxM matrix where N is the left input and M is the right input. // N is the outermost iteration tile_size_N = MVar::create_constant<long>(2); tile_size_M = MVar::create_constant<long>(2); MVar *final_loop_bound; if (!is_tiled() || !is_tileable()) { // No tiling // To make sure that the inner loop doesn't get replace with a different bound if parallelizing, copy // the bound to a different variable and use that MVar *bound_copy = new MVar(MScalarType::get_long_type()); bound_copy->register_for_delete(); MStatement *set_copy = new MStatement(bound_copy, get_stage_function()->/*get_args()*/get_loaded_args()[3]); set_copy->register_for_delete(); get_start_block()->add_expr(set_copy); // loop components MVar *outer_loop_start = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); outer_loop_skeleton_1 = create_stage_for_loop(outer_loop_start, MVar::create_constant<long>(1), get_stage_function()->/*get_args()*/get_loaded_args()[1], false, get_start_block()); if (is_parallelizable() && is_parallelized()) { outer_loop_skeleton_1->set_exec_type(PARALLEL); } MVar *_inner_start = nullptr; if ((left_input || right_input) && !_force_commutative) { _inner_start = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); } else { MAdd *add = new MAdd(outer_loop_skeleton_1->get_loop_index(), MVar::create_constant<long>(1)); outer_loop_skeleton_1->get_body_block()->add_expr(add); add->register_for_delete(); _inner_start = add->get_result(); } if (!time_loop()) { get_start_block()->add_expr(outer_loop_skeleton_1); } else { get_start_block()->add_expr(timer); timer->get_timer_block()->add_expr(outer_loop_skeleton_1); } MStatement *set_inner_start = new MStatement(inner_start, _inner_start); set_inner_start->register_for_delete(); outer_loop_skeleton_1->get_body_block()->add_expr(set_inner_start); MBlock *temp_block = new MBlock(); temp_block->register_for_delete(); inner_loop_skeleton_1 = create_stage_for_loop(inner_start, MVar::create_constant<long>(1), bound_copy, true, temp_block); // TODO hack, need to add the loop index initialization before the outer loop, but we have to add the outer loop before this since // the inner_start depends on the outer loop get_start_block()->insert_at(temp_block, get_start_block()->get_exprs().size() - 2); // insert right before the outer loop // stuff for calling the user function in the loop inner_loop_body = inner_loop_skeleton_1->get_body_block(); } else if (is_tiled() && is_tileable()) { // tiling // loop components MDiv *_outer_1_bound = new MDiv(get_stage_function()->/*get_args()*/get_loaded_args()[1], tile_size_N); _outer_1_bound->register_for_delete(); MDiv *_inner_1_bound = new MDiv(get_stage_function()->/*get_args()*/get_loaded_args()[3], tile_size_M); _inner_1_bound->register_for_delete(); // compensate for when the number of elements isn't a multiple of the tile size MAdd *outer_1_bound = new MAdd(_outer_1_bound->get_result(), MVar::create_constant<long>(1)); outer_1_bound->register_for_delete(); MAdd *inner_1_bound = new MAdd(_inner_1_bound->get_result(), MVar::create_constant<long>(1)); inner_1_bound->register_for_delete(); get_start_block()->add_expr(_outer_1_bound); get_start_block()->add_expr(_inner_1_bound); get_start_block()->add_expr(outer_1_bound); get_start_block()->add_expr(inner_1_bound); MVar *outer_loop_start_1 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); outer_loop_start_1->override_name("outer_loop_start_1"); MVar *inner_loop_start_1 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); inner_loop_start_1->override_name("inner_loop_start_1"); MVar *outer_loop_start_2 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); outer_loop_start_2->override_name("outer_loop_start_2"); MVar *inner_loop_start_2 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block()); inner_loop_start_2->override_name("inner_loop_start_2"); // n = 0 to N/tile_size_N + 1 outer_loop_skeleton_1 = create_stage_for_loop(outer_loop_start_1, MVar::create_constant<long>(1), outer_1_bound->get_result(), true, get_start_block()); outer_loop_skeleton_1->override_name("outer_loop_skeleton1"); // // if (!time_loop()) { // get_start_block()->add_expr(outer_loop_skeleton_1); // } else { // get_start_block()->add_expr(timer); // timer->get_timer_block()->add_expr(outer_loop_skeleton_1); // } // m = 0 to M/tile_size_M + 1 inner_loop_skeleton_1 = create_stage_for_loop(inner_loop_start_1, MVar::create_constant<long>(1), inner_1_bound->get_result(), true, get_start_block()); inner_loop_skeleton_1->override_name("inner_loop_skeleton1"); // nn = 0 to tile_size_N outer_loop_skeleton_2 = create_stage_for_loop(outer_loop_start_2, MVar::create_constant<long>(1), tile_size_N, true, get_start_block()); outer_loop_skeleton_2->override_name("outer_loop_skeleton2"); // mm = 0 to tile_size_M inner_loop_skeleton_2 = create_stage_for_loop(inner_loop_start_2, MVar::create_constant<long>(1), tile_size_M, true, get_start_block()); inner_loop_skeleton_2->override_name("inner_loop_skeleton2"); if (!time_loop()) { get_start_block()->add_expr(outer_loop_skeleton_1); } else { get_start_block()->add_expr(timer); timer->get_timer_block()->add_expr(outer_loop_skeleton_1); } inner_loop_skeleton_1->get_body_block()->add_expr(outer_loop_skeleton_2); outer_loop_skeleton_2->get_body_block()->add_expr(inner_loop_skeleton_2); inner_loop_body = inner_loop_skeleton_2->get_body_block(); } MBlock *user_arg_block; std::vector<MVar *> args = create_user_function_inputs(&user_arg_block, outer_loop_skeleton_1, outer_loop_skeleton_2, inner_loop_skeleton_1, inner_loop_skeleton_2, nullptr, false, nullptr, nullptr, get_stage_function()->/*get_args()*/get_loaded_args()[1], get_stage_function()->/*get_args()*/get_loaded_args()[3]); if (!is_tiled() || !is_tileable()) { inner_loop_body->add_expr(user_arg_block); } // if tiled, this is already added in the create_user_function_inputs inner_loop_body = user_arg_block; int bucket_idx = inner_loop_body->get_exprs().size(); MFunctionCall *call = call_user_function(get_user_function(), args); inner_loop_body->add_expr(call); // handle the output of the user call MBlock *processed_call = process_user_function_call(call, NULL, false); inner_loop_body->add_expr(processed_call); // do any other postprocessing needed in the loop before the next iteration MBlock *extra = loop_extras(); inner_loop_body->add_expr(extra); if (track_progress() && !is_parallelized()) { // still return the original loop bound MBlock *temp = new MBlock(); temp->register_for_delete(); final_loop_bound = outer_loop_skeleton_1->get_loop_bound(); outer_loop_skeleton_1->get_body_block()->add_expr(inner_loop_skeleton_1); inner_loop_body->insert_at(apply_buckets(args[0], args[1], inner_loop_skeleton_2 ? inner_loop_skeleton_2 : inner_loop_skeleton_1), bucket_idx); std::pair<MFor *, MFor *> splits = ProgressTracker::create_progress_tracker(outer_loop_skeleton_1, inner_loop_skeleton_1, get_num_tracking_splits(), temp, true); // find the original outer_loop_skeleton_1 in the block and remove it. Then replace with the new one in splits.first int idx = 0; if (!time_loop()) { for (std::vector<MExpr *>::const_iterator iter = get_start_block()->get_exprs().cbegin(); iter != get_start_block()->get_exprs().cend(); iter++) { if (*iter == outer_loop_skeleton_1) { break; } idx++; } get_start_block()->remove_at(idx); } else { for (std::vector<MExpr *>::const_iterator iter = timer->get_timer_block()->get_exprs().cbegin(); iter != timer->get_timer_block()->get_exprs().cend(); iter++) { if (*iter == outer_loop_skeleton_1) { break; } idx++; } timer->get_timer_block()->remove_at(idx); } outer_loop_skeleton_1 = splits.first; // do the replacement // outer_loop_skeleton_1 added to temp block in the progress tracker function if (!time_loop()) { get_stage_function()->add_body_block(temp); } else { timer->get_timer_block()->insert_at(temp, idx); } } else { outer_loop_skeleton_1->get_body_block()->add_expr(inner_loop_skeleton_1); final_loop_bound = outer_loop_skeleton_1->get_loop_bound(); inner_loop_body->insert_at(apply_buckets(args[0], args[1], inner_loop_skeleton_2 ? inner_loop_skeleton_2 : inner_loop_skeleton_1), bucket_idx); } // modify this loop if it needs to be parallelized if (is_parallelizable() && is_parallelized()) { parallelize_main_loop(get_start_block(), outer_loop_skeleton_1, inner_loop_skeleton_1); } // // if (is_tiled() && is_tileable()) { // inner_loop_skeleton_1->get_body_block()->add_expr(outer_loop_skeleton_2); // outer_loop_skeleton_2->get_body_block()->add_expr(inner_loop_skeleton_2); // } // postprocessing after the outer loop is done (no postprocessing needed after the inner loop since it just goes back to the outer loop) MBlock *after_loop = time_loop() ? timer->get_after_timer_block() : outer_loop_skeleton_1->get_end_block(); MBlock *finished = finish_stage(nullptr, final_loop_bound); MBlock *deletion = delete_fields(); after_loop->add_expr(deletion); after_loop->add_expr(finished); get_stage_function()->insert_body_block_at(get_start_block(), 1); // insert before the temp block, which would have been added if doing tracking. Insert after the stage arg loading though. // the temp block has the loop now, so it can't come before everything else }