int main(int argc, char *argv[]) { gaspi_rank_t nprocs, n; gaspi_number_t max_groups, gsize; gaspi_rank_t *partners; TSUITE_INIT(argc, argv); gaspi_group_max(&max_groups); ASSERT (gaspi_proc_init(GASPI_BLOCK)); ASSERT(gaspi_proc_num(&nprocs)); ASSERT(gaspi_group_size(GASPI_GROUP_ALL, &gsize)); partners = malloc(gsize * sizeof(gaspi_rank_t)); ASSERT(gaspi_group_ranks(GASPI_GROUP_ALL, partners)); for(n = 0; n < gsize; n++) { assert(partners[n] == n); } ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); ASSERT (gaspi_proc_term(GASPI_BLOCK)); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { int i, j; gaspi_number_t gsize; int comm_state = WORKING; int num_failures = 0; int timesteps = 0; ASSERT (gaspi_proc_init(GASPI_BLOCK)); ASSERT (gaspi_proc_rank(&myrank)); ASSERT (gaspi_proc_num(&numprocs)); read_params(argc, argv, ×teps, &numprocs_idle); numprocs_working = numprocs - numprocs_idle; numprocs_working_and_idle = numprocs_working + numprocs_idle; gaspi_rank_t *comm_main_ranks = malloc( numprocs_idle * sizeof(gaspi_rank_t)); init_array_2(comm_main_ranks, numprocs_working); /* contains info of all processes: which are working(0), broken(1) and idle(2). keeps updated all the time(iterations) */ int * status_processes = (int *) malloc(numprocs * sizeof(int)); init_array_3(status_processes, numprocs, WORKING); for(i = numprocs-1, j=0; j < numprocs_idle;--i,++j) { status_processes[i] = IDLE; // putting last processes to IDLE } // ===== GASPI group creation ===== if(status_processes[myrank]==WORKING) { ASSERT(gaspi_group_create(&COMM_MAIN)); gaspi_number_t i; for(i=0; i<numprocs; i++) { if(status_processes[i]==WORKING) { ASSERT(gaspi_group_add(COMM_MAIN, i)); ASSERT(gaspi_group_size(COMM_MAIN, &gsize)); } } ASSERT(gaspi_group_ranks (COMM_MAIN, comm_main_ranks)); ASSERT(gaspi_group_commit (COMM_MAIN, GASPI_BLOCK)); } /* ====== Init a SYNC FLAGS Segment ====== */ /* used to communicate the WORKING, BROKEN, or FINISHED_WORK status between the working and idle processes. */ gaspi_size_t SYNC_global_mem_size; SYNC_global_mem_size = numprocs * sizeof(int); gaspi_pointer_t gm_ptr_sync=NULL; ASSERT(init_segment (gm_seg_sync_flags_id, SYNC_global_mem_size)); ASSERT(gaspi_segment_ptr (gm_seg_sync_flags_id, &gm_ptr_sync)); int * sync_flags = (int *) gm_ptr_sync; init_array_3(sync_flags, numprocs, WORKING); /* ====== Init a health check write FLAGS Segment ====== */ /* This array is used to send the gaspi_write message write before health_chk routine, which will then update the gaspi internal health vector */ gaspi_size_t health_chk_global_mem_size; health_chk_global_mem_size = numprocs*sizeof(int); gaspi_pointer_t gm_ptr_health_chk=NULL; ASSERT(init_segment (gm_seg_health_chk_array_id, health_chk_global_mem_size)); ASSERT(gaspi_segment_ptr (gm_seg_health_chk_array_id, &gm_ptr_health_chk)); gaspi_state_vector_t health_vec = (gaspi_state_vector_t) malloc(numprocs); ASSERT(gaspi_state_vec_get(health_vec)); gaspi_rank_t * avoid_list= (gaspi_rank_t *) malloc(numprocs * sizeof(gaspi_rank_t)); for(i = 0;i < numprocs; ++i) avoid_list[i] = (gaspi_rank_t) 0; gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK); /* ===== TIME-STEP LOOP ===== */ if(status_processes[myrank]==IDLE) { /* IDLE processes remain in this loop */ while(1) { gaspi_printf("%d.", myrank); if(sync_flags[0] == WORKING) { /* NO FAILURE REPORTED */ usleep(1000000); } if(sync_flags[0] == BROKEN) { /* FAILURE REPORTED */ gaspi_printf("myrank: %d Broken reported\n", myrank); comm_state=BROKEN; break; } if(sync_flags[0] == WORKFINISHED) { /* WORKFINISHED REPORTED */ gaspi_printf("myrank: %d WorkFinished reported\n", myrank); comm_state = WORKFINISHED; break; } } } int time_step; for(time_step=1; time_step <= timesteps && comm_state!=WORKFINISHED; time_step++) { gaspi_printf("== time_step: %d ==\n", time_step); if(comm_state==WORKING && status_processes[myrank]==WORKING) { gaspi_barrier(COMM_MAIN, GASPI_TIMEOUT_TIME); sleep(1); // NOTE: this is the work section. if(time_step == 5 && myrank== 1) { exit (-1); } } if(time_step<timesteps ) { send_global_msg_to_check_state(health_vec, avoid_list); num_failures = check_comm_health(status_processes, health_vec); gaspi_printf("%d NUM_FAILURES at timestep %d = %d\n", myrank, time_step, num_failures); if( num_failures != 0 ) { rescue_process = numprocs_working; if(myrank==0) { // message the IDLE process sync_flags[0]=BROKEN; for(i = 0 ; i < num_failures ; ++i) { /* TODO: multiple failures at the same time. */ gaspi_printf("messaging rescue_process: %d\n", rescue_process); ASSERT(gaspi_write(gm_seg_sync_flags_id, 0, rescue_process, gm_seg_sync_flags_id, 0, sizeof(int), 0, GASPI_BLOCK)); rescue_process++; } } if(myrank==0 || myrank==rescue_process) gaspi_printf("%d REPAIRING COMM_MAIN FLAG 1\n", myrank); update_status_processes_array(status_processes, health_vec); numprocs_working_and_idle = refresh_numprocs_working_and_idle(status_processes); if(myrank != rescue_process) { ASSERT(gaspi_group_delete(COMM_MAIN)); ASSERT(recover()); } ASSERT(gaspi_group_create(&COMM_MAIN_NEW)); for(i = 0; i < numprocs; i++) { if(status_processes[i]==WORKING) { ASSERT(gaspi_group_add(COMM_MAIN_NEW, i)); ASSERT(gaspi_group_size(COMM_MAIN_NEW, &gsize)); if(gsize == numprocs_working) break; } } gaspi_printf("%d: COMM_MAIN_NEW size is: %hi\n", myrank, gsize); ASSERT(gaspi_group_commit (COMM_MAIN_NEW, GASPI_BLOCK)); init_array_2(comm_main_ranks, numprocs_working); ASSERT(gaspi_group_ranks (COMM_MAIN_NEW, comm_main_ranks)); gaspi_printf("printing group_ranks_main: \n"); gaspi_printf_array(comm_main_ranks, numprocs_working); comm_state = WORKING; gaspi_printf("%d REPAIRING COMM_MAIN_NEW FLAG 2\n", myrank); if(status_processes[myrank] == WORKING) { ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK)); ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK)); } /* set things to work again */ COMM_MAIN = COMM_MAIN_NEW; time_step = 5; } } } if(myrank == 0) { gaspi_printf("finished successfully\n"); } gaspi_proc_term(10000); return EXIT_SUCCESS; }