int main(int argc, char *argv[]) { TSUITE_INIT(argc, argv); ASSERT (gaspi_proc_init(GASPI_BLOCK)); gaspi_rank_t nprocs; ASSERT(gaspi_proc_num(&nprocs)); ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); gaspi_state_vector_t vec = NULL; EXPECT_FAIL(gaspi_state_vec_get(vec)); vec = (gaspi_state_vector_t) malloc(nprocs); ASSERT(gaspi_state_vec_get(vec)); int i; for(i = 0; i < nprocs; i++) { assert(vec[i] == GASPI_STATE_HEALTHY); } ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); ASSERT (gaspi_proc_term(GASPI_BLOCK)); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { gaspi_rank_t nprocs, i; TSUITE_INIT(argc, argv); ASSERT (gaspi_proc_init(GASPI_BLOCK)); ASSERT(gaspi_proc_num(&nprocs)); ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); gaspi_state_vector_t vec = (gaspi_state_vector_t) malloc(nprocs); gaspi_printf("vec out %p\n", vec); ASSERT(gaspi_state_vec_get(vec)); gaspi_printf("vec out %p\n", vec); for(i = 0; i < nprocs; i++) { assert(vec[i] == 0); } ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); ASSERT (gaspi_proc_term(GASPI_BLOCK)); return EXIT_SUCCESS; }
void print_health_vec(gaspi_state_vector_t health_vec) { int i; ASSERT(gaspi_state_vec_get(health_vec)); for(i = 0;i < numprocs; ++i) { gaspi_printf("%d_health is: %d \n", myrank, health_vec[i]); } }
void send_global_msg_to_check_state(gaspi_state_vector_t health_vec, gaspi_rank_t *avoid_list) { int i, j; int num_simultaneous_fail_checks = 1; gaspi_timeout_t HEALTH_CHECK_TIMEOUT_TIME = GASPI_BLOCK; gaspi_printf("Checking global health state\n"); /* in order to check multiple simultaneous fail, health check has to be performed multiple times */ for(j = 0 ; j < num_simultaneous_fail_checks; ++j ) { for(i = 0; i < numprocs; ++i) { if(avoid_list[i] != 1) { ASSERT(gaspi_write(gm_seg_health_chk_array_id, myrank, i, gm_seg_health_chk_array_id, myrank, sizeof(int), queue_id, HEALTH_CHECK_TIMEOUT_TIME)); } } gaspi_wait(queue_id, HEALTH_CHECK_TIMEOUT_TIME); ASSERT(gaspi_state_vec_get(health_vec)); /* adding the dead processes to avoid_list */ /* so that message for health test is not sent to them next time. */ for(i = 0; i < numprocs; ++i) { if(health_vec[i] == 1) { avoid_list[i] = 1; } } } print_health_vec(health_vec); }
int main(int argc, char *argv[]) { gaspi_rank_t nprocs, myrank, i; int j, n; gaspi_rank_t *avoid_list; gaspi_group_t survivors; TSUITE_INIT(argc, argv); ASSERT (gaspi_proc_init(GASPI_BLOCK)); ASSERT(gaspi_proc_num(&nprocs)); ASSERT(gaspi_proc_rank(&myrank)); ASSERT(gaspi_segment_create(0, _4MB, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED)); avoid_list = (gaspi_rank_t *) malloc(nprocs * sizeof(gaspi_rank_t)); assert (avoid_list != NULL); memset(avoid_list, 0, nprocs * sizeof(gaspi_rank_t)); gaspi_state_vector_t vec = (gaspi_state_vector_t) malloc(nprocs); ASSERT(gaspi_state_vec_get(vec)); //check that everyone is healthy for(i = 0; i < nprocs; i++) { assert(vec[i] == 0); } //sync ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); //now last rank dies if(myrank == nprocs - 1) exit(-1); else { //create group of survivors ASSERT(gaspi_group_create(&survivors)); for(i = 0; i < nprocs - 1; i++) ASSERT(gaspi_group_add(survivors, i)); ASSERT(gaspi_group_commit(survivors, GASPI_BLOCK)); gaspi_printf("Done with groups\n"); sleep(2); } //the others communicate gaspi_return_t retval; for(j = 0; j < 10; j++) { gaspi_printf("Iteration %d\n", j); for(i = 0; i < nprocs; i++) { if( avoid_list[i] != 1 ) ASSERT(gaspi_write(0, 0, i, 0, 0, sizeof(int), 0, GASPI_BLOCK)); } retval = gaspi_wait(0, GASPI_BLOCK); //problem found -> recover if(retval != GASPI_SUCCESS) { ASSERT(gaspi_state_vec_get(vec)); for(n = 0; n < nprocs; n++) { if(vec[n] != GASPI_STATE_HEALTHY) { gaspi_printf("Problem with node %d detected\n", n); assert(n == (nprocs - 1)); ASSERT(recover()); avoid_list[n] = 1; } } } } ASSERT (gaspi_barrier(survivors, GASPI_BLOCK)); ASSERT (gaspi_proc_term(GASPI_BLOCK)); gaspi_printf("exiting\n"); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { int i, j; gaspi_number_t gsize; int comm_state = WORKING; int num_failures = 0; int timesteps = 0; ASSERT (gaspi_proc_init(GASPI_BLOCK)); ASSERT (gaspi_proc_rank(&myrank)); ASSERT (gaspi_proc_num(&numprocs)); read_params(argc, argv, ×teps, &numprocs_idle); numprocs_working = numprocs - numprocs_idle; numprocs_working_and_idle = numprocs_working + numprocs_idle; gaspi_rank_t *comm_main_ranks = malloc( numprocs_idle * sizeof(gaspi_rank_t)); init_array_2(comm_main_ranks, numprocs_working); /* contains info of all processes: which are working(0), broken(1) and idle(2). keeps updated all the time(iterations) */ int * status_processes = (int *) malloc(numprocs * sizeof(int)); init_array_3(status_processes, numprocs, WORKING); for(i = numprocs-1, j=0; j < numprocs_idle;--i,++j) { status_processes[i] = IDLE; // putting last processes to IDLE } // ===== GASPI group creation ===== if(status_processes[myrank]==WORKING) { ASSERT(gaspi_group_create(&COMM_MAIN)); gaspi_number_t i; for(i=0; i<numprocs; i++) { if(status_processes[i]==WORKING) { ASSERT(gaspi_group_add(COMM_MAIN, i)); ASSERT(gaspi_group_size(COMM_MAIN, &gsize)); } } ASSERT(gaspi_group_ranks (COMM_MAIN, comm_main_ranks)); ASSERT(gaspi_group_commit (COMM_MAIN, GASPI_BLOCK)); } /* ====== Init a SYNC FLAGS Segment ====== */ /* used to communicate the WORKING, BROKEN, or FINISHED_WORK status between the working and idle processes. */ gaspi_size_t SYNC_global_mem_size; SYNC_global_mem_size = numprocs * sizeof(int); gaspi_pointer_t gm_ptr_sync=NULL; ASSERT(init_segment (gm_seg_sync_flags_id, SYNC_global_mem_size)); ASSERT(gaspi_segment_ptr (gm_seg_sync_flags_id, &gm_ptr_sync)); int * sync_flags = (int *) gm_ptr_sync; init_array_3(sync_flags, numprocs, WORKING); /* ====== Init a health check write FLAGS Segment ====== */ /* This array is used to send the gaspi_write message write before health_chk routine, which will then update the gaspi internal health vector */ gaspi_size_t health_chk_global_mem_size; health_chk_global_mem_size = numprocs*sizeof(int); gaspi_pointer_t gm_ptr_health_chk=NULL; ASSERT(init_segment (gm_seg_health_chk_array_id, health_chk_global_mem_size)); ASSERT(gaspi_segment_ptr (gm_seg_health_chk_array_id, &gm_ptr_health_chk)); gaspi_state_vector_t health_vec = (gaspi_state_vector_t) malloc(numprocs); ASSERT(gaspi_state_vec_get(health_vec)); gaspi_rank_t * avoid_list= (gaspi_rank_t *) malloc(numprocs * sizeof(gaspi_rank_t)); for(i = 0;i < numprocs; ++i) avoid_list[i] = (gaspi_rank_t) 0; gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK); /* ===== TIME-STEP LOOP ===== */ if(status_processes[myrank]==IDLE) { /* IDLE processes remain in this loop */ while(1) { gaspi_printf("%d.", myrank); if(sync_flags[0] == WORKING) { /* NO FAILURE REPORTED */ usleep(1000000); } if(sync_flags[0] == BROKEN) { /* FAILURE REPORTED */ gaspi_printf("myrank: %d Broken reported\n", myrank); comm_state=BROKEN; break; } if(sync_flags[0] == WORKFINISHED) { /* WORKFINISHED REPORTED */ gaspi_printf("myrank: %d WorkFinished reported\n", myrank); comm_state = WORKFINISHED; break; } } } int time_step; for(time_step=1; time_step <= timesteps && comm_state!=WORKFINISHED; time_step++) { gaspi_printf("== time_step: %d ==\n", time_step); if(comm_state==WORKING && status_processes[myrank]==WORKING) { gaspi_barrier(COMM_MAIN, GASPI_TIMEOUT_TIME); sleep(1); // NOTE: this is the work section. if(time_step == 5 && myrank== 1) { exit (-1); } } if(time_step<timesteps ) { send_global_msg_to_check_state(health_vec, avoid_list); num_failures = check_comm_health(status_processes, health_vec); gaspi_printf("%d NUM_FAILURES at timestep %d = %d\n", myrank, time_step, num_failures); if( num_failures != 0 ) { rescue_process = numprocs_working; if(myrank==0) { // message the IDLE process sync_flags[0]=BROKEN; for(i = 0 ; i < num_failures ; ++i) { /* TODO: multiple failures at the same time. */ gaspi_printf("messaging rescue_process: %d\n", rescue_process); ASSERT(gaspi_write(gm_seg_sync_flags_id, 0, rescue_process, gm_seg_sync_flags_id, 0, sizeof(int), 0, GASPI_BLOCK)); rescue_process++; } } if(myrank==0 || myrank==rescue_process) gaspi_printf("%d REPAIRING COMM_MAIN FLAG 1\n", myrank); update_status_processes_array(status_processes, health_vec); numprocs_working_and_idle = refresh_numprocs_working_and_idle(status_processes); if(myrank != rescue_process) { ASSERT(gaspi_group_delete(COMM_MAIN)); ASSERT(recover()); } ASSERT(gaspi_group_create(&COMM_MAIN_NEW)); for(i = 0; i < numprocs; i++) { if(status_processes[i]==WORKING) { ASSERT(gaspi_group_add(COMM_MAIN_NEW, i)); ASSERT(gaspi_group_size(COMM_MAIN_NEW, &gsize)); if(gsize == numprocs_working) break; } } gaspi_printf("%d: COMM_MAIN_NEW size is: %hi\n", myrank, gsize); ASSERT(gaspi_group_commit (COMM_MAIN_NEW, GASPI_BLOCK)); init_array_2(comm_main_ranks, numprocs_working); ASSERT(gaspi_group_ranks (COMM_MAIN_NEW, comm_main_ranks)); gaspi_printf("printing group_ranks_main: \n"); gaspi_printf_array(comm_main_ranks, numprocs_working); comm_state = WORKING; gaspi_printf("%d REPAIRING COMM_MAIN_NEW FLAG 2\n", myrank); if(status_processes[myrank] == WORKING) { ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK)); ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK)); } /* set things to work again */ COMM_MAIN = COMM_MAIN_NEW; time_step = 5; } } } if(myrank == 0) { gaspi_printf("finished successfully\n"); } gaspi_proc_term(10000); return EXIT_SUCCESS; }