Beispiel #1
0
int main(int argc, char *argv[])
{
  gaspi_rank_t nprocs, n;
  gaspi_number_t max_groups, gsize;
  gaspi_rank_t *partners;

  TSUITE_INIT(argc, argv);
  
  gaspi_group_max(&max_groups);
  
  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  ASSERT(gaspi_proc_num(&nprocs));
  
  ASSERT(gaspi_group_size(GASPI_GROUP_ALL, &gsize));

  partners = malloc(gsize * sizeof(gaspi_rank_t));
  ASSERT(gaspi_group_ranks(GASPI_GROUP_ALL, partners));
  
  for(n = 0; n < gsize; n++)
    {
      assert(partners[n] == n);
    }

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
  
  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return EXIT_SUCCESS;
}
Beispiel #2
0
int main(int argc, char* argv[])
{
  int i, j;
  gaspi_number_t gsize;
  int comm_state = WORKING;
  int num_failures = 0;
  int timesteps = 0;
  
  ASSERT (gaspi_proc_init(GASPI_BLOCK));
  ASSERT (gaspi_proc_rank(&myrank));
  ASSERT (gaspi_proc_num(&numprocs));

  read_params(argc, argv, &timesteps, &numprocs_idle);
	
  numprocs_working = numprocs - numprocs_idle;
  numprocs_working_and_idle = numprocs_working + numprocs_idle;
  gaspi_rank_t *comm_main_ranks = malloc( numprocs_idle * sizeof(gaspi_rank_t));
  init_array_2(comm_main_ranks, numprocs_working);

  /* contains info of all processes:
     which are working(0), broken(1) and idle(2).
     keeps updated all the time(iterations) */
  int * status_processes = (int *) malloc(numprocs * sizeof(int));
	
  init_array_3(status_processes, numprocs, WORKING);
  for(i = numprocs-1, j=0; j < numprocs_idle;--i,++j)
    {
      status_processes[i] = IDLE; // putting last processes to IDLE
    }
	
  // ===== GASPI group creation =====
  if(status_processes[myrank]==WORKING)
    {
      ASSERT(gaspi_group_create(&COMM_MAIN));

      gaspi_number_t i;
      for(i=0; i<numprocs; i++)
	{
	  if(status_processes[i]==WORKING)
	    {
	      ASSERT(gaspi_group_add(COMM_MAIN, i));
	      ASSERT(gaspi_group_size(COMM_MAIN, &gsize));
	    }
	}
      ASSERT(gaspi_group_ranks (COMM_MAIN, comm_main_ranks));
      ASSERT(gaspi_group_commit (COMM_MAIN, GASPI_BLOCK));
    }

  /* ====== Init a SYNC FLAGS Segment ====== */
  /* used to communicate the WORKING, BROKEN, or FINISHED_WORK status between the working and idle processes. */

  gaspi_size_t SYNC_global_mem_size;
  SYNC_global_mem_size = numprocs * sizeof(int);

  gaspi_pointer_t gm_ptr_sync=NULL;
  ASSERT(init_segment (gm_seg_sync_flags_id, SYNC_global_mem_size));
  ASSERT(gaspi_segment_ptr (gm_seg_sync_flags_id, &gm_ptr_sync));

  int * sync_flags = (int *) gm_ptr_sync;
  init_array_3(sync_flags, numprocs, WORKING);
	
  /* ====== Init a health check write FLAGS Segment ====== */
  /* This array is used to send the gaspi_write message write before health_chk routine,
     which will then update the gaspi internal health vector */

  gaspi_size_t health_chk_global_mem_size;
  health_chk_global_mem_size = numprocs*sizeof(int);
  gaspi_pointer_t gm_ptr_health_chk=NULL;
  ASSERT(init_segment (gm_seg_health_chk_array_id, health_chk_global_mem_size));
  ASSERT(gaspi_segment_ptr (gm_seg_health_chk_array_id, &gm_ptr_health_chk));
	
  gaspi_state_vector_t health_vec = (gaspi_state_vector_t) malloc(numprocs);
  ASSERT(gaspi_state_vec_get(health_vec));

  gaspi_rank_t * avoid_list= (gaspi_rank_t *) malloc(numprocs * sizeof(gaspi_rank_t));
  for(i = 0;i < numprocs; ++i)
    avoid_list[i] = (gaspi_rank_t) 0;
	
  gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK);

  /* ===== TIME-STEP LOOP =====  */
  if(status_processes[myrank]==IDLE)
    {
      /* IDLE processes remain in this loop */
      while(1)
	{
	  gaspi_printf("%d.", myrank);
	  if(sync_flags[0] == WORKING)
	    {
	      /*  NO FAILURE REPORTED  */
	      usleep(1000000);
	    }
	  if(sync_flags[0] == BROKEN)
	    {
	      /* FAILURE REPORTED */
	      gaspi_printf("myrank: %d Broken reported\n", myrank);
	      comm_state=BROKEN;
	      break;
	    }
	  if(sync_flags[0] == WORKFINISHED)
	    {
	      /* WORKFINISHED REPORTED */
	      gaspi_printf("myrank: %d WorkFinished reported\n", myrank);
	      comm_state = WORKFINISHED;
	      break;
	    }
	}
    }

  int time_step;
  for(time_step=1; time_step <= timesteps && comm_state!=WORKFINISHED; time_step++)
    {
      gaspi_printf("== time_step: %d ==\n", time_step);
      if(comm_state==WORKING && status_processes[myrank]==WORKING)
	{
	  gaspi_barrier(COMM_MAIN, GASPI_TIMEOUT_TIME);
	  sleep(1); // NOTE: this is the work section.
	  if(time_step == 5 && myrank== 1)
	    {
	      exit (-1);
	    }
	}
      
      if(time_step<timesteps )
	{
	  send_global_msg_to_check_state(health_vec, avoid_list);
	  num_failures = check_comm_health(status_processes, health_vec);

	  gaspi_printf("%d NUM_FAILURES at timestep %d = %d\n", myrank, time_step, num_failures);

	  if( num_failures != 0 )
	    {
	      rescue_process = numprocs_working;
	      if(myrank==0)
		{
		  // message the IDLE process
		  sync_flags[0]=BROKEN;
		  
		  for(i = 0 ; i < num_failures ; ++i)
		    {
		      /* TODO: multiple failures at the same time. */
		      gaspi_printf("messaging rescue_process: %d\n", rescue_process);
		      ASSERT(gaspi_write(gm_seg_sync_flags_id, 0, rescue_process, gm_seg_sync_flags_id, 0, sizeof(int), 0, GASPI_BLOCK));
		      rescue_process++;
		    }
		}

	      if(myrank==0 || myrank==rescue_process)
		gaspi_printf("%d REPAIRING COMM_MAIN FLAG 1\n", myrank);

	      update_status_processes_array(status_processes, health_vec);
	      numprocs_working_and_idle = refresh_numprocs_working_and_idle(status_processes);
	      
	      if(myrank != rescue_process)
		{
		  ASSERT(gaspi_group_delete(COMM_MAIN));
		  ASSERT(recover());
		}
	  
	      ASSERT(gaspi_group_create(&COMM_MAIN_NEW));

	      for(i = 0; i < numprocs; i++)
		{
		  if(status_processes[i]==WORKING)
		    {
		      ASSERT(gaspi_group_add(COMM_MAIN_NEW, i));
		      ASSERT(gaspi_group_size(COMM_MAIN_NEW, &gsize));
		      if(gsize == numprocs_working)
			break;
		    }
		}
	  
	      gaspi_printf("%d: COMM_MAIN_NEW size is: %hi\n", myrank, gsize);

	      ASSERT(gaspi_group_commit (COMM_MAIN_NEW, GASPI_BLOCK));
	  
	      init_array_2(comm_main_ranks, numprocs_working);
	  
	      ASSERT(gaspi_group_ranks (COMM_MAIN_NEW, comm_main_ranks));

	      gaspi_printf("printing group_ranks_main: \n");
	      gaspi_printf_array(comm_main_ranks, numprocs_working);

	      comm_state = WORKING;
	      gaspi_printf("%d REPAIRING COMM_MAIN_NEW FLAG 2\n", myrank);
				
	      if(status_processes[myrank] == WORKING)
		{
		  ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK));
		  ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK));
		}

	      /* set things to work again */
	      COMM_MAIN = COMM_MAIN_NEW;
	      time_step = 5;
	    }
	}
    }
  
  if(myrank == 0)
    {
      gaspi_printf("finished successfully\n");
    }
  
  gaspi_proc_term(10000);

  return EXIT_SUCCESS;
}