/**
 * Callback to signal successfull startup of the controller process
 *
 * @param cls the handle to the slave whose status is to be found here
 * @param cfg the configuration with which the controller has been started;
 *          NULL if status is not #GNUNET_OK
 * @param status #GNUNET_OK if the startup is successfull; #GNUNET_SYSERR if not,
 *          GNUNET_TESTBED_controller_stop() shouldn't be called in this case
 */
static void
slave_status_cb (void *cls,
                 const struct GNUNET_CONFIGURATION_Handle *cfg,
                 int status)
{
  struct Slave *slave = cls;
  struct LinkControllersContext *lcc;

  lcc = slave->lcc;
  if (GNUNET_SYSERR == status)
  {
    slave->controller_proc = NULL;
    /* Stop all link controller forwarding tasks since we shutdown here anyway
       and as these tasks they depend on the operation queues which are created
       through GNUNET_TESTBED_controller_connect() and in kill_slave() we call
       the destructor function GNUNET_TESTBED_controller_disconnect() */
    GST_free_lcf ();
    kill_slave (slave);
    destroy_slave (slave);
    slave = NULL;
    LOG (GNUNET_ERROR_TYPE_WARNING, "Unexpected slave shutdown\n");
    GNUNET_SCHEDULER_shutdown ();       /* We too shutdown */
    goto clean_lcc;
  }
  slave->controller =
      GNUNET_TESTBED_controller_connect (GST_host_list[slave->host_id],
                                         EVENT_MASK, &slave_event_cb,
                                         slave);
  if (NULL != slave->controller)
  {
    send_controller_link_response (lcc->client, lcc->operation_id, cfg, NULL);
  }
  else
  {
    send_controller_link_response (lcc->client, lcc->operation_id, NULL,
                                   "Could not connect to delegated controller");
    kill_slave (slave);
    destroy_slave (slave);
    slave = NULL;
  }

 clean_lcc:
  if (NULL != lcc)
  {
    if (NULL != lcc->client)
    {
      GNUNET_SERVICE_client_continue (lcc->client);
      lcc->client = NULL;
    }
    GNUNET_free (lcc);
  }
  if (NULL != slave)
    slave->lcc = NULL;
}
size_t EscapedSlavesEvent::normal() {
  // following constants may be tweaked to change game mechanics
  const size_t ESCAPED_PERCENTAGE = 10;
  size_t escaped_count = _raid_slaves.size() * MAX_STAT_VALUE / ESCAPED_PERCENTAGE;
  for (size_t i = 0; i < escaped_count; ++i) {
    kill_slave();
  }
  return EO_NORMAL;
}
Exemple #3
0
void
fatalx(char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vsyslog(LOG_ERR, fmt, ap);
	va_end(ap);

	kill_slave("fatal error");

	_exit(0);
}
Exemple #4
0
/*
 * Send data over a socket and exit if something fails.
 */
void
send_data(int sock, void *buf, size_t len)
{
	ssize_t n;
	size_t pos = 0;
	char *ptr = buf;

	while (len > pos) {
		switch (n = write(sock, ptr + pos, len - pos)) {
		case 0:
			kill_slave("write failure");
			_exit(0);
			/* NOTREACHED */
		case -1:
			if (errno != EINTR && errno != EAGAIN)
				fatalx("send_data: %m");
			break;
		default:
			pos += n;
		}
	}
}
/**
 * Cleans up the slave list
 */
void
GST_slave_list_clear ()
{
  struct Slave *slave;
  unsigned int id;

  for (id = 0; id < GST_slave_list_size; id++)
  {
    slave = GST_slave_list[id];
    if (NULL == slave)
      continue;
    kill_slave (slave);
  }
  for (id = 0; id < GST_slave_list_size; id++)
  {
    slave = GST_slave_list[id];
    if (NULL == slave)
      continue;
    destroy_slave (slave);
  }
  GNUNET_free_non_null (GST_slave_list);
  GST_slave_list = NULL;
}
Exemple #6
0
void do_master_stuff(int argc, char ** argv, struct mw_api_spec *f)
{
  DEBUG_PRINT(("master starting"));

  int number_of_nonslaves = 3;

  int number_of_slaves;
  MPI_Comm_size(MPI_COMM_WORLD, &number_of_slaves);
  number_of_slaves = number_of_slaves - number_of_nonslaves;

  // needed for F_Send
  int rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  DEBUG_PRINT(("Seeded srand with %u", (unsigned) time(NULL) + rank));
  srand((unsigned)time(NULL) + rank);
  
  LinkedList * work_list;

  double start, end, start_create, end_create, start_results, end_results;

  start = MPI_Wtime();

  DEBUG_PRINT(("creating work list..."));
  start_create = MPI_Wtime();
  // save work_array separately so we can find index later on
  mw_work_t ** work_array = f->create(argc, argv);
  work_list = listFromArray(work_array);
  end_create = MPI_Wtime();
  DEBUG_PRINT(("created work in %f seconds!", end_create - start_create));

  int slave=1, num_work_units=0;

  num_work_units = get_total_units(work_array);

  mw_result_t * received_results = malloc(f->res_sz * num_work_units);
  if (received_results == NULL)
  {
    fprintf(stderr, "ERROR: insufficient memory to allocate received_results\n");
    exit(0);
  }

  int num_results_received = 0;

  // make array keeping track of pointers for work that's active
  LinkedList* assignment_ptrs[number_of_slaves];

  // create array of start times
  double assignment_time[number_of_slaves];

  // create array indicating if slaves are down
  int are_you_down[number_of_slaves];

  // current pointer
  LinkedList
    * next_work_node = work_list,
    * list_end = NULL;

  // have supervisor so starting at number_of_nonslaves
  for(slave=number_of_nonslaves; slave<(number_of_slaves+number_of_nonslaves); ++slave)
  {
    are_you_down[slave-number_of_nonslaves] = 0; //slaves are all working in the beginning
    DEBUG_PRINT(("assigning work to slave"));

    if(next_work_node == NULL)
    {
      DEBUG_PRINT(("reached the end of the work, breaking!"));
      break;
    }

    mw_work_t * work_unit = next_work_node->data;

    send_to_slave(work_unit, f->work_sz, MPI_CHAR, slave, WORK_TAG, MPI_COMM_WORLD);

    // save next_work_node to assigned work
    assignment_ptrs[slave-number_of_nonslaves] = next_work_node;
    assert(assignment_ptrs[slave-number_of_nonslaves] != NULL);
    
    // save start time
    assignment_time[slave-number_of_nonslaves] = MPI_Wtime();

    // update next_work_node
    if(next_work_node->next == NULL)
    {
        list_end = next_work_node;
    }
    next_work_node=next_work_node->next;

    DEBUG_PRINT(("work sent to slave"));
  }

  // send time array to supervisor
  DEBUG_PRINT(("Sending supervisor first time update"));
  MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);

  // failure id
  int failure_id, kill_signal;

  MPI_Status status_fail, status_res, status_kill;
  MPI_Request request_fail, request_res, request_kill;
  int flag_fail = 0, flag_res = 0, flag_kill = 0;

  // receive failure from supervisor as non-blocking recv
  MPI_Irecv(&failure_id, 1, MPI_INT, 1, FAIL_TAG, MPI_COMM_WORLD, &request_fail);

  // receive result from workers as non-blocking recv
  MPI_Irecv(&received_results[num_results_received], f->res_sz, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &request_res);

  // receive kill from supervisor as non-blocking recv
  MPI_Irecv(&kill_signal, 1, MPI_INT, 1, KILL_TAG, MPI_COMM_WORLD, &request_kill);

  int ping_sup = 0;

  // send units of work while haven't received all results
  while(num_results_received < num_work_units)
  {
    // send ping to supervisor
    MPI_Send(&ping_sup, 1, MPI_INT, 1, M_PING_TAG, MPI_COMM_WORLD);

    // check for flag_fail
    MPI_Test(&request_fail, &flag_fail, &status_fail);

    // check for flag_res
    MPI_Test(&request_res, &flag_res, &status_res);

    // check for flag_kill
    MPI_Test(&request_kill, &flag_kill, &status_kill);
    
    // send work if have failures or got results
    if (flag_fail)
    {
        DEBUG_PRINT(("received failure from supervisor, process %d", failure_id));

        // get work_unit that needs to be reassigned
        LinkedList * work_unit = assignment_ptrs[failure_id];

        if(work_unit != NULL)
        {
            DEBUG_PRINT(("Moving assignment at %p to end of the queue", work_unit));
            move_node_to_end(work_unit);
            if(next_work_node == NULL)
            {
                next_work_node = work_unit;
            }
            assert(next_work_node != NULL);
        }
        if(assignment_time[failure_id] == 0.0)
        {
            DEBUG_PRINT(("Failure on idle process %d. WTF??", failure_id));
        }
        if(are_you_down[failure_id] == 1)
        {
            DEBUG_PRINT(("Failure on a process which is already failed. WTF??"));
        }
        are_you_down[failure_id] = 1; //this slave is considered dead :(
        assignment_ptrs[failure_id] = NULL;
        assignment_time[failure_id] = 0.0;
        MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
        flag_fail = 0;
        // continue to receive failures from supervisor as non-blocking recv
        MPI_Irecv(&failure_id, 1, MPI_INT, 1, FAIL_TAG, MPI_COMM_WORLD, &request_fail);
    }
    
    int idle_process = -1, i;
    for(i=0; i<number_of_slaves; ++i)
    {
        if(assignment_time[i] == 0.0 && !are_you_down[i])
        {
            idle_process = i;
            break;
        }
    }

    if(next_work_node != NULL && idle_process > -1)
    {
        send_to_slave(next_work_node->data, f->work_sz, MPI_CHAR, idle_process+number_of_nonslaves, WORK_TAG, MPI_COMM_WORLD);
        assignment_ptrs[idle_process] = next_work_node;
        assignment_time[idle_process] = MPI_Wtime();
        MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
        DEBUG_PRINT(("Gave an assignment to previously idle process %d, assignment at %p", idle_process, next_work_node));
        if(next_work_node->next == NULL)
        {
            list_end = next_work_node;
        }
        next_work_node = next_work_node->next;
    }

    if (flag_res)
    {
      int worker_number = status_res.MPI_SOURCE-number_of_nonslaves;
      if(!are_you_down[worker_number]) //If this slave is marked dead, just ignore him
      {
        // update number of results received
        num_results_received++;

        if(next_work_node == NULL && list_end != NULL && list_end->next != NULL)
        {
            DEBUG_PRINT(("Found more work to do, now an idle process can get an assignment"));
            next_work_node = list_end->next;
            list_end = NULL;
        }
        if(next_work_node != NULL)
        {
          // get work_unit
          mw_work_t* work_unit = next_work_node->data;

          // send new unit of work
          send_to_slave(work_unit, f->work_sz, MPI_CHAR, status_res.MPI_SOURCE, WORK_TAG, MPI_COMM_WORLD);        

          // update pointer
          if(next_work_node->next == NULL)
          {
              list_end = next_work_node;
          }

          // update work index for new_pid
          assignment_ptrs[status_res.MPI_SOURCE-number_of_nonslaves] = next_work_node;
          assert(assignment_ptrs[status_res.MPI_SOURCE-number_of_nonslaves] != NULL);
          assignment_time[status_res.MPI_SOURCE-number_of_nonslaves] = MPI_Wtime();
          // send updated array of times to supervisor
          MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
          DEBUG_PRINT(("SENT TIME TO SUP"));
          next_work_node = next_work_node->next;
          if(next_work_node == NULL)
          {
              DEBUG_PRINT(("Reached the end of the work list, should get idle processors after this"));
          }
        }
        else
        {
            DEBUG_PRINT(("Worker %d is now idle, I ain't got shit for him to do", worker_number));
            assignment_time[worker_number] = 0.0;
            assignment_ptrs[worker_number] = NULL;
            assert(!are_you_down[worker_number]);
            MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
        }
      }
      // continue to receive results from workers as non-blocking recv
      MPI_Irecv(&received_results[num_results_received], f->res_sz, MPI_CHAR, MPI_ANY_SOURCE, WORK_TAG, MPI_COMM_WORLD, &request_res);      
    }

    if (flag_kill)
    {
      return;
    }
  }

  // send kill signal to other processes, including supervisor
  for(slave=1; slave<number_of_slaves+number_of_nonslaves; ++slave)
  {
    DEBUG_PRINT(("Murdering slave"));
    kill_slave(slave);
  }

  start_results = MPI_Wtime();
  int err_code = f->result(num_results_received, received_results);
  end_results = MPI_Wtime();

  end = MPI_Wtime();
  
  DEBUG_PRINT(("all %f s\n", end-start));
  DEBUG_PRINT(("create %f s\n", end_create-start_create));
  DEBUG_PRINT(("process %f s\n", end_results-start_results));

}
Exemple #7
0
void do_supervisor_as_master_stuff(int argc, char ** argv, struct mw_api_spec *f)
{
  DEBUG_PRINT(("supervisor taking over"));

  int number_of_nonslaves = 2;

  int number_of_slaves;
  MPI_Comm_size(MPI_COMM_WORLD, &number_of_slaves);
  number_of_slaves = number_of_slaves - number_of_nonslaves;

  DEBUG_PRINT(("NUMBER OF SLAVES %d", number_of_slaves));

  /** slave failure detection **/
  // keep track of start times
  //if (assignment_time1 == NULL)
  //double * assignment_time2 = malloc(sizeof(double)*number_of_slaves);

  // determine how long each worker took
  //if (complete_time == NULL)
  double * complete_time = malloc(sizeof(double)*number_of_slaves);

  // initialize threshold to 0.1
  double threshold = 0.1, tot_time = 0.0, sq_err = 0.0, mean = 0.0, stddev = 0.0;
    
  /** end slave failure detection **/
  
  double start, end, start_create, end_create, start_results, end_results;

  start = MPI_Wtime();

  DEBUG_PRINT(("creating work list..."));
  start_create = MPI_Wtime();
  // save work_array separately so we can find index later on
  mw_work_t ** work_array = f->create(argc, argv);
  // create work_list later
  end_create = MPI_Wtime();
  DEBUG_PRINT(("created work in %f seconds!", end_create - start_create));

  int num_work_units=0;

  num_work_units = get_total_units(work_array);

  DEBUG_PRINT(("num_work_units %d\n", num_work_units));

  mw_result_t * received_results = calloc(num_work_units, f->res_sz);
  if (received_results == NULL)
  {
    fprintf(stderr, "ERROR: insufficient memory to allocate received_results\n");
    exit(0);
  }

  int * has_result_array = calloc(num_work_units, sizeof(int));

  int num_results_received = 0;

  /** read through contents of file **/
  FILE *file = fopen("recovery.txt","r");
  if (file != NULL) //there are results to process
  {
    int result_index = 0;
    char str[1000];
    while(fscanf(file, "%d %s", &result_index, str) != EOF)
    {
      //printf("%d %s\n", result_index, str);          

      // update received results  
      mw_result_t * result = f->from_str(str);

      //printf("here\n");
      received_results[result_index] = *result;
      //printf("now here\n");
      // update has_results_array
      has_result_array[result_index] = 1;
      // update num_results_received
      num_results_received++;
    }
  }

  DEBUG_PRINT(("num_results_received %d\n", num_results_received));

  // create linked list of indices not in the results array
  LinkedList * work_list = new_linkedlist_node();
  LinkedList * next_work_node = work_list;
  LinkedList * head = work_list;

  // cycle through has_result_array to find indices not in results array
  int i;
  int num_results_needed = 0;
  for (i = 0; i < num_work_units; i++)
  {
    if (has_result_array[i] == 0)
    {
      next_work_node->index = i;
      next_work_node->data = work_array[i];
      if (num_results_needed < (num_work_units-num_results_received)-1)
        addNode(next_work_node);
      if (next_work_node->next == NULL);
        next_work_node = next_work_node->next;
      num_results_needed++;
    }
  }
  DEBUG_PRINT(("num_results_needed %d", num_results_needed));

  // reset next_work_node to head
  next_work_node = head;

  // tell slaves to send to supervisor now
  int slave;
  for(slave=number_of_nonslaves; slave<(number_of_slaves+number_of_nonslaves); ++slave) {
    DEBUG_PRINT(("Telling slave"));
    MPI_Send(0, 0, MPI_CHAR, slave, M_FAIL_TAG, MPI_COMM_WORLD);
  }

  // make array keeping track of pointers for work that's active
  LinkedList* assignment_ptrs[number_of_slaves];

  // create array of start times
  double assignment_time[number_of_slaves];

  // create array of start times
  int assignment_indices[number_of_slaves];

  // create array indicating if slaves are down
  int are_you_down[number_of_slaves];

  // pointer for end of list
  LinkedList * list_end = NULL;

  // have supervisor so starting at number_of_nonslaves
  for(slave=number_of_nonslaves; slave<(number_of_slaves+number_of_nonslaves); ++slave)
  {
    are_you_down[slave-number_of_nonslaves] = 0; //slaves are all working in the beginning
    DEBUG_PRINT(("assigning work to slave"));

    if(next_work_node == NULL)
    {
      DEBUG_PRINT(("reached the end of the work, breaking!"));
      break;
    }

    mw_work_t * work_unit = next_work_node->data;

    send_to_slave(work_unit, f->work_sz, MPI_CHAR, slave, WORK_TAG, MPI_COMM_WORLD);

    // save next_work_node to assigned work
    assignment_ptrs[slave-number_of_nonslaves] = next_work_node;
    assert(assignment_ptrs[slave-number_of_nonslaves] != NULL);
    
    // save start time
    assignment_time[slave-number_of_nonslaves] = MPI_Wtime();

    // save assignment indices
    assignment_indices[slave-number_of_nonslaves] = next_work_node->index;

    // update next_work_node
    if(next_work_node->next == NULL)
    {
        list_end = next_work_node;
    }
    next_work_node=next_work_node->next;

    DEBUG_PRINT(("work %d sent to slave %d", assignment_indices[slave-number_of_nonslaves], slave));
  }

  // no need to send time array to supervisor

  MPI_Status status_res;
  MPI_Request request_res;
  int flag_res = 0;

  // receive result from workers as non-blocking recv
  MPI_Irecv(&received_results[num_results_received], f->res_sz, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &request_res);

  // don't clear out file; will append new results to recovery.txt
  FILE * fptr;

  // send units of work while haven't received all results
  while(num_results_received < num_work_units)
  {
    // check for flag_res
    MPI_Test(&request_res, &flag_res, &status_res);

    // send work if have failures or got results

      /** slave failure detection **/
      // check if slave has not responded for a long time
      for(i=0; i<number_of_slaves; i++)
      {
        // not failed and not idle
        if (!are_you_down[i] && assignment_time[i] != 0.0)
        {
          if (i == 4)
            DEBUG_PRINT(("NOT FAILED NOT IDLE rank %d %f",i+2, MPI_Wtime()-assignment_time[i]));
          if(threshold>0 && MPI_Wtime() - assignment_time[i] > threshold)
          {
            DEBUG_PRINT(("methinks someone is slacking of rank %d", i+2));
            are_you_down[i] = 1;
            assignment_time[i] = 0.0;
            assignment_indices[i] = -1;

            // get work_unit that needs to be reassigned
            LinkedList * work_unit = assignment_ptrs[i];

            if (work_unit == NULL)
              DEBUG_PRINT(("work_unit is NULL"));

            if(work_unit != NULL)
            {
                DEBUG_PRINT(("Moving assignment at %p to end of the queue", work_unit));
                move_node_to_end(work_unit);
                if(next_work_node == NULL)
                {
                    next_work_node = work_unit;
                }
                assert(next_work_node != NULL);
            }

          }
        }
      }
      /** end slave failure detection **/

    
    // find an idle process to assign work to
    int idle_process = -1, i;
    for(i=0; i<number_of_slaves; ++i)
    {
        if(assignment_time[i] == 0.0 && !are_you_down[i])
        {
            idle_process = i;
            break;
        }
    }

    // assign idle process unit of work
    if(next_work_node != NULL && idle_process > -1)
    {
        send_to_slave(next_work_node->data, f->work_sz, MPI_CHAR, idle_process+number_of_nonslaves, WORK_TAG, MPI_COMM_WORLD);

        /** slave failure detection **/
        //a previously idle worker got assigned something
        DEBUG_PRINT(("Worker of rank %d just got off his lazy ass", i+2));
        /** end slave failure detection **/

        assignment_ptrs[idle_process] = next_work_node;
        if (idle_process == 4)
          DEBUG_PRINT(("changing rank 6 time idle"));
        assignment_time[idle_process] = MPI_Wtime();
        assignment_indices[idle_process] = next_work_node->index;
        //MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
        DEBUG_PRINT(("Gave an assignment to previously idle process rank %d, assignment at %p", idle_process+number_of_nonslaves, next_work_node));
        if(next_work_node->next == NULL)
        {
            list_end = next_work_node;
        }
        next_work_node = next_work_node->next;
    }

    if (flag_res)
    {

      int worker_number = status_res.MPI_SOURCE-number_of_nonslaves;

      DEBUG_PRINT(("Got result from rank %d", worker_number+number_of_nonslaves));
     
      if(!are_you_down[worker_number]) //If this slave is marked dead, just ignore him
      {
        // save index and result received to file
        char * str = f->to_str(received_results[num_results_received]);
        fptr = fopen("recovery.txt", "a");
        fprintf(fptr, "%d %s\n", assignment_indices[worker_number], str);
        fclose(fptr);

        // update number of results received
        num_results_received++;

        /** slave failure detection **/
        //DEBUG_PRINT(("supervisor is impressed by his good worker %d", i));
        int i = worker_number;
        complete_time[i] = MPI_Wtime() - assignment_time[i];
        tot_time += complete_time[i];
        mean = tot_time/num_results_received;
        sq_err += pow(complete_time[i] - mean, 2);
        stddev = sqrt(sq_err/num_results_received);
        //we have enough data to update threshold
        if(num_results_received >= number_of_slaves/2)
        {
          //DEBUG_PRINT(("the stddev is %f", stddev));
          threshold = mean + 10*stddev + 0.1;
          //DEBUG_PRINT(("the threshold is %f", threshold));
        }
        //assignment_time1[i] = assignment_time2[i];
        //found_change = 1;
        /** end slave failure detection **/

        //DEBUG_PRINT(("num results received %d\n", num_results_received));

        if(next_work_node == NULL && list_end != NULL && list_end->next != NULL)
        {
            DEBUG_PRINT(("Found more work to do, now an idle process can get an assignment"));
            next_work_node = list_end->next;
            list_end = NULL;
        }
        if(next_work_node != NULL)
        {
          // get work_unit
          mw_work_t* work_unit = next_work_node->data;
          
          //DEBUG_PRINT(("Sending new unit of work"));

          // send new unit of work
          send_to_slave(work_unit, f->work_sz, MPI_CHAR, status_res.MPI_SOURCE, WORK_TAG, MPI_COMM_WORLD);        
          // update pointer
          if(next_work_node->next == NULL)
          {
              list_end = next_work_node;
          }

          // update work index for new_pid
          assignment_ptrs[status_res.MPI_SOURCE-number_of_nonslaves] = next_work_node;
          assert(assignment_ptrs[status_res.MPI_SOURCE-number_of_nonslaves] != NULL);
          if (status_res.MPI_SOURCE == 4)
            DEBUG_PRINT(("changing process of rank 6 time in res recv"));

          assignment_time[status_res.MPI_SOURCE-number_of_nonslaves] = MPI_Wtime();
          assignment_indices[status_res.MPI_SOURCE-number_of_nonslaves] = next_work_node->index;
          // send updated array of times to supervisor
          //MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
          //DEBUG_PRINT(("SENT TIME TO SUP"));
          next_work_node = next_work_node->next;
          if(next_work_node == NULL)
          {
              DEBUG_PRINT(("Reached the end of the work list, should get idle processors after this"));
          }
        }
        else
        {
            DEBUG_PRINT(("Worker of rank %d is now idle, I ain't got shit for him to do", worker_number+2));
            if (worker_number == 4)
              DEBUG_PRINT(("changing processof rank 6 time in else"));
            assignment_time[worker_number] = 0.0;
            assignment_ptrs[worker_number] = NULL;
            assignment_indices[worker_number] = -1;
            assert(!are_you_down[worker_number]);

            //MPI_Send(assignment_time, number_of_slaves, MPI_DOUBLE, 1, SUPERVISOR_TAG, MPI_COMM_WORLD);
        }
      }


      // continue to receive results from workers as non-blocking recv
      MPI_Irecv(&received_results[num_results_received], f->res_sz, MPI_CHAR, MPI_ANY_SOURCE, WORK_TAG, MPI_COMM_WORLD, &request_res);      
    }
  }

  // send kill signal to other processes
  for(slave=number_of_nonslaves; slave<number_of_slaves+number_of_nonslaves; ++slave)
  {
    DEBUG_PRINT(("Murdering slave"));
    kill_slave(slave);
  }

  start_results = MPI_Wtime();
  int err_code = f->result(num_results_received, received_results);
  end_results = MPI_Wtime();

  end = MPI_Wtime();
  
  DEBUG_PRINT(("all %f s\n", end-start));
  DEBUG_PRINT(("create %f s\n", end_create-start_create));
  DEBUG_PRINT(("process %f s\n", end_results-start_results));

  // remove recovery file since it is no longer useful
  remove("recovery.txt");

}
Exemple #8
0
void do_master_stuff(int argc, char ** argv, struct mw_api_spec *f)
{

	DEBUG_PRINT("master starting");

	int number_of_slaves;

	MPI_Comm_size(MPI_COMM_WORLD, &number_of_slaves);
	
	mw_work_t ** work_list;

	printf("argc: %d\n", argc);
	int i;
	for(i=0;  i<argc; ++i)
	{
		printf("arg[%d]: %s\n",i, argv[i]);
	}
	DEBUG_PRINT("creating work list...");
	work_list = f->create(argc, argv);
	DEBUG_PRINT("created work!");

	int slave=1, num_work_units=0;

	num_work_units = get_total_units(work_list);

	mw_result_t * received_results =  malloc(f->res_sz * num_work_units);
	if (received_results == NULL)
	{
	  fprintf(stderr, "ERROR: insufficient memory to allocate received_results\n");
	  free(received_results);
	}

	int num_results_received = 0;

	for(slave=1; slave<number_of_slaves; ++slave)
	{
		DEBUG_PRINT("assigning work to slave");
		mw_work_t * work_unit = work_list[i];
		i++;
		if(work_unit == NULL)
		{
			DEBUG_PRINT("reached the end of the work, breaking!");
			break;
		}
		send_to_slave(work_unit, f->work_sz, MPI_CHAR, slave, WORK_TAG, MPI_COMM_WORLD);
		//MPI_Send(work_unit, f->work_sz, MPI_CHAR, slave, WORK_TAG, MPI_COMM_WORLD);
		DEBUG_PRINT("work sent to slave");
	}

	while(work_list[i] != NULL)
	{
		DEBUG_PRINT("Waiting to receive a result...");
		MPI_Status status;
		MPI_Recv(&received_results[num_results_received], f->res_sz, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		DEBUG_PRINT("Received a result!");
		num_results_received++;
		send_to_slave(work_list[i], f->work_sz, MPI_CHAR, status.MPI_SOURCE, WORK_TAG, MPI_COMM_WORLD);
		i++;
	}

	while(num_results_received < num_work_units)
	{
		DEBUG_PRINT("Waiting to receive a result...");
		MPI_Status status;
		MPI_Recv(&received_results[num_results_received], f->res_sz, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		DEBUG_PRINT("Received a result!");
		num_results_received++;
	}

	DEBUG_PRINT("Received all results!");

	for(slave=1; slave<number_of_slaves; ++slave)
	{
		DEBUG_PRINT("Murdering slave");
		kill_slave(slave);
	}

	int err_code = f->result(num_results_received, received_results);
}