Exemple #1
0
/*legacy, not sure how this fits into the own scheme of things*/
void KNCube::InsertRandomFaults( const Configuration &config )
{
  int num_fails;
  unsigned long prev_seed;

  int node, chan;
  int i, j, t, n, c;
  bool available;

  bool edge;

  num_fails = config.GetInt( "link_failures" );
  
  if ( _size && num_fails ) {
    prev_seed = RandomIntLong( );
    RandomSeed( config.GetInt( "fail_seed" ) );

    vector<bool> fail_nodes(_size);

    for ( i = 0; i < _size; ++i ) {
      node = i;

      // edge test
      edge = false;
      for ( n = 0; n < _n; ++n ) {
	if ( ( ( node % _k ) == 0 ) ||
	     ( ( node % _k ) == _k - 1 ) ) {
	  edge = true;
	}
	node /= _k;
      }

      if ( edge ) {
	fail_nodes[i] = true;
      } else {
	fail_nodes[i] = false;
      }
    }

    for ( i = 0; i < num_fails; ++i ) {
      j = RandomInt( _size - 1 );
      available = false;

      for ( t = 0; ( t < _size ) && (!available); ++t ) {
	node = ( j + t ) % _size;
       
	if ( !fail_nodes[node] ) {
	  // check neighbors
	  c = RandomInt( 2*_n - 1 );

	  for ( n = 0; ( n < 2*_n ) && (!available); ++n ) {
	    chan = ( n + c ) % 2*_n;

	    if ( chan % 1 ) {
	      available = fail_nodes[_LeftNode( node, chan/2 )];
	    } else {
	      available = fail_nodes[_RightNode( node, chan/2 )];
	    }
	  }
	}
	
	if ( !available ) {
	  cout << "skipping " << node << endl;
	}
      }

      if ( t == _size ) {
	Error( "Could not find another possible fault channel" );
      }

      
      OutChannelFault( node, chan );
      fail_nodes[node] = true;

      for ( n = 0; ( n < _n ) && available ; ++n ) {
	fail_nodes[_LeftNode( node, n )]  = true;
	fail_nodes[_RightNode( node, n )] = true;
      }

      cout << "failure at node " << node << ", channel " 
	   << chan << endl;
    }

    RandomSeed( prev_seed );
  }
}
Exemple #2
0
static void _proc_msg(int new_fd, char *msg, slurm_addr_t cli_addr)
{
	/* Locks: Read job and node data */
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
	/* Locks: Write job */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	/* Locks: Write job, write node, read partition */
	slurmctld_lock_t job_write_lock2 = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
	/* Locks: Write node data */
	slurmctld_lock_t node_write_lock = {
		NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
	char *cmd_ptr, *resp = NULL, *msg_decrypted = NULL;
	uid_t cmd_uid;
	uint32_t protocol_version = 0;

	if (!msg) {
		info("slurmctld/nonstop: NULL message received");
		resp = xstrdup("Error:\"NULL message received\"");
		goto send_resp;
	}

	msg_decrypted = _decrypt(msg, &cmd_uid);
	if (!msg_decrypted) {
		info("slurmctld/nonstop: Message decrypt failure");
		resp = xstrdup("Error:\"Message decrypt failure\"");
		goto send_resp;
	}
	if (nonstop_debug > 0)
		info("slurmctld/nonstop: msg decrypted:%s", msg_decrypted);
	cmd_ptr = msg_decrypted;

	/* 123456789012345678901234567890 */
	if (xstrncmp(cmd_ptr, version_string, 13) == 0) {
		cmd_ptr = strchr(cmd_ptr + 13, ':');
		if (cmd_ptr) {
			cmd_ptr++;
			protocol_version = SLURM_PROTOCOL_VERSION;
		}
	}

	if (protocol_version == 0) {
		info("slurmctld/nonstop: Message version invalid");
		resp = xstrdup("Error:\"Message version invalid\"");
		goto send_resp;
	}
	if (xstrncmp(cmd_ptr, "CALLBACK:JOBID:", 15) == 0) {
		resp = register_callback(cmd_ptr, cmd_uid, cli_addr,
					 protocol_version);
	} else if (xstrncmp(cmd_ptr, "DRAIN:NODES:", 12) == 0) {
		lock_slurmctld(node_write_lock);
		resp = drain_nodes_user(cmd_ptr, cmd_uid, protocol_version);
		unlock_slurmctld(node_write_lock);
	} else if (xstrncmp(cmd_ptr, "DROP_NODE:JOBID:", 15) == 0) {
		lock_slurmctld(job_write_lock2);
		resp = drop_node(cmd_ptr, cmd_uid, protocol_version);
		unlock_slurmctld(job_write_lock2);
	} else if (xstrncmp(cmd_ptr, "GET_FAIL_NODES:JOBID:", 21) == 0) {
		lock_slurmctld(job_read_lock);
		resp = fail_nodes(cmd_ptr, cmd_uid, protocol_version);
		unlock_slurmctld(job_read_lock);
	} else if (xstrncmp(cmd_ptr, "REPLACE_NODE:JOBID:", 19) == 0) {
		lock_slurmctld(job_write_lock2);
		resp = replace_node(cmd_ptr, cmd_uid, protocol_version);
		unlock_slurmctld(job_write_lock2);
	} else if (xstrncmp(cmd_ptr, "SHOW_CONFIG", 11) == 0) {
		resp = show_config(cmd_ptr, cmd_uid, protocol_version);
	} else if (xstrncmp(cmd_ptr, "SHOW_JOB:JOBID:", 15) == 0) {
		resp = show_job(cmd_ptr, cmd_uid, protocol_version);
	} else if (xstrncmp(cmd_ptr, "TIME_INCR:JOBID:", 16) == 0) {
		lock_slurmctld(job_write_lock);
		resp = time_incr(cmd_ptr, cmd_uid, protocol_version);
		unlock_slurmctld(job_write_lock);
	} else {
		info("slurmctld/nonstop: Invalid command: %s", cmd_ptr);
		xstrfmtcat(resp, "%s ECMD", SLURM_VERSION_STRING);
	}

 send_resp:
	if (nonstop_debug > 0)
		info("slurmctld/nonstop: msg send:%s", resp);
	_send_reply(new_fd, resp);
	xfree(resp);
	if (msg_decrypted)
		free(msg_decrypted);
	return;
}
Exemple #3
0
/*legacy, not sure how this fits into the own scheme of things*/
void KNCube::InsertRandomFaults( const Configuration &config )
{
  int num_fails = config.GetInt( "link_failures" );
  
  if ( _size && num_fails ) {
    vector<long> save_x;
    vector<double> save_u;
    SaveRandomState( save_x, save_u );
    int fail_seed;
    if ( config.GetStr( "fail_seed" ) == "time" ) {
      fail_seed = int( time( NULL ) );
      cout << "SEED: fail_seed=" << fail_seed << endl;
    } else {
      fail_seed = config.GetInt( "fail_seed" );
    }
    RandomSeed( fail_seed );

    vector<bool> fail_nodes(_size);

    for ( int i = 0; i < _size; ++i ) {
      int node = i;

      // edge test
      bool edge = false;
      for ( int n = 0; n < _n; ++n ) {
	if ( ( ( node % _k ) == 0 ) ||
	     ( ( node % _k ) == _k - 1 ) ) {
	  edge = true;
	}
	node /= _k;
      }

      if ( edge ) {
	fail_nodes[i] = true;
      } else {
	fail_nodes[i] = false;
      }
    }

    for ( int i = 0; i < num_fails; ++i ) {
      int j = RandomInt( _size - 1 );
      bool available = false;
      int node, chan;
      int t;

      for ( t = 0; ( t < _size ) && (!available); ++t ) {
		node = ( j + t ) % _size;
	       
		if ( !fail_nodes[node] ) {
		  // check neighbors
		  int c = RandomInt( 2*_n - 1 );

		  for ( int n = 0; ( n < 2*_n ) && (!available); ++n ) {
		    chan = ( n + c ) % 2*_n;

		    if ( chan % 1 ) {
		      available = fail_nodes[_LeftNode( node, chan/2 )];
		    } else {
		      available = fail_nodes[_RightNode( node, chan/2 )];
		    }
		  }
		}
		
		if ( !available ) {
		  cout << "skipping " << node << endl;
		}
      }

      if ( t == _size ) {
		Error( "Could not find another possible fault channel" );
      }

      
      OutChannelFault( node, chan );
      fail_nodes[node] = true;

      for ( int n = 0; ( n < _n ) && available ; ++n ) {
		fail_nodes[_LeftNode( node, n )]  = true;
		fail_nodes[_RightNode( node, n )] = true;
      }

      cout << "failure at node " << node << ", channel " 
	   << chan << endl;
    }

    RestoreRandomState( save_x, save_u );
  }
}