/*legacy, not sure how this fits into the own scheme of things*/ void KNCube::InsertRandomFaults( const Configuration &config ) { int num_fails; unsigned long prev_seed; int node, chan; int i, j, t, n, c; bool available; bool edge; num_fails = config.GetInt( "link_failures" ); if ( _size && num_fails ) { prev_seed = RandomIntLong( ); RandomSeed( config.GetInt( "fail_seed" ) ); vector<bool> fail_nodes(_size); for ( i = 0; i < _size; ++i ) { node = i; // edge test edge = false; for ( n = 0; n < _n; ++n ) { if ( ( ( node % _k ) == 0 ) || ( ( node % _k ) == _k - 1 ) ) { edge = true; } node /= _k; } if ( edge ) { fail_nodes[i] = true; } else { fail_nodes[i] = false; } } for ( i = 0; i < num_fails; ++i ) { j = RandomInt( _size - 1 ); available = false; for ( t = 0; ( t < _size ) && (!available); ++t ) { node = ( j + t ) % _size; if ( !fail_nodes[node] ) { // check neighbors c = RandomInt( 2*_n - 1 ); for ( n = 0; ( n < 2*_n ) && (!available); ++n ) { chan = ( n + c ) % 2*_n; if ( chan % 1 ) { available = fail_nodes[_LeftNode( node, chan/2 )]; } else { available = fail_nodes[_RightNode( node, chan/2 )]; } } } if ( !available ) { cout << "skipping " << node << endl; } } if ( t == _size ) { Error( "Could not find another possible fault channel" ); } OutChannelFault( node, chan ); fail_nodes[node] = true; for ( n = 0; ( n < _n ) && available ; ++n ) { fail_nodes[_LeftNode( node, n )] = true; fail_nodes[_RightNode( node, n )] = true; } cout << "failure at node " << node << ", channel " << chan << endl; } RandomSeed( prev_seed ); } }
static void _proc_msg(int new_fd, char *msg, slurm_addr_t cli_addr) { /* Locks: Read job and node data */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; /* Locks: Write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* Locks: Write job, write node, read partition */ slurmctld_lock_t job_write_lock2 = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; /* Locks: Write node data */ slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK }; char *cmd_ptr, *resp = NULL, *msg_decrypted = NULL; uid_t cmd_uid; uint32_t protocol_version = 0; if (!msg) { info("slurmctld/nonstop: NULL message received"); resp = xstrdup("Error:\"NULL message received\""); goto send_resp; } msg_decrypted = _decrypt(msg, &cmd_uid); if (!msg_decrypted) { info("slurmctld/nonstop: Message decrypt failure"); resp = xstrdup("Error:\"Message decrypt failure\""); goto send_resp; } if (nonstop_debug > 0) info("slurmctld/nonstop: msg decrypted:%s", msg_decrypted); cmd_ptr = msg_decrypted; /* 123456789012345678901234567890 */ if (xstrncmp(cmd_ptr, version_string, 13) == 0) { cmd_ptr = strchr(cmd_ptr + 13, ':'); if (cmd_ptr) { cmd_ptr++; protocol_version = SLURM_PROTOCOL_VERSION; } } if (protocol_version == 0) { info("slurmctld/nonstop: Message version invalid"); resp = xstrdup("Error:\"Message version invalid\""); goto send_resp; } if (xstrncmp(cmd_ptr, "CALLBACK:JOBID:", 15) == 0) { resp = register_callback(cmd_ptr, cmd_uid, cli_addr, protocol_version); } else if (xstrncmp(cmd_ptr, "DRAIN:NODES:", 12) == 0) { lock_slurmctld(node_write_lock); resp = drain_nodes_user(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(node_write_lock); } else if (xstrncmp(cmd_ptr, "DROP_NODE:JOBID:", 15) == 0) { lock_slurmctld(job_write_lock2); resp = drop_node(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_write_lock2); } else if (xstrncmp(cmd_ptr, "GET_FAIL_NODES:JOBID:", 21) == 0) { lock_slurmctld(job_read_lock); resp = fail_nodes(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_read_lock); } else if (xstrncmp(cmd_ptr, "REPLACE_NODE:JOBID:", 19) == 0) { lock_slurmctld(job_write_lock2); resp = replace_node(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_write_lock2); } else if (xstrncmp(cmd_ptr, "SHOW_CONFIG", 11) == 0) { resp = show_config(cmd_ptr, cmd_uid, protocol_version); } else if (xstrncmp(cmd_ptr, "SHOW_JOB:JOBID:", 15) == 0) { resp = show_job(cmd_ptr, cmd_uid, protocol_version); } else if (xstrncmp(cmd_ptr, "TIME_INCR:JOBID:", 16) == 0) { lock_slurmctld(job_write_lock); resp = time_incr(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_write_lock); } else { info("slurmctld/nonstop: Invalid command: %s", cmd_ptr); xstrfmtcat(resp, "%s ECMD", SLURM_VERSION_STRING); } send_resp: if (nonstop_debug > 0) info("slurmctld/nonstop: msg send:%s", resp); _send_reply(new_fd, resp); xfree(resp); if (msg_decrypted) free(msg_decrypted); return; }
/*legacy, not sure how this fits into the own scheme of things*/ void KNCube::InsertRandomFaults( const Configuration &config ) { int num_fails = config.GetInt( "link_failures" ); if ( _size && num_fails ) { vector<long> save_x; vector<double> save_u; SaveRandomState( save_x, save_u ); int fail_seed; if ( config.GetStr( "fail_seed" ) == "time" ) { fail_seed = int( time( NULL ) ); cout << "SEED: fail_seed=" << fail_seed << endl; } else { fail_seed = config.GetInt( "fail_seed" ); } RandomSeed( fail_seed ); vector<bool> fail_nodes(_size); for ( int i = 0; i < _size; ++i ) { int node = i; // edge test bool edge = false; for ( int n = 0; n < _n; ++n ) { if ( ( ( node % _k ) == 0 ) || ( ( node % _k ) == _k - 1 ) ) { edge = true; } node /= _k; } if ( edge ) { fail_nodes[i] = true; } else { fail_nodes[i] = false; } } for ( int i = 0; i < num_fails; ++i ) { int j = RandomInt( _size - 1 ); bool available = false; int node, chan; int t; for ( t = 0; ( t < _size ) && (!available); ++t ) { node = ( j + t ) % _size; if ( !fail_nodes[node] ) { // check neighbors int c = RandomInt( 2*_n - 1 ); for ( int n = 0; ( n < 2*_n ) && (!available); ++n ) { chan = ( n + c ) % 2*_n; if ( chan % 1 ) { available = fail_nodes[_LeftNode( node, chan/2 )]; } else { available = fail_nodes[_RightNode( node, chan/2 )]; } } } if ( !available ) { cout << "skipping " << node << endl; } } if ( t == _size ) { Error( "Could not find another possible fault channel" ); } OutChannelFault( node, chan ); fail_nodes[node] = true; for ( int n = 0; ( n < _n ) && available ; ++n ) { fail_nodes[_LeftNode( node, n )] = true; fail_nodes[_RightNode( node, n )] = true; } cout << "failure at node " << node << ", channel " << chan << endl; } RestoreRandomState( save_x, save_u ); } }