void makeAndDisplayRegular( char* name, char* pool ) { Daemon startd( DT_STARTD, name, pool ); Daemon schedd( DT_SCHEDD, name, pool ); Daemon master( DT_MASTER, name, pool ); if( ! startd.locate() ) { dprintf( dflag, "%s\n", startd.error() ); } else { startd.display( dflag ); } dprintf( dflag, "\n" ); if( ! schedd.locate() ) { dprintf( dflag, "%s\n", schedd.error() ); } else { schedd.display( dflag ); } dprintf( dflag, "\n" ); if( ! master.locate() ) { dprintf( dflag, "%s\n", master.error() ); } else { master.display( dflag ); } dprintf( dflag, "\n" ); }
void delegateGSI(boost::python::object fname) { if (m_claim.empty()) {THROW_EX(ValueError, "No claim set for object.");} std::string proxy_file; if (fname.ptr() == Py_None) { proxy_file = get_x509_proxy_filename(); } else { proxy_file = boost::python::extract<std::string>(fname); } DCStartd startd(m_addr.c_str()); startd.setClaimId(m_claim); compat_classad::ClassAd reply; int irval; { condor::ModuleLock ml; irval = startd.delegateX509Proxy(proxy_file.c_str(), 0, NULL); } if (irval != OK) {THROW_EX(RuntimeError, "Startd failed to delegate GSI proxy.");} }
void testAPI( char* my_name, bool do_socks ) { char *name, *addr, *fullhost, *host, *pool, *error, *id; Daemon startd( DT_STARTD, my_name ); if( ! startd.locate() ) { dprintf( dflag, "%s\n", startd.error() ); } name = startd.name(); addr = startd.addr(); fullhost = startd.fullHostname(); host = startd.hostname(); pool = startd.pool(); error = (char*)startd.error(); id = startd.idStr(); dprintf( dflag, "Type: %d (%s), Name: %s, Addr: %s\n", (int)startd.type(), daemonString(startd.type()), name ? name : "(null)", addr ? addr : "(null)" ); dprintf( dflag, "FullHost: %s, Host: %s, Pool: %s, Port: %d\n", fullhost ? fullhost : "(null)", host ? host : "(null)", pool ? pool : "(null)", startd.port() ); dprintf( dflag, "IsLocal: %s, IsFound: %s, IdStr: %s, Error: %s\n", startd.isLocal ? "Y" : "N", startd.isFound ? "Y" : "N", id ? id : "(null)", error ? error : "(null)" ); if( do_socks ) { testSocks( &startd ); } }
bool VMProc::reportVMInfoToStartd(int cmd, const char *value) { Daemon startd(DT_STARTD, NULL); if( !startd.locate() ) { dprintf(D_ALWAYS,"ERROR: %s\n", startd.error()); return false; } char* addr = startd.addr(); if( !addr ) { dprintf(D_ALWAYS,"Can't find the address of local startd\n"); return false; } // Using udp packet SafeSock ssock; ssock.timeout( 5 ); // 5 seconds timeout ssock.encode(); if( !ssock.connect(addr) ) { dprintf( D_ALWAYS, "Failed to connect to local startd(%s)\n", addr); return false; } if( !startd.startCommand(cmd, &ssock) ) { dprintf( D_ALWAYS, "Failed to send UDP command(%s) " "to local startd %s\n", getCommandString(cmd), addr); return false; } // Send the pid of this starter MyString s_pid; s_pid += (int)daemonCore->getpid(); char *starter_pid = strdup(s_pid.Value()); ASSERT(starter_pid); ssock.code(starter_pid); // Send vm info char *vm_value = strdup(value); ASSERT(vm_value); ssock.code(vm_value); if( !ssock.end_of_message() ) { dprintf( D_FULLDEBUG, "Failed to send EOM to local startd %s\n", addr); free(starter_pid); free(vm_value); return false; } free(starter_pid); free(vm_value); sleep(1); return true; }
bool Defrag::drain(const ClassAd &startd_ad) { std::string name; startd_ad.LookupString(ATTR_NAME,name); dprintf(D_ALWAYS,"Initiating %s draining of %s.\n", m_draining_schedule_str.c_str(),name.c_str()); DCStartd startd( &startd_ad ); int graceful_completion = 0; startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_COMPLETION,graceful_completion); int quick_completion = 0; startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_COMPLETION,quick_completion); int graceful_badput = 0; startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_BADPUT,graceful_badput); int quick_badput = 0; startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_BADPUT,quick_badput); time_t now = time(NULL); std::string draining_check_expr; double badput_growth_tolerance = 1.25; // for now, this is hard-coded int negligible_badput = 1200; int negligible_deadline_slippage = 1200; if( m_draining_schedule <= DRAIN_GRACEFUL ) { dprintf(D_ALWAYS,"Expected draining completion time is %ds; expected draining badput is %d cpu-seconds\n", (int)(graceful_completion-now),graceful_badput); sprintf(draining_check_expr,"%s <= %d && %s <= %d", ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_COMPLETION, graceful_completion + negligible_deadline_slippage, ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_BADPUT, (int)(badput_growth_tolerance*graceful_badput) + negligible_badput); } else { // DRAIN_FAST and DRAIN_QUICK are effectively equivalent here dprintf(D_ALWAYS,"Expected draining completion time is %ds; expected draining badput is %d cpu-seconds\n", (int)(quick_completion-now),quick_badput); sprintf(draining_check_expr,"%s <= %d && %s <= %d", ATTR_EXPECTED_MACHINE_QUICK_DRAINING_COMPLETION, quick_completion + negligible_deadline_slippage, ATTR_EXPECTED_MACHINE_QUICK_DRAINING_BADPUT, (int)(badput_growth_tolerance*quick_badput) + negligible_badput); } std::string request_id; bool resume_on_completion = true; bool rval = startd.drainJobs( m_draining_schedule, resume_on_completion, draining_check_expr.c_str(), request_id ); if( !rval ) { dprintf(D_ALWAYS,"Failed to send request to drain %s: %s\n",startd.name(),startd.error()); m_stats.DrainFailures += 1; return false; } m_stats.DrainSuccesses += 1; return true; }
int updateMachineAdAt( const char * const name, const char * const pool, const ClassAd & update, ClassAd & reply ) { DCStartd startd( name, pool ); if( ! startd.locate() ) { fprintf( stderr, "Unable to locate startd: %s\n", startd.error() ); return 1; } if( ! startd.updateMachineAd( & update, & reply ) ) { fprintf( stderr, "Unable to update machine ad: %s\n", startd.error() ); return 1; } return 0; }
void resume() { if (m_claim.empty()) {THROW_EX(ValueError, "No claim set for object.");} DCStartd startd(m_addr.c_str()); startd.setClaimId(m_claim); compat_classad::ClassAd reply; bool rval; { condor::ModuleLock ml; rval = startd.resumeClaim(&reply, 20); } if (!rval) {THROW_EX(RuntimeError, "Sartd failed to resume claim.");} }
void deactivate(VacateType vacate_type) { if (m_claim.empty()) {THROW_EX(ValueError, "No claim set for object.");} DCStartd startd(m_addr.c_str()); startd.setClaimId(m_claim); compat_classad::ClassAd reply; bool rval; { condor::ModuleLock ml; rval = startd.deactivateClaim(vacate_type, &reply, 20); } if (!rval) {THROW_EX(RuntimeError, "Startd failed to deactivate claim.");} }
int main( int argc, char *argv[] ) { myDistro->Init( argc, argv ); config(); dprintf_config_tool_on_error(0); dprintf_OnExitDumpOnErrorBuffer(stderr); parseArgv( argc, argv ); DCStartd startd( target, pool ); if( ! startd.locate() ) { fprintf( stderr, "ERROR: %s\n", startd.error() ); exit( 1 ); } bool rval = false; if( cmd == DRAIN_JOBS ) { std::string request_id; rval = startd.drainJobs( how_fast, resume_on_completion, draining_check_expr, request_id ); if( rval ) { printf("Sent request to drain %s\n",startd.name()); if (dash_verbose && ! request_id.empty()) { printf("\tRequest id: %s\n", request_id.c_str()); } } } else if( cmd == CANCEL_DRAIN_JOBS ) { rval = startd.cancelDrainJobs( cancel_request_id ); if( rval ) { printf("Sent request to cancel draining on %s\n",startd.name()); } } if( ! rval ) { fprintf( stderr, "Attempt to send %s to startd %s failed\n%s\n", getCommandString(cmd), startd.addr(), startd.error() ); return 1; } dprintf_SetExitCode(0); return 0; }
bool Defrag::cancel_drain(const ClassAd &startd_ad) { std::string name; startd_ad.LookupString(ATTR_NAME,name); dprintf(D_ALWAYS,"Initiating %s draining of %s.\n", m_draining_schedule_str.c_str(),name.c_str()); DCStartd startd( &startd_ad ); bool rval = startd.cancelDrainJobs( NULL ); if ( rval ) { dprintf(D_FULLDEBUG, "Sent request to cancel draining on %s\n", startd.name()); } else { dprintf(D_ALWAYS, "Unable to cancel draining on %s: %s\n", startd.name(), startd.error()); } return rval; }
void activate(boost::python::object ad_obj) { if (m_claim.empty()) {THROW_EX(ValueError, "No claim set for object.");} compat_classad::ClassAd ad = boost::python::extract<ClassAdWrapper>(ad_obj)(); if (ad.find(ATTR_JOB_KEYWORD) == ad.end()) { ad.InsertAttr(ATTR_HAS_JOB_AD, true); } DCStartd startd(m_addr.c_str()); startd.setClaimId(m_claim); compat_classad::ClassAd reply; int irval; { condor::ModuleLock ml; irval = startd.activateClaim(&ad, &reply, 20); } if (irval != OK) {THROW_EX(RuntimeError, "Startd failed to activate claim.");} }
void requestCOD(boost::python::object constraint_obj, int lease_duration) { classad_shared_ptr<classad::ExprTree> constraint; boost::python::extract<std::string> constraint_extract(constraint_obj); if (constraint_obj.ptr() == Py_None) {} else if (constraint_extract.check()) { classad::ClassAdParser parser; std::string constraint_str = constraint_extract(); classad::ExprTree *expr_tmp = NULL; if (!parser.ParseExpression(constraint_str, expr_tmp)) {THROW_EX(ValueError, "Failed to parse request requirements expression");} constraint.reset(expr_tmp); } else { constraint.reset(convert_python_to_exprtree(constraint_obj)); } compat_classad::ClassAd ad, reply; if (constraint.get()) { classad::ExprTree *expr_tmp = constraint->Copy(); ad.Insert(ATTR_REQUIREMENTS, expr_tmp); } ad.InsertAttr(ATTR_JOB_LEASE_DURATION, lease_duration); bool rval; DCStartd startd(m_addr.c_str()); { condor::ModuleLock ml; rval = startd.requestClaim(CLAIM_COD, &ad, &reply, 20); } if (!rval) {THROW_EX(RuntimeError, "Failed to request claim from startd.");} if (!reply.EvaluateAttrString(ATTR_CLAIM_ID, m_claim)) {THROW_EX(RuntimeError, "Startd did not return a ClaimId.");} }
int main( int argc, char *argv[] ) { #ifndef WIN32 // Ignore SIGPIPE so if we cannot connect to a daemon we do not // blowup with a sig 13. install_sig_handler(SIGPIPE, SIG_IGN ); #endif myDistro->Init( argc, argv ); config(); cmd = getCommandFromArgv( argc, argv ); parseArgv( argc, argv ); DCStartd startd( target, pool ? pool->addr() : NULL ); if( needs_id ) { assert( claim_id ); startd.setClaimId( claim_id ); } if( ! startd.locate() ) { fprintf( stderr, "ERROR: %s\n", startd.error() ); exit( 1 ); } bool rval = FALSE; int irval; ClassAd reply; ClassAd ad; switch( cmd ) { case CA_REQUEST_CLAIM: fillRequestAd( &ad ); rval = startd.requestClaim( CLAIM_COD, &ad, &reply, timeout ); break; case CA_ACTIVATE_CLAIM: fillActivateAd( &ad ); irval = startd.activateClaim( &ad, &reply, timeout ); rval = (irval == OK); break; case CA_SUSPEND_CLAIM: rval = startd.suspendClaim( &reply, timeout ); break; case CA_RESUME_CLAIM: rval = startd.resumeClaim( &reply, timeout ); break; case CA_DEACTIVATE_CLAIM: rval = startd.deactivateClaim( vacate_type, &reply, timeout ); break; case CA_RELEASE_CLAIM: rval = startd.releaseClaim( vacate_type, &reply, timeout ); break; case CA_RENEW_LEASE_FOR_CLAIM: rval = startd.renewLeaseForClaim( &reply, timeout ); break; case DELEGATE_GSI_CRED_STARTD: irval = startd.delegateX509Proxy( proxy_file, 0, NULL ); rval = (irval == OK); break; } if( ! rval ) { fprintf( stderr, "Attempt to send %s to startd %s failed\n%s\n", getCommandString(cmd), startd.addr(), startd.error() ); return 1; } printOutput( &reply, &startd ); return 0; }
int part_send_job( int test_starter, char *host, int &reason, char *capability, char * /*schedd*/, PROC *proc, int &sd1, int &sd2, char **name) { int reply; ReliSock *sock = NULL; StartdRec stRec; PORTS ports; bool done = false; int retry_delay = 3; int num_retries = 0; // make sure we have the job classad InitJobAd(proc->id.cluster, proc->id.proc); while( !done ) { Daemon startd(DT_STARTD, host, NULL); if (!(sock = (ReliSock*)startd.startCommand ( ACTIVATE_CLAIM, Stream::reli_sock, 90))) { dprintf( D_ALWAYS, "startCommand(ACTIVATE_CLAIM) to startd failed\n"); goto returnfailure; } // Send the capability ClaimIdParser idp( capability ); dprintf(D_FULLDEBUG, "send capability %s\n", idp.publicClaimId() ); if( !sock->put_secret(capability) ) { dprintf( D_ALWAYS, "sock->put(\"%s\") failed\n",idp.publicClaimId()); goto returnfailure; } // Send the starter number if( test_starter ) { dprintf( D_ALWAYS, "Requesting Alternate Starter %d\n", test_starter ); } else { dprintf( D_ALWAYS, "Requesting Primary Starter\n" ); } if( !sock->code(test_starter) ) { dprintf( D_ALWAYS, "sock->code(%d) failed\n", test_starter ); goto returnfailure; } // Send the job info if( !JobAd->put(*sock) ) { dprintf( D_ALWAYS, "failed to send job ad\n" ); goto returnfailure; } if( !sock->end_of_message() ) { dprintf( D_ALWAYS, "failed to send message to startd\n" ); goto returnfailure; } // We're done sending. Now, get the reply. sock->decode(); if( !sock->code(reply) || !sock->end_of_message() ) { dprintf( D_ALWAYS, "failed to receive reply from startd\n" ); goto returnfailure; } switch( reply ) { case OK: dprintf( D_ALWAYS, "Shadow: Request to run a job was ACCEPTED\n" ); done = true; break; case NOT_OK: dprintf( D_ALWAYS, "Shadow: Request to run a job was REFUSED\n"); goto returnfailure; break; case CONDOR_TRY_AGAIN: num_retries++; dprintf( D_ALWAYS, "Shadow: Request to run a job was TEMPORARILY REFUSED\n" ); if( num_retries > 20 ) { dprintf( D_ALWAYS, "Shadow: Too many retries, giving up.\n" ); goto returnfailure; } delete sock; dprintf( D_ALWAYS, "Shadow: will try again in %d seconds\n", retry_delay ); sleep( retry_delay ); break; default: dprintf(D_ALWAYS,"Unknown reply from startd for command ACTIVATE_CLAIM\n"); dprintf(D_ALWAYS,"Shadow: Request to run a job was REFUSED\n"); goto returnfailure; break; } } /* start flock : dhruba */ sock->decode(); memset( &stRec, '\0', sizeof(stRec) ); if( !sock->code(stRec) || !sock->end_of_message() ) { dprintf(D_ALWAYS, "Can't read reply from startd.\n"); goto returnfailure; } ports = stRec.ports; if( stRec.ip_addr ) { host = stRec.server_name; if(name) { *name = strdup(stRec.server_name); } dprintf(D_FULLDEBUG, "host = %s inet_addr = 0x%x port1 = %d port2 = %d\n", host, stRec.ip_addr,ports.port1, ports.port2 ); } else { dprintf(D_FULLDEBUG, "host = %s port1 = %d port2 = %d\n", host, ports.port1, ports.port2 ); } if( ports.port1 == 0 ) { dprintf( D_ALWAYS, "Shadow: Request to run a job on %s was REFUSED\n", host ); goto returnfailure; } /* end flock ; dhruba */ // We don't use the server_name in the StartdRec, because our // DNS query may fail or may give us the wrong IP address // (either because it's stale or because we're talking to a // machine with multiple network interfaces). Sadly, we can't // use the ip_addr either, because the startd doesn't send it in // the correct byte ordering on little-endian machines. So, we // grab the IP address from the ReliSock, since we konw the // startd always uses the same IP address for all of its // communication. char sinfulstring[SINFUL_STRING_BUF_SIZE]; generate_sinful(sinfulstring, SINFUL_STRING_BUF_SIZE, sock->peer_ip_str(), ports.port1); if( (sd1 = do_connect(sinfulstring, (char *)0, (u_short)ports.port1)) < 0 ) { dprintf( D_ALWAYS, "failed to connect to scheduler on %s\n", sinfulstring ); goto returnfailure; } generate_sinful(sinfulstring, SINFUL_STRING_BUF_SIZE, sock->peer_ip_str(), ports.port2); if( (sd2 = do_connect(sinfulstring, (char *)0, (u_short)ports.port2)) < 0 ) { dprintf( D_ALWAYS, "failed to connect to scheduler on %s\n", sinfulstring ); close(sd1); goto returnfailure; } delete sock; sock = NULL; if ( stRec.server_name ) { free( stRec.server_name ); } return 0; returnfailure: reason = JOB_NOT_STARTED; delete sock; return -1; }