bool DCCollector::finishUpdate( DCCollector *self, Sock* sock, ClassAd* ad1, ClassAd* ad2 ) { // This is a static function so that we can call it from a // nonblocking startCommand() callback without worrying about // longevity of the DCCollector instance. sock->encode(); if( ad1 && ! putClassAd(sock, *ad1) ) { if(self) { self->newError( CA_COMMUNICATION_ERROR, "Failed to send ClassAd #1 to collector" ); } return false; } if( ad2 && ! putClassAd(sock, *ad2) ) { if(self) { self->newError( CA_COMMUNICATION_ERROR, "Failed to send ClassAd #2 to collector" ); return false; } } if( ! sock->end_of_message() ) { if(self) { self->newError( CA_COMMUNICATION_ERROR, "Failed to send EOM to collector" ); } return false; } return true; }
bool TransferQueueRequest::SendGoAhead(XFER_QUEUE_ENUM go_ahead,char const *reason) { ASSERT( m_sock ); m_sock->encode(); ClassAd msg; msg.Assign(ATTR_RESULT,(int)go_ahead); if( reason ) { msg.Assign(ATTR_ERROR_STRING,reason); } // how often should transfer processes send a report of I/O activity // 0 means never int report_interval = param_integer("TRANSFER_IO_REPORT_INTERVAL",10,0); msg.Assign(ATTR_REPORT_INTERVAL,report_interval); if(!putClassAd( m_sock, msg ) || !m_sock->end_of_message()) { dprintf(D_ALWAYS, "TransferQueueRequest: failed to send GoAhead to %s\n", Description() ); return false; } m_gave_go_ahead = true; m_time_go_ahead = time(NULL); return true; }
bool ClaimStartdMsg::writeMsg( DCMessenger * /*messenger*/, Sock *sock ) { // save startd fqu for hole punching m_startd_fqu = sock->getFullyQualifiedUser(); m_startd_ip_addr = sock->peer_ip_str(); // Insert an attribute in the request ad to inform the // startd that this schedd is capable of understanding // the newer protocol where the claim response may send // over any leftover resources from a partitionable slot. m_job_ad.Assign("_condor_SEND_LEFTOVERS", param_boolean("CLAIM_PARTITIONABLE_LEFTOVERS",true)); // Insert an attribute in the request ad to inform the // startd that this schedd is capable of understanding // the newer protocol where the claim response may send // over the ad and claim id of the partner of a paired slot. m_job_ad.Assign("_condor_SEND_PAIRED_SLOT", param_boolean("CLAIM_PAIRED_SLOT",true)); if( !sock->put_secret( m_claim_id.c_str() ) || !putClassAd( sock, m_job_ad ) || !sock->put( m_scheduler_addr.c_str() ) || !sock->put( m_alive_interval ) || !this->putExtraClaims(sock)) { dprintf(failureDebugLevel(), "Couldn't encode request claim to startd %s\n", description() ); sockFailed( sock ); return false; } // end_of_message() is done by caller return true; }
void CCBServer::ForwardRequestToTarget( CCBServerRequest *request, CCBTarget *target ) { Sock *sock = target->getSock(); ClassAd msg; msg.Assign( ATTR_COMMAND, CCB_REQUEST ); msg.Assign( ATTR_MY_ADDRESS, request->getReturnAddr() ); msg.Assign( ATTR_CLAIM_ID, request->getConnectID() ); // for easier debugging msg.Assign( ATTR_NAME, request->getSock()->peer_description() ); MyString reqid_str; CCBIDToString( request->getRequestID(), reqid_str); msg.Assign( ATTR_REQUEST_ID, reqid_str ); sock->encode(); if( !putClassAd( sock, msg ) || !sock->end_of_message() ) { dprintf(D_ALWAYS, "CCB: failed to forward request id %lu from %s to target " "daemon %s with ccbid %lu\n", request->getRequestID(), request->getSock()->peer_description(), target->getSock()->peer_description(), target->getCCBID()); RequestFinished( request, false, "failed to forward request to target" ); return; } // Now wait for target to respond (HandleRequestResultsMsg). // We will get the response next time we poll the socket. // To get a faster response, we _could_ register the socket // now, if it has not already been registered. }
bool ClassAdMsg::writeMsg( DCMessenger * /*messenger*/, Sock *sock ) { if( !putClassAd( sock, m_msg ) ) { sockFailed( sock ); return false; } return true; }
bool DCStartd::drainJobs(int how_fast,bool resume_on_completion,char const *check_expr,char const *start_expr,std::string &request_id) { std::string error_msg; ClassAd request_ad; Sock *sock = startCommand( DRAIN_JOBS, Sock::reli_sock, 20 ); if( !sock ) { formatstr(error_msg,"Failed to start DRAIN_JOBS command to %s",name()); newError(CA_FAILURE,error_msg.c_str()); return false; } request_ad.Assign(ATTR_HOW_FAST,how_fast); request_ad.Assign(ATTR_RESUME_ON_COMPLETION,resume_on_completion); if( check_expr ) { request_ad.AssignExpr(ATTR_CHECK_EXPR,check_expr); } if( start_expr ) { request_ad.AssignExpr(ATTR_START_EXPR,start_expr); } if( !putClassAd(sock, request_ad) || !sock->end_of_message() ) { formatstr(error_msg,"Failed to compose DRAIN_JOBS request to %s",name()); newError(CA_FAILURE,error_msg.c_str()); delete sock; return false; } sock->decode(); ClassAd response_ad; if( !getClassAd(sock, response_ad) || !sock->end_of_message() ) { formatstr(error_msg,"Failed to get response to DRAIN_JOBS request to %s",name()); newError(CA_FAILURE,error_msg.c_str()); delete sock; return false; } response_ad.LookupString(ATTR_REQUEST_ID,request_id); bool result = false; int error_code = 0; response_ad.LookupBool(ATTR_RESULT,result); if( !result ) { std::string remote_error_msg; response_ad.LookupString(ATTR_ERROR_STRING,remote_error_msg); response_ad.LookupInteger(ATTR_ERROR_CODE,error_code); formatstr(error_msg, "Received failure from %s in response to DRAIN_JOBS request: error code %d: %s", name(),error_code,remote_error_msg.c_str()); newError(CA_FAILURE,error_msg.c_str()); delete sock; return false; } delete sock; return true; }
// The transferd only needs a small amount of the information from this // transfer request seriazed to it from the schedd. Basically the m_ip // classad which represents the header information for this treq, and // the job ads. int TransferRequest::put(Stream *sock) { ClassAd *ad = NULL; sock->encode(); // shove the internal header classad across putClassAd(sock, *m_ip); sock->end_of_message(); // now dump all of the jobads through m_todo_ads.Rewind(); while(m_todo_ads.Next(ad)) { putClassAd(sock, *ad); sock->end_of_message(); } return TRUE; }
bool DCStarter::createJobOwnerSecSession(int timeout,char const *job_claim_id,char const *starter_sec_session,char const *session_info,MyString &owner_claim_id,MyString &error_msg,MyString &starter_version,MyString &starter_addr) { ReliSock sock; if (IsDebugLevel(D_COMMAND)) { dprintf (D_COMMAND, "DCStarter::createJobOwnerSecSession(%s,...) making connection to %s\n", getCommandStringSafe(CREATE_JOB_OWNER_SEC_SESSION), _addr ? _addr : "NULL"); } if( !connectSock(&sock, timeout, NULL) ) { error_msg = "Failed to connect to starter"; return false; } if( !startCommand(CREATE_JOB_OWNER_SEC_SESSION, &sock,timeout,NULL,NULL,false,starter_sec_session) ) { error_msg = "Failed to send CREATE_JOB_OWNER_SEC_SESSION to starter"; return false; } ClassAd input; input.Assign(ATTR_CLAIM_ID,job_claim_id); input.Assign(ATTR_SESSION_INFO,session_info); sock.encode(); if( !putClassAd(&sock, input) || !sock.end_of_message() ) { error_msg = "Failed to compose CREATE_JOB_OWNER_SEC_SESSION to starter"; return false; } sock.decode(); ClassAd reply; if( !getClassAd(&sock, reply) || !sock.end_of_message() ) { error_msg = "Failed to get response to CREATE_JOB_OWNER_SEC_SESSION from starter"; return false; } bool success = false; reply.LookupBool(ATTR_RESULT,success); if( !success ) { reply.LookupString(ATTR_ERROR_STRING,error_msg); return false; } reply.LookupString(ATTR_CLAIM_ID,owner_claim_id); reply.LookupString(ATTR_VERSION,starter_version); // get the full starter address from the starter in case it contains // extra CCB info that we don't already know about reply.LookupString(ATTR_STARTER_IP_ADDR,starter_addr); return true; }
bool SwapClaimsMsg::writeMsg( DCMessenger * /*messenger*/, Sock *sock ) { if ( ! sock->put_secret(m_claim_id.c_str()) || ! putClassAd(sock, m_opts)) { dprintf(failureDebugLevel(), "Couldn't encode claim swap request to startd %s\n", description() ); sockFailed(sock); return false; } // end_of_message() is done by caller return true; }
bool DCStartd::cancelDrainJobs(char const *request_id) { std::string error_msg; ClassAd request_ad; Sock *sock = startCommand( CANCEL_DRAIN_JOBS, Sock::reli_sock, 20 ); if( !sock ) { formatstr(error_msg,"Failed to start CANCEL_DRAIN_JOBS command to %s",name()); newError(CA_FAILURE,error_msg.c_str()); return false; } if( request_id ) { request_ad.Assign(ATTR_REQUEST_ID,request_id); } if( !putClassAd(sock, request_ad) || !sock->end_of_message() ) { formatstr(error_msg,"Failed to compose CANCEL_DRAIN_JOBS request to %s",name()); newError(CA_FAILURE,error_msg.c_str()); return false; } sock->decode(); ClassAd response_ad; if( !getClassAd(sock, response_ad) || !sock->end_of_message() ) { formatstr(error_msg,"Failed to get response to CANCEL_DRAIN_JOBS request to %s",name()); newError(CA_FAILURE,error_msg.c_str()); delete sock; return false; } bool result = false; int error_code = 0; response_ad.LookupBool(ATTR_RESULT,result); if( !result ) { std::string remote_error_msg; response_ad.LookupString(ATTR_ERROR_STRING,remote_error_msg); response_ad.LookupInteger(ATTR_ERROR_CODE,error_code); formatstr(error_msg, "Received failure from %s in response to CANCEL_DRAIN_JOBS request: error code %d: %s", name(),error_code,remote_error_msg.c_str()); newError(CA_FAILURE,error_msg.c_str()); delete sock; return false; } delete sock; return true; }
bool CCBListener::WriteMsgToCCB(ClassAd &msg) { if( !m_sock || m_waiting_for_connect ) { return false; } m_sock->encode(); if( !putClassAd( m_sock, msg ) || !m_sock->end_of_message() ) { Disconnected(); return false; } return true; }
bool ScheddNegotiate::sendJobInfo(Sock *sock) { // The Negotiator wants us to send it a job. sock->encode(); if( m_current_job_id.cluster == -1 && !nextJob() ) { if( !sock->snd_int(NO_MORE_JOBS,TRUE) ) { dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" ); return false; } m_negotiation_finished = true; return true; } if( !sock->put(JOB_INFO) ) { dprintf( D_ALWAYS, "Can't send JOB_INFO to mgr\n" ); return false; } // request match diagnostics m_current_job_ad.Assign(ATTR_WANT_MATCH_DIAGNOSTICS, true); // Send the ad to the negotiator if( !putClassAd(sock, m_current_job_ad) ) { dprintf( D_ALWAYS, "Can't send job ad to mgr\n" ); sock->end_of_message(); return false; } if( !sock->end_of_message() ) { dprintf( D_ALWAYS, "Can't send job eom to mgr\n" ); return false; } m_current_resources_delivered = 0; m_current_resources_requested = 1; m_current_job_ad.LookupInteger(ATTR_RESOURCE_REQUEST_COUNT,m_current_resources_requested); dprintf( D_FULLDEBUG, "Sent job %d.%d (autocluster=%d resources_requested=%d) to the negotiator\n", m_current_job_id.cluster, m_current_job_id.proc, m_current_auto_cluster_id, m_current_resources_requested ); return true; }
int CCBListener::ReverseConnected(Stream *stream) { Sock *sock = (Sock *)stream; ClassAd *msg_ad = (ClassAd *)daemonCore->GetDataPtr(); ASSERT( msg_ad ); if( sock ) { daemonCore->Cancel_Socket( sock ); } if( !sock || !sock->is_connected() ) { ReportReverseConnectResult(msg_ad,false,"failed to connect"); } else { // The reverse-connect protocol is designed to look like a // raw cedar command, in case the thing we are connecting // to is a cedar command socket. sock->encode(); int cmd = CCB_REVERSE_CONNECT; if( !sock->put(cmd) || !putClassAd( sock, *msg_ad ) || !sock->end_of_message() ) { ReportReverseConnectResult(msg_ad,false,"failure writing reverse connect command"); } else { ((ReliSock*)sock)->isClient(false); daemonCore->HandleReqAsync(sock); sock = NULL; // daemonCore took ownership of sock ReportReverseConnectResult(msg_ad,true); } } delete msg_ad; if( sock ) { delete sock; } decRefCount(); // we incremented ref count when setting up callback return KEEP_STREAM; }
void CCBServer::RequestReply( Sock *sock, bool success, char const *error_msg, CCBID request_cid, CCBID target_cid ) { if( success && sock->readReady() ) { // the client must have disconnected (which is expected if // the client has already received the reversed connection) return; } ClassAd msg; msg.Assign( ATTR_RESULT, success ); msg.Assign( ATTR_ERROR_STRING, error_msg ); sock->encode(); if( !putClassAd( sock, msg ) || !sock->end_of_message() ) { // Would like to be completely quiet if success and the // client has disconnected, since this is normal; however, // the above write operations will generate noise when // they fail, so at least in FULLDEBUG, we explain what's // going on. Note that most of the time, we should not get // here for successful requests, because we either observe // the client disconnect earlier, or the above check on // the socket catches it. Why bother sending a reply on // success at all? Because if the client has not yet // seen the reverse connect and we just disconnect without // telling it the request was successful, then it will // think something has gone wrong. dprintf(success ? D_FULLDEBUG : D_ALWAYS, "CCB: failed to send result (%s) for request id %lu " "from %s requesting a reversed connection to target daemon " "with ccbid %lu: %s %s\n", success ? "request succeeded" : "request failed", request_cid, sock->peer_description(), target_cid, error_msg, success ? "(since the request was successful, it is expected " "that the client may disconnect before receiving " "results)" : "" ); } }
void CCBServer::SendHeartbeatResponse( CCBTarget *target ) { Sock *sock = target->getSock(); ClassAd msg; msg.Assign( ATTR_COMMAND, ALIVE ); sock->encode(); if( !putClassAd( sock, msg ) || !sock->end_of_message() ) { dprintf(D_ALWAYS, "CCB: failed to send heartbeat to target " "daemon %s with ccbid %lu\n", target->getSock()->peer_description(), target->getCCBID()); RemoveTarget( target ); return; } dprintf(D_FULLDEBUG,"CCB: sent heartbeat to target %s\n", sock->peer_description()); }
int SendSpoolFileIfNeeded( ClassAd& ad ) { int rval = -1; CurrentSysCall = CONDOR_SendSpoolFileIfNeeded; qmgmt_sock->encode(); neg_on_error( qmgmt_sock->code(CurrentSysCall) ); neg_on_error( putClassAd(qmgmt_sock, ad) ); neg_on_error( qmgmt_sock->end_of_message() ); qmgmt_sock->decode(); neg_on_error( qmgmt_sock->code(rval) ); if( rval < 0 ) { neg_on_error( qmgmt_sock->code(terrno) ); neg_on_error( qmgmt_sock->end_of_message() ); errno = terrno; return rval; } neg_on_error( qmgmt_sock->end_of_message() ); return rval; }
// I'm going to ask the schedd for where I can put the files for the jobs I've // specified. The schedd is going to respond with A) a message telling me it // has the answer right away, or B) an answer telling me I have to wait // an unknown length of time for the schedd to schedule me a place to put it. bool DCSchedd::requestSandboxLocation(ClassAd *reqad, ClassAd *respad, CondorError * errstack) { ReliSock rsock; int will_block; ClassAd status_ad; rsock.timeout(20); // years of research... :) if( ! rsock.connect(_addr) ) { dprintf( D_ALWAYS, "DCSchedd::requestSandboxLocation(): " "Failed to connect to schedd (%s)\n", _addr ); return false; } if( ! startCommand(REQUEST_SANDBOX_LOCATION, (Sock*)&rsock, 0, errstack) ) { dprintf( D_ALWAYS, "DCSchedd::requestSandboxLocation(): " "Failed to send command (REQUEST_SANDBOX_LOCATION) " "to schedd (%s)\n", _addr ); return false; } // First, if we're not already authenticated, force that now. if (!forceAuthentication( &rsock, errstack )) { dprintf( D_ALWAYS, "DCSchedd: authentication failure: %s\n", errstack->getFullText().c_str() ); return false; } rsock.encode(); /////////////////////////////////////////////////////////////////////// // Send my sandbox location request packet to the schedd. /////////////////////////////////////////////////////////////////////// // This request ad will either contain: // ATTR_TREQ_PEER_VERSION // ATTR_TREQ_HAS_CONSTRAINT // ATTR_TREQ_JOBID_LIST // ATTR_TREQ_FTP // // OR // // ATTR_TREQ_DIRECTION // ATTR_TREQ_PEER_VERSION // ATTR_TREQ_HAS_CONSTRAINT // ATTR_TREQ_CONSTRAINT // ATTR_TREQ_FTP dprintf(D_ALWAYS, "Sending request ad.\n"); if (putClassAd(&rsock, *reqad) != 1) { dprintf(D_ALWAYS,"DCSchedd:requestSandboxLocation(): " "Can't send reqad to the schedd\n"); return false; } rsock.end_of_message(); rsock.decode(); /////////////////////////////////////////////////////////////////////// // Read back a response ad which will tell me which jobs the schedd // said I could modify and whether or not I'm am going to have to block // before getting the payload of the transferd location/capability ad. /////////////////////////////////////////////////////////////////////// // This status ad will contain // ATTR_TREQ_INVALID_REQUEST (set to true) // ATTR_TREQ_INVALID_REASON // // OR // ATTR_TREQ_INVALID_REQUEST (set to false) // ATTR_TREQ_JOBID_ALLOW_LIST // ATTR_TREQ_JOBID_DENY_LIST // ATTR_TREQ_WILL_BLOCK dprintf(D_ALWAYS, "Receiving status ad.\n"); if (getClassAd(&rsock, status_ad) == false) { dprintf(D_ALWAYS, "Schedd closed connection to me. Aborting sandbox " "submission.\n"); return false; } rsock.end_of_message(); status_ad.LookupInteger(ATTR_TREQ_WILL_BLOCK, will_block); dprintf(D_ALWAYS, "Client will %s\n", will_block==1?"block":"not block"); if (will_block == 1) { // set to 20 minutes. rsock.timeout(60*20); } /////////////////////////////////////////////////////////////////////// // Read back the payload ad from the schedd about the transferd location // and capability string I can use for the fileset I wish to transfer. /////////////////////////////////////////////////////////////////////// // read back the response ad from the schedd which contains a // td sinful string, and a capability. These represent my ability to // read/write a certain fileset somewhere. // This response ad from the schedd will contain: // // ATTR_TREQ_INVALID_REQUEST (set to true) // ATTR_TREQ_INVALID_REASON // // OR // // ATTR_TREQ_INVALID_REQUEST (set to false) // ATTR_TREQ_CAPABILITY // ATTR_TREQ_TD_SINFUL // ATTR_TREQ_JOBID_ALLOW_LIST dprintf(D_ALWAYS, "Receiving response ad.\n"); if (getClassAd(&rsock, *respad) != true) { dprintf(D_ALWAYS,"DCSchedd:requestSandboxLocation(): " "Can't receive respond ad from the schedd\n"); return false; } rsock.end_of_message(); return true; }
int DCStartd::activateClaim( ClassAd* job_ad, int starter_version, ReliSock** claim_sock_ptr ) { int reply; dprintf( D_FULLDEBUG, "Entering DCStartd::activateClaim()\n" ); setCmdStr( "activateClaim" ); if( claim_sock_ptr ) { // our caller wants a pointer to the socket we used to // successfully activate the claim. right now, set it to // NULL to signify error, and if everything works out, // we'll give them a pointer to the real object. *claim_sock_ptr = NULL; } if( ! claim_id ) { newError( CA_INVALID_REQUEST, "DCStartd::activateClaim: called with NULL claim_id, failing" ); return CONDOR_ERROR; } // if this claim is associated with a security session ClaimIdParser cidp(claim_id); char const *sec_session = cidp.secSessionId(); Sock* tmp; tmp = startCommand( ACTIVATE_CLAIM, Stream::reli_sock, 20, NULL, NULL, false, sec_session ); if( ! tmp ) { newError( CA_COMMUNICATION_ERROR, "DCStartd::activateClaim: Failed to send command ACTIVATE_CLAIM to the startd" ); return CONDOR_ERROR; } if( ! tmp->put_secret(claim_id) ) { newError( CA_COMMUNICATION_ERROR, "DCStartd::activateClaim: Failed to send ClaimId to the startd" ); delete tmp; return CONDOR_ERROR; } if( ! tmp->code(starter_version) ) { newError( CA_COMMUNICATION_ERROR, "DCStartd::activateClaim: Failed to send starter_version to the startd" ); delete tmp; return CONDOR_ERROR; } if( ! putClassAd(tmp, *job_ad) ) { newError( CA_COMMUNICATION_ERROR, "DCStartd::activateClaim: Failed to send job ClassAd to the startd" ); delete tmp; return CONDOR_ERROR; } if( ! tmp->end_of_message() ) { newError( CA_COMMUNICATION_ERROR, "DCStartd::activateClaim: Failed to send EOM to the startd" ); delete tmp; return CONDOR_ERROR; } // Now, try to get the reply tmp->decode(); if( !tmp->code(reply) || !tmp->end_of_message()) { std::string err = "DCStartd::activateClaim: "; err += "Failed to receive reply from "; err += _addr ? _addr : "NULL"; newError( CA_COMMUNICATION_ERROR, err.c_str() ); delete tmp; return CONDOR_ERROR; } dprintf( D_FULLDEBUG, "DCStartd::activateClaim: " "successfully sent command, reply is: %d\n", reply ); if( reply == OK && claim_sock_ptr ) { *claim_sock_ptr = (ReliSock*)tmp; } else { // in any other case, we're going to leak this ReliSock // object if we don't delete it here... delete tmp; } return reply; }
bool DCTransferQueue::RequestTransferQueueSlot(bool downloading,filesize_t sandbox_size,char const *fname,char const *jobid,char const *queue_user,int timeout,MyString &error_desc) { ASSERT(fname); ASSERT(jobid); if( GoAheadAlways( downloading ) ) { m_xfer_downloading = downloading; m_xfer_fname = fname; m_xfer_jobid = jobid; return true; } CheckTransferQueueSlot(); if( m_xfer_queue_sock ) { // A request has already been made. // Currently, this is a no-op, because any upload/download slot // is as good as any other. In the future, there may be // different queues for different paths. ASSERT( m_xfer_downloading == downloading ); m_xfer_fname = fname; m_xfer_jobid = jobid; return true; } time_t started = time(NULL); CondorError errstack; // Our caller has to finish this operation in the specified // amount of time or risk not responding to the file transfer // peer in time, so ignore the timeout multiplier and set the // timeout exactly as specified. m_xfer_queue_sock = reliSock( timeout, 0, &errstack, false, true ); if( !m_xfer_queue_sock ) { formatstr(m_xfer_rejected_reason, "Failed to connect to transfer queue manager for job %s (%s): %s.", jobid, fname, errstack.getFullText().c_str() ); error_desc = m_xfer_rejected_reason; dprintf(D_ALWAYS,"%s\n",m_xfer_rejected_reason.c_str()); return false; } if( timeout ) { timeout -= time(NULL)-started; if( timeout <= 0 ) { timeout = 1; } } if (IsDebugLevel(D_COMMAND)) { int cmd = TRANSFER_QUEUE_REQUEST; dprintf (D_COMMAND, "DCTransferQueue::RequestTransferQueueSlot(%s,...) making connection to %s\n", getCommandStringSafe(cmd), _addr ? _addr : "NULL"); } bool connected = startCommand( TRANSFER_QUEUE_REQUEST, m_xfer_queue_sock, timeout, &errstack ); if( !connected ) { delete m_xfer_queue_sock; m_xfer_queue_sock = NULL; formatstr(m_xfer_rejected_reason, "Failed to initiate transfer queue request for job %s (%s): %s.", jobid, fname, errstack.getFullText().c_str() ); error_desc = m_xfer_rejected_reason; dprintf(D_ALWAYS,"%s\n",m_xfer_rejected_reason.c_str()); return false; } m_xfer_downloading = downloading; m_xfer_fname = fname; m_xfer_jobid = jobid; ClassAd msg; msg.Assign(ATTR_DOWNLOADING,downloading); msg.Assign(ATTR_FILE_NAME,fname); msg.Assign(ATTR_JOB_ID,jobid); msg.Assign(ATTR_USER,queue_user); msg.Assign(ATTR_SANDBOX_SIZE,sandbox_size); m_xfer_queue_sock->encode(); if( !putClassAd(m_xfer_queue_sock, msg) || !m_xfer_queue_sock->end_of_message() ) { formatstr(m_xfer_rejected_reason, "Failed to write transfer request to %s for job %s " "(initial file %s).", m_xfer_queue_sock->peer_description(), m_xfer_jobid.c_str(), m_xfer_fname.c_str()); error_desc = m_xfer_rejected_reason; dprintf(D_ALWAYS,"%s\n",m_xfer_rejected_reason.c_str()); return false; } m_xfer_queue_sock->decode(); // Request has been initiated. Now sender should call // PollForTransferQueueSlot() to get response. m_xfer_queue_pending = true; return true; }
int part_send_job( int test_starter, char *host, int &reason, char *capability, char * /*schedd*/, PROC *proc, int &sd1, int &sd2, char **name) { int reply; ReliSock *sock = NULL; StartdRec stRec; PORTS ports; bool done = false; int retry_delay = 3; int num_retries = 0; // make sure we have the job classad InitJobAd(proc->id.cluster, proc->id.proc); while( !done ) { Daemon startd(DT_STARTD, host, NULL); if (!(sock = (ReliSock*)startd.startCommand ( ACTIVATE_CLAIM, Stream::reli_sock, 90))) { dprintf( D_ALWAYS, "startCommand(ACTIVATE_CLAIM) to startd failed\n"); goto returnfailure; } // Send the capability ClaimIdParser idp( capability ); dprintf(D_FULLDEBUG, "send capability %s\n", idp.publicClaimId() ); if( !sock->put_secret(capability) ) { dprintf( D_ALWAYS, "sock->put(\"%s\") failed\n",idp.publicClaimId()); goto returnfailure; } // Send the starter number if( test_starter ) { dprintf( D_ALWAYS, "Requesting Alternate Starter %d\n", test_starter ); } else { dprintf( D_ALWAYS, "Requesting Primary Starter\n" ); } if( !sock->code(test_starter) ) { dprintf( D_ALWAYS, "sock->code(%d) failed\n", test_starter ); goto returnfailure; } // Send the job info if( !putClassAd(sock, *JobAd) ) { dprintf( D_ALWAYS, "failed to send job ad\n" ); goto returnfailure; } if( !sock->end_of_message() ) { dprintf( D_ALWAYS, "failed to send message to startd\n" ); goto returnfailure; } // We're done sending. Now, get the reply. sock->decode(); if( !sock->code(reply) || !sock->end_of_message() ) { dprintf( D_ALWAYS, "failed to receive reply from startd\n" ); goto returnfailure; } switch( reply ) { case OK: dprintf( D_ALWAYS, "Shadow: Request to run a job was ACCEPTED\n" ); done = true; break; case NOT_OK: dprintf( D_ALWAYS, "Shadow: Request to run a job was REFUSED\n"); goto returnfailure; break; case CONDOR_TRY_AGAIN: num_retries++; dprintf( D_ALWAYS, "Shadow: Request to run a job was TEMPORARILY REFUSED\n" ); if( num_retries > 20 ) { dprintf( D_ALWAYS, "Shadow: Too many retries, giving up.\n" ); goto returnfailure; } delete sock; dprintf( D_ALWAYS, "Shadow: will try again in %d seconds\n", retry_delay ); sleep( retry_delay ); break; default: dprintf(D_ALWAYS,"Unknown reply from startd for command ACTIVATE_CLAIM\n"); dprintf(D_ALWAYS,"Shadow: Request to run a job was REFUSED\n"); goto returnfailure; break; } } /* start flock : dhruba */ sock->decode(); memset( &stRec, '\0', sizeof(stRec) ); if( !sock->code(stRec) || !sock->end_of_message() ) { dprintf(D_ALWAYS, "Can't read reply from startd.\n"); goto returnfailure; } ports = stRec.ports; if( stRec.ip_addr ) { host = stRec.server_name; if(name) { *name = strdup(stRec.server_name); } dprintf(D_FULLDEBUG, "host = %s inet_addr = 0x%x port1 = %d port2 = %d\n", host, stRec.ip_addr,ports.port1, ports.port2 ); } else { dprintf(D_FULLDEBUG, "host = %s port1 = %d port2 = %d\n", host, ports.port1, ports.port2 ); } if( ports.port1 == 0 ) { dprintf( D_ALWAYS, "Shadow: Request to run a job on %s was REFUSED\n", host ); goto returnfailure; } /* end flock ; dhruba */ // We don't use the server_name in the StartdRec, because our // DNS query may fail or may give us the wrong IP address // (either because it's stale or because we're talking to a // machine with multiple network interfaces). Sadly, we can't // use the ip_addr either, because the startd doesn't send it in // the correct byte ordering on little-endian machines. So, we // grab the IP address from the ReliSock, since we konw the // startd always uses the same IP address for all of its // communication. char sinfulstring[SINFUL_STRING_BUF_SIZE]; generate_sinful(sinfulstring, SINFUL_STRING_BUF_SIZE, sock->peer_ip_str(), ports.port1); if( (sd1 = do_connect(sinfulstring, (char *)0, (u_short)ports.port1)) < 0 ) { dprintf( D_ALWAYS, "failed to connect to scheduler on %s\n", sinfulstring ); goto returnfailure; } generate_sinful(sinfulstring, SINFUL_STRING_BUF_SIZE, sock->peer_ip_str(), ports.port2); if( (sd2 = do_connect(sinfulstring, (char *)0, (u_short)ports.port2)) < 0 ) { dprintf( D_ALWAYS, "failed to connect to scheduler on %s\n", sinfulstring ); close(sd1); goto returnfailure; } delete sock; sock = NULL; if ( stRec.server_name ) { free( stRec.server_name ); } return 0; returnfailure: reason = JOB_NOT_STARTED; delete sock; return -1; }
/* Continue reading from rsock the rest of the protcol for this encapsulation method */ int TransferD::accept_transfer_request_encapsulation_old_classads(Stream *sock) { int i; ClassAd *ad = NULL; TransferRequest *treq = NULL; MyString cap; ClassAd respad; dprintf(D_ALWAYS, "Entering " "TransferD::accept_transfer_request_encapsulation_old_classads()\n"); sock->decode(); ///////////////////////////////////////////////////////////////////////// // Accept the transfer request from the schedd. ///////////////////////////////////////////////////////////////////////// /* read the transfer request header packet upon construction */ ad = new ClassAd(); if (getClassAd(sock, *ad) == false) { // XXX don't fail here, just go back to daemoncore EXCEPT("XXX Couldn't init initial ad from stream!"); } sock->end_of_message(); dprintf(D_ALWAYS, "Read treq header.\n"); // initialize the header information of the TransferRequest object. treq = new TransferRequest(ad); if (treq == NULL) { EXCEPT("Out of memory!"); } /* read the information packet which describes the rest of the protocol */ if (treq->get_num_transfers() <= 0) { EXCEPT("Protocol error!"); } // read all the work ads associated with this TransferRequest for (i = 0; i < treq->get_num_transfers(); i++) { ad = new ClassAd(); if (ad == NULL) { EXCEPT("Out of memory!"); } if (getClassAd(sock, *ad) == false) { EXCEPT("Expected %d transfer job ads, got %d instead.", treq->get_num_transfers(), i); } sock->end_of_message(); dprintf(D_ALWAYS, "Read treq job ad[%d].\n", i); treq->append_task(ad); } sock->end_of_message(); sock->encode(); ///////////////////////////////////////////////////////////////////////// // See if I can honor this request's protocol choice ///////////////////////////////////////////////////////////////////////// switch(treq->get_xfer_protocol()) { case FTP_CFTP: // Transferd may use the FileTransfer Object protocol respad.Assign(ATTR_TREQ_INVALID_REQUEST, FALSE); break; default: dprintf(D_ALWAYS, "Transfer Request uses an unsupported file " "transfer protocol. Rejecting it.\n"); // Currently, I don't support anything else.... respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE); respad.Assign(ATTR_TREQ_INVALID_REASON, "Transferd doesn't support client required file transfer " "protocol."); // tell the schedd we don't want to do this request putClassAd(sock, respad); sock->end_of_message(); delete treq; // wait for the next request to come in.... return KEEP_STREAM; break; } ///////////////////////////////////////////////////////////////////////// // Create a capability for this request, making sure it is unique to all // rest of them, then send it back. ///////////////////////////////////////////////////////////////////////// cap = gen_capability(); treq->set_capability(cap); respad.Assign(ATTR_TREQ_CAPABILITY, cap); dprintf(D_ALWAYS, "Assigned capability to treq: %s.\n", cap.Value()); // This respose ad will contain: // // ATTR_TREQ_INVALID_REQUEST (set to true) // ATTR_TREQ_INVALID_REASON // // OR // // ATTR_TREQ_INVALID_REQUEST (set to false) // ATTR_TREQ_CAPABILITY // putClassAd(sock, respad); sock->end_of_message(); dprintf(D_ALWAYS, "Reported capability back to schedd.\n"); // If nothing times out or broke connection, and I think the schedd has // gotten the above information, then queue this request to deal with // at the appropriate time. m_treqs.insert(cap, treq); // get ready to read another treq. sock->decode(); dprintf(D_ALWAYS, "Waiting for another transfer request from schedd.\n"); dprintf(D_ALWAYS, "Leaving " "TransferD::accept_transfer_request_encapsulation_old_classads()\n"); return KEEP_STREAM; }
// Read history from a remote schedd static void readHistoryRemote(classad::ExprTree *constraintExpr) { printHeader(); if(longformat && use_xml) { std::string out; AddClassAdXMLFileHeader(out); printf("%s\n", out.c_str()); } classad::ClassAd ad; classad::ExprList *projList(new classad::ExprList()); classad::ExprTree *projTree = static_cast<classad::ExprTree*>(projList); ad.Insert(ATTR_PROJECTION, projTree); ad.Insert(ATTR_REQUIREMENTS, constraintExpr); ad.InsertAttr(ATTR_NUM_MATCHES, specifiedMatch <= 0 ? -1 : specifiedMatch); DCSchedd schedd(g_name.size() ? g_name.c_str() : NULL, g_pool.size() ? g_pool.c_str() : NULL); if (!schedd.locate(Daemon::LOCATE_FOR_LOOKUP)) { fprintf(stderr, "Unable to locate remote schedd (name=%s, pool=%s).\n", g_name.c_str(), g_pool.c_str()); exit(1); } Sock* sock; if (!(sock = schedd.startCommand(QUERY_SCHEDD_HISTORY, Stream::reli_sock, 0))) { fprintf(stderr, "Unable to send history command to remote schedd;\n" "Typically, either the schedd is not responding, does not authorize you, or does not support remote history.\n"); exit(1); } classad_shared_ptr<Sock> sock_sentry(sock); if (!putClassAd(sock, ad) || !sock->end_of_message()) { fprintf(stderr, "Unable to send request to remote schedd; likely a server or network error.\n"); exit(1); } while (true) { compat_classad::ClassAd ad; if (!getClassAd(sock, ad)) { fprintf(stderr, "Failed to recieve remote ad.\n"); exit(1); } long long intVal; if (ad.EvaluateAttrInt(ATTR_OWNER, intVal) && (intVal == 0)) { // Last ad. if (!sock->end_of_message()) { fprintf(stderr, "Unable to close remote socket.\n"); } sock->close(); std::string errorMsg; if (ad.EvaluateAttrInt(ATTR_ERROR_CODE, intVal) && intVal && ad.EvaluateAttrString(ATTR_ERROR_STRING, errorMsg)) { fprintf(stderr, "Error %lld: %s\n", intVal, errorMsg.c_str()); exit(intVal); } if (ad.EvaluateAttrInt("MalformedAds", intVal) && intVal) { fprintf(stderr, "Remote side had parse errors on history file"); exit(1); } if (!ad.EvaluateAttrInt(ATTR_NUM_MATCHES, intVal) || (intVal != matchCount)) { fprintf(stderr, "Client and server do not agree on number of ads sent;\n" "Indicates lost network packets or an internal error\n"); exit(1); } break; } matchCount++; printJob(ad); } if(longformat && use_xml) { std::string out; AddClassAdXMLFileFooter(out); printf("%s\n", out.c_str()); } }
int do_Q_request(ReliSock *syscall_sock,bool &may_fork) { int request_num = -1; int rval; syscall_sock->decode(); assert( syscall_sock->code(request_num) ); dprintf(D_SYSCALLS, "Got request #%d\n", request_num); switch( request_num ) { case CONDOR_InitializeConnection: { // dprintf( D_ALWAYS, "InitializeConnection()\n" ); bool authenticated = true; // Authenticate socket, if not already done by daemonCore if( !syscall_sock->triedAuthentication() ) { if( IsDebugLevel(D_SECURITY) ) { MyString methods; SecMan::getAuthenticationMethods( WRITE, &methods ); dprintf(D_SECURITY,"Calling authenticate(%s) in qmgmt_receivers\n", methods.Value()); } CondorError errstack; if( ! SecMan::authenticate_sock(syscall_sock, WRITE, &errstack) ) { // Failed to authenticate dprintf( D_ALWAYS, "SCHEDD: authentication failed: %s\n", errstack.getFullText().c_str() ); authenticated = false; } } if ( authenticated ) { InitializeConnection( syscall_sock->getOwner(), syscall_sock->getDomain() ); } else { InitializeConnection( NULL, NULL ); } return 0; } case CONDOR_InitializeReadOnlyConnection: { // dprintf( D_ALWAYS, "InitializeReadOnlyConnection()\n" ); // Since InitializeConnection() does nothing, and we need // to record the fact that this is a read-only connection, // but we have to do it in the socket (since we don't have // any other persistent data structure, and it's probably // the right place anyway), set the FQU. // // We need to record if this is a read-only connection so that // we can avoid expanding $$ in GetJobAd; simply checking if the // connection is authenticated isn't sufficient, because the // security session cache means that read-only connection could // be authenticated by a previous authenticated connection from // the same address (when using host-based security) less than // the expiration period ago. syscall_sock->setFullyQualifiedUser( "read-only" ); // same as InitializeConnection but no authenticate() InitializeConnection( NULL, NULL ); may_fork = true; return 0; } case CONDOR_SetEffectiveOwner: { MyString owner; int terrno; assert( syscall_sock->get(owner) ); assert( syscall_sock->end_of_message() ); rval = QmgmtSetEffectiveOwner( owner.Value() ); terrno = errno; syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() ); char const *fqu = syscall_sock->getFullyQualifiedUser(); dprintf(D_SYSCALLS, "\tSetEffectiveOwner\n"); dprintf(D_SYSCALLS, "\tauthenticated user = '******'\n", fqu ? fqu : ""); dprintf(D_SYSCALLS, "\trequested owner = '%s'\n", owner.Value()); dprintf(D_SYSCALLS, "\trval %d, errno %d\n", rval, terrno); return 0; } case CONDOR_NewCluster: { int terrno; assert( syscall_sock->end_of_message() );; errno = 0; rval = NewCluster( ); terrno = errno; dprintf(D_SYSCALLS, "\tNewCluster: rval = %d, errno = %d\n",rval,terrno ); if ( rval > 0 ) { dprintf( D_AUDIT, *syscall_sock, "Submitting new job %d.0\n", rval ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; dprintf(D_FULLDEBUG,"schedd: NewCluster rval %d errno %d\n",rval,terrno); return 0; } case CONDOR_NewProc: { int cluster_id = -1; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = NewProc( cluster_id ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); if ( rval > 0 ) { dprintf( D_AUDIT, *syscall_sock, "Submitting new job %d.%d\n", cluster_id, rval ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; dprintf(D_FULLDEBUG,"schedd: NewProc rval %d errno %d\n",rval,terrno); return 0; } case CONDOR_DestroyProc: { int cluster_id = -1; int proc_id = -1; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DestroyProc( cluster_id, proc_id ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; dprintf(D_FULLDEBUG,"schedd: DestroyProc cluster %d proc %d rval %d errno %d\n",cluster_id,proc_id,rval,terrno); return 0; } case CONDOR_DestroyCluster: { int cluster_id = -1; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DestroyCluster( cluster_id ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; return 0; } #if 0 case CONDOR_DestroyClusterByConstraint: { char *constraint=NULL; int terrno; assert( syscall_sock->code(constraint) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DestroyClusterByConstraint( constraint ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)constraint ); assert( syscall_sock->end_of_message() );; return 0; } #endif case CONDOR_SetAttributeByConstraint: case CONDOR_SetAttributeByConstraint2: { char *attr_name=NULL; char *attr_value=NULL; char *constraint=NULL; int terrno; SetAttributeFlags_t flags = 0; assert( syscall_sock->code(constraint) ); dprintf( D_SYSCALLS, " constraint = %s\n",constraint); assert( syscall_sock->code(attr_value) ); assert( syscall_sock->code(attr_name) ); if( request_num == CONDOR_SetAttributeByConstraint2 ) { assert( syscall_sock->code( flags ) ); } assert( syscall_sock->end_of_message() );; if (strcmp (attr_name, ATTR_MYPROXY_PASSWORD) == 0) { errno = 0; dprintf( D_SYSCALLS, "SetAttributeByConstraint (MyProxyPassword) not supported...\n"); rval = 0; terrno = errno; } else { errno = 0; rval = SetAttributeByConstraint( constraint, attr_name, attr_value, flags ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); if ( rval == 0 ) { dprintf( D_AUDIT, *syscall_sock, "Set Attribute By Constraint %s, " "%s = %s\n", constraint, attr_name, attr_value); } } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)constraint ); free( (char *)attr_value ); free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_SetAttribute: case CONDOR_SetAttribute2: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; char *attr_value=NULL; int terrno; SetAttributeFlags_t flags = 0; const char *users_username; const char *condor_username; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_value) ); assert( syscall_sock->code(attr_name) ); if( request_num == CONDOR_SetAttribute2 ) { assert( syscall_sock->code( flags ) ); } users_username = syscall_sock->getOwner(); condor_username = get_condor_username(); if (attr_name) dprintf(D_SYSCALLS,"\tattr_name = %s\n",attr_name); if (attr_value) dprintf(D_SYSCALLS,"\tattr_value = %s\n",attr_value); assert( syscall_sock->end_of_message() );; // ckireyev: // We do NOT want to include MyProxy password in the ClassAd (since it's a secret) // I'm not sure if this is the best place to do this, but.... if (attr_name && attr_value && strcmp (attr_name, ATTR_MYPROXY_PASSWORD) == 0) { errno = 0; dprintf( D_SYSCALLS, "Got MyProxyPassword, stashing...\n"); rval = SetMyProxyPassword (cluster_id, proc_id, attr_value); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); } else { errno = 0; rval = SetAttribute( cluster_id, proc_id, attr_name, attr_value, flags ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); // If we're modifying a previously-submitted job AND either // the client's username is not HTCondor's (i.e. not a // daemon) OR the client says we should log... if( (cluster_id != active_cluster_num) && (rval == 0) && ( strcmp(users_username, condor_username) || (flags & SHOULDLOG) ) ) { dprintf( D_AUDIT, *syscall_sock, "Set Attribute for job %d.%d, " "%s = %s\n", cluster_id, proc_id, attr_name, attr_value); } } free( (char *)attr_value ); free( (char *)attr_name ); if( flags & SetAttribute_NoAck ) { if( rval < 0 ) { return -1; } } else { syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() ); } return 0; } case CONDOR_SetTimerAttribute: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int duration = 0; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); if (attr_name) dprintf(D_SYSCALLS,"\tattr_name = %s\n",attr_name); assert( syscall_sock->code(duration) ); dprintf(D_SYSCALLS,"\tduration = %d\n",duration); assert( syscall_sock->end_of_message() );; errno = 0; rval = SetTimerAttribute( cluster_id, proc_id, attr_name, duration ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); dprintf( D_AUDIT, *syscall_sock, "Set Timer Attribute for job %d.%d, " "attr_name = %s, duration = %d\n", cluster_id, proc_id, attr_name, duration); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_BeginTransaction: { int terrno; assert( syscall_sock->end_of_message() );; errno = 0; rval = 0; // BeginTransaction returns void (sigh), so always success BeginTransaction( ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_AbortTransaction: { int terrno; assert( syscall_sock->end_of_message() );; errno = 0; rval = 0; // AbortTransaction returns void (sigh), so always success AbortTransaction( ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_CommitTransactionNoFlags: case CONDOR_CommitTransaction: { int terrno; int flags; if( request_num == CONDOR_CommitTransaction ) { assert( syscall_sock->code(flags) ); } else { flags = 0; } assert( syscall_sock->end_of_message() );; errno = 0; CondorError errstack; rval = CheckTransaction( flags, & errstack ); terrno = errno; dprintf( D_SYSCALLS, "\tflags = %d, rval = %d, errno = %d\n", flags, rval, terrno ); if( rval >= 0 ) { errno = 0; CommitTransaction( flags ); // CommitTransaction() never returns on failure rval = 0; terrno = errno; dprintf( D_SYSCALLS, "\tflags = %d, rval = %d, errno = %d\n", flags, rval, terrno ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); const CondorVersionInfo *vers = syscall_sock->get_peer_version(); if (vers && vers->built_since_version(8, 3, 4)) { // Send a classad, for less backwards-incompatibility. int code = 1; const char * reason = "QMGMT rejected job submission."; if( errstack.subsys() ) { code = 2; reason = errstack.message(); } ClassAd reply; reply.Assign( "ErrorCode", code ); reply.Assign( "ErrorReason", reason ); assert( putClassAd( syscall_sock, reply ) ); } } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeFloat: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; float value = 0.0; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeFloat( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( syscall_sock->code(value) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeInt: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int value = 0; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); dprintf( D_SYSCALLS, " attr_name = %s\n", attr_name ); assert( syscall_sock->end_of_message() );; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeInt( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; if (rval < 0) { dprintf( D_SYSCALLS, "GetAttributeInt(%d, %d, %s) not found.\n", cluster_id, proc_id, attr_name); } else { dprintf( D_SYSCALLS, " value: %d\n", value ); dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( syscall_sock->code(value) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeString: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; char *value = NULL; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeStringNew( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( syscall_sock->code(value) ); } free( (char *)value ); free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeExpr: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; char *value = NULL; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeExprNew( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); if ( !syscall_sock->code(rval) ) { free(value); return -1; } if( rval < 0 ) { if ( !syscall_sock->code(terrno) ) { free(value); return -1; } } if( rval >= 0 ) { if ( !syscall_sock->code(value) ) { free(value); return -1; } } free( (char *)value ); free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetDirtyAttributes: { int cluster_id = -1; int proc_id = -1; ClassAd updates; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = GetDirtyAttributes( cluster_id, proc_id, &updates ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); if ( !syscall_sock->code(rval) ) { return -1; } if( rval < 0 ) { if ( !syscall_sock->code(terrno) ) { return -1; } } if( rval >= 0 ) { assert( putClassAd(syscall_sock, updates) ); } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_DeleteAttribute: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DeleteAttribute( cluster_id, proc_id, attr_name ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetJobAd: { int cluster_id = -1; int proc_id = -1; ClassAd *ad = NULL; int terrno; bool delete_ad = false; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->end_of_message() );; // dprintf( D_ALWAYS, "(%d.%d) isAuthenticated() = %d\n", cluster_id, proc_id, syscall_sock->isAuthenticated() ); // dprintf( D_ALWAYS, "(%d.%d) getOwner() = %s\n", cluster_id, proc_id, syscall_sock->getOwner() ); errno = 0; // Only fetch the jobad for legal values of cluster/proc if( cluster_id >= 1 ) { if( proc_id >= 0 ) { const char * fqu = syscall_sock->getFullyQualifiedUser(); if( fqu != NULL && strcmp( fqu, "read-only" ) != 0 ) { // expand $$() macros in the jobad as required by GridManager. // The GridManager depends on the fact that the following call // expands $$ and saves the expansions to disk in case of // restart. ad = GetJobAd_as_ClassAd( cluster_id, proc_id, true, true ); delete_ad = true; // note : since we expanded the ad, ad is now a deep // copy of the ad in memory, so we must delete it below. } else { ad = GetJobAd_as_ClassAd( cluster_id, proc_id, false, false ); } } else if( proc_id == -1 ) { // allow cluster ad to be queried as required by preen, but // do NOT ask to expand $$() macros in a cluster ad! ad = GetJobAd_as_ClassAd( cluster_id, proc_id, false, false ); } } terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } // If we called GetJobAd() with the third bool argument set // to True (expandedAd), it does a deep copy of the ad in the // queue in order to expand the $$() attributes. So we must // delete it. if (delete_ad) delete ad; assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetJobByConstraint: { char *constraint=NULL; ClassAd *ad; int terrno; assert( syscall_sock->code(constraint) ); assert( syscall_sock->end_of_message() );; errno = 0; ad = GetJobByConstraint_as_ClassAd( constraint ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); free( (char *)constraint ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetNextJob: { ClassAd *ad; int initScan = 0; int terrno; assert( syscall_sock->code(initScan) ); dprintf( D_SYSCALLS, " initScan = %d\n", initScan ); assert( syscall_sock->end_of_message() );; errno = 0; ad = GetNextJob( initScan ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetNextJobByConstraint: { char *constraint=NULL; ClassAd *ad; int initScan = 0; int terrno; assert( syscall_sock->code(initScan) ); dprintf( D_SYSCALLS, " initScan = %d\n", initScan ); if ( !(syscall_sock->code(constraint)) ) { if (constraint != NULL) { free(constraint); constraint = NULL; } return -1; } assert( syscall_sock->end_of_message() );; errno = 0; ad = GetNextJobByConstraint( constraint, initScan ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); free( (char *)constraint ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetNextDirtyJobByConstraint: { char *constraint=NULL; ClassAd *ad; int initScan = 0; int terrno; assert( syscall_sock->code(initScan) ); dprintf( D_SYSCALLS, " initScan = %d\n", initScan ); if ( !(syscall_sock->code(constraint)) ) { if (constraint != NULL) { free(constraint); constraint = NULL; } return -1; } assert( syscall_sock->end_of_message() ); errno = 0; ad = GetNextDirtyJobByConstraint( constraint, initScan ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); free( (char *)constraint ); assert( syscall_sock->end_of_message() ); return 0; } case CONDOR_SendSpoolFile: { char *filename=NULL; int terrno; assert( syscall_sock->code(filename) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = SendSpoolFile( filename ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); #if 0 syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; #endif free( (char *)filename ); return 0; } case CONDOR_SendSpoolFileIfNeeded: { int terrno; ClassAd ad; assert( getClassAd(syscall_sock, ad) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = SendSpoolFileIfNeeded(ad); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); return 0; } case CONDOR_GetAllJobsByConstraint: { char *constraint=NULL; char *projection=NULL; ClassAd *ad; int terrno; int initScan = 1; classad::References proj; if ( !(syscall_sock->code(constraint)) ) { if (constraint != NULL) { free(constraint); constraint = NULL; } return -1; } if ( !(syscall_sock->code(projection)) ) { if (projection != NULL) { free(constraint); free(projection); projection = NULL; } return -1; } dprintf( D_SYSCALLS, " constraint = %s\n", constraint ); dprintf( D_SYSCALLS, " projection = %s\n", projection ? projection : ""); assert( syscall_sock->end_of_message() );; // if there is a projection, convert it into a set of attribute names if (projection) { StringTokenIterator list(projection); const std::string * attr; while ((attr = list.next_string())) { proj.insert(*attr); } } syscall_sock->encode(); do { errno = 0; ad = GetNextJobByConstraint( constraint, initScan ); initScan=0; // one first time through, otherwise 0 terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE, proj.empty() ? NULL : &proj) ); FreeJobAd(ad); } } while (rval >= 0); assert( syscall_sock->end_of_message() );; free( (char *)constraint ); free( (char *)projection ); return 0; } case CONDOR_CloseSocket: { assert( syscall_sock->end_of_message() );; return -1; } } /* End of switch */ return -1; } /* End of function */
ClassAd* DCSchedd::actOnJobs( JobAction action, const char* constraint, StringList* ids, const char* reason, const char* reason_attr, const char* reason_code, const char* reason_code_attr, action_result_type_t result_type, bool notify_scheduler, CondorError * errstack ) { char* tmp = NULL; char buf[512]; int size, reply; ReliSock rsock; // // // // // // // // // Construct the ad we want to send // // // // // // // // ClassAd cmd_ad; sprintf( buf, "%s = %d", ATTR_JOB_ACTION, action ); cmd_ad.Insert( buf ); sprintf( buf, "%s = %d", ATTR_ACTION_RESULT_TYPE, (int)result_type ); cmd_ad.Insert( buf ); sprintf( buf, "%s = %s", ATTR_NOTIFY_JOB_SCHEDULER, notify_scheduler ? "True" : "False" ); cmd_ad.Insert( buf ); if( constraint ) { if( ids ) { // This is a programming error, not a run-time one EXCEPT( "DCSchedd::actOnJobs has both constraint and ids!" ); } size = strlen(constraint) + strlen(ATTR_ACTION_CONSTRAINT) + 4; tmp = (char*) malloc( size*sizeof(char) ); if( !tmp ) { EXCEPT( "Out of memory!" ); } sprintf( tmp, "%s = %s", ATTR_ACTION_CONSTRAINT, constraint ); if( ! cmd_ad.Insert(tmp) ) { dprintf( D_ALWAYS, "DCSchedd::actOnJobs: " "Can't insert constraint (%s) into ClassAd!\n", constraint ); free( tmp ); return NULL; } free( tmp ); tmp = NULL; } else if( ids ) { char* action_ids = ids->print_to_string(); if ( action_ids ) { size = strlen(action_ids) + strlen(ATTR_ACTION_IDS) + 7; tmp = (char*) malloc( size*sizeof(char) ); if( !tmp ) { EXCEPT( "Out of memory!" ); } sprintf( tmp, "%s = \"%s\"", ATTR_ACTION_IDS, action_ids ); cmd_ad.Insert( tmp ); free( tmp ); tmp = NULL; free(action_ids); action_ids = NULL; } } else { EXCEPT( "DCSchedd::actOnJobs called without constraint or ids" ); } if( reason_attr && reason ) { size = strlen(reason_attr) + strlen(reason) + 7; tmp = (char*) malloc( size*sizeof(char) ); if( !tmp ) { EXCEPT( "Out of memory!" ); } sprintf( tmp, "%s = \"%s\"", reason_attr, reason ); cmd_ad.Insert( tmp ); free( tmp ); tmp = NULL; } if( reason_code_attr && reason_code ) { cmd_ad.AssignExpr(reason_code_attr,reason_code); } // // // // // // // // // On the wire protocol // // // // // // // // rsock.timeout(20); // years of research... :) if( ! rsock.connect(_addr) ) { dprintf( D_ALWAYS, "DCSchedd::actOnJobs: " "Failed to connect to schedd (%s)\n", _addr ); return NULL; } if( ! startCommand(ACT_ON_JOBS, (Sock*)&rsock, 0, errstack) ) { dprintf( D_ALWAYS, "DCSchedd::actOnJobs: " "Failed to send command (ACT_ON_JOBS) to the schedd\n" ); return NULL; } // First, if we're not already authenticated, force that now. if (!forceAuthentication( &rsock, errstack )) { dprintf( D_ALWAYS, "DCSchedd: authentication failure: %s\n", errstack->getFullText().c_str() ); return NULL; } // Now, put the command classad on the wire if( ! (putClassAd(&rsock, cmd_ad) && rsock.end_of_message()) ) { dprintf( D_ALWAYS, "DCSchedd:actOnJobs: Can't send classad\n" ); return NULL; } // Next, we need to read the reply from the schedd if things // are ok and it's going to go forward. If the schedd can't // read our reply to this ClassAd, it assumes we got killed // and it should abort its transaction rsock.decode(); ClassAd* result_ad = new ClassAd(); if( ! (getClassAd(&rsock, *result_ad) && rsock.end_of_message()) ) { dprintf( D_ALWAYS, "DCSchedd:actOnJobs: " "Can't read response ad from %s\n", _addr ); delete( result_ad ); return NULL; } // If the action totally failed, the schedd will already have // aborted the transaction and closed up shop, so there's no // reason trying to continue. However, we still want to // return the result ad we got back so that our caller can // figure out what went wrong. reply = FALSE; result_ad->LookupInteger( ATTR_ACTION_RESULT, reply ); if( reply != OK ) { dprintf( D_ALWAYS, "DCSchedd:actOnJobs: Action failed\n" ); return result_ad; } // Tell the schedd we're still here and ready to go rsock.encode(); int answer = OK; if( ! (rsock.code(answer) && rsock.end_of_message()) ) { dprintf( D_ALWAYS, "DCSchedd:actOnJobs: Can't send reply\n" ); delete( result_ad ); return NULL; } // finally, make sure the schedd didn't blow up trying to // commit these changes to the job queue... rsock.decode(); if( ! (rsock.code(reply) && rsock.end_of_message()) ) { dprintf( D_ALWAYS, "DCSchedd:actOnJobs: " "Can't read confirmation from %s\n", _addr ); delete( result_ad ); return NULL; } return result_ad; }
int CCBServer::HandleRegistration(int cmd,Stream *stream) { ReliSock *sock = (ReliSock *)stream; ASSERT( cmd == CCB_REGISTER ); // Avoid lengthy blocking on communication with our peer. // This command-handler should not get called until data // is ready to read. sock->timeout(1); ClassAd msg; sock->decode(); if( !getClassAd( sock, msg ) || !sock->end_of_message() ) { dprintf(D_ALWAYS, "CCB: failed to receive registration " "from %s.\n", sock->peer_description() ); return FALSE; } SetSmallBuffers(sock); MyString name; if( msg.LookupString(ATTR_NAME,name) ) { // target daemon name is purely for debugging purposes name.formatstr_cat(" on %s",sock->peer_description()); sock->set_peer_description(name.Value()); } CCBTarget *target = new CCBTarget(sock); MyString reconnect_cookie_str,reconnect_ccbid_str; CCBID reconnect_cookie,reconnect_ccbid; bool reconnected = false; if( msg.LookupString(ATTR_CLAIM_ID,reconnect_cookie_str) && CCBIDFromString(reconnect_cookie,reconnect_cookie_str.Value()) && msg.LookupString( ATTR_CCBID,reconnect_ccbid_str) && CCBIDFromContactString(reconnect_ccbid,reconnect_ccbid_str.Value()) ) { target->setCCBID( reconnect_ccbid ); reconnected = ReconnectTarget( target, reconnect_cookie ); } if( !reconnected ) { AddTarget( target ); } CCBReconnectInfo *reconnect_info = GetReconnectInfo( target->getCCBID() ); ASSERT( reconnect_info ); sock->encode(); ClassAd reply_msg; MyString ccb_contact; CCBIDToString( reconnect_info->getReconnectCookie(),reconnect_cookie_str ); // We send our address as part of the CCB contact string, rather // than letting the target daemon fill it in. This is to give us // potential flexibility on the CCB server side to do things like // assign different targets to different CCB server sub-processes, // each with their own command port. CCBIDToContactString( m_address.Value(), target->getCCBID(), ccb_contact ); reply_msg.Assign(ATTR_CCBID,ccb_contact.Value()); reply_msg.Assign(ATTR_COMMAND,CCB_REGISTER); reply_msg.Assign(ATTR_CLAIM_ID,reconnect_cookie_str.Value()); if( !putClassAd( sock, reply_msg ) || !sock->end_of_message() ) { dprintf(D_ALWAYS, "CCB: failed to send registration response " "to %s.\n", sock->peer_description() ); RemoveTarget( target ); return KEEP_STREAM; // we have already closed this socket } return KEEP_STREAM; }
bool DCStarter::peek(bool transfer_stdout, ssize_t &stdout_offset, bool transfer_stderr, ssize_t &stderr_offset, const std::vector<std::string> &filenames, std::vector<ssize_t> &offsets, size_t max_bytes, bool &retry_sensible, PeekGetFD &next, std::string &error_msg, unsigned timeout, const std::string &sec_session_id, DCTransferQueue *xfer_q) { compat_classad::ClassAd ad; ad.InsertAttr(ATTR_JOB_OUTPUT, transfer_stdout); ad.InsertAttr("OutOffset", stdout_offset); ad.InsertAttr(ATTR_JOB_ERROR, transfer_stderr); ad.InsertAttr("ErrOffset", stderr_offset); ad.InsertAttr(ATTR_VERSION, CondorVersion()); size_t total_files = 0; total_files += transfer_stdout ? 1 : 0; total_files += transfer_stderr ? 1 : 0; if (filenames.size()) { total_files += filenames.size(); std::vector<classad::ExprTree *> filelist; filelist.reserve(filenames.size()); std::vector<classad::ExprTree *> offsetlist; offsetlist.reserve(filenames.size()); std::vector<ssize_t>::const_iterator it2 = offsets.begin(); for (std::vector<std::string>::const_iterator it = filenames.begin(); it != filenames.end() && it2 != offsets.end(); it++, it2++) { classad::Value value; value.SetStringValue(*it); filelist.push_back(classad::Literal::MakeLiteral(value)); value.SetIntegerValue(*it2); offsetlist.push_back(classad::Literal::MakeLiteral(value)); } classad::ExprTree *list(classad::ExprList::MakeExprList(filelist)); ad.Insert("TransferFiles", list); list = classad::ExprList::MakeExprList(offsetlist); ad.Insert("TransferOffsets", list); } ad.InsertAttr(ATTR_MAX_TRANSFER_BYTES, static_cast<long long>(max_bytes)); ReliSock sock; if( !connectSock(&sock, timeout, NULL) ) { error_msg = "Failed to connect to starter"; return false; } if( !startCommand(STARTER_PEEK, &sock, timeout, NULL, NULL, false, sec_session_id.c_str()) ) { error_msg = "Failed to send START_PEEK to starter"; return false; } sock.encode(); if (!putClassAd(&sock, ad) || !sock.end_of_message()) { error_msg = "Failed to send request to starter"; return false; } compat_classad::ClassAd response; sock.decode(); if (!getClassAd(&sock, response) || !sock.end_of_message()) { error_msg = "Failed to read response for peeking at logs."; return false; } dPrintAd(D_FULLDEBUG, response); bool success = false; if (!response.EvaluateAttrBool(ATTR_RESULT, success) || !success) { response.EvaluateAttrBool(ATTR_RETRY, retry_sensible); error_msg = "Remote operation failed."; response.EvaluateAttrString(ATTR_ERROR_STRING, error_msg); return false; } classad::Value valueX; classad_shared_ptr<classad::ExprList> list; if (!response.EvaluateAttr("TransferFiles", valueX) || !valueX.IsSListValue(list)) { error_msg = "Unable to evaluate starter response"; return false; } classad_shared_ptr<classad::ExprList> offlist; if (!response.EvaluateAttr("TransferOffsets", valueX) || !valueX.IsSListValue(offlist)) { error_msg = "Unable to evaluate starter response (missing offsets)"; return false; } size_t remaining = max_bytes; size_t file_count = 0; classad::ExprList::const_iterator it2 = offlist->begin(); for (classad::ExprList::const_iterator it = list->begin(); it != list->end() && it2 != offlist->end(); it++, it2++) { classad::Value value; (*it2)->Evaluate(value); off_t off = -1; value.IsIntegerValue(off); (*it)->Evaluate(value); std::string filename; int64_t xfer_fd = -1; if (!value.IsStringValue(filename) && value.IsIntegerValue(xfer_fd)) { if (xfer_fd == 0) filename = "_condor_stdout"; if (xfer_fd == 1) filename = "_condor_stderr"; } int fd = next.getNextFD(filename); filesize_t size = -1; int retval; if ((retval = sock.get_file(&size, fd, false, false, remaining, xfer_q)) && (retval != GET_FILE_MAX_BYTES_EXCEEDED)) { error_msg = "Internal error when transferring file " + filename; } else if (size >= 0) { remaining -= max_bytes; file_count++; off += size; } else { error_msg = "Failed to transfer file " + filename; } if (xfer_fd == 0) { stdout_offset = off; //dprintf(D_FULLDEBUG, "New stdout offset: %ld\n", stdout_offset); } else if (xfer_fd == 1) { stderr_offset = off; } else { std::vector<ssize_t>::iterator it4 = offsets.begin(); for (std::vector<std::string>::const_iterator it3 = filenames.begin(); it3 != filenames.end() && it4 != offsets.end(); it3++, it4++) { if (*it3 == filename) *it4 = off; } } } size_t remote_file_count; if (!sock.get(remote_file_count) || !sock.end_of_message()) { error_msg = "Unable to get remote file count."; return false; } if (file_count != remote_file_count) { std::stringstream ss; ss << "Recieved " << file_count << " files, but remote side thought it sent " << remote_file_count << " files"; error_msg = ss.str(); return false; } if ((total_files != file_count) && !error_msg.size()) { error_msg = "At least one file transfer failed."; return false; } return true; }
bool DCStarter::startSSHD(char const *known_hosts_file,char const *private_client_key_file,char const *preferred_shells,char const *slot_name,char const *ssh_keygen_args,ReliSock &sock,int timeout,char const *sec_session_id,MyString &remote_user,MyString &error_msg,bool &retry_is_sensible) { retry_is_sensible = false; #ifndef HAVE_SSH_TO_JOB error_msg = "This version of Condor does not support ssh key exchange."; return false; #else if( !connectSock(&sock, timeout, NULL) ) { error_msg = "Failed to connect to starter"; return false; } if( !startCommand(START_SSHD, &sock,timeout,NULL,NULL,false,sec_session_id) ) { error_msg = "Failed to send START_SSHD to starter"; return false; } ClassAd input; if( preferred_shells && *preferred_shells ) { input.Assign(ATTR_SHELL,preferred_shells); } if( slot_name && *slot_name ) { // This is a little silly. // We are telling the remote side the name of the slot so // that it can put it in the welcome message. input.Assign(ATTR_NAME,slot_name); } if( ssh_keygen_args && *ssh_keygen_args ) { input.Assign(ATTR_SSH_KEYGEN_ARGS,ssh_keygen_args); } sock.encode(); if( !putClassAd(&sock, input) || !sock.end_of_message() ) { error_msg = "Failed to send START_SSHD request to starter"; return false; } ClassAd result; sock.decode(); if( !getClassAd(&sock, result) || !sock.end_of_message() ) { error_msg = "Failed to read response to START_SSHD from starter"; return false; } bool success = false; result.LookupBool(ATTR_RESULT,success); if( !success ) { std::string remote_error_msg; result.LookupString(ATTR_ERROR_STRING,remote_error_msg); error_msg.formatstr("%s: %s",slot_name,remote_error_msg.c_str()); retry_is_sensible = false; result.LookupBool(ATTR_RETRY,retry_is_sensible); return false; } result.LookupString(ATTR_REMOTE_USER,remote_user); std::string public_server_key; if( !result.LookupString(ATTR_SSH_PUBLIC_SERVER_KEY,public_server_key) ) { error_msg = "No public ssh server key received in reply to START_SSHD"; return false; } std::string private_client_key; if( !result.LookupString(ATTR_SSH_PRIVATE_CLIENT_KEY,private_client_key) ) { error_msg = "No ssh client key received in reply to START_SSHD"; return false; } // store the private client key unsigned char *decode_buf = NULL; int length = -1; condor_base64_decode(private_client_key.c_str(),&decode_buf,&length); if( !decode_buf ) { error_msg = "Error decoding ssh client key."; return false; } FILE *fp = safe_fcreate_fail_if_exists(private_client_key_file,"a",0400); if( !fp ) { error_msg.formatstr("Failed to create %s: %s", private_client_key_file,strerror(errno)); free( decode_buf ); return false; } if( fwrite(decode_buf,length,1,fp)!=1 ) { error_msg.formatstr("Failed to write to %s: %s", private_client_key_file,strerror(errno)); fclose( fp ); free( decode_buf ); return false; } if( fclose(fp)!=0 ) { error_msg.formatstr("Failed to close %s: %s", private_client_key_file,strerror(errno)); free( decode_buf ); return false; } fp = NULL; free( decode_buf ); decode_buf = NULL; // store the public server key in the known_hosts file length = -1; condor_base64_decode(public_server_key.c_str(),&decode_buf,&length); if( !decode_buf ) { error_msg = "Error decoding ssh server key."; return false; } fp = safe_fcreate_fail_if_exists(known_hosts_file,"a",0600); if( !fp ) { error_msg.formatstr("Failed to create %s: %s", known_hosts_file,strerror(errno)); free( decode_buf ); return false; } // prepend a host name pattern (*) to the public key to make a valid // record in the known_hosts file fprintf(fp,"* "); if( fwrite(decode_buf,length,1,fp)!=1 ) { error_msg.formatstr("Failed to write to %s: %s", known_hosts_file,strerror(errno)); fclose( fp ); free( decode_buf ); return false; } if( fclose(fp)!=0 ) { error_msg.formatstr("Failed to close %s: %s", known_hosts_file,strerror(errno)); free( decode_buf ); return false; } fp = NULL; free( decode_buf ); decode_buf = NULL; return true; #endif }
bool DCSchedd::getJobConnectInfo( PROC_ID jobid, int subproc, char const *session_info, int timeout, CondorError *errstack, MyString &starter_addr, MyString &starter_claim_id, MyString &starter_version, MyString &slot_name, MyString &error_msg, bool &retry_is_sensible, int &job_status, MyString &hold_reason) { ClassAd input; ClassAd output; input.Assign(ATTR_CLUSTER_ID,jobid.cluster); input.Assign(ATTR_PROC_ID,jobid.proc); if( subproc != -1 ) { input.Assign(ATTR_SUB_PROC_ID,subproc); } input.Assign(ATTR_SESSION_INFO,session_info); ReliSock sock; if( !connectSock(&sock,timeout,errstack) ) { error_msg = "Failed to connect to schedd"; dprintf( D_ALWAYS, "%s\n",error_msg.Value()); return false; } if( !startCommand(GET_JOB_CONNECT_INFO, &sock, timeout, errstack) ) { error_msg = "Failed to send GET_JOB_CONNECT_INFO to schedd"; dprintf( D_ALWAYS, "%s\n",error_msg.Value()); return false; } if( !forceAuthentication(&sock, errstack) ) { error_msg = "Failed to authenticate"; dprintf( D_ALWAYS, "%s\n",error_msg.Value()); return false; } sock.encode(); if( !putClassAd(&sock, input) || !sock.end_of_message() ) { error_msg = "Failed to send GET_JOB_CONNECT_INFO to schedd"; dprintf( D_ALWAYS, "%s\n",error_msg.Value()); return false; } sock.decode(); if( !getClassAd(&sock, output) || !sock.end_of_message() ) { error_msg = "Failed to get response from schedd"; dprintf( D_ALWAYS, "%s\n",error_msg.Value()); return false; } if( IsFulldebug(D_FULLDEBUG) ) { std::string adstr; sPrintAd(adstr, output, true); dprintf(D_FULLDEBUG,"Response for GET_JOB_CONNECT_INFO:\n%s\n", adstr.c_str()); } bool result=false; output.LookupBool(ATTR_RESULT,result); if( !result ) { output.LookupString(ATTR_HOLD_REASON,hold_reason); output.LookupString(ATTR_ERROR_STRING,error_msg); retry_is_sensible = false; output.LookupBool(ATTR_RETRY,retry_is_sensible); output.LookupInteger(ATTR_JOB_STATUS,job_status); } else { output.LookupString(ATTR_STARTER_IP_ADDR,starter_addr); output.LookupString(ATTR_CLAIM_ID,starter_claim_id); output.LookupString(ATTR_VERSION,starter_version); output.LookupString(ATTR_REMOTE_HOST,slot_name); } return result; }
bool ScheddNegotiate::sendJobInfo(Sock *sock, bool just_sig_attrs) { // The Negotiator wants us to send it a job. sock->encode(); if( m_current_job_id.cluster == -1 && !nextJob() ) { if( !sock->snd_int(NO_MORE_JOBS,TRUE) ) { dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" ); return false; } m_negotiation_finished = true; return true; } if( !sock->put(JOB_INFO) ) { dprintf( D_ALWAYS, "Can't send JOB_INFO to mgr\n" ); return false; } // If schedd wants pslot preemption, advertise here m_current_job_ad.Assign(ATTR_WANT_PSLOT_PREEMPTION, param_boolean("ALLOW_PSLOT_PREEMPTION", false)); // request match diagnostics // 0 = no match diagnostics // 1 = match diagnostics string // 2 = match diagnostics string decorated w/ autocluster + jobid m_current_job_ad.Assign(ATTR_WANT_MATCH_DIAGNOSTICS, (int) 2); m_current_job_ad.Assign(ATTR_WANT_PSLOT_PREEMPTION, param_boolean("ALLOW_PSLOT_PREEMPTION", false)); // Send the ad to the negotiator int putad_result = 0; std::string auto_cluster_attrs; if ( just_sig_attrs && m_current_job_ad.LookupString(ATTR_AUTO_CLUSTER_ATTRS, auto_cluster_attrs) ) { // don't send the entire job ad; just send significant attrs classad::References sig_attrs; StringTokenIterator list(auto_cluster_attrs); const std::string *attr; while ((attr = list.next_string())) { sig_attrs.insert(*attr); } // besides significant attrs, we also always want to send these attrs cuz // the matchmaker explicitly looks for them (for dprintfs or whatever). sig_attrs.insert(ATTR_OWNER); sig_attrs.insert(ATTR_CLUSTER_ID); sig_attrs.insert(ATTR_PROC_ID); sig_attrs.insert(ATTR_RESOURCE_REQUEST_COUNT); sig_attrs.insert(ATTR_GLOBAL_JOB_ID); sig_attrs.insert(ATTR_AUTO_CLUSTER_ID); sig_attrs.insert(ATTR_WANT_MATCH_DIAGNOSTICS); sig_attrs.insert(ATTR_WANT_PSLOT_PREEMPTION); sig_attrs.insert(ATTR_WANT_CLAIMING); // used for Condor-G matchmaking // ship it! putad_result = putClassAd(sock, m_current_job_ad, 0, &sig_attrs); } else { // send the entire classad. perhaps we are doing this because the // ad does not have ATTR_AUTO_CLUSTER_ATTRS defined for some reason, // or perhaps we are doing this because we were explicitly told to do so. putad_result = putClassAd(sock, m_current_job_ad); } if( !putad_result ) { dprintf( D_ALWAYS, "Can't send job ad to mgr\n" ); sock->end_of_message(); return false; } if( !sock->end_of_message() ) { dprintf( D_ALWAYS, "Can't send job eom to mgr\n" ); return false; } m_current_resources_delivered = 0; m_current_resources_requested = 1; m_current_job_ad.LookupInteger(ATTR_RESOURCE_REQUEST_COUNT,m_current_resources_requested); dprintf( D_FULLDEBUG, "Sent job %d.%d (autocluster=%d resources_requested=%d) to the negotiator\n", m_current_job_id.cluster, m_current_job_id.proc, m_current_auto_cluster_id, m_current_resources_requested ); return true; }
// when a transferd registers itself, it identifies who it is. The connection // is then held open and the schedd periodically might send more transfer // requests to the transferd. Also, if the transferd dies, the schedd is // informed quickly and reliably due to the closed connection. bool DCSchedd::register_transferd(MyString sinful, MyString id, int timeout, ReliSock **regsock_ptr, CondorError *errstack) { ReliSock *rsock; int invalid_request = 0; ClassAd regad; ClassAd respad; std::string errstr; std::string reason; if (regsock_ptr != NULL) { // Our caller wants a pointer to the socket we used to succesfully // register the claim. The NULL pointer will represent failure and // this will only be set to something real if everything was ok. *regsock_ptr = NULL; } // This call with automatically connect to _addr, which was set in the // constructor of this object to be the schedd in question. rsock = (ReliSock*)startCommand(TRANSFERD_REGISTER, Stream::reli_sock, timeout, errstack); if( ! rsock ) { dprintf( D_ALWAYS, "DCSchedd::register_transferd: " "Failed to send command (TRANSFERD_REGISTER) " "to the schedd\n" ); errstack->push("DC_SCHEDD", 1, "Failed to start a TRANSFERD_REGISTER command."); return false; } // First, if we're not already authenticated, force that now. if (!forceAuthentication( rsock, errstack )) { dprintf( D_ALWAYS, "DCSchedd::register_transferd authentication " "failure: %s\n", errstack->getFullText().c_str() ); errstack->push("DC_SCHEDD", 1, "Failed to authenticate properly."); return false; } rsock->encode(); // set up my registration request. regad.Assign(ATTR_TREQ_TD_SINFUL, sinful); regad.Assign(ATTR_TREQ_TD_ID, id); // This is the initial registration identification ad to the schedd // It contains: // ATTR_TREQ_TD_SINFUL // ATTR_TREQ_TD_ID putClassAd(rsock, regad); rsock->end_of_message(); // Get the response from the schedd. rsock->decode(); // This is the response ad from the schedd: // It contains: // ATTR_TREQ_INVALID_REQUEST // // OR // // ATTR_TREQ_INVALID_REQUEST // ATTR_TREQ_INVALID_REASON getClassAd(rsock, respad); rsock->end_of_message(); respad.LookupInteger(ATTR_TREQ_INVALID_REQUEST, invalid_request); if (invalid_request == FALSE) { // not an invalid request if (regsock_ptr) *regsock_ptr = rsock; return true; } respad.LookupString(ATTR_TREQ_INVALID_REASON, reason); errstack->pushf("DC_SCHEDD", 1, "Schedd refused registration: %s", reason.c_str()); return false; }