// Shared initialization code int CondorLockFile::BuildLock( const char *l_url, const char *l_name ) { #ifdef WIN32 dprintf( D_ALWAYS, "File locks not supported under Windows\n" ); return -1; #endif // Verify the rank if ( Rank( l_url ) <= 0 ) { return -1; } // Copy the URL & name out this->lock_url = l_url; this->lock_name = l_name; // Create the lock file name from it formatstr( lock_file, "%s/%s.lock", l_url + 5, l_name ); // Build a temporary file name char hostname[128]; if ( condor_gethostname( hostname, sizeof( hostname ) ) ) { sprintf( hostname, "unknown-%d", rand( ) ); } formatstr( temp_file, "%s.%s-%d", lock_file.c_str(), hostname, getpid( ) ); dprintf( D_FULLDEBUG, "HA Lock Init: lock file='%s'\n", lock_file.c_str() ); dprintf( D_FULLDEBUG, "HA Lock Init: temp file='%s'\n", temp_file.c_str() ); // Build the lock internals return ImplementLock( ); }
const char *GlobusResource::CanonicalName( const char *name ) { static std::string canonical; char *host; char *port; parse_resource_manager_string( name, &host, &port, NULL, NULL ); formatstr( canonical, "%s:%s", host, *port ? port : "2119" ); free( host ); free( port ); return canonical.c_str(); }
bool DCStartd::checkVacateType( VacateType t ) { std::string err_msg; switch( t ) { case VACATE_GRACEFUL: case VACATE_FAST: break; default: formatstr(err_msg, "Invalid VacateType (%d)", (int)t); newError( CA_INVALID_REQUEST, err_msg.c_str() ); return false; } return true; }
/* get variable string value */ char * str_val(struct tbl *vp) { char *s; if ((vp->flag&SPECIAL)) getspec(vp); if (!(vp->flag&ISSET)) s = null; /* special to dollar() */ else if (!(vp->flag&INTEGER)) /* string source */ s = vp->val.s + vp->type; else { /* integer source */ /* worst case number length is when base=2, so use BITS(long) */ /* minus base # number null */ char strbuf[1 + 2 + 1 + BITS(long) + 1]; const char *digits = (vp->flag & UCASEV_AL) ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" : "0123456789abcdefghijklmnopqrstuvwxyz"; unsigned long n; int base; s = strbuf + sizeof(strbuf); if (vp->flag & INT_U) n = (unsigned long) vp->val.i; else n = (vp->val.i < 0) ? -vp->val.i : vp->val.i; base = (vp->type == 0) ? 10 : vp->type; *--s = '\0'; do { *--s = digits[n % base]; n /= base; } while (n != 0); if (base != 10) { *--s = '#'; *--s = digits[base % 10]; if (base >= 10) *--s = digits[base / 10]; } if (!(vp->flag & INT_U) && vp->val.i < 0) *--s = '-'; if (vp->flag & (RJUST|LJUST)) /* case already dealt with */ s = formatstr(vp, s); else s = str_save(s, ATEMP); } return s; }
void CheckSpoolVersion(char const *spool, int spool_min_version_i_support, int spool_cur_version_i_support, int &spool_min_version,int &spool_cur_version) { spool_min_version = 0; // before 7.5.5 there was no version stamp spool_cur_version = 0; std::string vers_fname; formatstr(vers_fname,"%s%cspool_version",spool,DIR_DELIM_CHAR); FILE *vers_file = safe_fopen_wrapper_follow(vers_fname.c_str(),"r"); if( vers_file ) { if( 1 != fscanf(vers_file, "minimum compatible spool version %d\n", &spool_min_version) ) { EXCEPT("Failed to find minimum compatible spool version in %s\n", vers_fname.c_str()); } if( 1 != fscanf(vers_file, "current spool version %d\n", &spool_cur_version) ) { EXCEPT("Failed to find current spool version in %s\n", vers_fname.c_str()); } fclose(vers_file); } dprintf(D_FULLDEBUG,"Spool format version requires >= %d (I support version %d)\n", spool_min_version, spool_cur_version_i_support); dprintf(D_FULLDEBUG,"Spool format version is %d (I require version >= %d)\n", spool_min_version, spool_min_version_i_support); if( spool_min_version > spool_cur_version_i_support ) { EXCEPT("According to %s, the SPOOL directory requires that I support spool version %d, but I only support %d.\n", vers_fname.c_str(), spool_min_version, spool_cur_version_i_support); } if( spool_cur_version < spool_min_version_i_support ) { EXCEPT("According to %s, the SPOOL directory is written in spool version %d, but I only support versions back to %d.\n", vers_fname.c_str(), spool_cur_version, spool_min_version_i_support); } }
/* set variable to string value */ int setstr(struct tbl *vq, const char *s, int error_ok) { char *salloc = NULL; bool no_ro_check = tobool(error_ok & 0x4); error_ok &= ~0x4; if ((vq->flag & RDONLY) && !no_ro_check) { warningf(true, "read-only: %s", vq->name); if (!error_ok) errorfxz(2); return (0); } if (!(vq->flag&INTEGER)) { /* string dest */ if ((vq->flag&ALLOC)) { #ifndef MKSH_SMALL /* debugging */ if (s >= vq->val.s && s <= vq->val.s + strlen(vq->val.s)) { internal_errorf( "setstr: %s=%s: assigning to self", vq->name, s); } #endif afree(vq->val.s, vq->areap); } vq->flag &= ~(ISSET|ALLOC); vq->type = 0; if (s && (vq->flag & (UCASEV_AL|LCASEV|LJUST|RJUST))) s = salloc = formatstr(vq, s); if ((vq->flag&EXPORT)) exportprep(vq, s); else { strdupx(vq->val.s, s, vq->areap); vq->flag |= ALLOC; } } else { /* integer dest */ if (!v_evaluate(vq, s, error_ok, true)) return (0); } vq->flag |= ISSET; if ((vq->flag&SPECIAL)) setspec(vq); afree(salloc, ATEMP); return (1); }
void NordugridJob::SetRemoteJobId( const char *job_id ) { free( remoteJobId ); if ( job_id ) { remoteJobId = strdup( job_id ); } else { remoteJobId = NULL; } std::string full_job_id; if ( job_id ) { formatstr( full_job_id, "nordugrid %s %s", resourceManagerString, job_id ); } BaseJob::SetRemoteJobId( full_job_id.c_str() ); }
void write_hex(int level, const char* filename, const char* funcname, int lineno, const char* head, int size, char* str) { if(level<=0) return; int fd = -1; struct tm tm; struct timeval tv; time_t now = time (NULL); gettimeofday(&tv,NULL); localtime_r(&now, &tm); filename = basename(filename); if((fd = access_log())<=0) return; int i,j,end; int line=0; char buffer[100]; unsigned char buf[17]; memset(buffer,0,100); sprintf(buffer,"%02d%02d-%02d:%02d:%02d.%06ld|%s(%d)%s|%s,size=[%d]\n", tm.tm_mon + 1, tm.tm_mday,tm.tm_hour, tm.tm_min, tm.tm_sec,tv.tv_usec, filename, lineno, funcname,head,size); write(fd, buffer,strlen(buffer)); memset(buf,0,17); memset(buffer,0,100); line = size/16; for(i=0;i<line;i++) { memcpy(buf,str+16*i,16); sprintf(buffer,"%04X | %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X | %s\n",(i+1)*16,(unsigned char)buf[0],buf[1],buf[2],buf[3],buf[4],buf[5],buf[6],buf[7],buf[8],buf[9],buf[10],buf[11],buf[12],buf[13],buf[14],buf[15],formatstr(buf,16)); write(fd, buffer, strlen(buffer)); } end = size - 16*i; if(end > 0) { memset(buf,0,17); memset(buffer,0,100); memcpy(buf,str+16*i,end); sprintf(buffer,"%04X |",i*16+end); for(j=0;j<end;j++) { sprintf(buffer+6+j*3," %02X",buf[j]); } for(;j<16;j++) { sprintf(buffer+6+j*3," "); } sprintf(buffer+54," | %s\n",formatstr(buf,end)); write(fd, buffer, strlen(buffer)); } }
static void dumpSeqList(char *list, short format) { long i, l, listlen; char s[256]; listlen = strlen(list); printf("Sequences in %s (format is %s)\n", inputfile, formatstr(format)); for (i=0, l=0; i < listlen; i++) { if (list[i] == (char)NEWLINE) { s[l] = '\0'; l = 0; puts(s); } else if (l < 255) s[l++] = list[i]; } putchar('\n'); }
char* combinepath(const char* first, const char *second) { size_t len_first, off_second = 0; if (first == NULL || second == NULL) return NULL; len_first = strlen(first); while (is_path_separator(first[len_first-1])) len_first--; while (is_path_separator(second[off_second])) off_second++; return formatstr("%.*s%c%s", (int) len_first, first, OSAL_DIR_SEPARATORS[0], second + off_second); }
void CondorResource::PublishResourceAd( ClassAd *resource_ad ) { BaseResource::PublishResourceAd( resource_ad ); std::string buff; formatstr( buff, "condor %s %s", resourceName, poolName ); resource_ad->Assign( ATTR_NAME, buff.c_str() ); if ( proxySubject ) { resource_ad->Assign( ATTR_X509_USER_PROXY_SUBJECT, proxySubject ); } if ( proxyFQAN ) { resource_ad->Assign( ATTR_X509_USER_PROXY_FQAN, proxyFQAN ); } gahp->PublishStats( resource_ad ); }
void CondorResource::CondorRegisterJob( CondorJob *job, const char *submitter_id ) { BaseResource::RegisterJob( job ); if ( submitter_ids.contains( submitter_id ) == false ) { submitter_ids.append( submitter_id ); if ( submitter_constraint.empty() ) { formatstr( submitter_constraint, "(%s=?=\"%s\")", ATTR_SUBMITTER_ID, submitter_id ); } else { formatstr_cat( submitter_constraint, "||(%s=?=\"%s\")", ATTR_SUBMITTER_ID, submitter_id ); } } }
//------------------------------------------------------------------------- bool MakePathAbsolute(MyString &filePath, MyString &errMsg) { bool result = true; if ( !fullpath( filePath.Value() ) ) { MyString currentDir; if ( ! condor_getcwd( currentDir ) ) { formatstr( errMsg, "condor_getcwd() failed with errno %d (%s) at %s:%d", errno, strerror(errno), __FILE__, __LINE__ ); result = false; } filePath = currentDir + DIR_DELIM_STRING + filePath; } return result; }
void DCTransferQueue::SendReport(time_t now,bool disconnect) { std::string report; UtcTime now_usec; now_usec.getTime(); long interval = now_usec.difference_usec(m_last_report); if( interval < 0 ) { interval = 0; } formatstr(report,"%u %u %u %u %u %u %u %u", (unsigned)now, (unsigned)interval, m_recent_bytes_sent, m_recent_bytes_received, m_recent_usec_file_read, m_recent_usec_file_write, m_recent_usec_net_read, m_recent_usec_net_write); if( m_xfer_queue_sock ) { m_xfer_queue_sock->encode(); if ( !m_xfer_queue_sock->put(report.c_str()) || !m_xfer_queue_sock->end_of_message() ) { dprintf(D_FULLDEBUG,"Failed to send transfer queue i/o report.\n"); } if( disconnect ) { // Tell the server we are done. m_xfer_queue_sock->put(""); m_xfer_queue_sock->end_of_message(); } } m_recent_bytes_sent = 0; m_recent_bytes_received = 0; m_recent_usec_file_read = 0; m_recent_usec_file_write = 0; m_recent_usec_net_read = 0; m_recent_usec_net_write = 0; m_last_report = now_usec; m_next_report = now + m_report_interval; }
void Defrag::saveState() { ClassAd ad; ad.Assign(ATTR_LAST_POLL,(int)m_last_poll); std::string new_state_file; formatstr(new_state_file,"%s.new",m_state_file.c_str()); FILE *fp; if( !(fp = safe_fopen_wrapper_follow(new_state_file.c_str(), "w")) ) { EXCEPT("failed to save state to %s",new_state_file.c_str()); } else { fPrintAd(fp, ad); fclose( fp ); if( rotate_file(new_state_file.c_str(),m_state_file.c_str())!=0 ) { EXCEPT("failed to save state to %s",m_state_file.c_str()); } } }
// @rootdir: 欲遍历的根目录 // @subdir: 此次要遍历的除去根目录的部分的子目录. void walk_subdir_win32(const char* rootdir, char *subdir, int deepen, fn_walk_dir fn, uint32_t *ctx) { HANDLE hFind; WIN32_FIND_DATA finddata; BOOL fOk; int64_t timestamp = 0; char cFileNameComp[_MAX_PATH]; hFind = FindFirstFile("*.*", &finddata); fOk = (hFind != INVALID_HANDLE_VALUE); while (fOk) { sprintf(cFileNameComp, "%s\\%s", subdir, finddata.cFileName); timestamp = FileTimeToUnixTime(finddata.ftLastWriteTime); if (finddata.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { // 目录 if ((strcmp(finddata.cFileName, ".") != 0) && (strcmp(finddata.cFileName, "..") != 0)) { if (deepen && fn) { // 由浅到深: 是先调用fn,再去在该目上递归 fn(cFileNameComp, FILE_ATTRIBUTE_DIRECTORY, posix_mku64(finddata.nFileSizeLow, finddata.nFileSizeHigh), timestamp, ctx); } SetCurrentDirectory(formatstr("%s\\%s\\", rootdir, cFileNameComp)); walk_subdir_win32(rootdir, cFileNameComp, deepen, fn, ctx); if (!deepen && fn) { // 由深到浅: 是先在该目上递归,再去调动fn fn(cFileNameComp, FILE_ATTRIBUTE_DIRECTORY, posix_mku64(finddata.nFileSizeLow, finddata.nFileSizeHigh), timestamp, ctx); } } } else { // 文件 if (fn) { fn(cFileNameComp, 0, posix_mku64(finddata.nFileSizeLow, finddata.nFileSizeHigh), timestamp, ctx); } } fOk = FindNextFile(hFind, &finddata); } if (hFind != INVALID_HANDLE_VALUE) { FindClose(hFind); } return; }
void BaseResource::PublishResourceAd( ClassAd *resource_ad ) { std::string buff; formatstr( buff, "%s %s", ResourceType(), resourceName ); resource_ad->Assign( ATTR_NAME, buff.c_str() ); resource_ad->Assign( "HashName", GetHashName() ); resource_ad->Assign( ATTR_SCHEDD_NAME, ScheddName ); resource_ad->Assign( ATTR_SCHEDD_IP_ADDR, ScheddObj->addr() ); resource_ad->Assign( ATTR_OWNER, myUserName ); if ( SelectionValue ) { resource_ad->Assign( ATTR_GRIDMANAGER_SELECTION_VALUE, SelectionValue ); } resource_ad->Assign( "NumJobs", registeredJobs.Number() ); resource_ad->Assign( "JobLimit", jobLimit ); resource_ad->Assign( "SubmitsAllowed", submitsAllowed.Number() ); resource_ad->Assign( "SubmitsWanted", submitsWanted.Number() ); if ( resourceDown ) { resource_ad->Assign( ATTR_GRID_RESOURCE_UNAVAILABLE_TIME, (int)lastStatusChange ); } int num_idle = 0; int num_running = 0; BaseJob *job; registeredJobs.Rewind(); while ( registeredJobs.Next( job ) ) { switch ( job->condorState ) { case IDLE: num_idle++; break; case RUNNING: num_running++; break; default: break; } } resource_ad->Assign( ATTR_RUNNING_JOBS, num_running ); resource_ad->Assign( ATTR_IDLE_JOBS, num_idle ); }
static void initJobExprs() { static bool done = false; if(done) { return; } formatstr(expr_matched_or_undef, "(%s =!= %s)", ATTR_JOB_MATCHED, expr_false); formatstr(expr_managed, "(%s =?= \"%s\")", ATTR_JOB_MANAGED, MANAGED_EXTERNAL); formatstr(expr_not_managed, "(%s =!= \"%s\")", ATTR_JOB_MANAGED, MANAGED_EXTERNAL); formatstr(expr_not_held, "(%s != %d)", ATTR_JOB_STATUS, HELD); formatstr(expr_schedd_job_constraint, "(%s)", ScheddJobConstraint); // The gridmanager never wants to see this job again. // It should be in the process of leaving the queue. formatstr(expr_completely_done, "(%s =?= \"%s\")", ATTR_JOB_MANAGED, MANAGED_DONE); formatstr(expr_not_completely_done, "(%s =!= \"%s\")", ATTR_JOB_MANAGED, MANAGED_DONE); done = true; }
void WriteSpoolVersion(char const *spool,int spool_min_version_i_write,int spool_cur_version_i_support) { std::string vers_fname; formatstr(vers_fname,"%s%cspool_version",spool,DIR_DELIM_CHAR); FILE *vers_file = safe_fcreate_replace_if_exists(vers_fname.c_str(),"w"); if( !vers_file ) { EXCEPT("Failed to open %s for writing.\n",vers_fname.c_str()); } if( fprintf(vers_file,"minimum compatible spool version %d\n", spool_min_version_i_write) < 0 || fprintf(vers_file,"current spool version %d\n", spool_cur_version_i_support) < 0 || fflush(vers_file) != 0 || fsync(fileno(vers_file)) != 0 || fclose(vers_file) != 0 ) { EXCEPT("Error writing spool version to %s\n",vers_fname.c_str()); } }
// we will use ec2 command "status_all" to do the Ping work void EC2Resource::DoPing( unsigned& ping_delay, bool& ping_complete, bool& ping_succeeded ) { // Since EC2 doesn't use proxy, we should use Startup() to replace isInitialized() if ( gahp->isStarted() == false ) { dprintf( D_ALWAYS,"gahp server not up yet, delaying ping\n" ); ping_delay = 5; return; } ping_delay = 0; std::string error_code; int rc = gahp->ec2_vm_server_type( resourceName, m_public_key_file, m_private_key_file, m_serverType, error_code ); if ( rc == GAHPCLIENT_COMMAND_PENDING ) { ping_complete = false; } else if ( rc != 0 ) { ping_complete = true; // If the service returns an authorization failure, that means // the service is up, so return true. Individual jobs with // invalid authentication tokens will then go on hold, which is // what we want (rather than saying idle). if( error_code != "" ) { if( strstr( error_code.c_str(), "(401)" ) != NULL ) { ping_succeeded = true; m_hadAuthFailure = true; formatstr( authFailureMessage, "(%s): '%s'", error_code.c_str(), gahp->getErrorString() ); } } else { ping_succeeded = false; } } else { ping_complete = true; ping_succeeded = true; } return; }
bool PandadClassAdLogPlugin::getGlobalJobID( int cluster, int proc, std::string & globalJobID ) { static char const * hostname = NULL; if( hostname == NULL ) { // We can't block on name resolution, so use what we've already got. hostname = param( "FULL_HOSTNAME" ); if( hostname == NULL || strlen( hostname ) == 0 ) { hostname = param( "HOSTNAME" ); } if( hostname == NULL || strlen( hostname ) == 0 ) { hostname = param( "IP_ADDRESS" ); } if( hostname == NULL || strlen( hostname ) == 0 ) { dprintf( D_ALWAYS, "Unable to determine hostname portion of global job IDs, using '[unknown]'.\n" ); hostname = "[unknown]"; } } formatstr( globalJobID, "%s:%d.%d", hostname, cluster, proc ); return true; }
void GlobusResource::CleanupMonitorJob() { if ( monitorGramJobId ) { monitorGahp->globus_gram_client_job_cancel( monitorGramJobId ); free( monitorGramJobId ); monitorGramJobId = NULL; monitorGramJobStatus = GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN; monitorGramErrorCode = 0; } if ( monitorDirectory ) { std::string tmp_dir; formatstr( tmp_dir, "%s.remove", monitorDirectory ); MSC_SUPPRESS_WARNING_FIXME(6031) // warning: return value of 'rename' ignored. rename( monitorDirectory, tmp_dir.c_str() ); free( monitorDirectory ); monitorDirectory = NULL; Directory tmp( tmp_dir.c_str() ); tmp.Remove_Entire_Directory(); MSC_SUPPRESS_WARNING_FIXME(6031) // warning: return value of 'rmdir' ignored. rmdir( tmp_dir.c_str() ); } if(monitorJobStatusFile) { free(monitorJobStatusFile); monitorJobStatusFile = NULL; } if(monitorLogFile) { free(monitorLogFile); monitorLogFile = NULL; } }
void Reconfig() { contact_schedd_interval = param_integer ("C_GAHP_CONTACT_SCHEDD_DELAY", 5); // When GSI authentication is used, we're willing to trust schedds // which have the same credential as the job if ( proxySubjectName ) { char *daemon_subjects = param( "GSI_DAEMON_NAME" ); if ( daemon_subjects ) { std::string buff; formatstr( buff, "%s,%s", daemon_subjects, proxySubjectName ); dprintf( D_ALWAYS, "Setting %s=%s\n", "GSI_DAEMON_NAME", buff.c_str() ); // We must use our daemon subsystem prefix in case the // admin used it in the config file. config_insert( "C_GAHP_WORKER_THREAD.GSI_DAEMON_NAME", buff.c_str() ); free( daemon_subjects ); } } }
int DockerProc::StartJob() { std::string imageID; if( ! JobAd->LookupString( ATTR_DOCKER_IMAGE, imageID ) ) { dprintf( D_ALWAYS | D_FAILURE, "%s not defined in job ad, unable to start job.\n", ATTR_DOCKER_IMAGE ); return FALSE; } std::string command; JobAd->LookupString( ATTR_JOB_CMD, command ); dprintf( D_FULLDEBUG, "%s: '%s'\n", ATTR_JOB_CMD, command.c_str() ); std::string sandboxPath = Starter->jic->jobRemoteIWD(); // // This code is deliberately wrong, probably for backwards-compability. // (See the code in JICShadow::beginFileTransfer(), which assumes that // we transferred the executable if ATTR_TRANSFER_EXECUTABLE is unset.) // Rather than risk breaking anything by fixing condor_submit (which // does not set ATTR_TRANSFER_EXECUTABLE unless it's false) -- and // introducing a version dependency -- assume the executable was // transferred unless it was explicitly noted otherwise. // bool transferExecutable = true; JobAd->LookupBool( ATTR_TRANSFER_EXECUTABLE, transferExecutable ); if( transferExecutable ) { command = sandboxPath + "/" + command; } ArgList args; args.SetArgV1SyntaxToCurrentPlatform(); MyString argsError; if( ! args.AppendArgsFromClassAd( JobAd, & argsError ) ) { dprintf( D_ALWAYS | D_FAILURE, "Failed to read job arguments from job ad: '%s'.\n", argsError.c_str() ); return FALSE; } Env job_env; MyString env_errors; if( !Starter->GetJobEnv(JobAd,&job_env,&env_errors) ) { dprintf( D_ALWAYS, "Aborting DockerProc::StartJob: %s\n", env_errors.Value()); return 0; } // The GlobalJobID is unsuitable by virtue its octothorpes. This // construction is informative, but could be made even less likely // to collide if it had a timestamp. formatstr( containerName, "HTCJob%d_%d_%s_PID%d", Starter->jic->jobCluster(), Starter->jic->jobProc(), Starter->getMySlotName().c_str(), // note: this can be "" for single slot machines. getpid() ); // // Do I/O redirection (includes streaming). // int childFDs[3] = { -2, -2, -2 }; { TemporaryPrivSentry sentry(PRIV_USER); // getStdFile() returns -1 on error. if( -1 == (childFDs[0] = openStdFile( SFT_IN, NULL, true, "Input file" )) ) { dprintf( D_ALWAYS | D_FAILURE, "DockerProc::StartJob(): failed to open stdin.\n" ); return FALSE; } if( -1 == (childFDs[1] = openStdFile( SFT_OUT, NULL, true, "Output file" )) ) { dprintf( D_ALWAYS | D_FAILURE, "DockerProc::StartJob(): failed to open stdout.\n" ); daemonCore->Close_FD( childFDs[0] ); return FALSE; } if( -1 == (childFDs[2] = openStdFile( SFT_ERR, NULL, true, "Error file" )) ) { dprintf( D_ALWAYS | D_FAILURE, "DockerProc::StartJob(): failed to open stderr.\n" ); daemonCore->Close_FD( childFDs[0] ); daemonCore->Close_FD( childFDs[1] ); return FALSE; } } // Ulog the execute event Starter->jic->notifyJobPreSpawn(); CondorError err; // DockerAPI::run() returns a PID from daemonCore->Create_Process(), which // makes it suitable for passing up into VanillaProc. This combination // will trigger the reaper(s) when the container terminates. ClassAd *machineAd = Starter->jic->machClassAd(); std::list<std::string> extras; buildExtraVolumes(extras); int rv = DockerAPI::run( *machineAd, containerName, imageID, command, args, job_env, sandboxPath, extras, JobPid, childFDs, err ); if( rv < 0 ) { dprintf( D_ALWAYS | D_FAILURE, "DockerAPI::run( %s, %s, ... ) failed with return value %d\n", imageID.c_str(), command.c_str(), rv ); return FALSE; } dprintf( D_FULLDEBUG, "DockerAPI::run() returned pid %d\n", JobPid ); // TODO: Start a timer to poll for job usage updates. ++num_pids; // Used by OsProc::PublishUpdateAd(). return TRUE; }
bool DockerProc::JobReaper( int pid, int status ) { TemporaryPrivSentry sentry(PRIV_ROOT); dprintf( D_ALWAYS, "DockerProc::JobReaper()\n" ); // // This should mean that the container has terminated. // if( pid == JobPid ) { // // Even running Docker in attached mode, we have a race condition // is exiting when the container exits, not when the docker daemon // notices that the container has exited. // int rv = -1; bool running = false; ClassAd dockerAd; CondorError error; // Years of careful research. for( int i = 0; i < 20; ++i ) { rv = DockerAPI::inspect( containerName, & dockerAd, error ); if( rv < 0 ) { dprintf( D_FULLDEBUG, "Failed to inspect (for removal) container '%s'; sleeping a second (%d already slept) to give Docker a chance to catch up.\n", containerName.c_str(), i ); sleep( 1 ); continue; } if( ! dockerAd.LookupBool( "Running", running ) ) { dprintf( D_FULLDEBUG, "Inspection of container '%s' failed to reveal its running state; sleeping a second (%d already slept) to give Docke a chance to catch up.\n", containerName.c_str(), i ); sleep( 1 ); continue; } if( running ) { dprintf( D_FULLDEBUG, "Inspection reveals that container '%s' is still running; sleeping a second (%d already slept) to give Docker a chance to catch up.\n", containerName.c_str(), i ); sleep( 1 ); continue; } break; } // FIXME: Move all this shared conditional-checking into a function. if( rv < 0 ) { dprintf( D_ALWAYS | D_FAILURE, "Failed to inspect (for removal) container '%s'.\n", containerName.c_str() ); std::string imageName; if( ! JobAd->LookupString( ATTR_DOCKER_IMAGE, imageName ) ) { dprintf( D_ALWAYS | D_FAILURE, "%s not defined in job ad.\n", ATTR_DOCKER_IMAGE ); imageName = "Unknown"; // shouldn't ever happen } std::string message; formatstr(message, "Cannot start container: invalid image name: %s", imageName.c_str()); Starter->jic->holdJob(message.c_str(), CONDOR_HOLD_CODE_InvalidDockerImage, 0); return VanillaProc::JobReaper( pid, status ); } if( ! dockerAd.LookupBool( "Running", running ) ) { dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal its running state.\n", containerName.c_str() ); return VanillaProc::JobReaper( pid, status ); } if( running ) { dprintf( D_ALWAYS | D_FAILURE, "Inspection reveals that container '%s' is still running.\n", containerName.c_str() ); return VanillaProc::JobReaper( pid, status ); } // FIXME: Rethink returning a classad. Having to check for missing // attributes blows. // TODO: Set status appropriately (as if it were from waitpid()). std::string oomkilled; if (! dockerAd.LookupString( "OOMKilled", oomkilled)) { dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal whether it was OOM killed. Assuming it was not.\n", containerName.c_str() ); } if (oomkilled.find("true") == 0) { ClassAd *machineAd = Starter->jic->machClassAd(); int memory; machineAd->LookupInteger(ATTR_MEMORY, memory); std::string message; formatstr(message, "Docker job exhaused %d Mb memory", memory); dprintf(D_ALWAYS, "%s, going on hold\n", message.c_str()); Starter->jic->holdJob(message.c_str(), CONDOR_HOLD_CODE_JobOutOfResources, 0); DockerAPI::rm( containerName, error ); if ( Starter->Hold( ) ) { Starter->allJobsDone(); this->JobExit(); } Starter->ShutdownFast(); return 0; } // See if docker could not run the job // most likely invalid executable std::string dockerError; if (! dockerAd.LookupString( "DockerError", dockerError)) { dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal whether there was an internal docker error.\n", containerName.c_str() ); } if (dockerError.length() > 0) { std::string message; formatstr(message, "Error running docker job: %s", dockerError.c_str()); dprintf(D_ALWAYS, "%s, going on hold\n", message.c_str()); Starter->jic->holdJob(message.c_str(), CONDOR_HOLD_CODE_FailedToCreateProcess, 0); DockerAPI::rm( containerName, error ); if ( Starter->Hold( ) ) { Starter->allJobsDone(); this->JobExit(); } Starter->ShutdownFast(); return 0; } int dockerStatus; if( ! dockerAd.LookupInteger( "ExitCode", dockerStatus ) ) { dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal its exit code.\n", containerName.c_str() ); return VanillaProc::JobReaper( pid, status ); } dprintf( D_FULLDEBUG, "Setting status of Docker job to %d.\n", dockerStatus ); status = dockerStatus; // TODO: Record final job usage. // We don't have to do any process clean-up, because container. // We'll do the disk clean-up after we've transferred files. } // This helps to make ssh-to-job more plausible. return VanillaProc::JobReaper( pid, status ); }
CondorResource::CondorResource( const char *resource_name, const char *pool_name, const Proxy *proxy ) : BaseResource( resource_name ) { hasLeases = true; if ( proxy != NULL ) { proxySubject = strdup( proxy->subject->subject_name ); proxyFQAN = strdup( proxy->subject->fqan ); } else { proxySubject = NULL; proxyFQAN = NULL; } scheddPollTid = TIMER_UNSET; scheddName = strdup( resource_name ); gahp = NULL; ping_gahp = NULL; scheddStatusActive = false; submitter_constraint = ""; if ( pool_name != NULL ) { poolName = strdup( pool_name ); } else { poolName = NULL; } scheddPollTid = daemonCore->Register_Timer( 0, (TimerHandlercpp)&CondorResource::DoScheddPoll, "CondorResource::DoScheddPoll", (Service*)this ); char *gahp_path = param("CONDOR_GAHP"); if ( gahp_path == NULL ) { EXCEPT( "CONDOR_GAHP not defined in condor config file" ); } else { // TODO remove scheddName from the gahp server key if/when // a gahp server can handle multiple schedds std::string buff; ArgList args; formatstr( buff, "CONDOR/%s/%s/%s", poolName ? poolName : "NULL", scheddName, proxyFQAN ? proxyFQAN : "NULL" ); args.AppendArg("-f"); args.AppendArg("-s"); args.AppendArg(scheddName); if ( poolName != NULL ) { args.AppendArg("-P"); args.AppendArg(poolName); } gahp = new GahpClient( buff.c_str(), gahp_path, &args ); gahp->setNotificationTimerId( scheddPollTid ); gahp->setMode( GahpClient::normal ); gahp->setTimeout( CondorJob::gahpCallTimeout ); ping_gahp = new GahpClient( buff.c_str(), gahp_path, &args ); ping_gahp->setNotificationTimerId( pingTimerId ); ping_gahp->setMode( GahpClient::normal ); ping_gahp->setTimeout( CondorJob::gahpCallTimeout ); lease_gahp = new GahpClient( buff.c_str(), gahp_path, &args ); lease_gahp->setNotificationTimerId( updateLeasesTimerId ); lease_gahp->setMode( GahpClient::normal ); lease_gahp->setTimeout( CondorJob::gahpCallTimeout ); free( gahp_path ); } }
void CondorResource::DoUpdateLeases( unsigned& update_delay, bool& update_complete, SimpleList<PROC_ID>& update_succeeded ) { int rc; BaseJob *curr_job; SimpleList<PROC_ID> jobs; SimpleList<int> expirations; SimpleList<PROC_ID> updated; dprintf(D_FULLDEBUG,"*** DoUpdateLeases called\n"); if ( lease_gahp->isStarted() == false ) { dprintf( D_ALWAYS,"gahp server not up yet, delaying lease update\n" ); update_delay = 5; return; } update_delay = 0; if ( leaseUpdates.IsEmpty() ) { dprintf( D_FULLDEBUG, "*** Job lease list empty, returning success immediately\n" ); update_complete = true; return; } if ( updateLeasesCmdActive == false ) { leaseUpdates.Rewind(); while ( leaseUpdates.Next( curr_job ) ) { // TODO When remote-job-id is homogenized and stored in // BaseJob, BaseResource can skip jobs that don't have a // a remote-job-id yet if ( ((CondorJob*)curr_job)->remoteJobId.cluster != 0 ) { jobs.Append( ((CondorJob*)curr_job)->remoteJobId ); expirations.Append( m_sharedLeaseExpiration ); } } } rc = lease_gahp->condor_job_update_lease( scheddName, jobs, expirations, updated ); if ( rc == GAHPCLIENT_COMMAND_PENDING ) { update_complete = false; } else if ( rc != 0 ) { dprintf( D_FULLDEBUG, "*** Lease update failed!\n" ); update_complete = true; } else { dprintf( D_FULLDEBUG, "*** Lease udpate succeeded!\n" ); update_complete = true; PROC_ID curr_id; std::string id_str; updated.Rewind(); while ( updated.Next( curr_id ) ) { formatstr( id_str, "condor %s %s %d.%d", scheddName, poolName, curr_id.cluster, curr_id.proc ); if ( BaseJob::JobsByRemoteId.lookup( HashKey( id_str.c_str() ), curr_job ) == 0 ) { update_succeeded.Append( curr_job->procID ); } } } }
void CondorResource::DoScheddPoll() { int rc; ScheddPollInfo *poll_info = NULL; if ( ( registeredJobs.IsEmpty() || resourceDown ) && scheddStatusActive == false ) { // No jobs or we can't talk to the schedd, so no point // in polling daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() ); return; } if ( gahp->isStarted() == false ) { // The gahp isn't started yet. Wait a few seconds for a CondorJob // object to start it (and possibly initialize x509 credentials). daemonCore->Reset_Timer( scheddPollTid, 5 ); return; } PollInfoByName.lookup( HashKey( HashName( scheddName, poolName, NULL ) ), poll_info ); daemonCore->Reset_Timer( scheddPollTid, TIMER_NEVER ); if ( scheddStatusActive == false ) { // We share polls across all CondorResource objects going to // the same schedd. If another object has done a poll // recently, then don't bother doing one ourselves. if ( poll_info == NULL ) { poll_info = new ScheddPollInfo; poll_info->m_lastPoll = 0; poll_info->m_pollActive = false; PollInfoByName.insert( HashKey( HashName( scheddName, poolName, NULL ) ), poll_info ); } if ( poll_info->m_pollActive == true || poll_info->m_lastPoll + BatchStatusInterval() > time(NULL) ) { daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() ); return; } // start schedd status command dprintf( D_FULLDEBUG, "Starting collective poll: %s\n", scheddName ); std::string constraint; // create a list of jobs we expect to hear about in our // status command // Since we're sharing the results of this status command with // all CondorResource objects going to the same schedd, look // for their jobs as well. poll_info->m_submittedJobs.Rewind(); while ( poll_info->m_submittedJobs.Next() ) { poll_info->m_submittedJobs.DeleteCurrent(); } CondorResource *next_resource; BaseJob *job; std::string job_id; ResourcesByName.startIterations(); while ( ResourcesByName.iterate( next_resource ) != 0 ) { if ( strcmp( scheddName, next_resource->scheddName ) || strcmp( poolName ? poolName : "", next_resource->poolName ? next_resource->poolName : "" ) ) { continue; } next_resource->registeredJobs.Rewind(); while ( ( job = next_resource->registeredJobs.Next() ) ) { if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) { poll_info->m_submittedJobs.Append( (CondorJob *)job ); } } } formatstr( constraint, "(%s)", submitter_constraint.c_str() ); rc = gahp->condor_job_status_constrained( scheddName, constraint.c_str(), NULL, NULL ); if ( rc != GAHPCLIENT_COMMAND_PENDING ) { dprintf( D_ALWAYS, "gahp->condor_job_status_constrained returned %d for remote schedd: %s\n", rc, scheddName ); EXCEPT( "condor_job_status_constrained failed!" ); } scheddStatusActive = true; poll_info->m_pollActive = true; } else { // finish schedd status command int num_status_ads; ClassAd **status_ads = NULL; ASSERT( poll_info ); rc = gahp->condor_job_status_constrained( NULL, NULL, &num_status_ads, &status_ads ); if ( rc == GAHPCLIENT_COMMAND_PENDING ) { return; } else if ( rc != 0 ) { dprintf( D_ALWAYS, "gahp->condor_job_status_constrained returned %d for remote schedd %s\n", rc, scheddName ); dprintf( D_ALWAYS, "Requesting ping of resource\n" ); RequestPing( NULL ); } if ( rc == 0 ) { for ( int i = 0; i < num_status_ads; i++ ) { int cluster, proc; int rc2; std::string job_id_string; BaseJob *base_job = NULL; CondorJob *job; if( status_ads[i] == NULL ) { dprintf(D_ALWAYS, "DoScheddPoll was given null pointer for classad #%d\n", i); continue; } status_ads[i]->LookupInteger( ATTR_CLUSTER_ID, cluster ); status_ads[i]->LookupInteger( ATTR_PROC_ID, proc ); formatstr( job_id_string, "condor %s %s %d.%d", scheddName, poolName, cluster, proc ); rc2 = BaseJob::JobsByRemoteId.lookup( HashKey( job_id_string.c_str() ), base_job ); job = dynamic_cast<CondorJob*>( base_job ); if ( rc2 == 0 ) { job->NotifyNewRemoteStatus( status_ads[i] ); poll_info->m_submittedJobs.Delete( job ); } else { delete status_ads[i]; } } poll_info->m_lastPoll = time(NULL); } poll_info->m_pollActive = false; if ( status_ads != NULL ) { free( status_ads ); } // Check if any jobs were missing from the status result if ( rc == 0 ) { CondorJob *job; std::string job_id; poll_info->m_submittedJobs.Rewind(); while ( ( job = poll_info->m_submittedJobs.Next() ) ) { if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) { // We should have gotten a status ad for this job, // but didn't. Tell the job that there may be // something wrong by giving it a NULL status ad. job->NotifyNewRemoteStatus( NULL ); } poll_info->m_submittedJobs.DeleteCurrent(); } } scheddStatusActive = false; dprintf( D_FULLDEBUG, "Collective poll complete: %s\n", scheddName ); daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() ); } }
bool GlobusResource::SubmitMonitorJob() { // return true if job submitted, else return false int now = time(NULL); int rc; char *monitor_executable; std::string contact; std::string RSL; StopMonitorJob(); /* Create monitor directory and files */ g_MonitorUID++; std::string buff; formatstr( buff, "%s/grid-monitor.%s.%d", GridmanagerScratchDir, resourceName, g_MonitorUID ); monitorDirectory = strdup( buff.c_str() ); if ( mkdir( monitorDirectory, 0700 ) < 0 ) { dprintf( D_ALWAYS, "SubmitMonitorJob: mkdir(%s,0700) failed, " "errno=%d (%s)\n", monitorDirectory, errno, strerror( errno ) ); free( monitorDirectory ); monitorDirectory = NULL; return false; } formatstr( buff, "%s/grid-monitor-job-status", monitorDirectory ); monitorJobStatusFile = strdup( buff.c_str() ); formatstr( buff, "%s/grid-monitor-log", monitorDirectory ); monitorLogFile = strdup( buff.c_str() ); rc = creat( monitorJobStatusFile, S_IREAD|S_IWRITE ); if ( rc < 0 ) { dprintf( D_ALWAYS, "Failed to submit grid_monitor to %s: " "creat(%s,%d) failed, errno=%d (%s)\n", resourceName, monitorJobStatusFile, S_IREAD|S_IWRITE, errno, strerror( errno ) ); return false; } else { close( rc ); } rc = creat( monitorLogFile, S_IREAD|S_IWRITE ); if ( rc < 0 ) { dprintf( D_ALWAYS, "Failed to submit grid_monitor to %s: " "creat(%s,%d) failed, errno=%d (%s)\n", resourceName, monitorLogFile, S_IREAD|S_IWRITE, errno, strerror( errno ) ); return false; } else { close( rc ); } jobStatusFileLastReadTime = now; logFileLastReadTime = now; monitor_executable = param( "GRID_MONITOR" ); if ( monitor_executable == NULL ) { dprintf( D_ALWAYS, "Failed to submit grid_monitor to %s: " "GRID_MONITOR not defined!\n", resourceName ); return false; } monitorGahp->setMode( GahpClient::normal ); const char *gassServerUrl = monitorGahp->getGlobusGassServerUrl(); formatstr( RSL, "&(executable=%s%s)(stdout=%s%s)(arguments='--dest-url=%s%s')", gassServerUrl, monitor_executable, gassServerUrl, monitorLogFile, gassServerUrl, monitorJobStatusFile ); free( monitor_executable ); formatstr( contact, "%s/jobmanager-fork", resourceName ); std::string job_contact; rc = monitorGahp->globus_gram_client_job_request( contact.c_str(), RSL.c_str(), 1, monitorGahp->getGt2CallbackContact(), job_contact, false ); if ( rc != GAHPCLIENT_COMMAND_PENDING ) { dprintf( D_ALWAYS, "Failed to submit grid_monitor to %s: " "globus_gram_client_job_request() returned %d!\n", resourceName, rc ); return false; } monitorSubmitActive = true; return true; }
void BaseResource::Reconfig() { int tmp_int; char *param_value; std::string param_name; tmp_int = param_integer( "GRIDMANAGER_RESOURCE_PROBE_INTERVAL", 5 * 60 ); setProbeInterval( tmp_int ); jobLimit = -1; formatstr( param_name, "GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE_%s", ResourceType() ); param_value = param( param_name.c_str() ); if ( param_value == NULL ) { param_value = param( "GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE" ); } if ( param_value != NULL ) { char *tmp1; char *tmp2; StringList limits( param_value ); limits.rewind(); if ( limits.number() > 0 ) { jobLimit = atoi( limits.next() ); while ( (tmp1 = limits.next()) && (tmp2 = limits.next()) ) { if ( strstr( resourceName, tmp1 ) != 0 ) { jobLimit = atoi( tmp2 ); } } } free( param_value ); } if ( jobLimit <= 0 ) { jobLimit = DEFAULT_MAX_SUBMITTED_JOBS_PER_RESOURCE; } // If the jobLimit was widened, move jobs from Wanted to Allowed and // signal them while ( submitsAllowed.Length() < jobLimit && submitsWanted.Length() > 0 ) { BaseJob *wanted_job = submitsWanted.Head(); submitsWanted.Delete( wanted_job ); submitsAllowed.Append( wanted_job ); wanted_job->SetEvaluateState(); } formatstr( param_name, "GRIDMANAGER_JOB_PROBE_RATE_%s", ResourceType() ); m_paramJobPollRate = param_integer( param_name.c_str(), -1 ); if ( m_paramJobPollRate <= 0 ) { m_paramJobPollRate = param_integer( "GRIDMANAGER_JOB_PROBE_RATE", DEFAULT_JOB_POLL_RATE ); } if ( m_paramJobPollRate <= 0 ) { m_paramJobPollRate = DEFAULT_JOB_POLL_RATE; } const char *legacy_job_poll_param = NULL; const char *type = ResourceType(); if ( strcmp( type, "condor" ) == 0 ) { legacy_job_poll_param = "CONDOR_JOB_POLL_INTERVAL"; } else if ( strcmp( type, "batch" ) == 0 || strcmp( type, "pbs" ) == 0 || strcmp( type, "lsf" ) == 0 || strcmp( type, "nqs" ) == 0 || strcmp( type, "sge" ) == 0 || strcmp( type, "naregi" ) == 0 ) { legacy_job_poll_param = "INFN_JOB_POLL_INTERVAL"; } formatstr( param_name, "GRIDMANAGER_JOB_PROBE_INTERVAL_%s", ResourceType() ); m_paramJobPollInterval = param_integer( param_name.c_str(), -1 ); if ( m_paramJobPollInterval <= 0 ) { m_paramJobPollInterval = param_integer( "GRIDMANAGER_JOB_PROBE_INTERVAL", -1 ); } if ( m_paramJobPollInterval <= 0 && legacy_job_poll_param ) { m_paramJobPollInterval = param_integer( legacy_job_poll_param, -1 ); } if ( m_paramJobPollInterval <= 0 ) { m_paramJobPollInterval = DEFAULT_JOB_POLL_INTERVAL; } SetJobPollInterval(); _collectorUpdateInterval = param_integer ( "GRIDMANAGER_COLLECTOR_UPDATE_INTERVAL", 5*60 ); }