void HoldJob( const char* long_reason, const char* short_reason, int reason_code, int reason_subcode ) { char subject[ BUFSIZ ]; FILE *mailer; sprintf( subject, "Condor Job %d.%d put on hold\n", Proc->id.cluster, Proc->id.proc ); if( ! JobAd ) { dprintf( D_ALWAYS, "In HoldJob() w/ NULL JobAd!\n" ); exit( JOB_SHOULD_HOLD ); } ExitReason = JOB_SHOULD_HOLD; if ( !ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) ) { dprintf( D_ALWAYS, "Failed to connect to schedd!\n" ); } SetAttributeString( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON, short_reason ); SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_CODE, reason_code ); SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_SUBCODE, reason_subcode ); if ( !DisconnectQ(0) ) { dprintf( D_ALWAYS, "Failed to commit updated job queue status!\n" ); } mailer = email_user_open(JobAd, subject); if( ! mailer ) { // User didn't want email, so just exit now with the right // value so the schedd actually puts the job on hold. dprintf( D_ALWAYS, "Job going into Hold state.\n"); dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_SHOULD_HOLD); exit( JOB_SHOULD_HOLD ); } fprintf( mailer, "Your condor job " ); if( Proc->args_v1or2[0] ) { ArgList args; MyString args_string; args.AppendArgsV1or2Raw(Proc->args_v1or2[0],NULL); args.GetArgsStringForDisplay(&args_string); fprintf( mailer, "%s %s ", Proc->cmd[0], args_string.Value() ); } else { fprintf( mailer, "%s ", Proc->cmd[0] ); } fprintf( mailer, "\nis being put on hold.\n\n" ); fprintf( mailer, "%s", long_reason ); email_close(mailer); // Now that the user knows why, exit with the right code. dprintf( D_ALWAYS, "Job going into Hold state.\n"); dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_SHOULD_HOLD); exit( JOB_SHOULD_HOLD ); }
bool Email::send( void ) { if( ! fp ) { return false; } email_close(fp); init(); return true; }
void NotifyUser( char *buf, PROC *proc ) { FILE *mailer; char subject[ BUFSIZ ]; dprintf(D_FULLDEBUG, "NotifyUser() called.\n"); sprintf( subject, "Condor Job %d.%d", proc->id.cluster, proc->id.proc ); if( ! JobAd ) { dprintf( D_ALWAYS, "In NotifyUser() w/ NULL JobAd!\n" ); return; } // email HACK for John Bent <*****@*****.**> // added by Derek Wright <*****@*****.**> 2005-02-20 char* email_cc = param( "EMAIL_NOTIFICATION_CC" ); if( email_cc ) { bool allows_cc = true; int bool_val; if( JobAd->LookupBool(ATTR_ALLOW_NOTIFICATION_CC, bool_val) ) { dprintf( D_FULLDEBUG, "Job defined %s to %s\n", ATTR_ALLOW_NOTIFICATION_CC, bool_val ? "TRUE" : "FALSE" ); allows_cc = (bool)bool_val; } else { dprintf( D_FULLDEBUG, "%s not defined, assuming TRUE\n", ATTR_ALLOW_NOTIFICATION_CC ); } if( allows_cc ) { dprintf( D_FULLDEBUG, "%s is TRUE, sending email to \"%s\"\n", ATTR_ALLOW_NOTIFICATION_CC, email_cc ); mailer = email_open( email_cc, subject ); publishNotifyEmail( mailer, buf, proc ); email_close( mailer ); } else { dprintf( D_FULLDEBUG, "%s is FALSE, not sending email copy\n", ATTR_ALLOW_NOTIFICATION_CC ); } free( email_cc ); email_cc = NULL; } /* If user loaded program incorrectly, always send a message. */ if( MainSymbolExists == TRUE ) { switch( proc->notification ) { case NOTIFY_NEVER: return; case NOTIFY_ALWAYS: break; case NOTIFY_COMPLETE: if( proc->status == COMPLETED ) { break; } else { return; } case NOTIFY_ERROR: if( (proc->status == COMPLETED) && (WTERMSIG(JobStatus)!= 0) ) { break; } else { return; } default: dprintf(D_ALWAYS, "Condor Job %d.%d has a notification of %d\n", proc->id.cluster, proc->id.proc, proc->notification ); } } mailer = email_user_open(JobAd, subject); if( mailer == NULL ) { dprintf(D_ALWAYS, "Shadow: Cannot notify user( %s, %s, %s )\n", subject, proc->owner, "w" ); return; } publishNotifyEmail( mailer, buf, proc ); email_close(mailer); }
void TransferQueueManager::CheckTransferQueue() { TransferQueueRequest *client = NULL; int downloading = 0; int uploading = 0; bool clients_waiting = false; m_check_queue_timer = -1; ClearTransferCounts(); m_xfer_queue.Rewind(); while( m_xfer_queue.Next(client) ) { if( client->m_gave_go_ahead ) { GetUserRec(client->m_up_down_queue_user).running++; if( client->m_downloading ) { downloading += 1; } else { uploading += 1; } } else { GetUserRec(client->m_up_down_queue_user).idle++; } } if( m_throttle_disk_load ) { int old_concurrency_limit = m_throttle_disk_load_max_concurrency; double disk_load_short = m_iostats.file_read.EMAValue(m_disk_throttle_short_horizon.c_str()) + m_iostats.file_write.EMAValue(m_disk_throttle_short_horizon.c_str()); double disk_load_long = m_iostats.file_read.EMAValue(m_disk_throttle_long_horizon.c_str()) + m_iostats.file_write.EMAValue(m_disk_throttle_long_horizon.c_str()); if( disk_load_short > m_disk_load_high_throttle ) { // above the high water mark, do not start more transfers m_throttle_disk_load_max_concurrency = uploading + downloading; } else if( disk_load_long > m_disk_load_low_throttle || disk_load_short > m_disk_load_low_throttle ) { // between the high and low water mark, keep the concurrency limit as is (but at least 1) if( m_throttle_disk_load_max_concurrency < 1 ) { m_throttle_disk_load_max_concurrency = 1; m_throttle_disk_load_incremented = time(NULL); } } else { // below the low water mark, slowly increase the concurrency limit if we are running into it if( uploading + downloading == m_throttle_disk_load_max_concurrency ) { time_t now = time(NULL); if( m_throttle_disk_load_incremented > now ) { m_throttle_disk_load_incremented = now; // clock jumped back } if( m_throttle_disk_load_incremented == 0 || now-m_throttle_disk_load_incremented >= m_throttle_disk_load_increment_wait ) { m_throttle_disk_load_incremented = now; m_throttle_disk_load_max_concurrency += 1; if( m_throttle_disk_load_max_concurrency < floor(m_disk_load_low_throttle) ) { m_throttle_disk_load_max_concurrency = floor(m_disk_load_low_throttle); } } } } if( old_concurrency_limit != m_throttle_disk_load_max_concurrency ) { dprintf(D_ALWAYS, "TransferQueueManager: adjusted concurrency limit by %+d based on disk load: " "new limit %d, load %s %f %s %f, throttle %f to %f\n", m_throttle_disk_load_max_concurrency-old_concurrency_limit, m_throttle_disk_load_max_concurrency, m_disk_throttle_short_horizon.c_str(), disk_load_short, m_disk_throttle_long_horizon.c_str(), disk_load_long, m_disk_load_low_throttle, m_disk_load_high_throttle); } } // schedule new transfers while( uploading < m_max_uploads || m_max_uploads <= 0 || downloading < m_max_downloads || m_max_downloads <= 0 ) { TransferQueueRequest *best_client = NULL; int best_recency = 0; unsigned int best_running_count = 0; if( m_throttle_disk_load && (uploading + downloading >= m_throttle_disk_load_max_concurrency) ) { break; } m_xfer_queue.Rewind(); while( m_xfer_queue.Next(client) ) { if( client->m_gave_go_ahead ) { continue; } if( (client->m_downloading && (downloading < m_max_downloads || m_max_downloads <= 0)) || ((!client->m_downloading) && (uploading < m_max_uploads || m_max_uploads <= 0)) ) { TransferQueueUser &this_user = GetUserRec(client->m_up_down_queue_user); unsigned int this_user_active_count = this_user.running; int this_user_recency = this_user.recency; bool this_client_is_better = false; if( !best_client ) { this_client_is_better = true; } else if( best_client->m_downloading != client->m_downloading ) { // effectively treat up/down queues independently if( client->m_downloading ) { this_client_is_better = true; } } else if( best_running_count > this_user_active_count ) { // prefer users with fewer active transfers // (only counting transfers in one direction for this comparison) this_client_is_better = true; } else if( best_recency > this_user_recency ) { // if still tied: round robin this_client_is_better = true; } if( this_client_is_better ) { best_client = client; best_running_count = this_user_active_count; best_recency = this_user_recency; } } } client = best_client; if( !client ) { break; } dprintf(D_FULLDEBUG, "TransferQueueManager: sending GoAhead to %s.\n", client->Description() ); if( !client->SendGoAhead() ) { dprintf(D_FULLDEBUG, "TransferQueueManager: failed to send GoAhead; " "dequeueing %s.\n", client->Description() ); delete client; m_xfer_queue.Delete(client); TransferQueueChanged(); } else { SetRoundRobinRecency(client->m_up_down_queue_user); TransferQueueUser &user = GetUserRec(client->m_up_down_queue_user); user.running += 1; user.idle -= 1; if( client->m_downloading ) { downloading += 1; } else { uploading += 1; } } } // now that we have finished scheduling new transfers, // examine requests that are still waiting m_xfer_queue.Rewind(); while( m_xfer_queue.Next(client) ) { if( !client->m_gave_go_ahead ) { clients_waiting = true; TransferQueueUser &user = GetUserRec(client->m_up_down_queue_user); int age = time(NULL) - client->m_time_born; if( client->m_downloading ) { m_waiting_to_download++; if( age > m_download_wait_time ) { m_download_wait_time = age; } m_iostats.download_MB_waiting += client->m_sandbox_size_MB; user.iostats.download_MB_waiting += client->m_sandbox_size_MB; } else { m_waiting_to_upload++; if( age > m_upload_wait_time ) { m_upload_wait_time = age; } m_iostats.upload_MB_waiting += client->m_sandbox_size_MB; user.iostats.upload_MB_waiting += client->m_sandbox_size_MB; } } } m_uploading = uploading; m_downloading = downloading; if( clients_waiting ) { // queue is full; check for ancient clients m_xfer_queue.Rewind(); while( m_xfer_queue.Next(client) ) { if( client->m_gave_go_ahead ) { int age = time(NULL) - client->m_time_go_ahead; int max_queue_age = client->m_max_queue_age; if( max_queue_age > 0 && max_queue_age < age ) { // Killing this client will not stop the current // file that is being transfered by it (which // presumably has stalled for some reason). However, // it should prevent any additional files in the // sandbox from being transferred. dprintf(D_ALWAYS,"TransferQueueManager: forcibly " "dequeueing ancient (%ds old) entry for %s, " "because it is older than " "MAX_TRANSFER_QUEUE_AGE=%ds.\n", age, client->Description(), max_queue_age); FILE *email = email_admin_open("file transfer took too long"); if( !email ) { // Error sending the message dprintf( D_ALWAYS, "ERROR: Can't send email to the Condor " "Administrator\n" ); } else { fprintf( email, "A file transfer for\n%s\ntook longer than MAX_TRANSFER_QUEUE_AGE=%ds,\n" "so this transfer is being removed from the transfer queue,\n" "which will abort further transfers for this attempt to run this job.\n\n" "To avoid this timeout, MAX_TRANSFER_QUEUE_AGE may be increased,\n" "but be aware that transfers which take a long time will delay other\n" "transfers from starting if the maximum number of concurrent transfers\n" "is exceeded. Therefore, it is advisable to also review the settings\n" "of MAX_CONCURRENT_UPLOADS and/or MAX_CONCURRENT_DOWNLOADS.\n\n" "The transfer queue currently has %d/%d uploads,\n" "%d/%d downloads, %d transfers waiting %ds to upload,\n" "and %d transfers waiting %ds to download.\n", client->Description(), max_queue_age, m_uploading, m_max_uploads, m_downloading, m_max_downloads, m_waiting_to_upload, m_upload_wait_time, m_waiting_to_download, m_download_wait_time ); email_close ( email ); } delete client; m_xfer_queue.DeleteCurrent(); TransferQueueChanged(); // Only delete more ancient clients if the // next pass of this function finds there is pressure // on the queue. break; } } } } }
void TransferQueueManager::notifyAboutTransfersTakingTooLong() { SimpleListIterator<TransferQueueRequest *> itr(m_xfer_queue); TransferQueueRequest *client = NULL; FILE *email = NULL; while( itr.Next(client) ) { if( client->m_gave_go_ahead && !client->m_notified_about_taking_too_long ) { int age = time(NULL) - client->m_time_go_ahead; int max_queue_age = client->m_max_queue_age; if( max_queue_age > 0 && max_queue_age < age ) { client->m_notified_about_taking_too_long = true; if( !email ) { email = email_admin_open("file transfer took too long"); if( !email ) { // Error sending the message dprintf( D_ALWAYS, "ERROR: Can't send email to the Condor " "Administrator\n" ); return; } fprintf( email, "Below is a list of file transfers that took longer than\n" "MAX_TRANSFER_QUEUE_AGE=%ds. When other transfers are waiting\n" "to start, these old transfer attempts will be aborted.\n" "To avoid this timeout, MAX_TRANSFER_QUEUE_AGE may be increased,\n" "but be aware that transfers which take a long time will delay other\n" "transfers from starting if the maximum number of concurrent transfers\n" "is exceeded. Therefore, it is advisable to also review the settings\n" "of MAX_CONCURRENT_UPLOADS, MAX_CONCURRENT_DOWNLOADS, and/or\n" "FILE_TRANSFER_DISK_LOAD_THROTTLE.\n\n" "The transfer queue currently has %d/%d uploads,\n" "%d/%d downloads, %d transfers waiting %ds to upload,\n" "and %d transfers waiting %ds to download.\n", max_queue_age, m_uploading, m_max_uploads, m_downloading, m_max_downloads, m_waiting_to_upload, m_upload_wait_time, m_waiting_to_download, m_download_wait_time); char const *ema_horizon = m_iostats.bytes_sent.ShortestHorizonEMAName(); if( ema_horizon ) { fprintf(email, "Upload %s I/O load: %.0f bytes/s %.3f disk load %.3f net load\n", ema_horizon, m_iostats.bytes_sent.EMAValue(ema_horizon), m_iostats.file_read.EMAValue(ema_horizon), m_iostats.net_write.EMAValue(ema_horizon)); fprintf(email, "Download %s I/O load: %.0f bytes/s %.3f disk load %.3f net load\n", ema_horizon, m_iostats.bytes_received.EMAValue(ema_horizon), m_iostats.file_write.EMAValue(ema_horizon), m_iostats.net_read.EMAValue(ema_horizon)); } fprintf(email,"\n\nTransfers older than MAX_TRANSFER_QUEUE_AGE=%ds:\n\n",max_queue_age); } fprintf( email, "%s\n", client->SinlessDescription() ); } } } if( email ) { email_close ( email ); } }
int GridUniverseLogic::GManagerReaper(Service *,int pid, int exit_status) { gman_node_t* gman_node = NULL; MyString owner; // Iterate through our table to find the node w/ this pid // Someday we should perhaps also hash on the pid, but we // don't expect gridmanagers to exit very often, and there // are not that many of them. if (gman_pid_table) { gman_node_t* tmpnode; gman_pid_table->startIterations(); while ( gman_pid_table->iterate(owner,tmpnode) ) { if (tmpnode->pid == pid ) { // found it! gman_node = tmpnode; break; } } } MyString owner_safe; MyString exit_reason; if(gman_node) { owner_safe = owner; } else { owner_safe = "Unknown"; } if ( WIFEXITED( exit_status ) ) { exit_reason.formatstr( "with return code %d", WEXITSTATUS( exit_status ) ); } else { exit_reason.formatstr( "due to %s", daemonCore->GetExceptionString( exit_status ) ); } dprintf(D_ALWAYS, "condor_gridmanager (PID %d, owner %s) exited %s.\n", pid, owner_safe.Value(), exit_reason.Value() ); if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) { const char *condorUserName = get_condor_username(); dprintf(D_ALWAYS, "The gridmanager had a problem writing its log. " "Check the permissions of the file specified by GRIDMANAGER_LOG; " "it needs to be writable by Condor.\n"); /* send email to the admin about this, but only * every six hours - enough to not be ignored, but * not enough to be a pest. If only my children were * so helpful and polite. Ah, well, we can always dream... */ static time_t last_email_re_gridmanlog = 0; if ( time(NULL) - last_email_re_gridmanlog > 6 * 60 * 60 ) { last_email_re_gridmanlog = time(NULL); FILE *email = email_admin_open("Unable to launch grid universe jobs."); if ( email ) { fprintf(email, "The condor_gridmanager had an error writing its log file. Check the \n" "permissions/ownership of the file specified by the GRIDMANAGER_LOG setting in \n" "the condor_config file. This file needs to be writable as user %s to enable\n" "the condor_gridmanager daemon to write to it. \n\n" "Until this problem is fixed, grid universe jobs submitted from this machine cannot " "be launched.\n", condorUserName ? condorUserName : "******" ); email_close(email); } else { // Error sending an email message dprintf(D_ALWAYS,"ERROR: Cannot send email to the admin\n"); } } } // end if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) if (!gman_node) { // nothing more to do, so return return 0; } // Cancel any timers before removing the node!! if (gman_node->add_timer_id != -1) { daemonCore->Cancel_Timer(gman_node->add_timer_id); } if (gman_node->remove_timer_id != -1) { daemonCore->Cancel_Timer(gman_node->remove_timer_id); } // Remove node from our hash table gman_pid_table->remove(owner); // Remove any scratch directory used by this gridmanager char *scratchdir = scratchFilePath(gman_node); ASSERT(scratchdir); if ( IsDirectory(scratchdir) && init_user_ids(gman_node->owner, gman_node->domain) ) { priv_state saved_priv = set_user_priv(); // Must put this in braces so the Directory object // destructor is called, which will free the iterator // handle. If we didn't do this, the below rmdir // would fail. { Directory tmp( scratchdir ); tmp.Remove_Entire_Directory(); } if ( rmdir(scratchdir) == 0 ) { dprintf(D_FULLDEBUG,"Removed scratch dir %s\n",scratchdir); } else { dprintf(D_FULLDEBUG,"Failed to remove scratch dir %s\n", scratchdir); } set_priv(saved_priv); uninit_user_ids(); } delete [] scratchdir; // Reclaim memory from the node itself delete gman_node; return 0; }