Esempio n. 1
0
void
HoldJob( const char* long_reason, const char* short_reason, int reason_code,
		 int reason_subcode )
{
    char subject[ BUFSIZ ];	
	FILE *mailer;

	sprintf( subject, "Condor Job %d.%d put on hold\n", 
			 Proc->id.cluster, Proc->id.proc );

	if( ! JobAd ) {
		dprintf( D_ALWAYS, "In HoldJob() w/ NULL JobAd!\n" );
		exit( JOB_SHOULD_HOLD );
	}

	ExitReason = JOB_SHOULD_HOLD;
	if ( !ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) ) {
		dprintf( D_ALWAYS, "Failed to connect to schedd!\n" );
	}
	SetAttributeString( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON,
						short_reason );
	SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_CODE,
					 reason_code );
	SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_SUBCODE,
					 reason_subcode );
	if ( !DisconnectQ(0) ) {
		dprintf( D_ALWAYS, "Failed to commit updated job queue status!\n" );
	}

	mailer = email_user_open(JobAd, subject);
	if( ! mailer ) {
			// User didn't want email, so just exit now with the right
			// value so the schedd actually puts the job on hold.
		dprintf( D_ALWAYS, "Job going into Hold state.\n");
		dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n",
			JOB_SHOULD_HOLD);
		exit( JOB_SHOULD_HOLD );
	}

	fprintf( mailer, "Your condor job " );
	if( Proc->args_v1or2[0] ) {
		ArgList args;
		MyString args_string;
		args.AppendArgsV1or2Raw(Proc->args_v1or2[0],NULL);
		args.GetArgsStringForDisplay(&args_string);

		fprintf( mailer, "%s %s ", Proc->cmd[0], args_string.Value() );
	} else {
		fprintf( mailer, "%s ", Proc->cmd[0] );
	}
	fprintf( mailer, "\nis being put on hold.\n\n" );
	fprintf( mailer, "%s", long_reason );
	email_close(mailer);

		// Now that the user knows why, exit with the right code. 
	dprintf( D_ALWAYS, "Job going into Hold state.\n");
	dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n",
		JOB_SHOULD_HOLD);
	exit( JOB_SHOULD_HOLD );
}
Esempio n. 2
0
bool
Email::send( void )
{
	if( ! fp ) {
		return false;
	}
	email_close(fp);
	init();
	return true;
}
Esempio n. 3
0
void
NotifyUser( char *buf, PROC *proc )
{
        FILE *mailer;
        char subject[ BUFSIZ ];	

        dprintf(D_FULLDEBUG, "NotifyUser() called.\n");

		sprintf( subject, "Condor Job %d.%d", 
				 proc->id.cluster, proc->id.proc );

		if( ! JobAd ) {
			dprintf( D_ALWAYS, "In NotifyUser() w/ NULL JobAd!\n" );
			return;
		}

			// email HACK for John Bent <*****@*****.**>
			// added by Derek Wright <*****@*****.**> 2005-02-20
		char* email_cc = param( "EMAIL_NOTIFICATION_CC" );
		if( email_cc ) {
			bool allows_cc = true;
			int bool_val;
			if( JobAd->LookupBool(ATTR_ALLOW_NOTIFICATION_CC, bool_val) ) {
				dprintf( D_FULLDEBUG, "Job defined %s to %s\n",
						 ATTR_ALLOW_NOTIFICATION_CC,
						 bool_val ? "TRUE" : "FALSE" );
				allows_cc = (bool)bool_val;
			} else {
				dprintf( D_FULLDEBUG, "%s not defined, assuming TRUE\n",
						 ATTR_ALLOW_NOTIFICATION_CC );
			}
			if( allows_cc ) {
				dprintf( D_FULLDEBUG, "%s is TRUE, sending email to \"%s\"\n",
						 ATTR_ALLOW_NOTIFICATION_CC, email_cc );
				mailer = email_open( email_cc, subject );
				publishNotifyEmail( mailer, buf, proc );
				email_close( mailer );
			} else {
				dprintf( D_FULLDEBUG,
						 "%s is FALSE, not sending email copy\n",
						 ATTR_ALLOW_NOTIFICATION_CC );
			}
			free( email_cc );
			email_cc = NULL;
		}

        /* If user loaded program incorrectly, always send a message. */
        if( MainSymbolExists == TRUE ) {
                switch( proc->notification ) {
                case NOTIFY_NEVER:
                        return;
                case NOTIFY_ALWAYS:
                        break;
                case NOTIFY_COMPLETE:
                        if( proc->status == COMPLETED ) {
                                break;
                        } else {
                                return;
                        }
                case NOTIFY_ERROR:
                        if( (proc->status == COMPLETED) && (WTERMSIG(JobStatus)!= 0) ) {
                                break;
                        } else {
                                return;
                        }
                default:
                        dprintf(D_ALWAYS, "Condor Job %d.%d has a notification of %d\n",
                                        proc->id.cluster, proc->id.proc, proc->notification );
                }
        }

		mailer = email_user_open(JobAd, subject);
        if( mailer == NULL ) {
                dprintf(D_ALWAYS,
                        "Shadow: Cannot notify user( %s, %s, %s )\n",
                        subject, proc->owner, "w"
                );
				return;
        }
		publishNotifyEmail( mailer, buf, proc );
		email_close(mailer);
}
Esempio n. 4
0
void
TransferQueueManager::CheckTransferQueue() {
	TransferQueueRequest *client = NULL;
	int downloading = 0;
	int uploading = 0;
	bool clients_waiting = false;

	m_check_queue_timer = -1;

	ClearTransferCounts();

	m_xfer_queue.Rewind();
	while( m_xfer_queue.Next(client) ) {
		if( client->m_gave_go_ahead ) {
			GetUserRec(client->m_up_down_queue_user).running++;
			if( client->m_downloading ) {
				downloading += 1;
			}
			else {
				uploading += 1;
			}
		}
		else {
			GetUserRec(client->m_up_down_queue_user).idle++;
		}
	}

	if( m_throttle_disk_load ) {
		int old_concurrency_limit = m_throttle_disk_load_max_concurrency;

		double disk_load_short = m_iostats.file_read.EMAValue(m_disk_throttle_short_horizon.c_str()) +
		                         m_iostats.file_write.EMAValue(m_disk_throttle_short_horizon.c_str());
		double disk_load_long =  m_iostats.file_read.EMAValue(m_disk_throttle_long_horizon.c_str()) +
		                         m_iostats.file_write.EMAValue(m_disk_throttle_long_horizon.c_str());

		if( disk_load_short > m_disk_load_high_throttle ) {
				// above the high water mark, do not start more transfers
			m_throttle_disk_load_max_concurrency = uploading + downloading;
		}
		else if( disk_load_long > m_disk_load_low_throttle || disk_load_short > m_disk_load_low_throttle ) {
				// between the high and low water mark, keep the concurrency limit as is (but at least 1)
			if( m_throttle_disk_load_max_concurrency < 1 ) {
				m_throttle_disk_load_max_concurrency = 1;
				m_throttle_disk_load_incremented = time(NULL);
			}
		}
		else {
				// below the low water mark, slowly increase the concurrency limit if we are running into it
			if( uploading + downloading == m_throttle_disk_load_max_concurrency ) {
				time_t now = time(NULL);
				if( m_throttle_disk_load_incremented > now ) {
					m_throttle_disk_load_incremented = now; // clock jumped back
				}
				if( m_throttle_disk_load_incremented == 0 || now-m_throttle_disk_load_incremented >= m_throttle_disk_load_increment_wait ) {
					m_throttle_disk_load_incremented = now;
					m_throttle_disk_load_max_concurrency += 1;
					if( m_throttle_disk_load_max_concurrency < floor(m_disk_load_low_throttle) ) {
						m_throttle_disk_load_max_concurrency = floor(m_disk_load_low_throttle);
					}
				}
			}
		}

		if( old_concurrency_limit != m_throttle_disk_load_max_concurrency ) {
			dprintf(D_ALWAYS,
					"TransferQueueManager: adjusted concurrency limit by %+d based on disk load: "
					"new limit %d, load %s %f %s %f, throttle %f to %f\n",
					m_throttle_disk_load_max_concurrency-old_concurrency_limit,
					m_throttle_disk_load_max_concurrency,
					m_disk_throttle_short_horizon.c_str(),
					disk_load_short,
					m_disk_throttle_long_horizon.c_str(),
					disk_load_long,
					m_disk_load_low_throttle,
					m_disk_load_high_throttle);
		}
	}

		// schedule new transfers
	while( uploading < m_max_uploads || m_max_uploads <= 0 ||
		   downloading < m_max_downloads || m_max_downloads <= 0 )
	{
		TransferQueueRequest *best_client = NULL;
		int best_recency = 0;
		unsigned int best_running_count = 0;

		if( m_throttle_disk_load && (uploading + downloading >= m_throttle_disk_load_max_concurrency) ) {
			break;
		}

		m_xfer_queue.Rewind();
		while( m_xfer_queue.Next(client) ) {
			if( client->m_gave_go_ahead ) {
				continue;
			}
			if( (client->m_downloading && 
				(downloading < m_max_downloads || m_max_downloads <= 0)) ||
				((!client->m_downloading) &&
				(uploading < m_max_uploads || m_max_uploads <= 0)) )
			{
				TransferQueueUser &this_user = GetUserRec(client->m_up_down_queue_user);
				unsigned int this_user_active_count = this_user.running;
				int this_user_recency = this_user.recency;

				bool this_client_is_better = false;
				if( !best_client ) {
					this_client_is_better = true;
				}
				else if( best_client->m_downloading != client->m_downloading ) {
						// effectively treat up/down queues independently
					if( client->m_downloading ) {
						this_client_is_better = true;
					}
				}
				else if( best_running_count > this_user_active_count ) {
						// prefer users with fewer active transfers
						// (only counting transfers in one direction for this comparison)
					this_client_is_better = true;
				}
				else if( best_recency > this_user_recency ) {
						// if still tied: round robin
					this_client_is_better = true;
				}

				if( this_client_is_better ) {
					best_client = client;
					best_running_count = this_user_active_count;
					best_recency = this_user_recency;
				}
			}
		}

		client = best_client;
		if( !client ) {
			break;
		}

		dprintf(D_FULLDEBUG,
				"TransferQueueManager: sending GoAhead to %s.\n",
				client->Description() );

		if( !client->SendGoAhead() ) {
			dprintf(D_FULLDEBUG,
					"TransferQueueManager: failed to send GoAhead; "
					"dequeueing %s.\n",
					client->Description() );

			delete client;
			m_xfer_queue.Delete(client);

			TransferQueueChanged();
		}
		else {
			SetRoundRobinRecency(client->m_up_down_queue_user);
			TransferQueueUser &user = GetUserRec(client->m_up_down_queue_user);
			user.running += 1;
			user.idle -= 1;
			if( client->m_downloading ) {
				downloading += 1;
			}
			else {
				uploading += 1;
			}
		}
	}


		// now that we have finished scheduling new transfers,
		// examine requests that are still waiting
	m_xfer_queue.Rewind();
	while( m_xfer_queue.Next(client) ) {
		if( !client->m_gave_go_ahead ) {
			clients_waiting = true;

			TransferQueueUser &user = GetUserRec(client->m_up_down_queue_user);
			int age = time(NULL) - client->m_time_born;
			if( client->m_downloading ) {
				m_waiting_to_download++;
				if( age > m_download_wait_time ) {
					m_download_wait_time = age;
				}
				m_iostats.download_MB_waiting += client->m_sandbox_size_MB;
				user.iostats.download_MB_waiting += client->m_sandbox_size_MB;
			}
			else {
				m_waiting_to_upload++;
				if( age > m_upload_wait_time ) {
					m_upload_wait_time = age;
				}
				m_iostats.upload_MB_waiting += client->m_sandbox_size_MB;
				user.iostats.upload_MB_waiting += client->m_sandbox_size_MB;
			}
		}
	}

	m_uploading = uploading;
	m_downloading = downloading;


	if( clients_waiting ) {
			// queue is full; check for ancient clients
		m_xfer_queue.Rewind();
		while( m_xfer_queue.Next(client) ) {
			if( client->m_gave_go_ahead ) {
				int age = time(NULL) - client->m_time_go_ahead;
				int max_queue_age = client->m_max_queue_age;
				if( max_queue_age > 0 && max_queue_age < age ) {
						// Killing this client will not stop the current
						// file that is being transfered by it (which
						// presumably has stalled for some reason).  However,
						// it should prevent any additional files in the
						// sandbox from being transferred.
					dprintf(D_ALWAYS,"TransferQueueManager: forcibly "
							"dequeueing  ancient (%ds old) entry for %s, "
							"because it is older than "
							"MAX_TRANSFER_QUEUE_AGE=%ds.\n",
							age,
							client->Description(),
							max_queue_age);


					FILE *email = email_admin_open("file transfer took too long");
					if( !email ) {
							// Error sending the message
						dprintf( D_ALWAYS, 
								 "ERROR: Can't send email to the Condor "
								 "Administrator\n" );
					} else {
						fprintf( email,
								 "A file transfer for\n%s\ntook longer than MAX_TRANSFER_QUEUE_AGE=%ds,\n"
								 "so this transfer is being removed from the transfer queue,\n"
								 "which will abort further transfers for this attempt to run this job.\n\n"
								 "To avoid this timeout, MAX_TRANSFER_QUEUE_AGE may be increased,\n"
								 "but be aware that transfers which take a long time will delay other\n"
								 "transfers from starting if the maximum number of concurrent transfers\n"
								 "is exceeded.  Therefore, it is advisable to also review the settings\n"
								 "of MAX_CONCURRENT_UPLOADS and/or MAX_CONCURRENT_DOWNLOADS.\n\n"
								 "The transfer queue currently has %d/%d uploads,\n"
								 "%d/%d downloads, %d transfers waiting %ds to upload,\n"
								 "and %d transfers waiting %ds to download.\n",
								 client->Description(),
								 max_queue_age,
								 m_uploading,
								 m_max_uploads,
								 m_downloading,
								 m_max_downloads,
								 m_waiting_to_upload,
								 m_upload_wait_time,
								 m_waiting_to_download,
								 m_download_wait_time
								 );

						email_close ( email );
					}

					delete client;
					m_xfer_queue.DeleteCurrent();
					TransferQueueChanged();
						// Only delete more ancient clients if the
						// next pass of this function finds there is pressure
						// on the queue.
					break;
				}
			}
		}
	}
}
Esempio n. 5
0
void
TransferQueueManager::notifyAboutTransfersTakingTooLong()
{
	SimpleListIterator<TransferQueueRequest *> itr(m_xfer_queue);
	TransferQueueRequest *client = NULL;

	FILE *email = NULL;

	while( itr.Next(client) ) {
		if( client->m_gave_go_ahead && !client->m_notified_about_taking_too_long ) {
			int age = time(NULL) - client->m_time_go_ahead;
			int max_queue_age = client->m_max_queue_age;
			if( max_queue_age > 0 && max_queue_age < age ) {
				client->m_notified_about_taking_too_long = true;
				if( !email ) {
					email = email_admin_open("file transfer took too long");
					if( !email ) {
							// Error sending the message
						dprintf( D_ALWAYS, 
								 "ERROR: Can't send email to the Condor "
								 "Administrator\n" );
						return;
					}
					fprintf( email,
							 "Below is a list of file transfers that took longer than\n"
							 "MAX_TRANSFER_QUEUE_AGE=%ds.  When other transfers are waiting\n"
							 "to start, these old transfer attempts will be aborted.\n"
							 "To avoid this timeout, MAX_TRANSFER_QUEUE_AGE may be increased,\n"
							 "but be aware that transfers which take a long time will delay other\n"
							 "transfers from starting if the maximum number of concurrent transfers\n"
							 "is exceeded.  Therefore, it is advisable to also review the settings\n"
							 "of MAX_CONCURRENT_UPLOADS, MAX_CONCURRENT_DOWNLOADS, and/or\n"
							 "FILE_TRANSFER_DISK_LOAD_THROTTLE.\n\n"
							 "The transfer queue currently has %d/%d uploads,\n"
							 "%d/%d downloads, %d transfers waiting %ds to upload,\n"
							 "and %d transfers waiting %ds to download.\n",
							 max_queue_age,
							 m_uploading,
							 m_max_uploads,
							 m_downloading,
							 m_max_downloads,
							 m_waiting_to_upload,
							 m_upload_wait_time,
							 m_waiting_to_download,
							 m_download_wait_time);

					char const *ema_horizon = m_iostats.bytes_sent.ShortestHorizonEMAName();
					if( ema_horizon ) {
						fprintf(email,
								"Upload %s I/O load: %.0f bytes/s  %.3f disk load  %.3f net load\n",
								ema_horizon,
								m_iostats.bytes_sent.EMAValue(ema_horizon),
								m_iostats.file_read.EMAValue(ema_horizon),
								m_iostats.net_write.EMAValue(ema_horizon));

						fprintf(email,
								"Download %s I/O load: %.0f bytes/s  %.3f disk load  %.3f net load\n",
								ema_horizon,
								m_iostats.bytes_received.EMAValue(ema_horizon),
								m_iostats.file_write.EMAValue(ema_horizon),
								m_iostats.net_read.EMAValue(ema_horizon));
					}
					fprintf(email,"\n\nTransfers older than MAX_TRANSFER_QUEUE_AGE=%ds:\n\n",max_queue_age);
				}

				fprintf( email, "%s\n", client->SinlessDescription() );
			}
		}
	}
	if( email ) {
		email_close ( email );
	}
}
Esempio n. 6
0
int 
GridUniverseLogic::GManagerReaper(Service *,int pid, int exit_status)
{
	gman_node_t* gman_node = NULL;
	MyString owner;

	// Iterate through our table to find the node w/ this pid
	// Someday we should perhaps also hash on the pid, but we
	// don't expect gridmanagers to exit very often, and there
	// are not that many of them.

	if (gman_pid_table) {
		gman_node_t* tmpnode;
		gman_pid_table->startIterations();
		while ( gman_pid_table->iterate(owner,tmpnode) ) {
			if (tmpnode->pid == pid ) {
				// found it!
				gman_node = tmpnode;
				break;
			}
		}
	}

	MyString owner_safe;
	MyString exit_reason;
	if(gman_node) { owner_safe = owner; }
	else { owner_safe = "Unknown"; }
	if ( WIFEXITED( exit_status ) ) {
		exit_reason.formatstr( "with return code %d",
							 WEXITSTATUS( exit_status ) );
	} else {
		exit_reason.formatstr( "due to %s",
							 daemonCore->GetExceptionString( exit_status ) );
	}
	dprintf(D_ALWAYS, "condor_gridmanager (PID %d, owner %s) exited %s.\n",
			pid, owner_safe.Value(), exit_reason.Value() );
	if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) {
		const char *condorUserName = get_condor_username();

		dprintf(D_ALWAYS, 
			"The gridmanager had a problem writing its log. "
			"Check the permissions of the file specified by GRIDMANAGER_LOG; "
			"it needs to be writable by Condor.\n");

			/* send email to the admin about this, but only
			 * every six hours - enough to not be ignored, but
			 * not enough to be a pest.  If only my children were
			 * so helpful and polite.  Ah, well, we can always dream...
			 */
		static time_t last_email_re_gridmanlog = 0;
		if ( time(NULL) - last_email_re_gridmanlog > 6 * 60 * 60 ) {
			last_email_re_gridmanlog = time(NULL);
			FILE *email = email_admin_open("Unable to launch grid universe jobs.");
			if ( email ) {
				fprintf(email,
					"The condor_gridmanager had an error writing its log file.  Check the  \n"
					"permissions/ownership of the file specified by the GRIDMANAGER_LOG setting in \n"
					"the condor_config file.  This file needs to be writable as user %s to enable\n"
					"the condor_gridmanager daemon to write to it. \n\n"
					"Until this problem is fixed, grid universe jobs submitted from this machine cannot "
					"be launched.\n", condorUserName ? condorUserName : "******" );
				email_close(email);
			} else {
					// Error sending an email message
				dprintf(D_ALWAYS,"ERROR: Cannot send email to the admin\n");
			}
		}	
	}	// end if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR)

	if (!gman_node) {
		// nothing more to do, so return
		return 0;
	}

	// Cancel any timers before removing the node!!
	if (gman_node->add_timer_id != -1) {
		daemonCore->Cancel_Timer(gman_node->add_timer_id);
	}
	if (gman_node->remove_timer_id != -1) {
		daemonCore->Cancel_Timer(gman_node->remove_timer_id);
	}
	// Remove node from our hash table
	gman_pid_table->remove(owner);
	// Remove any scratch directory used by this gridmanager
	char *scratchdir = scratchFilePath(gman_node);
	ASSERT(scratchdir);
	if ( IsDirectory(scratchdir) && 
		 init_user_ids(gman_node->owner, gman_node->domain) ) 
	{
		priv_state saved_priv = set_user_priv();
			// Must put this in braces so the Directory object
			// destructor is called, which will free the iterator
			// handle.  If we didn't do this, the below rmdir 
			// would fail.
		{
			Directory tmp( scratchdir );
			tmp.Remove_Entire_Directory();
		}
		if ( rmdir(scratchdir) == 0 ) {
			dprintf(D_FULLDEBUG,"Removed scratch dir %s\n",scratchdir);
		} else {
			dprintf(D_FULLDEBUG,"Failed to remove scratch dir %s\n",
					scratchdir);
		}
		set_priv(saved_priv);
		uninit_user_ids();
	}
	delete [] scratchdir;

	// Reclaim memory from the node itself
	delete gman_node;

	return 0;
}