Exemplo n.º 1
0
FILE*
Email::open_stream( ClassAd* ad, int exit_reason, const char* subject )
{
	if( ! shouldSend(ad, exit_reason) ) {
			// nothing to do
		return NULL;
	}

	ad->LookupInteger( ATTR_CLUSTER_ID, cluster );
	ad->LookupInteger( ATTR_PROC_ID, proc );

	MyString full_subject;
	full_subject.formatstr( "Condor Job %d.%d", cluster, proc );
	if( subject ) {
		full_subject += " ";
		full_subject += subject;
	}
	if(email_admin) {
		fp = email_admin_open( full_subject.Value() );
	} else {
		fp = email_user_open_id( ad, cluster, proc, full_subject.Value() );
	}
	return fp; 
}
Exemplo n.º 2
0
void
TransferQueueManager::notifyAboutTransfersTakingTooLong()
{
	SimpleListIterator<TransferQueueRequest *> itr(m_xfer_queue);
	TransferQueueRequest *client = NULL;

	FILE *email = NULL;

	while( itr.Next(client) ) {
		if( client->m_gave_go_ahead && !client->m_notified_about_taking_too_long ) {
			int age = time(NULL) - client->m_time_go_ahead;
			int max_queue_age = client->m_max_queue_age;
			if( max_queue_age > 0 && max_queue_age < age ) {
				client->m_notified_about_taking_too_long = true;
				if( !email ) {
					email = email_admin_open("file transfer took too long");
					if( !email ) {
							// Error sending the message
						dprintf( D_ALWAYS, 
								 "ERROR: Can't send email to the Condor "
								 "Administrator\n" );
						return;
					}
					fprintf( email,
							 "Below is a list of file transfers that took longer than\n"
							 "MAX_TRANSFER_QUEUE_AGE=%ds.  When other transfers are waiting\n"
							 "to start, these old transfer attempts will be aborted.\n"
							 "To avoid this timeout, MAX_TRANSFER_QUEUE_AGE may be increased,\n"
							 "but be aware that transfers which take a long time will delay other\n"
							 "transfers from starting if the maximum number of concurrent transfers\n"
							 "is exceeded.  Therefore, it is advisable to also review the settings\n"
							 "of MAX_CONCURRENT_UPLOADS, MAX_CONCURRENT_DOWNLOADS, and/or\n"
							 "FILE_TRANSFER_DISK_LOAD_THROTTLE.\n\n"
							 "The transfer queue currently has %d/%d uploads,\n"
							 "%d/%d downloads, %d transfers waiting %ds to upload,\n"
							 "and %d transfers waiting %ds to download.\n",
							 max_queue_age,
							 m_uploading,
							 m_max_uploads,
							 m_downloading,
							 m_max_downloads,
							 m_waiting_to_upload,
							 m_upload_wait_time,
							 m_waiting_to_download,
							 m_download_wait_time);

					char const *ema_horizon = m_iostats.bytes_sent.ShortestHorizonEMAName();
					if( ema_horizon ) {
						fprintf(email,
								"Upload %s I/O load: %.0f bytes/s  %.3f disk load  %.3f net load\n",
								ema_horizon,
								m_iostats.bytes_sent.EMAValue(ema_horizon),
								m_iostats.file_read.EMAValue(ema_horizon),
								m_iostats.net_write.EMAValue(ema_horizon));

						fprintf(email,
								"Download %s I/O load: %.0f bytes/s  %.3f disk load  %.3f net load\n",
								ema_horizon,
								m_iostats.bytes_received.EMAValue(ema_horizon),
								m_iostats.file_write.EMAValue(ema_horizon),
								m_iostats.net_read.EMAValue(ema_horizon));
					}
					fprintf(email,"\n\nTransfers older than MAX_TRANSFER_QUEUE_AGE=%ds:\n\n",max_queue_age);
				}

				fprintf( email, "%s\n", client->SinlessDescription() );
			}
		}
	}
	if( email ) {
		email_close ( email );
	}
}
Exemplo n.º 3
0
void
TransferQueueManager::CheckTransferQueue() {
	TransferQueueRequest *client = NULL;
	int downloading = 0;
	int uploading = 0;
	bool clients_waiting = false;

	m_check_queue_timer = -1;

	ClearTransferCounts();

	m_xfer_queue.Rewind();
	while( m_xfer_queue.Next(client) ) {
		if( client->m_gave_go_ahead ) {
			GetUserRec(client->m_up_down_queue_user).running++;
			if( client->m_downloading ) {
				downloading += 1;
			}
			else {
				uploading += 1;
			}
		}
		else {
			GetUserRec(client->m_up_down_queue_user).idle++;
		}
	}

	if( m_throttle_disk_load ) {
		int old_concurrency_limit = m_throttle_disk_load_max_concurrency;

		double disk_load_short = m_iostats.file_read.EMAValue(m_disk_throttle_short_horizon.c_str()) +
		                         m_iostats.file_write.EMAValue(m_disk_throttle_short_horizon.c_str());
		double disk_load_long =  m_iostats.file_read.EMAValue(m_disk_throttle_long_horizon.c_str()) +
		                         m_iostats.file_write.EMAValue(m_disk_throttle_long_horizon.c_str());

		if( disk_load_short > m_disk_load_high_throttle ) {
				// above the high water mark, do not start more transfers
			m_throttle_disk_load_max_concurrency = uploading + downloading;
		}
		else if( disk_load_long > m_disk_load_low_throttle || disk_load_short > m_disk_load_low_throttle ) {
				// between the high and low water mark, keep the concurrency limit as is (but at least 1)
			if( m_throttle_disk_load_max_concurrency < 1 ) {
				m_throttle_disk_load_max_concurrency = 1;
				m_throttle_disk_load_incremented = time(NULL);
			}
		}
		else {
				// below the low water mark, slowly increase the concurrency limit if we are running into it
			if( uploading + downloading == m_throttle_disk_load_max_concurrency ) {
				time_t now = time(NULL);
				if( m_throttle_disk_load_incremented > now ) {
					m_throttle_disk_load_incremented = now; // clock jumped back
				}
				if( m_throttle_disk_load_incremented == 0 || now-m_throttle_disk_load_incremented >= m_throttle_disk_load_increment_wait ) {
					m_throttle_disk_load_incremented = now;
					m_throttle_disk_load_max_concurrency += 1;
					if( m_throttle_disk_load_max_concurrency < floor(m_disk_load_low_throttle) ) {
						m_throttle_disk_load_max_concurrency = floor(m_disk_load_low_throttle);
					}
				}
			}
		}

		if( old_concurrency_limit != m_throttle_disk_load_max_concurrency ) {
			dprintf(D_ALWAYS,
					"TransferQueueManager: adjusted concurrency limit by %+d based on disk load: "
					"new limit %d, load %s %f %s %f, throttle %f to %f\n",
					m_throttle_disk_load_max_concurrency-old_concurrency_limit,
					m_throttle_disk_load_max_concurrency,
					m_disk_throttle_short_horizon.c_str(),
					disk_load_short,
					m_disk_throttle_long_horizon.c_str(),
					disk_load_long,
					m_disk_load_low_throttle,
					m_disk_load_high_throttle);
		}
	}

		// schedule new transfers
	while( uploading < m_max_uploads || m_max_uploads <= 0 ||
		   downloading < m_max_downloads || m_max_downloads <= 0 )
	{
		TransferQueueRequest *best_client = NULL;
		int best_recency = 0;
		unsigned int best_running_count = 0;

		if( m_throttle_disk_load && (uploading + downloading >= m_throttle_disk_load_max_concurrency) ) {
			break;
		}

		m_xfer_queue.Rewind();
		while( m_xfer_queue.Next(client) ) {
			if( client->m_gave_go_ahead ) {
				continue;
			}
			if( (client->m_downloading && 
				(downloading < m_max_downloads || m_max_downloads <= 0)) ||
				((!client->m_downloading) &&
				(uploading < m_max_uploads || m_max_uploads <= 0)) )
			{
				TransferQueueUser &this_user = GetUserRec(client->m_up_down_queue_user);
				unsigned int this_user_active_count = this_user.running;
				int this_user_recency = this_user.recency;

				bool this_client_is_better = false;
				if( !best_client ) {
					this_client_is_better = true;
				}
				else if( best_client->m_downloading != client->m_downloading ) {
						// effectively treat up/down queues independently
					if( client->m_downloading ) {
						this_client_is_better = true;
					}
				}
				else if( best_running_count > this_user_active_count ) {
						// prefer users with fewer active transfers
						// (only counting transfers in one direction for this comparison)
					this_client_is_better = true;
				}
				else if( best_recency > this_user_recency ) {
						// if still tied: round robin
					this_client_is_better = true;
				}

				if( this_client_is_better ) {
					best_client = client;
					best_running_count = this_user_active_count;
					best_recency = this_user_recency;
				}
			}
		}

		client = best_client;
		if( !client ) {
			break;
		}

		dprintf(D_FULLDEBUG,
				"TransferQueueManager: sending GoAhead to %s.\n",
				client->Description() );

		if( !client->SendGoAhead() ) {
			dprintf(D_FULLDEBUG,
					"TransferQueueManager: failed to send GoAhead; "
					"dequeueing %s.\n",
					client->Description() );

			delete client;
			m_xfer_queue.Delete(client);

			TransferQueueChanged();
		}
		else {
			SetRoundRobinRecency(client->m_up_down_queue_user);
			TransferQueueUser &user = GetUserRec(client->m_up_down_queue_user);
			user.running += 1;
			user.idle -= 1;
			if( client->m_downloading ) {
				downloading += 1;
			}
			else {
				uploading += 1;
			}
		}
	}


		// now that we have finished scheduling new transfers,
		// examine requests that are still waiting
	m_xfer_queue.Rewind();
	while( m_xfer_queue.Next(client) ) {
		if( !client->m_gave_go_ahead ) {
			clients_waiting = true;

			TransferQueueUser &user = GetUserRec(client->m_up_down_queue_user);
			int age = time(NULL) - client->m_time_born;
			if( client->m_downloading ) {
				m_waiting_to_download++;
				if( age > m_download_wait_time ) {
					m_download_wait_time = age;
				}
				m_iostats.download_MB_waiting += client->m_sandbox_size_MB;
				user.iostats.download_MB_waiting += client->m_sandbox_size_MB;
			}
			else {
				m_waiting_to_upload++;
				if( age > m_upload_wait_time ) {
					m_upload_wait_time = age;
				}
				m_iostats.upload_MB_waiting += client->m_sandbox_size_MB;
				user.iostats.upload_MB_waiting += client->m_sandbox_size_MB;
			}
		}
	}

	m_uploading = uploading;
	m_downloading = downloading;


	if( clients_waiting ) {
			// queue is full; check for ancient clients
		m_xfer_queue.Rewind();
		while( m_xfer_queue.Next(client) ) {
			if( client->m_gave_go_ahead ) {
				int age = time(NULL) - client->m_time_go_ahead;
				int max_queue_age = client->m_max_queue_age;
				if( max_queue_age > 0 && max_queue_age < age ) {
						// Killing this client will not stop the current
						// file that is being transfered by it (which
						// presumably has stalled for some reason).  However,
						// it should prevent any additional files in the
						// sandbox from being transferred.
					dprintf(D_ALWAYS,"TransferQueueManager: forcibly "
							"dequeueing  ancient (%ds old) entry for %s, "
							"because it is older than "
							"MAX_TRANSFER_QUEUE_AGE=%ds.\n",
							age,
							client->Description(),
							max_queue_age);


					FILE *email = email_admin_open("file transfer took too long");
					if( !email ) {
							// Error sending the message
						dprintf( D_ALWAYS, 
								 "ERROR: Can't send email to the Condor "
								 "Administrator\n" );
					} else {
						fprintf( email,
								 "A file transfer for\n%s\ntook longer than MAX_TRANSFER_QUEUE_AGE=%ds,\n"
								 "so this transfer is being removed from the transfer queue,\n"
								 "which will abort further transfers for this attempt to run this job.\n\n"
								 "To avoid this timeout, MAX_TRANSFER_QUEUE_AGE may be increased,\n"
								 "but be aware that transfers which take a long time will delay other\n"
								 "transfers from starting if the maximum number of concurrent transfers\n"
								 "is exceeded.  Therefore, it is advisable to also review the settings\n"
								 "of MAX_CONCURRENT_UPLOADS and/or MAX_CONCURRENT_DOWNLOADS.\n\n"
								 "The transfer queue currently has %d/%d uploads,\n"
								 "%d/%d downloads, %d transfers waiting %ds to upload,\n"
								 "and %d transfers waiting %ds to download.\n",
								 client->Description(),
								 max_queue_age,
								 m_uploading,
								 m_max_uploads,
								 m_downloading,
								 m_max_downloads,
								 m_waiting_to_upload,
								 m_upload_wait_time,
								 m_waiting_to_download,
								 m_download_wait_time
								 );

						email_close ( email );
					}

					delete client;
					m_xfer_queue.DeleteCurrent();
					TransferQueueChanged();
						// Only delete more ancient clients if the
						// next pass of this function finds there is pressure
						// on the queue.
					break;
				}
			}
		}
	}
}
Exemplo n.º 4
0
int 
GridUniverseLogic::GManagerReaper(Service *,int pid, int exit_status)
{
	gman_node_t* gman_node = NULL;
	MyString owner;

	// Iterate through our table to find the node w/ this pid
	// Someday we should perhaps also hash on the pid, but we
	// don't expect gridmanagers to exit very often, and there
	// are not that many of them.

	if (gman_pid_table) {
		gman_node_t* tmpnode;
		gman_pid_table->startIterations();
		while ( gman_pid_table->iterate(owner,tmpnode) ) {
			if (tmpnode->pid == pid ) {
				// found it!
				gman_node = tmpnode;
				break;
			}
		}
	}

	MyString owner_safe;
	MyString exit_reason;
	if(gman_node) { owner_safe = owner; }
	else { owner_safe = "Unknown"; }
	if ( WIFEXITED( exit_status ) ) {
		exit_reason.formatstr( "with return code %d",
							 WEXITSTATUS( exit_status ) );
	} else {
		exit_reason.formatstr( "due to %s",
							 daemonCore->GetExceptionString( exit_status ) );
	}
	dprintf(D_ALWAYS, "condor_gridmanager (PID %d, owner %s) exited %s.\n",
			pid, owner_safe.Value(), exit_reason.Value() );
	if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) {
		const char *condorUserName = get_condor_username();

		dprintf(D_ALWAYS, 
			"The gridmanager had a problem writing its log. "
			"Check the permissions of the file specified by GRIDMANAGER_LOG; "
			"it needs to be writable by Condor.\n");

			/* send email to the admin about this, but only
			 * every six hours - enough to not be ignored, but
			 * not enough to be a pest.  If only my children were
			 * so helpful and polite.  Ah, well, we can always dream...
			 */
		static time_t last_email_re_gridmanlog = 0;
		if ( time(NULL) - last_email_re_gridmanlog > 6 * 60 * 60 ) {
			last_email_re_gridmanlog = time(NULL);
			FILE *email = email_admin_open("Unable to launch grid universe jobs.");
			if ( email ) {
				fprintf(email,
					"The condor_gridmanager had an error writing its log file.  Check the  \n"
					"permissions/ownership of the file specified by the GRIDMANAGER_LOG setting in \n"
					"the condor_config file.  This file needs to be writable as user %s to enable\n"
					"the condor_gridmanager daemon to write to it. \n\n"
					"Until this problem is fixed, grid universe jobs submitted from this machine cannot "
					"be launched.\n", condorUserName ? condorUserName : "******" );
				email_close(email);
			} else {
					// Error sending an email message
				dprintf(D_ALWAYS,"ERROR: Cannot send email to the admin\n");
			}
		}	
	}	// end if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR)

	if (!gman_node) {
		// nothing more to do, so return
		return 0;
	}

	// Cancel any timers before removing the node!!
	if (gman_node->add_timer_id != -1) {
		daemonCore->Cancel_Timer(gman_node->add_timer_id);
	}
	if (gman_node->remove_timer_id != -1) {
		daemonCore->Cancel_Timer(gman_node->remove_timer_id);
	}
	// Remove node from our hash table
	gman_pid_table->remove(owner);
	// Remove any scratch directory used by this gridmanager
	char *scratchdir = scratchFilePath(gman_node);
	ASSERT(scratchdir);
	if ( IsDirectory(scratchdir) && 
		 init_user_ids(gman_node->owner, gman_node->domain) ) 
	{
		priv_state saved_priv = set_user_priv();
			// Must put this in braces so the Directory object
			// destructor is called, which will free the iterator
			// handle.  If we didn't do this, the below rmdir 
			// would fail.
		{
			Directory tmp( scratchdir );
			tmp.Remove_Entire_Directory();
		}
		if ( rmdir(scratchdir) == 0 ) {
			dprintf(D_FULLDEBUG,"Removed scratch dir %s\n",scratchdir);
		} else {
			dprintf(D_FULLDEBUG,"Failed to remove scratch dir %s\n",
					scratchdir);
		}
		set_priv(saved_priv);
		uninit_user_ids();
	}
	delete [] scratchdir;

	// Reclaim memory from the node itself
	delete gman_node;

	return 0;
}