void SpooledJobFiles::getJobSpoolPath(int cluster,int proc,std::string &spool_path) { char * spool = param("SPOOL"); ASSERT( spool ); char * buf = gen_ckpt_name(spool, cluster, proc, 0); ASSERT( buf ); spool_path = buf; free(buf); free(spool); }
void RemoveNewShadowDroppings(char *cluster, char *proc) { char names[2][1024]; int j; char *ckpt_name; char *myspool; struct stat buf; int clusternum, procnum; memset(&names[0], 0, 1024); memset(&names[1], 0, 1024); /* XXX I'm sorry. There are some incompatibilities between the new shadow and the old shadow. The new shadow now makes a _directory_ with the usual ckeckpoint name because there might eventually be more than one file that has to get checkpointed with a job. The old shadow is dumb, and it only makes a _file_ named the usual checkpoint name. So a contention happens when we are using opsys/arch to choose an executable name for both NT and UNIX between vanilla only jobs and standard universe jobs. What happens is that the old shadow gets back a correct stat() on the new shadow created directory but misinterprets it as a file and hilarity ensues. So, my nasty hack is to make the old shadow determine if the file it found is actually a directory and if so, then remove it and everything underneath it. I somehow feel that this might bite us in the ass in the future, so each time the shadow does this, it logs it so a human can figure out what happened. I don't have to worry about the converse issue of a new shadow starting up with an old file-based checkpoint because whomever adds standard universe support to the new shadow will have to do something intelligent, and our submit program places expressions into the requirements attribute in the job forcing a checkpointed job to always run on the architecture it checkpointed on. -psilord 7/30/01 */ myspool = param("SPOOL"); if (myspool == NULL) { EXCEPT ("RemoveNewShadowDroppings(): No Spool directory!?!\n"); } clusternum = atoi(cluster); procnum = atoi(proc); if (clusternum < 0 || procnum < 0) /* sanity checks */ { dprintf(D_ALWAYS, "RemoveNewShadowDroppings(): Asked to deal with " "negative cluster or proc numbers. Ignoring.\n"); free(myspool); return; } ckpt_name = gen_ckpt_name( myspool, clusternum, procnum, 0 ); strcpy(names[0], ckpt_name); strcpy(names[1], ckpt_name); strcat(names[1], ".tmp"); free(ckpt_name); ckpt_name = NULL; for (j = 0; j < 2; j++) { if (stat(names[j], &buf) == 0) { /* ok, we have a hit, let's see if it is a directory... */ if (IsDirectory(names[j]) == true) { /* it is, so blow away everything inside it */ { Directory todd_droppings(names[j]); if (todd_droppings.Remove_Entire_Directory() == false) { dprintf(D_ALWAYS, "RemoveNewShadowDroppings(): Old " "shadow failed to remove new shadow ckpt directory " "contents: %s\n", names[j]); } } /* now delete the directory itself */ if (rmdir(names[j]) < 0 && errno != ENOENT) { dprintf(D_ALWAYS, "RemoveNewShadowDroppings(): Old shadow " "failed to remove new shadow ckpt directory: %s (%s)\n", names[j], strerror(errno)); } else { dprintf(D_ALWAYS, "RemoveNewShadowDroppings(): Old shadow " "removed new shadow ckpt directory: %s\n", names[j]); } } } } free(myspool); }
/* ** Opens job queue (Q), and reads in process structure (Proc) as side ** affects. */ void start_job( char *cluster_id, char *proc_id ) { int cluster_num; int proc_num; char *tmp; Proc->id.cluster = atoi( cluster_id ); Proc->id.proc = atoi( proc_id ); cluster_num = atoi( cluster_id ); proc_num = atoi( proc_id ); InitJobAd(cluster_num, proc_num); // make sure we have the job classad if (MakeProc(JobAd, Proc) < 0) { EXCEPT("MakeProc()"); } JobAd->LookupFloat(ATTR_BYTES_SENT, TotalBytesSent); JobAd->LookupFloat(ATTR_BYTES_RECVD, TotalBytesRecvd); JobAd->LookupFloat(ATTR_RSC_BYTES_SENT, RSCBytesSent); JobAd->LookupFloat(ATTR_RSC_BYTES_RECVD, RSCBytesRecvd); JobAd->LookupInteger(ATTR_NUM_RESTARTS, NumRestarts); // by default, we round ATTR_NUM_CKPTS, so fetch the raw value // here (if available) for us to increment later. if ( !JobAd->LookupInteger(ATTR_NUM_CKPTS_RAW, NumCkpts) ) { JobAd->LookupInteger(ATTR_NUM_CKPTS, NumCkpts); } // Grab the ClaimID (a.k.a. "capability") from the job classad // and put it in our global variable for use everywhere else. if (GlobalCap) { free(GlobalCap); GlobalCap = NULL; } JobAd->LookupString(ATTR_CLAIM_ID, &GlobalCap); if (! GlobalCap) { EXCEPT("ad does not include %s!", ATTR_CLAIM_ID); } #define TESTING #if !defined(HPUX) && !defined(TESTING) if( Proc->status != RUNNING ) { dprintf( D_ALWAYS, "Shadow: Asked to run proc %d.%d, but status = %d\n", Proc->id.cluster, Proc->id.proc, Proc->status ); dprintf(D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_BAD_STATUS); exit( JOB_BAD_STATUS ); /* don't cleanup here */ } #endif LocalUsage = Proc->local_usage; RemoteUsage = Proc->remote_usage[0]; ImageSize = Proc->image_size; if (Proc->universe != CONDOR_UNIVERSE_STANDARD) { strcpy( CkptName, "" ); strcpy( TmpCkptName, "" ); } else { tmp = gen_ckpt_name( Spool, Proc->id.cluster, Proc->id.proc, 0 ); snprintf( CkptName, MAXPATHLEN, "%s", tmp ); sprintf( TmpCkptName, "%s.tmp", CkptName ); free(tmp); tmp = NULL; } tmp = gen_ckpt_name( Spool, Proc->id.cluster, ICKPT, 0 ); snprintf( ICkptName, MAXPATHLEN, "%s", tmp ); free(tmp); tmp = NULL; strcpy( RCkptName, CkptName ); }