void slave_consistency_process (struct slave_database *sdb) { int i; time_t now; while (1) { for (i=0; i<MAXTASKS; i++) { if ((sdb->comp->status.task[i].used) //&& (sdb->comp->status.task[i].status != TASKSTATUS_LOADING) && (kill(sdb->comp->status.task[i].pid,0) == -1)) { now = time(NULL); if ((sdb->comp->status.task[i].status != TASKSTATUS_LOADING) && now != -1 && ((now - sdb->comp->status.task[i].start_loading_time) < MAXTASKLOADINGTIME)) { // Still loading... no timeout continue; } // There is process registered as running, but not running. Or it could be loading but without the real process running after the timeout. semaphore_lock(sdb->semid); sdb->comp->status.task[i].used = 0; semaphore_release(sdb->semid); log_auto(L_WARNING,"Process registered as running was not running. Removed."); } else if ((sdb->comp->status.task[i].used) && (sdb->comp->status.task[i].status == TASKSTATUS_LOADING)) { // The process is already running but marked as loading. We have to update. semaphore_lock(sdb->semid); sdb->comp->status.task[i].status = TASKSTATUS_RUNNING; semaphore_release(sdb->semid); log_auto(L_DEBUG2,"Process previously loading is now running."); } } sleep (SLAVEDELAY); } }
void slave_set_limits (struct slave_database *sdb) { if (sdb->limits.autoenable.flags & AEF_ACTIVE) { sdb->comp->limits.autoenable.flags = sdb->limits.autoenable.flags; sdb->comp->limits.autoenable.h = sdb->limits.autoenable.h % 24; sdb->comp->limits.autoenable.m = sdb->limits.autoenable.m % 60; log_auto (L_INFO,"Setting autoenable time to %i:%02i", sdb->comp->limits.autoenable.h, sdb->comp->limits.autoenable.m); } if (sdb->flags & SDBF_SETMAXCPUS) { sdb->comp->limits.nmaxcpus = (sdb->limits.nmaxcpus > sdb->comp->limits.nmaxcpus) ? sdb->comp->limits.nmaxcpus : sdb->limits.nmaxcpus; log_auto (L_INFO,"Setting maximum number of CPUs to %i",sdb->comp->limits.nmaxcpus); } }
void sigpipe_handler (int signal) { // fix compiler warning (void)signal; /* This is not an error because it only happens on a connection handler */ log_auto (L_WARNING,"Broken connection while reading or writing"); exit (1); }
void check_tasks (struct computer_status *cstatus, int64_t semid) { int i; semaphore_lock (semid); cstatus->ntasks = 0; for (i=0;i<MAXTASKS;i++) { if (cstatus->task[i].used) { if (cstatus->task[i].status == TASKSTATUS_RUNNING) { /* If the task is LOADING then there is no process running yet */ if (kill(cstatus->task[i].pid,0) == 0) { /* check if task is running */ cstatus->ntasks++; } else { /* task is registered but not running */ log_auto(L_WARNING,"Check tasks found no task where there should have been one."); cstatus->task[i].used = 0; } } else { // FIXME: LOADING or FINISHED ? cstatus->ntasks++; } } } semaphore_release (semid); }
void log_slave_computer (int level, char *fmt, ...) { char name2[MAXNAMELEN]; char *name = NULL; /* To only make a call to gethostname */ va_list ap; // fix compiler warning (void)level; // FIXME: use level variable if (!log_level_dest(level)) { return; } if (name == NULL) { if (gethostname(name2,MAXNAMELEN-1) == -1) { strcpy (name2,"UNKNOWN"); } name = name2; } va_start (ap,fmt); log_auto(level,fmt,ap); va_end (ap); }
void sigalarm_handler (int signal) { // fix compiler warning (void)signal; /* This is not an error because it only happens on a connection handler */ log_auto (L_WARNING,"Connection time exceeded"); exit (1); }
void clean_out (int signal) { int rc; pid_t child_pid; int i; struct sigaction ignore; struct sigaction action_dfl; // fix compiler warning (void)signal; /* Ignore new int signals that could arrive during clean up */ ignore.sa_handler = SIG_IGN; sigemptyset (&ignore.sa_mask); ignore.sa_flags = 0; sigaction (SIGINT, &ignore, NULL); // Handle SIGCLD properly action_dfl.sa_flags = 0; #ifdef __IRIX action_dfl.sa_handler = SIG_DFL; #else action_dfl.sa_handler = (void *)SIG_DFL; #endif sigemptyset (&action_dfl.sa_mask); #ifdef __OSX sigaction (SIGCHLD, &action_dfl, NULL); #else sigaction (SIGCLD, &action_dfl, NULL); #endif log_auto (L_INFO,"Cleaning..."); for (i=0; i<MAXTASKS; i++) { if (sdb.comp->status.task[i].used) kill(-sdb.comp->status.task[i].pid,SIGINT); } kill (0,SIGINT); while ((child_pid = wait (&rc)) != -1) { printf ("Child arrived ! %i\n",(int)child_pid); } request_slavexit (sdb.comp->hwinfo.id,SLAVE); computer_free (sdb.comp); if (semctl ((int)sdb.semid,0,IPC_RMID,NULL) == -1) { perror ("semid"); } if (shmctl ((int)sdb.shmid,IPC_RMID,NULL) == -1) { perror ("shmid"); } exit (0); }
int database_load (struct database *wdb) { /* This function returns 1 on success and 0 on failure */ /* It logs failure and maybe it should leave that task to the calling function */ /* README : this function writes to the database without locking */ struct database_hdr hdr; char *basedir; char filename[BUFFERLEN]; int fd; int c; /* counters */ // FIXME: no filename guessing. if ((basedir = getenv ("DRQUEUE_DB")) == NULL) { /* This should never happen because we check it at the beginning of the program */ drerrno = DRE_NOENVROOT; return 0; } snprintf (filename, BUFFERLEN - 1, "%s/drqueue.db", basedir); if ((fd = open (filename, O_RDONLY)) == -1) { drerrno = DRE_ERROROPENING; return 0; } read_32b (fd, &hdr.magic); if (hdr.magic != DB_MAGIC) { drerrno = DRE_DIFFILEFORMAT; close (fd); return 0; } read_32b (fd, &hdr.version); if (hdr.version != database_version_id()) { drerrno = DRE_DIFVERSION; close (fd); return 0; } read_16b (fd, &hdr.job_size); if (hdr.job_size != MAXJOBS) { drerrno = DRE_DIFJOBSIZE; close (fd); return 0; } for (c = 0; c < hdr.job_size; c++) { job_init(&wdb->job[c]); if (!database_job_load (fd, &wdb->job[c])) { log_auto (L_ERROR,"database_load(): error loading job number %i. (%s)",c,strerror(drerrno_system)); close (fd); return 0; } } drerrno = DRE_NOERROR; close (fd); return 1; }
void *attach_shared_memory_slave (int64_t shmid) { void *rv; /* return value */ if ((rv = shmat ((int)shmid,0,0)) == (void *)-1) { log_auto (L_ERROR,"Problem attaching slave shared memory segment"); kill(0,SIGINT); } return rv; }
void log_slave_task (struct task *task,int level,char *fmt,...) { va_list ap; if (!log_level_dest(level)) return; logger_task = task; va_start (ap,fmt); log_auto (level,fmt,ap); va_end (ap); }
int database_job_save (int sfd, struct job *job) { struct job bswapped; char *buf = (char*)&bswapped; uint32_t datasize; datasize = sizeof (struct job); datasize = htonl (datasize); //if (!dr_file_write(sfd,(char*)&datasize,sizeof(datasize))) { if (!dr_write(sfd, (char*)&datasize, sizeof(datasize))) { log_auto (L_ERROR,"database_job_save(): error saving job data size (%u). (%s)",ntohl(datasize),strerror(drerrno_system)); return 0; } datasize = ntohl (datasize); job_bswap_to_network (job,&bswapped); //if (!dr_file_write(sfd,buf,datasize)) { if (!dr_write(sfd, buf, datasize)) { log_auto (L_ERROR,"database_job_save(): error saving job main information. (%s)",strerror(drerrno_system)); return 0; } if (job->used) { if (!database_job_save_envvars (sfd,job)) { log_auto (L_ERROR,"database_job_save(): error saving job environment variables. (%s)",strerror(drerrno_system)); return 0; } if (!database_job_save_frames(sfd,job)) { log_auto (L_ERROR,"database_job_save(): could not save job frames. (%s)",strerror(drerrno_system)); return 0; } if (!database_job_save_blocked_hosts(sfd,job)) { log_auto (L_ERROR,"database_job_save(): could not save job blocked hosts. (%s)",strerror(drerrno_system)); return 0; } } drerrno = DRE_NOERROR; return 1; }
int64_t get_shared_memory_slave (int force) { key_t key; int64_t shmid; int shmflg; char file[BUFFERLEN]; char *root; root = getenv("DRQUEUE_BIN"); if (root) { snprintf (file,BUFFERLEN-1,"%s/%s",root,KEY_SLAVE); } else { log_auto (L_ERROR,"get_shared_memory_slave(): environment variable DRQUEUE_BIN not defined."); exit (1); } log_auto (L_DEBUG,"get_shared_memory_slave(): using file '%s' as key for shared resources.",file); if ((key = ftok (file,'A')) == -1) { log_auto (L_ERROR,"get_shared_memory_slave(): error obtaining key for shared memory (ftok): %s", strerror(errno)); exit (1); } if (force) { shmflg = IPC_CREAT|0600; } else { shmflg = IPC_EXCL|IPC_CREAT|0600; } if ((shmid = (int64_t)shmget (key,sizeof(struct computer),shmflg)) == (int64_t)-1) { log_auto (L_ERROR,"get_shared_memory_slave(): error allocating shared memory space (shmget): %s", strerror(errno)); if (!force) { fprintf (stderr,"Try with option -f (if you are _sure_ that no other slave is running)\n"); } exit (1); } return shmid; }
void log_master_computer (struct computer *computer, int level, char *fmt, ...) { va_list ap; if (!log_level_dest(level)) return; logger_computer = computer; va_start (ap,fmt); log_auto (level,fmt,ap); va_end (ap); logger_computer = NULL; return; }
void og_master_job (struct job *job, int level, char *fmt, ...) { va_list ap; if (!log_level_dest (level)) return; logger_job = job; va_start (ap,fmt); log_auto(level,fmt,ap); va_end (ap); logger_job = NULL; return; }
int64_t get_semaphores_slave (void) { key_t key; int64_t semid; struct sembuf op; char file[BUFFERLEN]; char *root; root = getenv("DRQUEUE_BIN"); if (root) { snprintf (file,BUFFERLEN-1,"%s/%s",root,KEY_SLAVE); } else { log_auto (L_ERROR,"get_semaphores_slave(): environment variable DRQUEUE_BIN not defined."); exit (1); } log_auto (L_DEBUG,"get_semaphores_slave(): using file '%s' as key for shared resources.",file); if ((key = ftok (file,'A')) == -1) { log_auto (L_ERROR,"Getting key for semaphores"); kill (0,SIGINT); } if ((semid = (int64_t)semget (key,1, IPC_CREAT|0600)) == (int64_t)-1) { log_auto (L_ERROR,"Getting semaphores: %s",strerror(errno)); kill (0,SIGINT); } #if _SEM_SEMUN_UNDEFINED == 1 || defined (__CYGWIN) union semun { int val; struct semid_ds *buf; unsigned short int *array; struct seminfo *__buf; } u_semun; #else union semun u_semun; #endif u_semun.val = 1; if (semctl ((int)semid,0,SETVAL,u_semun) == -1) { drerrno_system = errno; log_auto (L_ERROR,"get_semaphores_slave(): could not set initial semaphore value. (%s)",strerror(drerrno_system)); kill (0,SIGINT); } if (semctl ((int)semid,0,GETVAL) == 0) { op.sem_num = 0; op.sem_op = 1; op.sem_flg = 0; if (semop((int)semid,&op,1) == -1) { log_auto (L_ERROR,"semaphore_release: %s",strerror(errno)); kill(0,SIGINT); } } return semid; }
int database_job_load (int sfd, struct job *job) { uint32_t datasize; //if (!dr_file_read(sfd,(char*)&datasize,sizeof(datasize))) { if (!dr_read(sfd,(char*)&datasize,sizeof(datasize))) { log_auto (L_ERROR,"database_job_load(): error reading job data size (%u). (%s)",ntohl(datasize),strerror(drerrno_system)); return 0; } datasize = ntohl (datasize); if (datasize != sizeof (struct job)) { log_auto (L_ERROR,"database_job_load(): job data sizes do not match. Read: %u Current: %u",datasize,sizeof(struct job)); return 0; } job_delete(job); //if (!dr_file_read(sfd,(char*)job,datasize)) { if (!dr_read(sfd, (char*)job, datasize)) { log_auto (L_ERROR,"database_job_load(): error reading job main information. (%s)",strerror(drerrno_system)); return 0; } job_bswap_from_network (job,job); job_fix_received_invalid (job); if (job->used) { if (!database_job_load_envvars (sfd,job)) { log_auto (L_ERROR,"database_job_load(): error reading job environment variables. (%s)",strerror(drerrno_system)); return 0; } if (!database_job_load_frames(sfd,job)) { log_auto (L_ERROR,"database_job_load(): error reading job frame information. (%s)",strerror(drerrno_system)); return 0; } if (!database_job_load_blocked_hosts(sfd,job)) { log_auto (L_ERROR,"database_job_load(): error reading job blocked hosts list. (%s)",strerror(drerrno_system)); return 0; } } drerrno = DRE_NOERROR; return 1; }
void slave_listening_process (struct slave_database *sdb) { pid_t child_pid; int sfd,csfd,highest_fd; if ((sfd = get_socket(SLAVEPORT)) == -1) { log_auto(L_ERROR,"Unable to open socket (server)"); slave_exit(SIGINT); } highest_fd = sfd+1; log_auto (L_DEBUG,"Highest file descriptor after initialization %i",highest_fd); log_auto (L_INFO,"Slave waiting for remote requests"); while (1) { if ((csfd = accept_socket_slave (sfd)) != -1) { // Ignore children exit codes & do not let zombies around ignore_sigcld(); if ((child_pid = fork()) == 0) { // Child process set_signal_handlers_child_chandler (); close (sfd); alarm (MAXTIMECONNECTION); handle_request_slave (csfd,sdb); close (csfd); exit (0); } else if (child_pid == -1) { // Parent still drerrno_system = errno; log_auto (L_ERROR,"slave_listening_process(): error forking on slave_listening_process. (%s)",strerror(drerrno_system)); } // Parent close (csfd); if (csfd > highest_fd) log_auto (L_DEBUG,"slave_listening_process(): csfd has grown over the default highest (csfd=%i)",csfd); } else { // csfd == -1 log_auto (L_ERROR,"slave_listening_process(): error accepting connection. (%s)",strerror(drerrno_system)); } } exit (0); }
int database_save (struct database *wdb) { /* This function returns 1 on success and 0 on failure */ /* It logs failure and maybe it should leave that task to the calling function */ /* README : this function reads from the database memory without locking */ struct database_hdr hdr; char *basedir; char dir[BUFFERLEN]; char filename[BUFFERLEN]; int fd; uint32_t c; // FIXME: this all filename guessing should be inside a function if ((basedir = getenv ("DRQUEUE_DB")) == NULL) { /* This should never happen because we check it at the beginning of the program */ log_auto (L_ERROR,"database_save() : DRQUEUE_DB environment variable could not be found. Master db cannot be saved."); drerrno = DRE_NOENVROOT; return 0; } snprintf (dir, BUFFERLEN - 1, "%s", basedir); snprintf (filename, BUFFERLEN - 1, "%s/drqueue.db", dir); // Lock it semaphore_lock(wdb->semid); if (database_backup(wdb) == 0) { // FIXME: filename should be a value returned by a function log_auto (L_ERROR,"database_save() : there was an error while backing up old database. NOT SAVING current one. (file: %s)", filename); } // FIXME: // dbfd = database_file_open(filename) log_auto (L_INFO,"Storing DB into: '%s'",filename); if ((fd = open (filename, O_CREAT | O_TRUNC | O_RDWR, 0664)) == -1) { if (errno == ENOENT) { /* If its because the directory does not exist we try creating it first */ #ifdef _WIN32 if (mkdir (dir) == -1) { #else if (mkdir (dir, 0775) == -1) { #endif drerrno_system = errno; log_auto (L_WARNING,"Could not create database directory. Check permissions: %s. (%s)", dir,strerror(drerrno_system)); drerrno = DRE_COULDNOTCREATE; return 0; } if ((fd = open (filename, O_CREAT | O_TRUNC | O_RDWR, 0664)) == -1) { log_auto (L_WARNING,"Could not open database file for writing. Check permissions: %s. (%s)", filename,strerror(drerrno_system)); drerrno = DRE_COULDNOTCREATE; return 0; } } else { /* could not open the file for other reasons */ log_auto (L_WARNING,"Could not open database file for writing. Check permissions: %s", filename); drerrno = DRE_COULDNOTCREATE; return 0; } } // FIXME: database_header_save() hdr.magic = DB_MAGIC; hdr.version = database_version_id(); hdr.job_size = MAXJOBS; write_32b (fd, &hdr.magic); write_32b (fd, &hdr.version); write_16b (fd, &hdr.job_size); for (c = 0; c < hdr.job_size; c++) { logger_job = &wdb->job[c]; if (!database_job_save (fd, &wdb->job[c])) { // FIXME: report log_auto (L_ERROR,"database_save(): error saving job number %i. (%s)",c,strerror(drerrno_system)); return 0; } } logger_job = NULL; log_auto (L_INFO,"Database saved successfully."); semaphore_release(wdb->semid); // Unlock it return 1; } int database_job_save_frames (int sfd, struct job *job) { int nframes = job_nframes (job); struct frame_info *fi; int i; if ((fi = attach_frame_shared_memory (job->fishmid)) == (void *) -1) { // Store empty frames in an attemp to save other jobs // FIXME: Warning CORRUPT struct frame_info fi2; job_frame_info_init (&fi2); for (i = 0; i < nframes; i++) { if (!send_frame_info (sfd, &fi2)) { return 0; } } } else { for (i = 0; i < nframes; i++) { if (!send_frame_info (sfd, &fi[i])) { detach_frame_shared_memory (fi); return 0; } } detach_frame_shared_memory (fi); } return 1; }
void get_loadavg (uint16_t *loadavg) { #if defined(__LINUX) /* __LINUX */ FILE *f_loadavg; float a,b,c; if ((f_loadavg = fopen("/proc/loadavg","r")) == NULL) { perror ("get_loadavg: fopen"); exit (1); } fscanf (f_loadavg,"%f %f %f",&a,&b,&c); loadavg[0] = (uint16_t) (a * 100); loadavg[1] = (uint16_t) (b * 100); loadavg[2] = (uint16_t) (c * 100); fclose (f_loadavg); #elif defined(__IRIX) /* __IRIX */ sgt_cookie_t cookie; uint32_t tla[3]; SGT_COOKIE_SET_KSYM (&cookie,KSYM_AVENRUN); if (sysget (SGT_KSYM,(char *)tla,sizeof(uint32_t)*3,SGT_READ,&cookie) == -1) { FILE *uptime; char buf[BUFFERLEN]; char *fd; /* first digit */ float f1,f2,f3; if ((uptime = popen ("/usr/bsd/uptime","r")) == NULL) { fprintf (stderr,"Warning: Problems executing '/usr/bsd/uptime'\n"); tla[0] = tla[1] = tla[2] = 0; } while (fgets (buf,BUFFERLEN,uptime) != NULL) { if ((fd = strstr(buf,"average:")) != NULL) { while (!isdigit((int)*fd)) fd++; if (sscanf (fd,"%f, %f, %f",&f1,&f2,&f3) != 3) { log_auto (L_WARNING,"Problems on get_loadavg\n"); f1 = f2 = f3 = 0; } tla[0] = f1 * 1000; tla[1] = f2 * 1000; tla[2] = f3 * 1000; } } pclose (uptime); } loadavg[0] = (uint16_t) (tla[0]/10); loadavg[1] = (uint16_t) (tla[1]/10); loadavg[2] = (uint16_t) (tla[2]/10); #elif defined(__OSX) || defined(__FREEBSD) double fls[3]; if (getloadavg(fls,3)<3) { log_auto (L_WARNING,"Problems on getloadavg\n"); fls[0]=fls[1]=fls[2]=0.0; } loadavg[0] = (uint16_t) (fls[0] * 100); loadavg[1] = (uint16_t) (fls[1] * 100); loadavg[2] = (uint16_t) (fls[2] * 100); #elif defined(__CYGWIN) /* __CYGWIN */ FILE *f_loadavg; float a,b,c; if ((f_loadavg = fopen("/proc/loadavg","r")) == NULL) { perror ("get_loadavg: fopen"); exit (1); } fscanf (f_loadavg,"%f %f %f",&a,&b,&c); loadavg[0] = a * 100; loadavg[1] = b * 100; loadavg[2] = c * 100; fclose (f_loadavg); #else # error You need to define the OS, or OS defined not supported #endif }
int main (int argc,char *argv[]) { int force = 0; pid_t consistency_pid; pid_t listener_pid; slave_get_options(&argc,&argv,&force,&sdb); logtool = DRQ_LOG_TOOL_SLAVE; // Set some standard defaults based on DRQUEUE_ROOT (must be already set!) set_default_env(); // Config files overrides environment // Read the config file after reading the arguments, as those may change // the path to the config file if (sdb.conf[0]) { config_parse(sdb.conf); } else { config_parse_tool("slave"); } if (!common_environment_check()) { log_auto (L_ERROR,"Error checking the environment: %s",drerrno_str()); exit (1); } set_signal_handlers (); //system ("env | grep DRQUEUE"); sdb.shmid = get_shared_memory_slave (force); sdb.comp = attach_shared_memory_slave (sdb.shmid); sdb.semid = get_semaphores_slave (); log_auto (L_INFO,"Starting..."); computer_init (sdb.comp); sdb.comp->used = 1; get_hwinfo (&sdb.comp->hwinfo); computer_limits_cpu_init (sdb.comp); // computer_init_limits depends on the hardware information slave_set_limits (&sdb); // Override defaults logger_computer = sdb.comp; report_hwinfo (&sdb.comp->hwinfo); fprintf (stderr,"Working silently..."); register_slave (sdb.comp); // Before sending the limits we have to set the pools computer_pool_set_from_environment (&sdb.comp->limits); computer_pool_list (&sdb.comp->limits); update_computer_limits(&sdb.comp->limits); /* Does not need to be locked because at this point */ /* because there is only one process running. The rest of the time */ /* either we call it locked or we make a copy of the limits while locked */ /* and then we send that copy */ if (pipe(phantom) != 0) { fprintf (stderr,"Phantom pipe could not be created\n"); exit (1); } if ((listener_pid = fork()) == 0) { /* Create the listening process */ log_auto (L_INFO,"Listener process starting..."); set_signal_handlers_child_listening (); slave_listening_process (&sdb); log_auto (L_INFO,"Listener process exiting..."); exit (0); } else if (listener_pid == -1) { drerrno_system = errno; log_auto (L_ERROR,"Could not create the listener process. (%s)", strerror(drerrno_system)); slave_exit(SIGINT); } if ((consistency_pid = fork()) == 0) { // Create the consistency checks process // Signal are treated the same way as the listening process log_auto (L_INFO,"Consistency process starting..."); set_signal_handlers_child_listening (); slave_consistency_process (&sdb); log_auto (L_INFO,"Consistency process exiting..."); exit (0); } else if (consistency_pid == -1) { drerrno_system = errno; log_auto (L_ERROR,"Could not create the listener process. (%s)", strerror(drerrno_system)); slave_exit(SIGINT); } while (1) { get_computer_status (&sdb.comp->status,sdb.semid); computer_autoenable_check (&sdb); /* Check if it's time for autoenable */ while (computer_available(sdb.comp)) { uint16_t itask; if (request_job_available(&sdb,&itask)) { launch_task(&sdb,itask); update_computer_status (&sdb); } else { // computer not available break; // break the while loop } } /* WARNING could be in this loop forever if no care is taken !! */ update_computer_status (&sdb); /* sends the computer status to the master */ /* Does not need to be locked because we lock inside it */ FD_ZERO(&read_set); FD_SET(phantom[0],&read_set); timeout.tv_sec = SLAVEDELAY; timeout.tv_usec = 0; rs = select (phantom[0]+1,&read_set,NULL,NULL,&timeout); switch (rs) { case -1: /* Error in select */ log_auto(L_ERROR,"Select call failed"); case 0: log_auto(L_DEBUG,"Slave loop (select call timeout)"); break; default: if (FD_ISSET(phantom[0],&read_set)) { log_auto(L_DEBUG,"Select call, notification came. Available for reading."); read(phantom[0],buffer,BUFFERLEN); } else { log_auto(L_WARNING,"Select call, report this message, please. It should never happen."); } } } exit (0); }
void launch_task (struct slave_database *sdb, uint16_t itask) { /* Here we get the job ready in the process task structure pointed by itask */ int rc; pid_t task_pid,waiter_pid; extern char **environ; char *exec_path; struct task *ttask; logtool = DRQ_LOG_TOOL_SLAVE_TASK; ttask = malloc (sizeof(*ttask)); memcpy(ttask,&sdb->comp->status.task[itask],sizeof(*ttask)); logger_task = ttask; if ((waiter_pid = fork()) == 0) { // // WAITER PROCESS // This process reports the execution of the command itself // set_signal_handlers_child_launcher (); if ((task_pid = fork()) == 0) { // // TASK PROCESS // This process executes the task // This child also creates the directory for logging if it doesn't exist // and prepares the file descriptors so every output will be logged // const char *new_argv[4]; int lfd; /* logger fd */ #ifdef __CYGWIN new_argv[0] = SHELL_NAME; new_argv[1] = "-c"; new_argv[2] = sdb->comp->status.task[itask].jobcmd; new_argv[3] = NULL; /* new_argv[0] = SHELL_NAME; */ /* if ((new_argv[1] = malloc(MAXCMDLEN)) == NULL) */ /* exit (1); */ /* cygwin_conv_to_posix_path(sdb->comp->status.task[itask].jobcmd,(char*)new_argv[1]); */ /* new_argv[2] = NULL; */ #else new_argv[0] = SHELL_NAME; new_argv[1] = "-c"; new_argv[2] = sdb->comp->status.task[itask].jobcmd; new_argv[3] = NULL; #endif setpgid(0,0); /* So this process doesn't receive signals from the others */ set_signal_handlers_task_exec (); if ((lfd = log_dumptask_open (&sdb->comp->status.task[itask])) != -1) { // Log on the logger file whatever goes to stdout and stderr dup2 (lfd,STDOUT_FILENO); dup2 (lfd,STDERR_FILENO); close (lfd); } task_environment_set(&sdb->comp->status.task[itask]); #ifdef __CYGWIN exec_path = SHELL_PATH; /* exec_path = malloc(PATH_MAX); */ /* char *dr_bin = getenv("DRQUEUE_BIN"); */ /* if (dr_bin) { */ /* snprintf (exec_path,PATH_MAX,"%s/tcsh.exe",dr_bin); */ /* } */ #else exec_path = SHELL_PATH; #endif execve(exec_path,(char*const*)new_argv,environ); // Wouldn't reach this point unless error on execve drerrno_system = errno; log_auto(L_ERROR,"launch_task(): error on execve. (%s)",strerror(drerrno_system)); slave_exit(SIGINT); } else if (task_pid == -1) { log_auto(L_ERROR,"lauch_task(): Fork failed. Task not created."); //semaphore_lock(sdb->semid); //sdb->comp->status.task[itask].used = 0; /* We don't need the task anymore */ //semaphore_release(sdb->semid); } // Then we set the process as loading // Later on, well make a check for every loading frame and if running change its status semaphore_lock(sdb->semid); sdb->comp->status.task[itask].status = TASKSTATUS_LOADING; sdb->comp->status.task[itask].start_loading_time = time(NULL); sdb->comp->status.task[itask].pid = task_pid; sdb->comp->status.ntasks = computer_ntasks (sdb->comp); sdb->comp->status.nrunning = computer_nrunning (sdb->comp); semaphore_release(sdb->semid); if (waitpid(task_pid,&rc,0) == -1) { // It forked on task_pid but waitpid says it doesn't exist. drerrno_system = errno; log_auto(L_ERROR,"lauch_task(): task process (%i) does not exist. (%s)",task_pid,strerror(drerrno_system)); semaphore_lock(sdb->semid); sdb->comp->status.task[itask].used = 0; /* We don't need the task anymore */ sdb->comp->status.ntasks = computer_ntasks (sdb->comp); sdb->comp->status.nrunning = computer_nrunning (sdb->comp); semaphore_release(sdb->semid); // FIXME: notify the master ? } else { // waitpid returned successfully /* We have to clean the task and send the info to the master */ /* consider WIFSIGNALED(status), WTERMSIG(status), WEXITSTATUS(status) */ /* we pass directly the status (translated to DR) to the master and he decides what to do with the frame */ semaphore_lock(sdb->semid); sdb->comp->status.task[itask].exitstatus = 0; sdb->comp->status.task[itask].status = TASKSTATUS_FINISHED; sdb->comp->status.ntasks = computer_ntasks (sdb->comp); sdb->comp->status.nrunning = computer_nrunning (sdb->comp); if (WIFSIGNALED(rc)) { /* Process exited abnormally either killed by us or by itself (SIGSEGV) */ /* printf ("\n\nSIGNALED with %i\n",WTERMSIG(rc)); */ sdb->comp->status.task[itask].exitstatus |= DR_SIGNALEDFLAG ; sdb->comp->status.task[itask].exitstatus |= WTERMSIG(rc); log_auto(L_INFO,"Task signaled"); } else { if (WIFEXITED(rc)) { /* printf ("\n\nEXITED with %i\n",WEXITSTATUS(rc)); */ sdb->comp->status.task[itask].exitstatus |= DR_EXITEDFLAG ; sdb->comp->status.task[itask].exitstatus |= WEXITSTATUS(rc); /* printf ("\n\nEXITED with %i\n",DR_WEXITSTATUS(sdb->comp->status.task[itask].exitstatus)); */ log_auto(L_INFO,"Task finished"); } else { log_auto(L_WARNING,"Task finished with rc = %i",rc); } } semaphore_release(sdb->semid); request_task_finished (sdb,itask); semaphore_lock(sdb->semid); sdb->comp->status.task[itask].used = 0; /* We don't need the task anymore */ semaphore_release(sdb->semid); } exit (0); } else if (waiter_pid == -1) { log_auto(L_WARNING,"Fork failed for task waiter"); semaphore_lock(sdb->semid); sdb->comp->status.task[itask].used = 0; /* We don't need the task anymore */ semaphore_release(sdb->semid); exit (1); } else { // in this "else" waiter_pid actually contains the PID of the waiter process } }