static void _step_cleanup(slurmd_job_t *job, slurm_msg_t *msg, int rc) { if (job) { jobacct_gather_g_destroy(job->jobacct); if (!job->batch) job_destroy(job); } /* * The message cannot be freed until the jobstep is complete * because the job struct has pointers into the msg, such * as the switch jobinfo pointer. */ switch(msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: slurm_free_job_launch_msg(msg->data); break; case REQUEST_LAUNCH_TASKS: slurm_free_launch_tasks_request_msg(msg->data); break; default: fatal("handle_launch_message: Unrecognized launch RPC"); break; } jobacct_gather_g_destroy(step_complete.jobacct); xfree(msg); }
static JSBool jserl_send(JSContext* cx, JSObject* obj, uintN argc, jsval* argv, jsval* rval) { vm_ptr vm = (vm_ptr) JS_GetContextPrivate(cx); ErlNifEnv* env; job_ptr job; ENTERM mesg; if(argc < 0) { return JS_FALSE; } assert(vm != NULL && "Context has no vm."); env = enif_alloc_env(); mesg = vm_mk_message(env, to_erl(env, cx, argv[0])); // If pid is not alive, raise an error. // XXX: Can I make this uncatchable? if(!enif_send(NULL, &(vm->curr_job->pid), env, mesg)) { JS_ReportError(cx, "Context closing."); return JS_FALSE; } job = queue_receive(vm->jobs); if(job->type == job_close) { // XXX: Can I make this uncatchable? job_destroy(job); JS_ReportError(cx, "Context closing."); return JS_FALSE; } assert(job->type == job_response && "Invalid message response."); *rval = to_js(job->env, cx, job->args); job_destroy(job); return JS_TRUE; }
int vm_send(vm_ptr vm, ENTERM data) { job_ptr job = job_create(); if(job == NULL) goto error; job->type = job_response; job->args = enif_make_copy(job->env, data); if(!queue_send(vm->jobs, job)) goto error; return 1; error: if(job != NULL) job_destroy(job); return 0; }
void joblist_remove(joblist *jl, job *j) { if (debug>1) printf("SERVER: removing job id %d\n", j->id); j->next->prev=j->prev; j->prev->next=j->next; jl->N--; if (j->state == running) job_cancel(j); job_destroy(j); free(j); }
int vm_add_call(vm_ptr vm, ENTERM ref, ENPID pid, ENTERM name, ENTERM args) { job_ptr job = job_create(); if(job == NULL) goto error; job->type = job_call; job->ref = enif_make_copy(job->env, ref); job->pid = pid; job->name = enif_make_copy(job->env, name); job->args = enif_make_copy(job->env, args); if(!queue_push(vm->jobs, job)) goto error; return 1; error: if(job != NULL) job_destroy(job); return 0; }
int vm_add_eval(vm_ptr vm, ENTERM ref, ENPID pid, ENBINARY bin) { job_ptr job = job_create(); job->type = job_eval; job->ref = enif_make_copy(job->env, ref); job->pid = pid; if(!enif_alloc_binary(bin.size, &(job->script))) goto error; memcpy(job->script.data, bin.data, bin.size); if(!queue_push(vm->jobs, job)) goto error; return 1; error: if(job != NULL) job_destroy(job); return 0; }
static void io_job_thread (gpointer data, gpointer user_data) { GIOSchedulerJob *job = data; gboolean result; if (job->cancellable) g_cancellable_push_current (job->cancellable); do { result = job->job_func (job, job->cancellable, job->data); } while (result); if (job->cancellable) g_cancellable_pop_current (job->cancellable); job_destroy (job); }
/* * job_retrieve() will retrieve the disk copy of a job associated with the * transfer file name passed in. It returns a pointer to a job structure * or a NULL if the job was not on disk. */ job_t * job_retrieve(char *xFile, char *spool) { int retry_cnt = 0; char *s; jobfile_t *file; char cFile[BUFSIZ]; char buf[BUFSIZ]; int fd; flock_t flk; job_t *tmp; syslog(LOG_DEBUG, "job_retrieve(%s)", xFile); if ((tmp = (job_t *)calloc(1, sizeof (*tmp))) == NULL) { return (NULL); } if ((file = calloc(1, sizeof (*file))) == NULL) { free(tmp); return (NULL); } flk.l_type = F_RDLCK; flk.l_whence = 1; flk.l_start = 0; flk.l_len = 0; (void) memset(buf, NULL, sizeof (buf)); /* get job id, from binding file name */ (void) strlcpy(buf, xFile + strlen(_xfer_file_prefix) + 1, sizeof (buf)); buf[3] = NULL; tmp->job_id = atoi(buf); /* Construct data file and control file names */ (void) strlcpy(cFile, _control_file_prefix, sizeof (cFile)); (void) strlcat(cFile, xFile + strlen(_xfer_file_prefix), sizeof (cFile)); /* remove data file and control file whenever xFile is removed */ if ((fd = open(xFile, O_RDONLY)) < 0) { syslog(LOG_DEBUG, "job_retrieve(%s) open failed errno=%d", xFile, errno); if (get_job_from_cfile(file, cFile, xFile, tmp)) job_destroy(tmp); free(file); free(tmp); (void) unlink(xFile); (void) unlink(cFile); return (NULL); } /* * If failed to get a lock on the file, just return NULL. It will * be retried later. */ if ((fcntl(fd, F_SETLK, &flk)) < 0) { syslog(LOG_DEBUG, "job_retrieve(%s) lock failed errno=%d", xFile, errno); close(fd); free(file); free(tmp); return (NULL); } /* * Retry a few times if we failed to read or read returns 0, just * to make sure we tried hard before giving up. In practice, * there were cases of read() returning 0. To handle that * scenario just try a few times. */ for (retry_cnt = 0; retry_cnt < MAX_RETRIES; retry_cnt++) { if ((read(fd, buf, sizeof (buf))) > 0) { close(fd); if ((s = strtok(buf, ":\n")) != NULL) tmp->job_server = strdup(s); if ((s = strtok(NULL, ":\n")) != NULL) tmp->job_printer = strdup(s); syslog(LOG_DEBUG, "job_retrieve(%s) success - %s:%s", xFile, tmp->job_server, tmp->job_printer); break; } } /* * If failed to read after MAX_RETRIES, return NULL and remove xFile, * and cFile. */ if (retry_cnt == MAX_RETRIES) { syslog(LOG_DEBUG, "job_retrieve(%s) unsuccessful", xFile); if (get_job_from_cfile(file, cFile, xFile, tmp)) job_destroy(tmp); free(file); free(tmp); (void) unlink(xFile); (void) unlink(cFile); return (NULL); } file->jf_src_path = strdup(xFile); file->jf_spl_path = strdup(cFile); if (!get_job_from_cfile(file, cFile, xFile, tmp)) { (void) unlink(file->jf_spl_path); /* control file */ (void) unlink(file->jf_src_path); /* binding file */ free(file->jf_src_path); free(file->jf_spl_path); free(file); free(tmp); return (NULL); } tmp->job_spool_dir = strdup(spool); return (tmp); }
/*! Main @param argc number of arguments @param argv arguments @return fuse_main()s return value */ int main(int argc, char **argv) { /* return value of fuse_main() */ int ret; /* for signal handling */ struct sigaction sig; /* argument handling */ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); /* file name for database */ char *db_file; /*------------------------* * install signal handler * *------------------------*/ /* set handling function */ sig.sa_handler = sig_handler; /* set (no) flags */ sig.sa_flags = 0; /* don't ignore any signal */ sigemptyset(&sig.sa_mask); /* install signal handler for USR1 and USR2 */ sigaction(SIGUSR1, &sig, NULL); sigaction(SIGUSR2, &sig, NULL); /*------------------* * handle arguments * *------------------*/ if (fuse_opt_parse(&args, &discofs_options, discofs_opts, discofs_opt_proc) == -1) return EXIT_FAILURE; /* after option parsing, remote mount point must be set */ if (!REMOTE_ROOT) { fprintf(stderr, "no remote filesystem given\n"); return EXIT_FAILURE; } /* a mount point for discofs must also be set */ if (!discofs_options.discofs_mp) { fprintf(stderr, "no mount point given\n"); return EXIT_FAILURE; } /* add "use_ino" to display inodes in stat(1)*/ fuse_opt_add_arg(&args, "-ouse_ino"); /*---------------* * set UID / GID * *---------------*/ /* set GID first since permissions might not be sufficient if UID was set beforehand */ if (discofs_options.gid) { VERBOSE("setting gid to %d\n", discofs_options.gid); if (setgid(discofs_options.gid)) { perror("setting gid"); return EXIT_FAILURE; } } if (discofs_options.uid) { VERBOSE("setting uid to %d\n", discofs_options.uid); if (setuid(discofs_options.uid)) { perror("setting uid"); return EXIT_FAILURE; } } /*--------------------* * initialize logging * *--------------------*/ /* if -d is specified, override logging settings */ if (discofs_options.debug) log_init(LOG_DEBUG, NULL); else log_init(discofs_options.loglevel, discofs_options.logfile); /*=========================* * INITIALIZE CACHE AND DB * *=========================*/ /* compute data root if not passed as option */ if (!discofs_options.data_root) discofs_options.data_root = paths_data_root(REMOTE_ROOT); if (!is_dir(discofs_options.data_root)) { if (mkdir_rec(discofs_options.data_root)) FATAL("failed to create data directory %s\n", discofs_options.data_root); } /*----------------------* * initialize cache dir * *----------------------*/ /* set cache dir */ CACHE_ROOT = join_path(discofs_options.data_root, "cache"); /* store length of cache root (to save a few hundred strlen() calls) */ CACHE_ROOT_LEN = strlen(CACHE_ROOT); /* delete cache if "clear" specified */ if (discofs_options.clear) { VERBOSE("deleting cache\n"); rmdir_rec(CACHE_ROOT); } /* create cache root if needed */ if (!is_dir(CACHE_ROOT)) { if (mkdir(CACHE_ROOT, S_IRWXU) != 0) FATAL("failed to create cache directory %s\n", CACHE_ROOT); } /*---------------------* * initialize database * *---------------------*/ /* set db filename */ db_file = join_path(discofs_options.data_root, "db.sqlite"); /* create database file if it doesn't exist */ int fd = open(db_file, (O_RDONLY | O_CREAT), (S_IRUSR | S_IWUSR)); if (fd == -1) { perror(db_file); FATAL("couldn't open or create database file\n"); } close(fd); /* initialize tables etc */ db_init(db_file, discofs_options.clear); /* try to load filesystem features from DB */ if (db_cfg_get_int(CFG_FS_FEATURES, &discofs_options.fs_features)) { /* if loading failed, try to determine them */ if (is_mounted(REMOTE_ROOT) && is_reachable(discofs_options.host)) { if (test_fs_features(&discofs_options.fs_features)) { ERROR("failed to test remote fs features\n"); discofs_options.fs_features = 0; } /* test succeeded, store value for next time */ else db_cfg_set_int(CFG_FS_FEATURES, discofs_options.fs_features); } /* nag and assume that no features available (but don't save that) */ else { ERROR("could not determine remote fs features"); discofs_options.fs_features = 0; } } /*------------------* * initialize stuff * *------------------*/ #define INIT(name) \ if (name ## _init()) \ FATAL("error initializing " #name) INIT(lock); INIT(sync); INIT(job); #undef INIT /*----------------------* * print options to log * *----------------------*/ log_options(LOG_VERBOSE, discofs_options); /*-----------------* * run fuse_main() * *-----------------*/ ret = fuse_main(args.argc, args.argv, &discofs_oper, NULL); /*------* * exit * *------*/ lock_destroy(); sync_destroy(); job_destroy(); /* free arguments */ fuse_opt_free_args(&args); /* close database connection */ db_destroy(); /* end logging */ INFO("exiting\n"); log_destroy(); /* return fuse_main()s return value */ return ret; }
void* vm_run(void* arg) { vm_ptr vm = (vm_ptr) arg; JSContext* cx; JSObject* gl; job_ptr job; ENTERM resp; int flags; cx = JS_NewContext(vm->runtime, vm->stack_size); if(cx == NULL) { fprintf(stderr, "Failed to create context.\n"); goto done; } JS_BeginRequest(cx); flags = 0; flags |= JSOPTION_VAROBJFIX; flags |= JSOPTION_STRICT; flags |= JSVERSION_LATEST; flags |= JSOPTION_COMPILE_N_GO; flags |= JSOPTION_XML; JS_SetOptions(cx, JS_GetOptions(cx) | flags); gl = JS_NewObject(cx, &global_class, NULL, NULL); if(gl == NULL) { fprintf(stderr, "Failed to create global object.\n"); goto done; } if(!JS_InitStandardClasses(cx, gl)) { fprintf(stderr, "Failed to initialize classes.\n"); goto done; } if(!install_jserl(cx, gl)) { fprintf(stderr, "Failed to install erlang object."); goto done; } JS_SetErrorReporter(cx, vm_report_error); JS_SetContextPrivate(cx, (void*) vm); JS_EndRequest(cx); while(1) { job = queue_pop(vm->jobs); if(job->type == job_close) { job_destroy(job); break; } JS_BeginRequest(cx); assert(vm->curr_job == NULL && "vm already has a job set."); vm->curr_job = job; if(job->type == job_eval) { resp = vm_eval(cx, gl, job); } else if(job->type == job_call) { resp = vm_call(cx, gl, job); } else { assert(0 && "Invalid job type."); } vm->curr_job = NULL; JS_EndRequest(cx); JS_MaybeGC(cx); // XXX: If pid is not alive, we just ignore it. enif_send(NULL, &(job->pid), job->env, resp); job_destroy(job); } done: JS_BeginRequest(cx); if(cx != NULL) JS_DestroyContext(cx); return NULL; }
/* create a slurmd job structure from a launch tasks message */ slurmd_job_t * job_create(launch_tasks_request_msg_t *msg) { struct passwd *pwd = NULL; slurmd_job_t *job = NULL; srun_info_t *srun = NULL; slurm_addr_t resp_addr; slurm_addr_t io_addr; int nodeid = NO_VAL; xassert(msg != NULL); xassert(msg->complete_nodelist != NULL); debug3("entering job_create"); if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { error("uid %ld not found on system", (long) msg->uid); slurm_seterrno (ESLURMD_UID_NOT_FOUND); return NULL; } if (!_valid_gid(pwd, &(msg->gid))) { slurm_seterrno (ESLURMD_GID_NOT_FOUND); _pwd_destroy(pwd); return NULL; } if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL) && (msg->acctg_freq > conf->job_acct_gather_freq)) { error("Can't set frequency to %u, it is higher than %u. " "We need it to be at least at this level to " "monitor memory usage.", msg->acctg_freq, conf->job_acct_gather_freq); slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); _pwd_destroy(pwd); return NULL; } job = xmalloc(sizeof(slurmd_job_t)); #ifndef HAVE_FRONT_END nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); job->node_name = xstrdup(conf->node_name); #else nodeid = 0; job->node_name = xstrdup(msg->complete_nodelist); #endif if(nodeid < 0) { error("couldn't find node %s in %s", job->node_name, msg->complete_nodelist); job_destroy(job); return NULL; } job->state = SLURMSTEPD_STEP_STARTING; job->pwd = pwd; job->node_tasks = msg->tasks_to_launch[nodeid]; job->ntasks = msg->ntasks; job->jobid = msg->job_id; job->stepid = msg->job_step_id; job->uid = (uid_t) msg->uid; job->gid = (gid_t) msg->gid; job->cwd = xstrdup(msg->cwd); job->task_dist = msg->task_dist; job->cpu_bind_type = msg->cpu_bind_type; job->cpu_bind = xstrdup(msg->cpu_bind); job->mem_bind_type = msg->mem_bind_type; job->mem_bind = xstrdup(msg->mem_bind); job->cpu_freq = msg->cpu_freq; job->ckpt_dir = xstrdup(msg->ckpt_dir); job->restart_dir = xstrdup(msg->restart_dir); job->cpus_per_task = msg->cpus_per_task; job->env = _array_copy(msg->envc, msg->env); job->eio = eio_handle_create(); job->sruns = list_create((ListDelF) _srun_info_destructor); job->clients = list_create(NULL); /* FIXME! Needs destructor */ job->stdout_eio_objs = list_create(NULL); /* FIXME! Needs destructor */ job->stderr_eio_objs = list_create(NULL); /* FIXME! Needs destructor */ job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */ job->incoming_count = 0; job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */ job->outgoing_count = 0; job->outgoing_cache = list_create(NULL); /* FIXME! Needs destructor */ job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; job->envtp->stepid = -1; job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; job->envtp->distribution = 0; job->envtp->cpu_bind_type = 0; job->envtp->cpu_bind = NULL; job->envtp->mem_bind_type = 0; job->envtp->mem_bind = NULL; job->envtp->ckpt_dir = NULL; //job->envtp->comm_port = msg->resp_port[nodeid % msg->num_resp_port]; /*memcpy(&resp_addr, &msg->orig_addr, sizeof(slurm_addr_t)); slurm_set_addr(&resp_addr, msg->resp_port[nodeid % msg->num_resp_port], NULL); job->user_managed_io = msg->user_managed_io; if (!msg->user_managed_io) { memcpy(&io_addr, &msg->orig_addr, sizeof(slurm_addr_t)); slurm_set_addr(&io_addr, msg->io_port[nodeid % msg->num_io_port], NULL); }*/ //srun = srun_info_create(msg->cred, &resp_addr, &io_addr); srun = srun_info_create(NULL, NULL, NULL); job->buffered_stdio = msg->buffered_stdio; job->labelio = msg->labelio; job->task_prolog = xstrdup(msg->task_prolog); job->task_epilog = xstrdup(msg->task_epilog); job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); job->nnodes = msg->nnodes; job->nodeid = nodeid; job->debug = msg->slurmd_debug; job->cpus = msg->cpus_allocated[nodeid]; if (msg->acctg_freq != (uint16_t) NO_VAL) jobacct_gather_change_poll(msg->acctg_freq); job->multi_prog = msg->multi_prog; job->timelimit = (time_t) -1; job->task_flags = msg->task_flags; job->switch_job = msg->switch_job; job->pty = msg->pty; job->open_mode = msg->open_mode; job->options = msg->options; format_core_allocs(msg->cred, conf->node_name, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); if (job->step_mem) { jobacct_gather_set_mem_limit(job->jobid, job->stepid, job->step_mem); } else if (job->job_mem) { jobacct_gather_set_mem_limit(job->jobid, job->stepid, job->job_mem); } #ifdef HAVE_CRAY /* This is only used for Cray emulation mode where slurmd is used to * launch job steps. On a real Cray system, ALPS is used to launch * the tasks instead of SLURM. SLURM's task launch RPC does NOT * contain the reservation ID, so just use some non-zero value here * for testing purposes. */ job->resv_id = 1; select_g_select_jobinfo_set(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID, &job->resv_id); #endif get_cred_gres(msg->cred, conf->node_name, &job->job_gres_list, &job->step_gres_list); list_append(job->sruns, (void *) srun); _job_init_task_info(job, msg->global_task_ids[nodeid], msg->ifname, msg->ofname, msg->efname); return job; }