コード例 #1
0
ファイル: slurmstepd.c プロジェクト: lipari/slurm
static void
_step_cleanup(slurmd_job_t *job, slurm_msg_t *msg, int rc)
{
	if (job) {
		jobacct_gather_g_destroy(job->jobacct);
		if (!job->batch)
			job_destroy(job);
	}
	/*
	 * The message cannot be freed until the jobstep is complete
	 * because the job struct has pointers into the msg, such
	 * as the switch jobinfo pointer.
	 */
	switch(msg->msg_type) {
	case REQUEST_BATCH_JOB_LAUNCH:
		slurm_free_job_launch_msg(msg->data);
		break;
	case REQUEST_LAUNCH_TASKS:
		slurm_free_launch_tasks_request_msg(msg->data);
		break;
	default:
		fatal("handle_launch_message: Unrecognized launch RPC");
		break;
	}
	jobacct_gather_g_destroy(step_complete.jobacct);

	xfree(msg);
}
コード例 #2
0
ファイル: vm.c プロジェクト: alepharchives/emonk
static JSBool
jserl_send(JSContext* cx, JSObject* obj, uintN argc, jsval* argv, jsval* rval)
{
    vm_ptr vm = (vm_ptr) JS_GetContextPrivate(cx);
    ErlNifEnv* env;
    job_ptr job;
    ENTERM mesg;
    
    if(argc < 0)
    {
        return JS_FALSE;
    }
    
    assert(vm != NULL && "Context has no vm.");
    
    env = enif_alloc_env();
    mesg = vm_mk_message(env, to_erl(env, cx, argv[0]));

    // If pid is not alive, raise an error.
    // XXX: Can I make this uncatchable?
    if(!enif_send(NULL, &(vm->curr_job->pid), env, mesg))
    {
        JS_ReportError(cx, "Context closing.");
        return JS_FALSE;
    }

    job = queue_receive(vm->jobs);
    if(job->type == job_close)
    {
        // XXX: Can I make this uncatchable?
        job_destroy(job);
        JS_ReportError(cx, "Context closing.");
        return JS_FALSE;
    }
    
    assert(job->type == job_response && "Invalid message response.");
    
    *rval = to_js(job->env, cx, job->args);
    job_destroy(job);

    return JS_TRUE;
}
コード例 #3
0
ファイル: vm.c プロジェクト: alepharchives/emonk
int
vm_send(vm_ptr vm, ENTERM data)
{
    job_ptr job = job_create();
    if(job == NULL) goto error;
    
    job->type = job_response;
    job->args = enif_make_copy(job->env, data);
    
    if(!queue_send(vm->jobs, job)) goto error;
    
    return 1;
error:
    if(job != NULL) job_destroy(job);
    return 0;
}
コード例 #4
0
ファイル: job.c プロジェクト: spronk/suq
void joblist_remove(joblist *jl, job *j)
{
    if (debug>1)
        printf("SERVER: removing job id %d\n", j->id);


    j->next->prev=j->prev;
    j->prev->next=j->next;

    jl->N--;


    if (j->state == running)
        job_cancel(j);

    job_destroy(j);
    free(j);
}
コード例 #5
0
ファイル: vm.c プロジェクト: alepharchives/emonk
int
vm_add_call(vm_ptr vm, ENTERM ref, ENPID pid, ENTERM name, ENTERM args)
{
    job_ptr job = job_create();
    if(job == NULL) goto error;

    job->type = job_call;
    job->ref = enif_make_copy(job->env, ref);
    job->pid = pid;
    job->name = enif_make_copy(job->env, name);
    job->args = enif_make_copy(job->env, args);

    if(!queue_push(vm->jobs, job)) goto error;

    return 1;
error:
    if(job != NULL) job_destroy(job);
    return 0;
}
コード例 #6
0
ファイル: vm.c プロジェクト: alepharchives/emonk
int
vm_add_eval(vm_ptr vm, ENTERM ref, ENPID pid, ENBINARY bin)
{
    job_ptr job = job_create();

    job->type = job_eval;
    job->ref = enif_make_copy(job->env, ref);
    job->pid = pid;
    
    if(!enif_alloc_binary(bin.size, &(job->script))) goto error;
    memcpy(job->script.data, bin.data, bin.size);

    if(!queue_push(vm->jobs, job)) goto error;

    return 1;

error:
    if(job != NULL) job_destroy(job);
    return 0;
}
コード例 #7
0
static void
io_job_thread (gpointer data,
	       gpointer user_data)
{
  GIOSchedulerJob *job = data;
  gboolean result;

  if (job->cancellable)
    g_cancellable_push_current (job->cancellable);

  do 
    {
      result = job->job_func (job, job->cancellable, job->data);
    }
  while (result);

  if (job->cancellable)
    g_cancellable_pop_current (job->cancellable);

  job_destroy (job);
}
コード例 #8
0
ファイル: job.c プロジェクト: andreiw/polaris
/*
 *  job_retrieve() will retrieve the disk copy of a job associated with the
 *	transfer file name passed in.  It returns a pointer to a job structure
 *	or a NULL if the job was not on disk.
 */
job_t *
job_retrieve(char *xFile, char *spool)
{
	int	retry_cnt = 0;
	char	*s;
	jobfile_t *file;
	char 	cFile[BUFSIZ];
	char	buf[BUFSIZ];
	int	fd;
	flock_t flk;
	job_t	*tmp;

	syslog(LOG_DEBUG, "job_retrieve(%s)", xFile);
	if ((tmp = (job_t *)calloc(1, sizeof (*tmp))) == NULL) {
		return (NULL);
	}

	if ((file = calloc(1, sizeof (*file))) == NULL) {
		free(tmp);
		return (NULL);
	}

	flk.l_type = F_RDLCK;
	flk.l_whence = 1;
	flk.l_start = 0;
	flk.l_len = 0;

	(void) memset(buf, NULL, sizeof (buf));
	/* get job id, from binding file name */
	(void) strlcpy(buf, xFile + strlen(_xfer_file_prefix) + 1,
	    sizeof (buf));

	buf[3] = NULL;
	tmp->job_id = atoi(buf);

	/* Construct data file and control file names */
	(void) strlcpy(cFile, _control_file_prefix, sizeof (cFile));
	(void) strlcat(cFile, xFile + strlen(_xfer_file_prefix),
	    sizeof (cFile));

	/* remove data file and control file whenever xFile is removed */
	if ((fd = open(xFile, O_RDONLY)) < 0) {
		syslog(LOG_DEBUG, "job_retrieve(%s) open failed errno=%d",
		    xFile, errno);
		if (get_job_from_cfile(file, cFile, xFile, tmp))
			job_destroy(tmp);
		free(file);
		free(tmp);
		(void) unlink(xFile);
		(void) unlink(cFile);
		return (NULL);
	}

	/*
	 * If failed to get a lock on the file, just return NULL. It will
	 * be retried later.
	 */
	if ((fcntl(fd, F_SETLK, &flk)) < 0) {
		syslog(LOG_DEBUG, "job_retrieve(%s) lock failed errno=%d",
		    xFile, errno);
		close(fd);
		free(file);
		free(tmp);
		return (NULL);
	}

	/*
	 * Retry a few times if we failed to read or read returns 0, just
	 * to make sure we tried hard before giving up. In practice,
	 * there were cases of read() returning 0. To handle that
	 * scenario just try a few times.
	 */
	for (retry_cnt = 0; retry_cnt < MAX_RETRIES; retry_cnt++) {
		if ((read(fd, buf, sizeof (buf))) > 0) {
			close(fd);
			if ((s = strtok(buf, ":\n")) != NULL)
				tmp->job_server = strdup(s);
			if ((s = strtok(NULL, ":\n")) != NULL)
				tmp->job_printer = strdup(s);
			syslog(LOG_DEBUG, "job_retrieve(%s) success - %s:%s",
			    xFile, tmp->job_server, tmp->job_printer);
			break;
		}
	}
	/*
	 * If failed to read after MAX_RETRIES, return NULL and remove xFile,
	 * and cFile.
	 */
	if (retry_cnt == MAX_RETRIES) {
		syslog(LOG_DEBUG, "job_retrieve(%s) unsuccessful", xFile);
		if (get_job_from_cfile(file, cFile, xFile, tmp))
			job_destroy(tmp);
		free(file);
		free(tmp);
		(void) unlink(xFile);
		(void) unlink(cFile);
		return (NULL);
	}

	file->jf_src_path = strdup(xFile);
	file->jf_spl_path = strdup(cFile);

	if (!get_job_from_cfile(file, cFile, xFile, tmp)) {
		(void) unlink(file->jf_spl_path);  /* control file */
		(void) unlink(file->jf_src_path);  /* binding file */
		free(file->jf_src_path);
		free(file->jf_spl_path);
		free(file);
		free(tmp);
		return (NULL);
	}

	tmp->job_spool_dir = strdup(spool);
	return (tmp);
}
コード例 #9
0
ファイル: discofs.c プロジェクト: laevar/discofs
/*! Main
@param argc number of arguments
@param argv arguments
 @return fuse_main()s return value
*/
int main(int argc, char **argv)
{
    /* return value of fuse_main() */
    int ret;

    /* for signal handling */
    struct sigaction sig;

    /* argument handling */
    struct fuse_args args = FUSE_ARGS_INIT(argc, argv);

    /* file name for database */
    char *db_file;


    /*------------------------*
     * install signal handler *
     *------------------------*/

    /* set handling function */
    sig.sa_handler = sig_handler;

    /* set (no) flags */
    sig.sa_flags = 0;

    /* don't ignore any signal */
    sigemptyset(&sig.sa_mask);

    /* install signal handler for USR1 and USR2 */
    sigaction(SIGUSR1, &sig, NULL);
    sigaction(SIGUSR2, &sig, NULL);


    /*------------------*
     * handle arguments *
     *------------------*/

    if (fuse_opt_parse(&args, &discofs_options, discofs_opts, discofs_opt_proc) == -1)
        return EXIT_FAILURE;

    /* after option parsing, remote mount point must be set */
    if (!REMOTE_ROOT)
    {
        fprintf(stderr, "no remote filesystem given\n");
        return EXIT_FAILURE;
    }

    /* a mount point for discofs must also be set */
    if (!discofs_options.discofs_mp)
    {
        fprintf(stderr, "no mount point given\n");
        return EXIT_FAILURE;
    }

    /* add "use_ino" to display inodes in stat(1)*/
    fuse_opt_add_arg(&args, "-ouse_ino");


    /*---------------*
     * set UID / GID *
     *---------------*/

    /* set GID first since permissions might not be
       sufficient if UID was set beforehand */
    if (discofs_options.gid)
    {
        VERBOSE("setting gid to %d\n", discofs_options.gid);
        if (setgid(discofs_options.gid))
        {
            perror("setting gid");
            return EXIT_FAILURE;
        }
    }
    if (discofs_options.uid)
    {
        VERBOSE("setting uid to %d\n", discofs_options.uid);
        if (setuid(discofs_options.uid))
        {
            perror("setting uid");
            return EXIT_FAILURE;
        }
    }


    /*--------------------*
     * initialize logging *
     *--------------------*/

    /* if -d is specified, override logging settings */
    if (discofs_options.debug)
        log_init(LOG_DEBUG, NULL);
    else
        log_init(discofs_options.loglevel, discofs_options.logfile);



    /*=========================*
     * INITIALIZE CACHE AND DB *
     *=========================*/

    /* compute data root if not passed as option */
    if (!discofs_options.data_root)
        discofs_options.data_root = paths_data_root(REMOTE_ROOT);

    if (!is_dir(discofs_options.data_root))
    {
        if (mkdir_rec(discofs_options.data_root))
            FATAL("failed to create data directory %s\n", discofs_options.data_root);
    }


    /*----------------------*
     * initialize cache dir *
     *----------------------*/

    /* set cache dir */
    CACHE_ROOT = join_path(discofs_options.data_root, "cache");

    /* store length of cache root (to save a few hundred strlen() calls)  */
    CACHE_ROOT_LEN = strlen(CACHE_ROOT);

    /* delete cache if "clear" specified */
    if (discofs_options.clear)
    {
        VERBOSE("deleting cache\n");
        rmdir_rec(CACHE_ROOT);
    }

    /* create cache root if needed */
    if (!is_dir(CACHE_ROOT))
    {
        if (mkdir(CACHE_ROOT, S_IRWXU) != 0)
            FATAL("failed to create cache directory %s\n", CACHE_ROOT);
    }


    /*---------------------*
     * initialize database *
     *---------------------*/

    /* set db filename */
    db_file = join_path(discofs_options.data_root, "db.sqlite");

    /* create database file if it doesn't exist  */
    int fd = open(db_file, (O_RDONLY | O_CREAT), (S_IRUSR | S_IWUSR));
    if (fd == -1)
    {
        perror(db_file);
        FATAL("couldn't open or create database file\n");
    }
    close(fd);

    /* initialize tables etc */
    db_init(db_file, discofs_options.clear);

    /* try to load filesystem features from DB */
    if (db_cfg_get_int(CFG_FS_FEATURES, &discofs_options.fs_features))
    {

        /* if loading failed, try to determine them */
        if (is_mounted(REMOTE_ROOT) && is_reachable(discofs_options.host))
        {
            if (test_fs_features(&discofs_options.fs_features))
            {
                ERROR("failed to test remote fs features\n");
                discofs_options.fs_features = 0;
            }
            /* test succeeded, store value for next time */
            else
                db_cfg_set_int(CFG_FS_FEATURES, discofs_options.fs_features);
        }
        /* nag and assume that no features available (but don't save that) */
        else
        {
            ERROR("could not determine remote fs features");
            discofs_options.fs_features = 0;
        }
    }


    /*------------------*
     * initialize stuff *
     *------------------*/
    #define INIT(name)                          \
        if (name ## _init())                    \
            FATAL("error initializing " #name)
    INIT(lock);
    INIT(sync);
    INIT(job);
    #undef INIT


    /*----------------------*
     * print options to log *
     *----------------------*/
    log_options(LOG_VERBOSE, discofs_options);


    /*-----------------*
     * run fuse_main() *
     *-----------------*/
    ret = fuse_main(args.argc, args.argv, &discofs_oper, NULL);


    /*------*
     * exit *
     *------*/

    lock_destroy();
    sync_destroy();
    job_destroy();

    /* free arguments */
    fuse_opt_free_args(&args);

    /* close database connection */
    db_destroy();


    /* end logging */
    INFO("exiting\n");
    log_destroy();


    /* return fuse_main()s return value */
    return ret;
}
コード例 #10
0
ファイル: vm.c プロジェクト: alepharchives/emonk
void*
vm_run(void* arg)
{
    vm_ptr vm = (vm_ptr) arg;
    JSContext* cx;
    JSObject* gl;
    job_ptr job;
    ENTERM resp;
    int flags;
    
    cx = JS_NewContext(vm->runtime, vm->stack_size);
    if(cx == NULL)
    {
        fprintf(stderr, "Failed to create context.\n");
        goto done;
    }

    JS_BeginRequest(cx);

    flags = 0;
    flags |= JSOPTION_VAROBJFIX;
    flags |= JSOPTION_STRICT;
    flags |= JSVERSION_LATEST;
    flags |= JSOPTION_COMPILE_N_GO;
    flags |= JSOPTION_XML;
    JS_SetOptions(cx, JS_GetOptions(cx) | flags);
    
    gl = JS_NewObject(cx, &global_class, NULL, NULL);
    if(gl == NULL)
    {
        fprintf(stderr, "Failed to create global object.\n");
        goto done;
    }
    
    if(!JS_InitStandardClasses(cx, gl))
    {
        fprintf(stderr, "Failed to initialize classes.\n");
        goto done;
    }
    
    if(!install_jserl(cx, gl))
    {
        fprintf(stderr, "Failed to install erlang object.");
        goto done;
    }
    
    JS_SetErrorReporter(cx, vm_report_error);
    JS_SetContextPrivate(cx, (void*) vm);

    JS_EndRequest(cx);

    while(1)
    {
        job = queue_pop(vm->jobs);
        if(job->type == job_close)
        {
            job_destroy(job);
            break;
        }

        JS_BeginRequest(cx);
        assert(vm->curr_job == NULL && "vm already has a job set.");
        vm->curr_job = job;

        if(job->type == job_eval)
        {
            resp = vm_eval(cx, gl, job);
        }
        else if(job->type == job_call)
        {
            resp = vm_call(cx, gl, job);
        }
        else
        {
            assert(0 && "Invalid job type.");
        }

        vm->curr_job = NULL;
        JS_EndRequest(cx);
        JS_MaybeGC(cx);

        // XXX: If pid is not alive, we just ignore it.
        enif_send(NULL, &(job->pid), job->env, resp);

        job_destroy(job);
    }

done:
    JS_BeginRequest(cx);
    if(cx != NULL) JS_DestroyContext(cx);
    return NULL;
}
コード例 #11
0
/* create a slurmd job structure from a launch tasks message */
slurmd_job_t *
job_create(launch_tasks_request_msg_t *msg)
{
    struct passwd *pwd = NULL;
    slurmd_job_t  *job = NULL;
    srun_info_t   *srun = NULL;
    slurm_addr_t     resp_addr;
    slurm_addr_t     io_addr;
    int            nodeid = NO_VAL;

    xassert(msg != NULL);
    xassert(msg->complete_nodelist != NULL);
    debug3("entering job_create");
    if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) {
        error("uid %ld not found on system", (long) msg->uid);
        slurm_seterrno (ESLURMD_UID_NOT_FOUND);
        return NULL;
    }
    if (!_valid_gid(pwd, &(msg->gid))) {
        slurm_seterrno (ESLURMD_GID_NOT_FOUND);
        _pwd_destroy(pwd);
        return NULL;
    }

    if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL)
            && (msg->acctg_freq > conf->job_acct_gather_freq)) {
        error("Can't set frequency to %u, it is higher than %u.  "
              "We need it to be at least at this level to "
              "monitor memory usage.",
              msg->acctg_freq, conf->job_acct_gather_freq);
        slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ);
        _pwd_destroy(pwd);
        return NULL;
    }

    job = xmalloc(sizeof(slurmd_job_t));
#ifndef HAVE_FRONT_END
    nodeid = nodelist_find(msg->complete_nodelist, conf->node_name);
    job->node_name = xstrdup(conf->node_name);
#else
    nodeid = 0;
    job->node_name = xstrdup(msg->complete_nodelist);
#endif
    if(nodeid < 0) {
        error("couldn't find node %s in %s",
              job->node_name, msg->complete_nodelist);
        job_destroy(job);
        return NULL;
    }

    job->state	= SLURMSTEPD_STEP_STARTING;
    job->pwd	= pwd;
    job->node_tasks	= msg->tasks_to_launch[nodeid];
    job->ntasks	= msg->ntasks;
    job->jobid	= msg->job_id;
    job->stepid	= msg->job_step_id;

    job->uid	= (uid_t) msg->uid;
    job->gid	= (gid_t) msg->gid;
    job->cwd	= xstrdup(msg->cwd);
    job->task_dist	= msg->task_dist;

    job->cpu_bind_type = msg->cpu_bind_type;
    job->cpu_bind = xstrdup(msg->cpu_bind);
    job->mem_bind_type = msg->mem_bind_type;
    job->mem_bind = xstrdup(msg->mem_bind);
    job->cpu_freq = msg->cpu_freq;
    job->ckpt_dir = xstrdup(msg->ckpt_dir);
    job->restart_dir = xstrdup(msg->restart_dir);
    job->cpus_per_task = msg->cpus_per_task;

    job->env     = _array_copy(msg->envc, msg->env);
    job->eio     = eio_handle_create();
    job->sruns   = list_create((ListDelF) _srun_info_destructor);
    job->clients = list_create(NULL); /* FIXME! Needs destructor */
    job->stdout_eio_objs = list_create(NULL); /* FIXME! Needs destructor */
    job->stderr_eio_objs = list_create(NULL); /* FIXME! Needs destructor */
    job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */
    job->incoming_count = 0;
    job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */
    job->outgoing_count = 0;
    job->outgoing_cache = list_create(NULL); /* FIXME! Needs destructor */

    job->envtp   = xmalloc(sizeof(env_t));
    job->envtp->jobid = -1;
    job->envtp->stepid = -1;
    job->envtp->procid = -1;
    job->envtp->localid = -1;
    job->envtp->nodeid = -1;

    job->envtp->distribution = 0;
    job->envtp->cpu_bind_type = 0;
    job->envtp->cpu_bind = NULL;
    job->envtp->mem_bind_type = 0;
    job->envtp->mem_bind = NULL;
    job->envtp->ckpt_dir = NULL;
    //job->envtp->comm_port = msg->resp_port[nodeid % msg->num_resp_port];

    /*memcpy(&resp_addr, &msg->orig_addr, sizeof(slurm_addr_t));
    slurm_set_addr(&resp_addr,
    	       msg->resp_port[nodeid % msg->num_resp_port],
    	       NULL);
    job->user_managed_io = msg->user_managed_io;
    if (!msg->user_managed_io) {
    	memcpy(&io_addr,   &msg->orig_addr, sizeof(slurm_addr_t));
    	slurm_set_addr(&io_addr,
    		       msg->io_port[nodeid % msg->num_io_port],
    		       NULL);
    }*/
    //srun = srun_info_create(msg->cred, &resp_addr, &io_addr);
    srun = srun_info_create(NULL, NULL, NULL);
    job->buffered_stdio = msg->buffered_stdio;
    job->labelio = msg->labelio;

    job->task_prolog = xstrdup(msg->task_prolog);
    job->task_epilog = xstrdup(msg->task_epilog);

    job->argc    = msg->argc;
    job->argv    = _array_copy(job->argc, msg->argv);

    job->nnodes  = msg->nnodes;
    job->nodeid  = nodeid;
    job->debug   = msg->slurmd_debug;
    job->cpus    = msg->cpus_allocated[nodeid];
    if (msg->acctg_freq != (uint16_t) NO_VAL)
        jobacct_gather_change_poll(msg->acctg_freq);
    job->multi_prog  = msg->multi_prog;
    job->timelimit   = (time_t) -1;
    job->task_flags  = msg->task_flags;
    job->switch_job  = msg->switch_job;
    job->pty         = msg->pty;
    job->open_mode   = msg->open_mode;
    job->options     = msg->options;
    format_core_allocs(msg->cred, conf->node_name,
                       &job->job_alloc_cores, &job->step_alloc_cores,
                       &job->job_mem, &job->step_mem);
    if (job->step_mem) {
        jobacct_gather_set_mem_limit(job->jobid, job->stepid,
                                     job->step_mem);
    } else if (job->job_mem) {
        jobacct_gather_set_mem_limit(job->jobid, job->stepid,
                                     job->job_mem);
    }

#ifdef HAVE_CRAY
    /* This is only used for Cray emulation mode where slurmd is used to
     * launch job steps. On a real Cray system, ALPS is used to launch
     * the tasks instead of SLURM. SLURM's task launch RPC does NOT
     * contain the reservation ID, so just use some non-zero value here
     * for testing purposes. */
    job->resv_id = 1;
    select_g_select_jobinfo_set(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID,
                                &job->resv_id);
#endif

    get_cred_gres(msg->cred, conf->node_name,
                  &job->job_gres_list, &job->step_gres_list);

    list_append(job->sruns, (void *) srun);
    _job_init_task_info(job, msg->global_task_ids[nodeid],
                        msg->ifname, msg->ofname, msg->efname);

    return job;
}