/* * fork1 * * Description: common code used by all new process creation other than the * bootstrap of the initial process on the system * * Parameters: parent_proc parent process of the process being * child_threadp pointer to location to receive the * Mach thread_t of the child process * breated * kind kind of creation being requested * * Notes: Permissable values for 'kind': * * PROC_CREATE_FORK Create a complete process which will * return actively running in both the * parent and the child; the child copies * the parent address space. * PROC_CREATE_SPAWN Create a complete process which will * return actively running in the parent * only after returning actively running * in the child; the child address space * is newly created by an image activator, * after which the child is run. * PROC_CREATE_VFORK Creates a partial process which will * borrow the parent task, thread, and * uthread to return running in the child; * the child address space and other parts * are lazily created at execve() time, or * the child is terminated, and the parent * does not actively run until that * happens. * * At first it may seem strange that we return the child thread * address rather than process structure, since the process is * the only part guaranteed to be "new"; however, since we do * not actualy adjust other references between Mach and BSD (see * the block diagram above the implementation of vfork()), this * is the only method which guarantees us the ability to get * back to the other information. */ int fork1(proc_t parent_proc, thread_t *child_threadp, int kind) { thread_t parent_thread = (thread_t)current_thread(); uthread_t parent_uthread = (uthread_t)get_bsdthread_info(parent_thread); proc_t child_proc = NULL; /* set in switch, but compiler... */ thread_t child_thread = NULL; uid_t uid; int count; int err = 0; int spawn = 0; /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. Don't allow * a nonprivileged user to use the last process; don't let root * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ uid = kauth_cred_get()->cr_ruid; proc_list_lock(); if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { proc_list_unlock(); tablefull("proc"); return (EAGAIN); } proc_list_unlock(); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit, which is * always less than what an rlim_t can hold. * (locking protection is provided by list lock held in chgproccnt) */ count = chgproccnt(uid, 1); if (uid != 0 && (rlim_t)count > parent_proc->p_rlimit[RLIMIT_NPROC].rlim_cur) { err = EAGAIN; goto bad; } #if CONFIG_MACF /* * Determine if MAC policies applied to the process will allow * it to fork. This is an advisory-only check. */ err = mac_proc_check_fork(parent_proc); if (err != 0) { goto bad; } #endif switch(kind) { case PROC_CREATE_VFORK: /* * Prevent a vfork while we are in vfork(); we should * also likely preventing a fork here as well, and this * check should then be outside the switch statement, * since the proc struct contents will copy from the * child and the tash/thread/uthread from the parent in * that case. We do not support vfork() in vfork() * because we don't have to; the same non-requirement * is true of both fork() and posix_spawn() and any * call other than execve() amd _exit(), but we've * been historically lenient, so we continue to be so * (for now). * * <rdar://6640521> Probably a source of random panics */ if (parent_uthread->uu_flag & UT_VFORK) { printf("fork1 called within vfork by %s\n", parent_proc->p_comm); err = EINVAL; goto bad; } /* * Flag us in progress; if we chose to support vfork() in * vfork(), we would chain our parent at this point (in * effect, a stack push). We don't, since we actually want * to disallow everything not specified in the standard */ proc_vfork_begin(parent_proc); /* The newly created process comes with signal lock held */ if ((child_proc = forkproc(parent_proc)) == NULL) { /* Failed to allocate new process */ proc_vfork_end(parent_proc); err = ENOMEM; goto bad; } // XXX BEGIN: wants to move to be common code (and safe) #if CONFIG_MACF /* * allow policies to associate the credential/label that * we referenced from the parent ... with the child * JMM - this really isn't safe, as we can drop that * association without informing the policy in other * situations (keep long enough to get policies changed) */ mac_cred_label_associate_fork(child_proc->p_ucred, child_proc); #endif /* * Propogate change of PID - may get new cred if auditing. * * NOTE: This has no effect in the vfork case, since * child_proc->task != current_task(), but we duplicate it * because this is probably, ultimately, wrong, since we * will be running in the "child" which is the parent task * with the wrong token until we get to the execve() or * _exit() call; a lot of "undefined" can happen before * that. * * <rdar://6640530> disallow everything but exeve()/_exit()? */ set_security_token(child_proc); AUDIT_ARG(pid, child_proc->p_pid); AUDIT_SESSION_PROCNEW(child_proc->p_ucred); // XXX END: wants to move to be common code (and safe) /* * BORROW PARENT TASK, THREAD, UTHREAD FOR CHILD * * Note: this is where we would "push" state instead of setting * it for nested vfork() support (see proc_vfork_end() for * description if issues here). */ child_proc->task = parent_proc->task; child_proc->p_lflag |= P_LINVFORK; child_proc->p_vforkact = parent_thread; child_proc->p_stat = SRUN; parent_uthread->uu_flag |= UT_VFORK; parent_uthread->uu_proc = child_proc; parent_uthread->uu_userstate = (void *)act_thread_csave(); parent_uthread->uu_vforkmask = parent_uthread->uu_sigmask; /* temporarily drop thread-set-id state */ if (parent_uthread->uu_flag & UT_SETUID) { parent_uthread->uu_flag |= UT_WASSETUID; parent_uthread->uu_flag &= ~UT_SETUID; } /* blow thread state information */ /* XXX is this actually necessary, given syscall return? */ thread_set_child(parent_thread, child_proc->p_pid); child_proc->p_acflag = AFORK; /* forked but not exec'ed */ /* * Preserve synchronization semantics of vfork. If * waiting for child to exec or exit, set P_PPWAIT * on child, and sleep on our proc (in case of exit). */ child_proc->p_lflag |= P_LPPWAIT; pinsertchild(parent_proc, child_proc); /* set visible */ break; case PROC_CREATE_SPAWN: /* * A spawned process differs from a forked process in that * the spawned process does not carry around the parents * baggage with regard to address space copying, dtrace, * and so on. */ spawn = 1; /* FALLSTHROUGH */ case PROC_CREATE_FORK: /* * When we clone the parent process, we are going to inherit * its task attributes and memory, since when we fork, we * will, in effect, create a duplicate of it, with only minor * differences. Contrarily, spawned processes do not inherit. */ if ((child_thread = cloneproc(parent_proc->task, parent_proc, spawn ? FALSE : TRUE)) == NULL) { /* Failed to create thread */ err = EAGAIN; goto bad; } /* copy current thread state into the child thread (only for fork) */ if (!spawn) { thread_dup(child_thread); } /* child_proc = child_thread->task->proc; */ child_proc = (proc_t)(get_bsdtask_info(get_threadtask(child_thread))); // XXX BEGIN: wants to move to be common code (and safe) #if CONFIG_MACF /* * allow policies to associate the credential/label that * we referenced from the parent ... with the child * JMM - this really isn't safe, as we can drop that * association without informing the policy in other * situations (keep long enough to get policies changed) */ mac_cred_label_associate_fork(child_proc->p_ucred, child_proc); #endif /* * Propogate change of PID - may get new cred if auditing. * * NOTE: This has no effect in the vfork case, since * child_proc->task != current_task(), but we duplicate it * because this is probably, ultimately, wrong, since we * will be running in the "child" which is the parent task * with the wrong token until we get to the execve() or * _exit() call; a lot of "undefined" can happen before * that. * * <rdar://6640530> disallow everything but exeve()/_exit()? */ set_security_token(child_proc); AUDIT_ARG(pid, child_proc->p_pid); AUDIT_SESSION_PROCNEW(child_proc->p_ucred); // XXX END: wants to move to be common code (and safe) /* * Blow thread state information; this is what gives the child * process its "return" value from a fork() call. * * Note: this should probably move to fork() proper, since it * is not relevent to spawn, and the value won't matter * until we resume the child there. If you are in here * refactoring code, consider doing this at the same time. */ thread_set_child(child_thread, child_proc->p_pid); child_proc->p_acflag = AFORK; /* forked but not exec'ed */ // <rdar://6598155> dtrace code cleanup needed #if CONFIG_DTRACE /* * This code applies to new processes who are copying the task * and thread state and address spaces of their parent process. */ if (!spawn) { // <rdar://6598155> call dtrace specific function here instead of all this... /* * APPLE NOTE: Solaris does a sprlock() and drops the * proc_lock here. We're cheating a bit and only taking * the p_dtrace_sprlock lock. A full sprlock would * task_suspend the parent. */ lck_mtx_lock(&parent_proc->p_dtrace_sprlock); /* * Remove all DTrace tracepoints from the child process. We * need to do this _before_ duplicating USDT providers since * any associated probes may be immediately enabled. */ if (parent_proc->p_dtrace_count > 0) { dtrace_fasttrap_fork(parent_proc, child_proc); } lck_mtx_unlock(&parent_proc->p_dtrace_sprlock); /* * Duplicate any lazy dof(s). This must be done while NOT * holding the parent sprlock! Lock ordering is * dtrace_dof_mode_lock, then sprlock. It is imperative we * always call dtrace_lazy_dofs_duplicate, rather than null * check and call if !NULL. If we NULL test, during lazy dof * faulting we can race with the faulting code and proceed * from here to beyond the helpers copy. The lazy dof * faulting will then fail to copy the helpers to the child * process. */ dtrace_lazy_dofs_duplicate(parent_proc, child_proc); /* * Duplicate any helper actions and providers. The SFORKING * we set above informs the code to enable USDT probes that * sprlock() may fail because the child is being forked. */ /* * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent * never fails to find the child. We do not set SFORKING. */ if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) { (*dtrace_helpers_fork)(parent_proc, child_proc); } } #endif /* CONFIG_DTRACE */ break; default: panic("fork1 called with unknown kind %d", kind); break; } /* return the thread pointer to the caller */ *child_threadp = child_thread; bad: /* * In the error case, we return a 0 value for the returned pid (but * it is ignored in the trampoline due to the error return); this * is probably not necessary. */ if (err) { (void)chgproccnt(uid, -1); } return (err); }
error_t do_fork(fork_info_t *info) { kmem_req_t req; struct dqdt_attr_s attr; struct thread_s *child_thread; struct task_s *child_task; struct page_s *page; uint_t cid; error_t err; sint_t order; fork_dmsg(1, "%s: cpu %d, started [%d]\n", __FUNCTION__, cpu_get_id(), cpu_time_stamp()); child_thread = NULL; child_task = NULL; page = NULL; cid = info->cpu->cluster->id; attr.cid = cid; attr.cpu_id = 0; attr.cid_exec = info->cid_exec; //dqdt_update_threads_number(attr.cluster->levels_tbl[0], attr.cpu->lid, 1); dqdt_update_threads_number(cid, attr.cpu_id, 1); //attr.cluster = info->current_clstr; attr.cid = cid; err = task_create(&child_task, &attr, CPU_USR_MODE); //attr.cluster = info->cpu->cluster; attr.cid = cid; if(err) goto fail_task; fork_dmsg(1, "%s: cpu %d, ppid %d, task @0x%x, pid %d, task @0x%x [%d]\n", __FUNCTION__, cpu_get_id(), info->this_task->pid, info->this_task, child_task->pid, child_task, cpu_time_stamp()); req.type = KMEM_PAGE; req.size = ARCH_THREAD_PAGE_ORDER; req.flags = AF_KERNEL | AF_REMOTE; req.ptr = info->cpu->cluster; req.ptr = info->current_clstr; page = kmem_alloc(&req); if(page == NULL) goto fail_mem; fork_dmsg(1, "%s: child pid will be %d on cluster %d, cpu %d [%d]\n", __FUNCTION__, child_task->pid, child_task->cpu->cluster->id, child_task->cpu->gid, cpu_time_stamp()); err = task_dup(child_task, info->this_task); if(err) goto fail_task_dup; signal_manager_destroy(child_task); signal_manager_init(child_task); fork_dmsg(1, "%s: parent task has been duplicated [%d]\n", __FUNCTION__, cpu_time_stamp()); child_task->current_clstr = info->current_clstr; err = vmm_dup(&child_task->vmm, &info->this_task->vmm); if(err) goto fail_vmm_dup; fork_dmsg(1, "%s: parent vmm has been duplicated [%d]\n", __FUNCTION__, cpu_time_stamp()); child_thread = (struct thread_s*) ppm_page2addr(page); /* Set the child page before calling thread_dup */ child_thread->info.page = page; err = thread_dup(child_task, child_thread, info->cpu, info->cpu->cluster, info->this_thread); if(err) goto fail_thread_dup; /* Adjust child_thread attributes */ if(info->flags & PT_FORK_USE_AFFINITY) { child_thread->info.attr.flags |= (info->flags & ~(PT_ATTR_LEGACY_MASK)); if(!(info->flags & PT_ATTR_MEM_PRIO)) child_thread->info.attr.flags &= ~(PT_ATTR_MEM_PRIO); if(!(info->flags & PT_ATTR_AUTO_MGRT)) child_thread->info.attr.flags &= ~(PT_ATTR_AUTO_MGRT); if(!(info->flags & PT_ATTR_AUTO_NXTT)) child_thread->info.attr.flags &= ~(PT_ATTR_AUTO_NXTT); } fork_dmsg(1, "%s: parent current thread has been duplicated, tid %x [%d]\n", __FUNCTION__, child_thread, cpu_time_stamp()); if(info->isPinned) thread_migration_disabled(child_thread); else thread_migration_enabled(child_thread); list_add_last(&child_task->th_root, &child_thread->rope); child_task->threads_count = 1; child_task->threads_nr ++; child_task->state = TASK_READY; order = bitmap_ffs2(child_task->bitmap, 0, sizeof(child_task->bitmap)); if(order == -1) goto fail_order; bitmap_clear(child_task->bitmap, order); child_thread->info.attr.key = order; child_thread->info.order = order; child_task->next_order = order + 1; child_task->max_order = order; child_task->uid = info->this_task->uid; child_task->parent = info->this_task->pid; err = sched_register(child_thread); assert(err == 0); cpu_context_set_tid(&child_thread->info.pss, (reg_t)child_thread); cpu_context_set_pmm(&child_thread->info.pss, &child_task->vmm.pmm); cpu_context_dup_finlize(&child_thread->pws, &child_thread->info.pss); child_thread->info.retval = 0; child_thread->info.errno = 0; info->child_thread = child_thread; info->child_task = child_task; return 0; fail_order: fail_thread_dup: fail_vmm_dup: fail_task_dup: printk(WARNING, "WARNING: %s: destroy child thread\n", __FUNCTION__); req.ptr = page; kmem_free(&req); fail_mem: fail_task: //FIXME //dqdt_update_threads_number(attr.cluster->levels_tbl[0], attr.cpu->lid, -1); dqdt_update_threads_number(attr.cid, attr.cpu_id, -1); printk(WARNING, "WARNING: %s: destroy child task\n", __FUNCTION__); if(child_task != NULL) task_destroy(child_task); printk(WARNING, "WARNING: %s: fork err %d [%d]\n", __FUNCTION__, err, cpu_time_stamp()); return err; }