/* Gets a small ev_q, with ev_mbox pointing to the vcpd mbox of vcoreid. If * ev_flags has EVENT_VCORE_PRIVATE set, it'll give you the private mbox. o/w, * you'll get the public one. */ struct event_queue *get_event_q_vcpd(uint32_t vcoreid, int ev_flags) { struct event_queue *ev_q = get_event_q(); if (ev_flags & EVENT_VCORE_PRIVATE) ev_q->ev_mbox = &vcpd_of(vcoreid)->ev_mbox_private; else ev_q->ev_mbox = &vcpd_of(vcoreid)->ev_mbox_public; return ev_q; }
/* Prep a pthread to run a signal handler. The original context of the pthread * is saved, and a new context with a new stack is set up to run the signal * handler the next time the pthread is run. */ static void __pthread_prep_sighandler(struct pthread_tcb *pthread, void (*entry)(void), struct siginfo *info) { struct user_context *ctx; pthread->sigdata = alloc_sigdata(); if (info != NULL) pthread->sigdata->info = *info; init_user_ctx(&pthread->sigdata->u_ctx, (uintptr_t)entry, (uintptr_t)pthread->sigdata->stack); if (pthread->uthread.flags & UTHREAD_SAVED) { ctx = &pthread->uthread.u_ctx; if (pthread->uthread.flags & UTHREAD_FPSAVED) { pthread->sigdata->as = pthread->uthread.as; pthread->uthread.flags &= ~UTHREAD_FPSAVED; } } else { assert(current_uthread == &pthread->uthread); ctx = &vcpd_of(vcore_id())->uthread_ctx; save_fp_state(&pthread->sigdata->as); } swap_user_contexts(ctx, &pthread->sigdata->u_ctx); }
void __attribute__((constructor)) vcore_lib_init(void) { uintptr_t mmap_block; /* Note this is racy, but okay. The first time through, we are _S. * Also, this is the "lowest" level constructor for now, so we don't need * to call any other init functions after our run_once() call. This may * change in the future. */ init_once_racy(return); /* Need to alloc vcore0's transition stuff here (technically, just the TLS) * so that schedulers can use vcore0's transition TLS before it comes up in * vcore_entry() */ if (allocate_vcore_stack(0) || allocate_transition_tls(0)) goto vcore_lib_init_fail; /* Initialize our VCPD event queues' ucqs, two pages per ucq, 4 per vcore */ mmap_block = (uintptr_t)mmap(0, PGSIZE * 4 * max_vcores(), PROT_WRITE | PROT_READ, MAP_POPULATE | MAP_ANONYMOUS, -1, 0); /* Yeah, this doesn't fit in the error-handling scheme, but this whole * system doesn't really handle failure, and needs a rewrite involving less * mmaps/munmaps. */ assert(mmap_block); /* Note we may end up doing vcore 0's elsewhere, for _Ss, or else have a * separate ev_q for that. */ for (int i = 0; i < max_vcores(); i++) { /* four pages total for both ucqs from the big block (2 pages each) */ ucq_init_raw(&vcpd_of(i)->ev_mbox_public.ev_msgs, mmap_block + (4 * i ) * PGSIZE, mmap_block + (4 * i + 1) * PGSIZE); ucq_init_raw(&vcpd_of(i)->ev_mbox_private.ev_msgs, mmap_block + (4 * i + 2) * PGSIZE, mmap_block + (4 * i + 3) * PGSIZE); /* Set the lowest level entry point for each vcore. */ vcpd_of(i)->vcore_entry = (uintptr_t)__kernel_vcore_entry; } atomic_init(&vc_req_being_handled, 0); assert(!in_vcore_context()); vcore_libc_init(); return; vcore_lib_init_fail: assert(0); }
/* Enables notifs, and deals with missed notifs by self notifying. This should * be rare, so the syscall overhead isn't a big deal. The other alternative * would be to uthread_yield(), which would require us to revert some uthread * interface changes. */ void enable_notifs(uint32_t vcoreid) { __enable_notifs(vcoreid); wrmb(); /* need to read after the write that enabled notifs */ /* Note we could get migrated before executing this. If that happens, our * vcore had gone into vcore context (which is what we wanted), and this * self_notify to our old vcore is spurious and harmless. */ if (vcpd_of(vcoreid)->notif_pending) sys_self_notify(vcoreid, EV_NONE, 0, TRUE); }
void vcore_reenter(void (*entry_func)(void)) { assert(in_vcore_context()); struct preempt_data *vcpd = vcpd_of(vcore_id()); __vcore_reentry_func = entry_func; set_stack_pointer((void*)vcpd->vcore_stack); cmb(); __vcore_reenter(); }
void vcore_init(void) { uintptr_t mmap_block; /* Note this is racy, but okay. The first time through, we are _S */ init_once_racy(return); /* Need to alloc vcore0's transition stuff here (technically, just the TLS) * so that schedulers can use vcore0's transition TLS before it comes up in * vcore_entry() */ if(allocate_transition_stack(0) || allocate_transition_tls(0)) goto vcore_init_fail; /* Initialize our VCPD event queues' ucqs, two pages per ucq, 4 per vcore */ mmap_block = (uintptr_t)mmap(0, PGSIZE * 4 * max_vcores(), PROT_WRITE | PROT_READ, MAP_POPULATE | MAP_ANONYMOUS, -1, 0); /* Yeah, this doesn't fit in the error-handling scheme, but this whole * system doesn't really handle failure, and needs a rewrite involving less * mmaps/munmaps. */ assert(mmap_block); /* Note we may end up doing vcore 0's elsewhere, for _Ss, or else have a * separate ev_q for that. */ for (int i = 0; i < max_vcores(); i++) { /* four pages total for both ucqs from the big block (2 pages each) */ ucq_init_raw(&vcpd_of(i)->ev_mbox_public.ev_msgs, mmap_block + (4 * i ) * PGSIZE, mmap_block + (4 * i + 1) * PGSIZE); ucq_init_raw(&vcpd_of(i)->ev_mbox_private.ev_msgs, mmap_block + (4 * i + 2) * PGSIZE, mmap_block + (4 * i + 3) * PGSIZE); } atomic_init(&vc_req_being_handled, 0); assert(!in_vcore_context()); /* no longer need to enable notifs on vcore 0, it is set like that by * default (so you drop into vcore context immediately on transtioning to * _M) */ vc_initialized = TRUE; return; vcore_init_fail: assert(0); }
/* This can return, if you failed to yield due to a concurrent event. Note * we're atomicly setting the CAN_RCV flag, and aren't bothering with CASing * (either with the kernel or uthread's handle_indirs()). We don't particularly * care what other code does - we intend to set those flags no matter what. */ void vcore_yield(bool preempt_pending) { unsigned long old_nr; uint32_t vcoreid = vcore_id(); struct preempt_data *vcpd = vcpd_of(vcoreid); __sync_fetch_and_and(&vcpd->flags, ~VC_CAN_RCV_MSG); /* no wrmb() necessary, handle_events() has an mb() if it is checking */ /* Clears notif pending and tries to handle events. This is an optimization * to avoid the yield syscall if we have an event pending. If there is one, * we want to unwind and return to the 2LS loop, where we may not want to * yield anymore. * Note that the kernel only cares about CAN_RCV_MSG for the desired vcore, * not for a FALLBACK. */ if (handle_events(vcoreid)) { __sync_fetch_and_or(&vcpd->flags, VC_CAN_RCV_MSG); return; } /* If we are yielding since we don't want the core, tell the kernel we want * one less vcore (vc_yield assumes a dumb 2LS). * * If yield fails (slight race), we may end up having more vcores than * amt_wanted for a while, and might lose one later on (after a * preempt/timeslicing) - the 2LS will have to notice eventually if it * actually needs more vcores (which it already needs to do). amt_wanted * could even be 0. * * In general, any time userspace decrements or sets to 0, it could get * preempted, so the kernel will still give us at least one, until the last * vcore properly yields without missing a message (and becomes a WAITING * proc, which the ksched will not give cores to). * * I think it's possible for userspace to do this (lock, read amt_wanted, * check all message queues for all vcores, subtract amt_wanted (not set to * 0), unlock) so long as every event handler +1s the amt wanted, but that's * a huge pain, and we already have event handling code making sure a * process can't sleep (transition to WAITING) if a message arrives (can't * yield if notif_pending, can't go WAITING without yielding, and the event * posting the notif_pending will find the online VC or be delayed by * spinlock til the proc is WAITING). */ if (!preempt_pending) { do { old_nr = __procdata.res_req[RES_CORES].amt_wanted; if (old_nr == 0) break; } while (!__sync_bool_compare_and_swap( &__procdata.res_req[RES_CORES].amt_wanted, old_nr, old_nr - 1)); } /* We can probably yield. This may pop back up if notif_pending became set * by the kernel after we cleared it and we lost the race. */ sys_yield(preempt_pending); __sync_fetch_and_or(&vcpd->flags, VC_CAN_RCV_MSG); }
static int allocate_vcore_stack(int id) { struct preempt_data *vcpd = vcpd_of(id); if (vcpd->vcore_stack) return 0; // reuse old stack void* stackbot = mmap(0, TRANSITION_STACK_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_POPULATE|MAP_ANONYMOUS, -1, 0); if(stackbot == MAP_FAILED) return -1; // errno set by mmap vcpd->vcore_stack = (uintptr_t)stackbot + TRANSITION_STACK_SIZE; return 0; }
/* Helper: prepares a vcore for use. Takes a block of pages for the UCQs. * * Vcores need certain things, such as a stack and TLS. These are determined by * userspace. Every vcore needs these set up before we drop into vcore context * on that vcore. This means we need to prep before asking the kernel for those * vcores. * * We could have this function do its own mmap, at the expense of O(n) syscalls * when we prepare the extra vcores. */ static void __prep_vcore(int vcoreid, uintptr_t mmap_block) { struct preempt_data *vcpd = vcpd_of(vcoreid); int ret; ret = allocate_vcore_stack(vcoreid); assert(!ret); ret = allocate_transition_tls(vcoreid); assert(!ret); vcpd->ev_mbox_public.type = EV_MBOX_UCQ; ucq_init_raw(&vcpd->ev_mbox_public.ucq, mmap_block + 0 * PGSIZE, mmap_block + 1 * PGSIZE); vcpd->ev_mbox_private.type = EV_MBOX_UCQ; ucq_init_raw(&vcpd->ev_mbox_private.ucq, mmap_block + 2 * PGSIZE, mmap_block + 3 * PGSIZE); /* Set the lowest level entry point for each vcore. */ vcpd->vcore_entry = (uintptr_t)__kernel_vcore_entry; }
/* The lowest level function jumped to by the kernel on every vcore_entry. * Currently, this function is only necessary so we can set the tls_desc from * the vcpd for non x86_64 architectures. We should consider removing this and * making it mandatory to set the tls_desc in the kernel. We wouldn't even * need to pass the vcore id to user space at all if we did this. It would * already be set in the preinstalled TLS as __vcore_id. */ static void __attribute__((noreturn)) __kernel_vcore_entry(void) { /* The kernel sets the TLS desc for us, based on whatever is in VCPD. * * x86 32-bit TLS is pretty jacked up, so the kernel doesn't set the TLS * desc for us. it's a little more expensive to do it here, esp for * amd64. Can remove this when/if we overhaul 32 bit TLS. * * AFAIK, riscv's TLS changes are really cheap, and they don't do it in * the kernel (yet/ever), so they can set their TLS here too. */ int id = __vcore_id_on_entry; #ifndef __x86_64__ set_tls_desc(vcpd_of(id)->vcore_tls_desc); #endif /* Every time the vcore comes up, it must set that it is in vcore context. * uthreads may share the same TLS as their vcore (when uthreads do not have * their own TLS), and if a uthread was preempted, __vcore_context == FALSE, * and that will continue to be true the next time the vcore pops up. */ __vcore_context = TRUE; vcore_entry(); fprintf(stderr, "vcore_entry() should never return!\n"); abort(); __builtin_unreachable(); }