/* Returns -1 with errno set on error, or 0 on success. This does not return * the number of cores actually granted (though some parts of the kernel do * internally). * * This tries to get "more vcores", based on the number we currently have. * We'll probably need smarter 2LSs in the future that just directly set * amt_wanted. What happens is we can have a bunch of 2LS vcore contexts * trying to get "another vcore", which currently means more than num_vcores(). * If you have someone ask for two more, and then someone else ask for one more, * how many you ultimately ask for depends on if the kernel heard you and * adjusted num_vcores in between the two calls. Or maybe your amt_wanted * already was num_vcores + 5, so neither call is telling the kernel anything * new. It comes down to "one more than I have" vs "one more than I've already * asked for". * * So for now, this will keep the older behavior (one more than I have). It * will try to accumulate any concurrent requests, and adjust amt_wanted up. * Interleaving, repetitive calls (everyone asking for one more) may get * ignored. * * Note the doesn't block or anything (despite the min number requested is * 1), since the kernel won't block the call. * * There are a few concurrency concerns. We have _max_vcores_ever_wanted, * initialization of new vcore stacks/TLSs, making sure we don't ask for too * many (minor point), and most importantly not asking the kernel for too much * or otherwise miscommunicating our desires to the kernel. Remember, the * kernel wants just one answer from the process about what it wants, and it is * up to the process to figure that out. * * So we basically have one thread do the submitting/prepping/bookkeeping, and * other threads come in just update the number wanted and make sure someone * is sorting things out. This will perform a bit better too, since only one * vcore makes syscalls (which hammer the proc_lock). This essentially has * cores submit work, and one core does the work (like Eric's old delta * functions). * * There's a slight semantic change: this will return 0 (success) for the * non-submitters, and 0 if we submitted. -1 only if the submitter had some * non-kernel failure. * * Also, beware that this (like the old version) doesn't protect with races on * num_vcores(). num_vcores() is how many you have now or very soon (accounting * for messages in flight that will take your cores), not how many you told the * kernel you want. */ int vcore_request(long nr_new_vcores) { long nr_to_prep_now, nr_vcores_wanted; assert(vc_initialized); /* Early sanity checks */ if ((nr_new_vcores < 0) || (nr_new_vcores + num_vcores() > max_vcores())) return -1; /* consider ERRNO */ /* Post our desires (ROS atomic_add() conflicts with glibc) */ atomic_fetch_and_add(&nr_new_vcores_wanted, nr_new_vcores); try_handle_it: cmb(); /* inc before swap. the atomic is a CPU mb() */ if (atomic_swap(&vc_req_being_handled, 1)) { /* We got a 1 back, so someone else is already working on it */ return 0; } /* So now we're the ones supposed to handle things. This does things in the * "increment based on the number we have", vs "increment on the number we * said we want". * * Figure out how many we have, though this is racy. Yields/preempts/grants * will change this over time, and we may end up asking for less than we * had. */ nr_vcores_wanted = num_vcores(); /* Pull all of the vcores wanted into our local variable, where we'll deal * with prepping/requesting that many vcores. Keep doing this til we think * no more are wanted. */ while ((nr_to_prep_now = atomic_swap(&nr_new_vcores_wanted, 0))) { nr_vcores_wanted += nr_to_prep_now; /* Don't bother prepping or asking for more than we can ever get */ nr_vcores_wanted = MIN(nr_vcores_wanted, max_vcores()); /* Make sure all we might ask for are prepped */ for (long i = _max_vcores_ever_wanted; i < nr_vcores_wanted; i++) { if (allocate_transition_stack(i) || allocate_transition_tls(i)) { atomic_set(&vc_req_being_handled, 0); /* unlock and bail out*/ return -1; } _max_vcores_ever_wanted++; /* done in the loop to handle failures*/ } } cmb(); /* force a reread of num_vcores() */ /* Update amt_wanted if we now want *more* than what the kernel already * knows. See notes in the func doc. */ if (nr_vcores_wanted > __procdata.res_req[RES_CORES].amt_wanted) __procdata.res_req[RES_CORES].amt_wanted = nr_vcores_wanted; /* If num_vcores isn't what we want, we can poke the ksched. Due to some * races with yield, our desires may be old. Not a big deal; any vcores * that pop up will just end up yielding (or get preempt messages.) */ if (nr_vcores_wanted > num_vcores()) sys_poke_ksched(0, RES_CORES); /* 0 -> poke for ourselves */ /* Unlock, (which lets someone else work), and check to see if more work * needs to be done. If so, we'll make sure it gets handled. */ atomic_set(&vc_req_being_handled, 0); /* unlock, to allow others to try */ wrmb(); /* check for any that might have come in while we were out */ if (atomic_read(&nr_new_vcores_wanted)) goto try_handle_it; return 0; }
/* Enables notifs, and deals with missed notifs by self notifying. This should * be rare, so the syscall overhead isn't a big deal. The other alternative * would be to uthread_yield(), which would require us to revert some uthread * interface changes. */ void enable_notifs(uint32_t vcoreid) { __enable_notifs(vcoreid); wrmb(); /* need to read after the write that enabled notifs */ /* Note we could get migrated before executing this. If that happens, our * vcore had gone into vcore context (which is what we wanted), and this * self_notify to our old vcore is spurious and harmless. */ if (vcpd_of(vcoreid)->notif_pending) sys_self_notify(vcoreid, EV_NONE, 0, TRUE); }
/* Consumer side, returns TRUE on success and fills *msg with the ev_msg. If * the ceq appears empty, it will return FALSE. Messages may have arrived after * we started getting that we do not receive. */ bool get_ceq_msg(struct ceq *ceq, struct event_msg *msg) { int32_t idx = get_ring_idx(ceq); if (idx == -1) { if (!ceq->ring_overflowed) return FALSE; /* We didn't get anything via the ring, but if we're overflowed, then we * need to look in the array directly. Note that we only handle * overflow when we failed to get something. Eventually, we'll deal * with overflow (which should be very rare). Also note that while we * are dealing with overflow, the kernel could be producing and using * the ring, and we could have consumers consuming from the ring. * * Overall, we need to clear the overflow flag, make sure the list is * empty, and turn the flag back on if it isn't. That'll make sure * overflow is set if there's a chance there is a message in the array * that doesn't have an idx in the ring. * * However, if we do that, there's a time when overflow isn't set and * the ring is empty. A concurrent consumer could think that the ring * is empty, when in fact it isn't. That's bad, since we could miss a * message (i.e. sleep when we have a message we needed). So we'll need * to deal with concurrent consumers, and whatever we do will also need * to deal with concurrent conusmers who handle overflow too. Easiest * thing is to just lock. If the lock is set, then that also means the * mailbox isn't empty. */ spin_pdr_lock((struct spin_pdr_lock*)&ceq->u_lock); /* Check again - someone may have handled it while we were waiting on * the lock */ if (!ceq->ring_overflowed) { spin_pdr_unlock((struct spin_pdr_lock*)&ceq->u_lock); return FALSE; } ceq->ring_overflowed = FALSE; wrmb(); /* clear overflowed before reading event entries */ for (int i = 0; i < ceq->nr_events; i++) { if (extract_ceq_msg(ceq, i, msg)) { /* We found something. There might be more, but a future * consumer will have to deal with it, or verify there isn't. */ ceq->ring_overflowed = TRUE; spin_pdr_unlock((struct spin_pdr_lock*)&ceq->u_lock); return TRUE; } } /* made it to the end, looks like there was no overflow left. there * could be new ones added behind us (they'd be in the ring or overflow * would be turned on again), but those message were added after we * started consuming, and therefore not our obligation to extract. */ spin_pdr_unlock((struct spin_pdr_lock*)&ceq->u_lock); return FALSE; } if (!extract_ceq_msg(ceq, idx, msg)) return FALSE; return TRUE; }
/* Attempts to message a vcore that may or may not have VC_CAN_RCV_MSG set. If * so, we'll post the message and the message will eventually get dealt with * (when the vcore runs or when it is preempte-recovered). */ static bool try_spam_vcore(struct proc *p, uint32_t vcoreid, struct event_msg *ev_msg, int ev_flags) { /* Not sure if we can or not, so check before spamming. Technically, the * only critical part is that we __alert, then check can_alert. */ if (can_msg_vcore(vcoreid)) { spam_vcore(p, vcoreid, ev_msg, ev_flags); wrmb(); /* prev write (notif_pending) must come before following reads*/ if (can_msg_vcore(vcoreid)) return TRUE; } return FALSE; }
/* Helper: will try to message (INDIR/IPI) a list member (lists of vcores). We * use this on the online and bulk_preempted vcore lists. If this succeeds in * alerting a vcore on the list, it'll return TRUE. We need to be careful here, * since we're reading a list that could be concurrently modified. The * important thing is that we can always fail if we're unsure (such as with * lists being temporarily empty). The caller will be able to deal with it via * the ultimate fallback. */ static bool spam_list_member(struct vcore_tailq *list, struct proc *p, struct event_msg *ev_msg, int ev_flags) { struct vcore *vc, *vc_first; uint32_t vcoreid; int loops = 0; vc = TAILQ_FIRST(list); /* If the list appears empty, we'll bail out (failing) after the loop. */ while (vc) { vcoreid = vcore2vcoreid(p, vc); /* post the alert. Not using the try_spam_vcore() helper since I want * something more customized for the lists. */ spam_vcore(p, vcoreid, ev_msg, ev_flags); wrmb(); /* prev write (notif_pending) must come before following reads*/ /* if they are still alertable after we sent the msg, then they'll get * it before yielding (racing with userspace yield here). This check is * not as critical as the next one, but will allow us to alert vcores * that happen to concurrently be moved from the active to the * bulk_preempt list. */ if (can_msg_vcore(vcoreid)) return TRUE; /* As a backup, if they are still the first on the list, then they are * still going to get the message. For the online list, proc_yield() * will return them to userspace (where they will get the message) * because __alert_vcore() set notif_pending. For the BP list, they * will either be turned on later, or have a preempt message sent about * their demise. * * We race on list membership (and not exclusively VC_CAN_RCV_MSG, so * that when it fails we can get a new vcore to try (or know WHP there * are none). */ vc_first = TAILQ_FIRST(list); if (vc == vc_first) return TRUE; /* At this point, the list has changed and the vcore we tried yielded, * so we try the *new* list head. Track loops for sanity reasons. */ if (loops++ > 10) { warn("Too many (%d) attempts to find a vcore, failing!", loops); return FALSE; /* always safe to fail! */ } /* Get set up for your attack run! */ vc = vc_first; } return FALSE; }
/* Glibc initial blockon, usable before parlib code can init things (or if it * never can, like for RTLD). MCPs will need the 'uthread-aware' blockon. */ void __ros_scp_syscall_blockon(struct syscall *sysc) { /* Need to disable notifs before registering, so we don't take an __notify * that drops us into VC ctx and forces us to eat the notif_pending that was * meant to prevent us from yielding if the syscall completed early. */ __procdata.vcore_preempt_data[0].notif_disabled = TRUE; /* Ask for a SYSCALL event when the sysc is done. We don't need a handler, * we just need the kernel to restart us from proc_yield. If register * fails, we're already done. */ if (register_evq(sysc, &__ros_scp_simple_evq)) { /* Sending false for now - we want to signal proc code that we want to * wait (piggybacking on the MCP meaning of this variable) */ __ros_syscall_noerrno(SYS_yield, FALSE, 0, 0, 0, 0, 0); } /* Manually doing an enable_notifs for VC 0 */ __procdata.vcore_preempt_data[0].notif_disabled = FALSE; wrmb(); /* need to read after the write that enabled notifs */ if (__procdata.vcore_preempt_data[0].notif_pending) __ros_syscall_noerrno(SYS_self_notify, 0, EV_NONE, 0, TRUE, 0, 0); }
/* Helper, from u/p/uthread.c. Keep it in sync. (don't want to move this into * glibc yet). */ static bool register_evq(struct syscall *sysc, struct event_queue *ev_q) { int old_flags; sysc->ev_q = ev_q; wrmb(); /* don't let that write pass any future reads (flags) */ /* Try and set the SC_UEVENT flag (so the kernel knows to look at ev_q) */ do { /* no cmb() needed, the atomic_read will reread flags */ old_flags = atomic_read(&sysc->flags); /* Spin if the kernel is mucking with syscall flags */ while (old_flags & SC_K_LOCK) old_flags = atomic_read(&sysc->flags); /* If the kernel finishes while we are trying to sign up for an event, * we need to bail out */ if (old_flags & (SC_DONE | SC_PROGRESS)) { sysc->ev_q = 0; /* not necessary, but might help with bugs */ return FALSE; } } while (!atomic_cas(&sysc->flags, old_flags, old_flags | SC_UEVENT)); return TRUE; }
/* This is the 'post (work) and poke' style of sync. We make sure the poke * tracker's function runs. Once this returns, the func either has run or is * currently running (in case someone else is running now). We won't wait or * spin or anything, and it is safe to call this recursively (deeper in the * call-graph). * * It's up to the caller to somehow post its work. We'll also pass arg to the * func, ONLY IF the caller is the one to execute it - so there's no guarantee * the func(specific_arg) combo will actually run. It's more for info * purposes/optimizations/etc. If no one uses it, I'll get rid of it. */ void poke(struct poke_tracker *tracker, void *arg) { atomic_set(&tracker->need_to_run, TRUE); /* will need to repeatedly do it if someone keeps posting work */ do { /* want an wrmb() btw posting work/need_to_run and in_progress. * the swap provides the HW mb. just need a cmb, which we do in * the loop to cover the iterations (even though i can't imagine * the compiler reordering the check it needed to do for the * branch).. */ cmb(); /* poke / make sure someone does it. if we get a TRUE (1) back, * someone is already running and will deal with the posted * work. (probably on their next loop). if we got a 0 back, we * won the race and have the 'lock'. */ if (atomic_swap(&tracker->run_in_progress, TRUE)) return; /* if we're here, then we're the one who needs to run the func. * */ /* clear the 'need to run', since we're running it now. new * users will set it again. this write needs to be wmb()'d * after in_progress. the swap provided the HW mb(). */ cmb(); /* no internal HW mb */ atomic_set(&tracker->need_to_run, FALSE); /* run the actual function. the poke sync makes sure only one * caller is in that func at a time. */ assert(tracker->func); tracker->func(arg); /* ensure the in_prog write comes after the run_again. */ wmb(); /* no internal HW mb */ atomic_set(&tracker->run_in_progress, FALSE); /* in_prog write must come before run_again read */ wrmb(); } while (atomic_read(&tracker->need_to_run)); }
/* Send an event to ev_q, based on the parameters in ev_q's flag. We don't * accept null ev_qs, since the caller ought to be checking before bothering to * make a msg and send it to the event_q. Vcoreid is who the kernel thinks the * message ought to go to (for IPIs). Appropriate for things like * EV_PREEMPT_PENDING, where we tell the affected vcore. To have the message go * where the kernel suggests, set EVENT_VCORE_APPRO(priate). */ void send_event(struct proc *p, struct event_queue *ev_q, struct event_msg *msg, uint32_t vcoreid) { struct proc *old_proc; struct event_mbox *ev_mbox = 0; assert(p); printd("[kernel] sending msg to proc %p, ev_q %p\n", p, ev_q); if (!ev_q) { warn("[kernel] Null ev_q - kernel code should check before sending!"); return; } if (!is_user_rwaddr(ev_q, sizeof(struct event_queue))) { /* Ought to kill them, just warn for now */ printk("[kernel] Illegal addr for ev_q\n"); return; } /* This should be caught by "future technology" that can tell when the * kernel PFs on the user's behalf. For now, we catch common userspace bugs * (had this happen a few times). */ if (!PTE_ADDR(ev_q)) { printk("[kernel] Bad addr %p for ev_q\n", ev_q); return; } /* ev_q is a user pointer, so we need to make sure we're in the right * address space */ old_proc = switch_to(p); /* If we're an _S, just spam vcore0, and wake up if necessary. */ if (!__proc_is_mcp(p)) { spam_vcore(p, 0, msg, ev_q->ev_flags); wrmb(); /* don't let the notif_pending write pass the state read */ /* using the same pattern as in spam_public (which can have multiple * unblock callbacks */ if (p->state == PROC_WAITING) proc_wakeup(p); goto out; } /* Get the vcoreid that we'll message (if appropriate). For INDIR and * SPAMMING, this is the first choice of a vcore, but other vcores might get * it. Common case is !APPRO and !ROUNDROBIN. Note we are clobbering the * vcoreid parameter. */ if (!(ev_q->ev_flags & EVENT_VCORE_APPRO)) vcoreid = ev_q->ev_vcore; /* use the ev_q's vcoreid */ /* Note that RR overwrites APPRO */ if (ev_q->ev_flags & EVENT_ROUNDROBIN) { /* Pick a vcore, round-robin style. Assuming ev_vcore was the previous * one used. Note that round-robin overrides the passed-in vcoreid. * Also note this may be 'wrong' if num_vcores changes. */ vcoreid = (ev_q->ev_vcore + 1) % p->procinfo->num_vcores; ev_q->ev_vcore = vcoreid; } if (!vcoreid_is_safe(vcoreid)) { /* Ought to kill them, just warn for now */ printk("[kernel] Vcoreid %d unsafe! (too big?)\n", vcoreid); goto out; } /* If we're a SPAM_PUBLIC, they just want us to spam the message. Note we * don't care about the mbox, since it'll go to VCPD public mboxes, and * we'll prefer to send it to whatever vcoreid we determined at this point * (via APPRO or whatever). */ if (ev_q->ev_flags & EVENT_SPAM_PUBLIC) { spam_public_msg(p, msg, vcoreid, ev_q->ev_flags & EVENT_SPAM_FLAGS); goto out; } /* We aren't spamming and we know the default vcore, and now we need to * figure out which mbox to use. If they provided an mbox, we'll use it. * If not, we'll use a VCPD mbox (public or private, depending on the * flags). */ ev_mbox = ev_q->ev_mbox; if (!ev_mbox) ev_mbox = get_vcpd_mbox(vcoreid, ev_q->ev_flags); /* At this point, we ought to have the right mbox to send the msg to, and * which vcore to alert (IPI/INDIR) (if applicable). The mbox could be the * vcore's vcpd ev_mbox. */ if (!ev_mbox) { /* This shouldn't happen any more, this is more for sanity's sake */ warn("[kernel] ought to have an mbox by now!"); goto out; } /* Even if we're using an mbox in procdata (VCPD), we want a user pointer */ if (!is_user_rwaddr(ev_mbox, sizeof(struct event_mbox))) { /* Ought to kill them, just warn for now */ printk("[kernel] Illegal addr for ev_mbox\n"); goto out; } /* We used to support no msgs, but quit being lazy and send a 'msg'. If the * ev_q is a NOMSG, we won't actually memcpy or anything, it'll just be a * vehicle for sending the ev_type. */ assert(msg); post_ev_msg(p, ev_mbox, msg, ev_q->ev_flags); wmb(); /* ensure ev_msg write is before alerting the vcore */ /* Prod/alert a vcore with an IPI or INDIR, if desired. INDIR will also * call try_notify (IPI) later */ if (ev_q->ev_flags & EVENT_INDIR) { send_indir(p, ev_q, vcoreid); } else { /* they may want an IPI despite not wanting an INDIR */ try_notify(p, vcoreid, ev_q->ev_flags); } /* Fall through */ out: /* Return to the old address space. */ switch_back(p, old_proc); }