int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val) { if (tg->tid_map[ext_tid] == 0) { tg->envelope = bcast_val ? *bcast_val : NULL; cpu_sfence(); tg->forked = 1; tg->group_sense = tg->thread_sense[0]->sense; // if it's possible that threads are sleeping, signal them if (tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); uv_cond_broadcast(&tg->alarm); uv_mutex_unlock(&tg->alarm_lock); } } else { // spin up to threshold cycles (count sheep), then sleep uint64_t spin_cycles, spin_start = rdtsc(); while (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { if (tg->sleep_threshold) { spin_cycles = rdtsc() - spin_start; if (spin_cycles >= tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); if (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { uv_cond_wait(&tg->alarm, &tg->alarm_lock); } uv_mutex_unlock(&tg->alarm_lock); spin_start = rdtsc(); continue; } } cpu_pause(); } cpu_lfence(); if (bcast_val) *bcast_val = tg->envelope; } return 0; }
int __gettimeofday(struct timeval *tv, struct timezone *tz) { struct timespec ts; int res; int w; if (fast_clock == 0 && fast_count++ >= 10) { __kpmap_map(&upticksp, &fast_clock, KPTYPE_UPTICKS); __kpmap_map(&ts_realtime, &fast_clock, KPTYPE_TS_REALTIME); __kpmap_map(&fast_gtod, &fast_clock, KPTYPE_FAST_GTOD); __kpmap_map(NULL, &fast_clock, 0); } if (fast_clock > 0 && *fast_gtod && tz == NULL) { do { w = *upticksp; cpu_lfence(); ts = ts_realtime[w & 1]; cpu_lfence(); w = *upticksp - w; } while (w > 1); res = 0; if (tv) { tv->tv_sec = ts.tv_sec; tv->tv_usec = ts.tv_nsec / 1000; } } else { res = __sys_gettimeofday(tv, tz); } return res; }
/* * Get SMP fully working before we start initializing devices. */ static void ap_finish(void) { int i; cpumask_t ncpus_mask = 0; for (i = 1; i <= ncpus; i++) ncpus_mask |= CPUMASK(i); mp_finish = 1; if (bootverbose) kprintf("Finish MP startup\n"); /* build our map of 'other' CPUs */ mycpu->gd_other_cpus = smp_startup_mask & ~CPUMASK(mycpu->gd_cpuid); /* * Let the other cpu's finish initializing and build their map * of 'other' CPUs. */ rel_mplock(); while (smp_active_mask != smp_startup_mask) { DELAY(100000); cpu_lfence(); } while (try_mplock() == 0) DELAY(100000); if (bootverbose) kprintf("Active CPU Mask: %08x\n", smp_active_mask); }
/* * Get SMP fully working before we start initializing devices. */ static void ap_finish(void) { mp_finish = 1; if (bootverbose) kprintf("Finish MP startup\n"); /* build our map of 'other' CPUs */ mycpu->gd_other_cpus = smp_startup_mask; CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); /* * Let the other cpu's finish initializing and build their map * of 'other' CPUs. */ rel_mplock(); while (CPUMASK_CMPMASKNEQ(smp_active_mask,smp_startup_mask)) { DELAY(100000); cpu_lfence(); } while (try_mplock() == 0) DELAY(100000); if (bootverbose) kprintf("Active CPU Mask: %08lx\n", (long)CPUMASK_LOWMASK(smp_active_mask)); }
int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val) { if (tg->tid_map[ext_tid] == 0) { tg->envelope = bcast_val ? *bcast_val : NULL; cpu_sfence(); tg->forked = 1; tg->group_sense = tg->thread_sense[0]->sense; // if it's possible that threads are sleeping, signal them if (tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); uv_cond_broadcast(&tg->alarm); uv_mutex_unlock(&tg->alarm_lock); } } else { // spin up to threshold ns (count sheep), then sleep uint64_t spin_ns; uint64_t spin_start = 0; while (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { if (tg->sleep_threshold) { if (!spin_start) { // Lazily initialize spin_start since uv_hrtime is expensive spin_start = uv_hrtime(); continue; } spin_ns = uv_hrtime() - spin_start; // In case uv_hrtime is not monotonic, we'll sleep earlier if (spin_ns >= tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); if (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { uv_cond_wait(&tg->alarm, &tg->alarm_lock); } uv_mutex_unlock(&tg->alarm_lock); spin_start = 0; continue; } } cpu_pause(); } cpu_lfence(); if (bcast_val) *bcast_val = tg->envelope; } return 0; }
static void loopdebug(const char *msg, pmap_inval_info_t *info) { int p; int cpu = mycpu->gd_cpuid; /* * Don't kprintf() anything if the pmap inval watchdog gets hit. * DRM can cause an occassional watchdog hit (at least with a 1/16 * second watchdog), and attempting to kprintf to the KVM frame buffer * from Xinvltlb, which ignores critical sections, can implode the * system. */ if (pmap_inval_watchdog_print == 0) return; cpu_lfence(); #ifdef LOOPRECOVER atomic_add_long(&smp_smurf_mask.ary[0], 0); #endif kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " #ifdef LOOPRECOVER "s=%08jx " #endif #ifdef LOOPMASK_IN "in=%08jx " #endif #ifdef LOOPRECOVER "smurf=%08jx\n" #endif , msg, cpu, info->mode, info->mask.ary[0], info->done.ary[0] #ifdef LOOPRECOVER , info->sigmask.ary[0] #endif #ifdef LOOPMASK_IN , smp_in_mask.ary[0] #endif #ifdef LOOPRECOVER , smp_smurf_mask.ary[0] #endif ); kprintf("mdglob "); for (p = 0; p < ncpus; ++p) kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb); kprintf("\n"); }
static int sysctl_get_basetime(SYSCTL_HANDLER_ARGS) { struct timespec *bt; int error; int index; /* * Because basetime data and index may be updated by another cpu, * a load fence is required to ensure that the data we read has * not been speculatively read relative to a possibly updated index. */ index = basetime_index; cpu_lfence(); bt = &basetime[index]; error = SYSCTL_OUT(req, bt, sizeof(*bt)); return (error); }
/* * (Frontend) collect a response from a running cluster op. * * Responses are fed from all appropriate nodes concurrently * and collected into a cohesive response >= collect_key. * * The collector will return the instant quorum or other requirements * are met, even if some nodes get behind or become non-responsive. * * HAMMER2_XOP_COLLECT_NOWAIT - Used to 'poll' a completed collection, * usually called synchronously from the * node XOPs for the strategy code to * fake the frontend collection and complete * the BIO as soon as possible. * * HAMMER2_XOP_SYNCHRONIZER - Reqeuest synchronization with a particular * cluster index, prevents looping when that * index is out of sync so caller can act on * the out of sync element. ESRCH and EDEADLK * can be returned if this flag is specified. * * Returns 0 on success plus a filled out xop->cluster structure. * Return ENOENT on normal termination. * Otherwise return an error. */ int hammer2_xop_collect(hammer2_xop_head_t *xop, int flags) { hammer2_xop_fifo_t *fifo; hammer2_chain_t *chain; hammer2_key_t lokey; int error; int keynull; int adv; /* advance the element */ int i; uint32_t check_counter; loop: /* * First loop tries to advance pieces of the cluster which * are out of sync. */ lokey = HAMMER2_KEY_MAX; keynull = HAMMER2_CHECK_NULL; check_counter = xop->check_counter; cpu_lfence(); for (i = 0; i < xop->cluster.nchains; ++i) { chain = xop->cluster.array[i].chain; if (chain == NULL) { adv = 1; } else if (chain->bref.key < xop->collect_key) { adv = 1; } else { keynull &= ~HAMMER2_CHECK_NULL; if (lokey > chain->bref.key) lokey = chain->bref.key; adv = 0; } if (adv == 0) continue; /* * Advance element if possible, advanced element may be NULL. */ if (chain) { hammer2_chain_unlock(chain); hammer2_chain_drop(chain); } fifo = &xop->collect[i]; if (fifo->ri != fifo->wi) { cpu_lfence(); chain = fifo->array[fifo->ri & HAMMER2_XOPFIFO_MASK]; ++fifo->ri; xop->cluster.array[i].chain = chain; if (chain == NULL) { /* XXX */ xop->cluster.array[i].flags |= HAMMER2_CITEM_NULL; } if (fifo->wi - fifo->ri < HAMMER2_XOPFIFO / 2) wakeup(xop); /* XXX optimize */ --i; /* loop on same index */ } else { /* * Retain CITEM_NULL flag. If set just repeat EOF. * If not, the NULL,0 combination indicates an * operation in-progress. */ xop->cluster.array[i].chain = NULL; /* retain any CITEM_NULL setting */ } } /* * Determine whether the lowest collected key meets clustering * requirements. Returns: * * 0 - key valid, cluster can be returned. * * ENOENT - normal end of scan, return ENOENT. * * ESRCH - sufficient elements collected, quorum agreement * that lokey is not a valid element and should be * skipped. * * EDEADLK - sufficient elements collected, no quorum agreement * (and no agreement possible). In this situation a * repair is needed, for now we loop. * * EINPROGRESS - insufficient elements collected to resolve, wait * for event and loop. */ if ((flags & HAMMER2_XOP_COLLECT_WAITALL) && xop->run_mask != HAMMER2_XOPMASK_VOP) { error = EINPROGRESS; } else { error = hammer2_cluster_check(&xop->cluster, lokey, keynull); } if (error == EINPROGRESS) { if (xop->check_counter == check_counter) { if (flags & HAMMER2_XOP_COLLECT_NOWAIT) goto done; tsleep_interlock(&xop->check_counter, 0); cpu_lfence(); if (xop->check_counter == check_counter) { tsleep(&xop->check_counter, PINTERLOCKED, "h2coll", hz*60); } } goto loop; } if (error == ESRCH) { if (lokey != HAMMER2_KEY_MAX) { xop->collect_key = lokey + 1; goto loop; } error = ENOENT; } if (error == EDEADLK) { kprintf("hammer2: no quorum possible lokey %016jx\n", lokey); if (lokey != HAMMER2_KEY_MAX) { xop->collect_key = lokey + 1; goto loop; } error = ENOENT; } if (lokey == HAMMER2_KEY_MAX) xop->collect_key = lokey; else xop->collect_key = lokey + 1; done: return error; }
/* * Wait for async lock completion or abort. Returns ENOLCK if an abort * occurred. */ int mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to) { indefinite_info_t info; int error; indefinite_init(&info, mtx->mtx_ident, 1, ((link->state & MTX_LINK_LINKED_SH) ? 'm' : 'M')); /* * Sleep. Handle false wakeups, interruptions, etc. * The link may also have been aborted. The LINKED * bit was set by this cpu so we can test it without * fences. */ error = 0; while (link->state & MTX_LINK_LINKED) { tsleep_interlock(link, 0); cpu_lfence(); if (link->state & MTX_LINK_LINKED) { error = tsleep(link, flags | PINTERLOCKED, mtx->mtx_ident, to); if (error) break; } if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0) indefinite_check(&info); } /* * We need at least a lfence (load fence) to ensure our cpu does not * reorder loads (of data outside the lock structure) prior to the * remote cpu's release, since the above test may have run without * any atomic interactions. * * If we do not do this then state updated by the other cpu before * releasing its lock may not be read cleanly by our cpu when this * function returns. Even though the other cpu ordered its stores, * our loads can still be out of order. */ cpu_mfence(); /* * We are done, make sure the link structure is unlinked. * It may still be on the list due to e.g. EINTR or * EWOULDBLOCK. * * It is possible for the tsleep to race an ABORT and cause * error to be 0. * * The tsleep() can be woken up for numerous reasons and error * might be zero in situations where we intend to return an error. * * (This is the synchronous case so state cannot be CALLEDBACK) */ switch(link->state) { case MTX_LINK_ACQUIRED: case MTX_LINK_CALLEDBACK: error = 0; break; case MTX_LINK_ABORTED: error = ENOLCK; break; case MTX_LINK_LINKED_EX: case MTX_LINK_LINKED_SH: mtx_delete_link(mtx, link); /* fall through */ default: if (error == 0) error = EWOULDBLOCK; break; } /* * Clear state on status returned. */ link->state = MTX_LINK_IDLE; if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0) indefinite_done(&info); return error; }
/* * Called with a critical section held and interrupts enabled. */ int pmap_inval_intr(cpumask_t *cpumaskp, int toolong) { globaldata_t gd = mycpu; pmap_inval_info_t *info; int loopme = 0; int cpu; cpumask_t cpumask; /* * Check all cpus for invalidations we may need to service. */ cpu_ccfence(); cpu = gd->gd_cpuid; cpumask = *cpumaskp; while (CPUMASK_TESTNZERO(cpumask)) { int n = BSFCPUMASK(cpumask); #ifdef LOOPRECOVER KKASSERT(n >= 0 && n < MAXCPU); #endif CPUMASK_NANDBIT(cpumask, n); info = &invinfo[n]; /* * Due to interrupts/races we can catch a new operation * in an older interrupt. A fence is needed once we detect * the (not) done bit. */ if (!CPUMASK_TESTBIT(info->done, cpu)) continue; cpu_lfence(); #ifdef LOOPRECOVER if (toolong) { kprintf("pminvl %d->%d %08jx %08jx mode=%d\n", cpu, n, info->done.ary[0], info->mask.ary[0], info->mode); } #endif /* * info->mask and info->done always contain the originating * cpu until the originator is done. Targets may still be * present in info->done after the originator is done (they * will be finishing up their loops). * * Clear info->mask bits on other cpus to indicate that they * have quiesced (entered the loop). Once the other mask bits * are clear we can execute the operation on the original, * then clear the mask and done bits on the originator. The * targets will then finish up their side and clear their * done bits. * * The command is considered 100% done when all done bits have * been cleared. */ if (n != cpu) { /* * Command state machine for 'other' cpus. */ if (CPUMASK_TESTBIT(info->mask, cpu)) { /* * Other cpu indicate to originator that they * are quiesced. */ ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); loopme = 1; } else if (info->ptep && CPUMASK_TESTBIT(info->mask, n)) { /* * Other cpu must wait for the originator (n) * to complete its command if ptep is not NULL. */ loopme = 1; } else { /* * Other cpu detects that the originator has * completed its command, or there was no * command. * * Now that the page table entry has changed, * we can follow up with our own invalidation. */ vm_offset_t va = info->va; int npgs; if (va == (vm_offset_t)-1 || info->npgs > MAX_INVAL_PAGES) { cpu_invltlb(); } else { for (npgs = info->npgs; npgs; --npgs) { cpu_invlpg((void *)va); va += PAGE_SIZE; } } ATOMIC_CPUMASK_NANDBIT(info->done, cpu); /* info invalid now */ /* loopme left alone */ } } else if (CPUMASK_TESTBIT(info->mask, cpu)) { /* * Originator is waiting for other cpus */ if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) { /* * Originator waits for other cpus to enter * their loop (aka quiesce). * * If this bugs out the IPI may have been lost, * try to reissue by resetting our own * reentrancy bit and clearing the smurf mask * for the cpus that did not respond, then * reissuing the IPI. */ loopme = 1; #ifdef LOOPRECOVER if (loopwdog(info)) { info->failed = 1; loopdebug("C", info); /* XXX recover from possible bug */ mdcpu->gd_xinvaltlb = 0; ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, info->mask); cpu_disable_intr(); smp_invlpg(&smp_active_mask); /* * Force outer-loop retest of Xinvltlb * requests (see mp_machdep.c). */ mdcpu->gd_xinvaltlb = 2; cpu_enable_intr(); } #endif } else { /* * Originator executes operation and clears * mask to allow other cpus to finish. */ KKASSERT(info->mode != INVDONE); if (info->mode == INVSTORE) { if (info->ptep) info->opte = atomic_swap_long(info->ptep, info->npte); CHECKSIGMASK(info); ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); CHECKSIGMASK(info); } else { if (atomic_cmpset_long(info->ptep, info->opte, info->npte)) { info->success = 1; } else { info->success = 0; } CHECKSIGMASK(info); ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); CHECKSIGMASK(info); } loopme = 1; } } else { /* * Originator does not have to wait for the other * cpus to finish. It clears its done bit. A new * command will not be initiated by the originator * until the other cpus have cleared their done bits * (asynchronously). */ vm_offset_t va = info->va; int npgs; if (va == (vm_offset_t)-1 || info->npgs > MAX_INVAL_PAGES) { cpu_invltlb(); } else { for (npgs = info->npgs; npgs; --npgs) { cpu_invlpg((void *)va); va += PAGE_SIZE; } } /* leave loopme alone */ /* other cpus may still be finishing up */ /* can't race originator since that's us */ info->mode = INVDONE; ATOMIC_CPUMASK_NANDBIT(info->done, cpu); } } return loopme; }