int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val) { if (tg->tid_map[ext_tid] == 0) { tg->envelope = bcast_val ? *bcast_val : NULL; cpu_sfence(); tg->forked = 1; tg->group_sense = tg->thread_sense[0]->sense; // if it's possible that threads are sleeping, signal them if (tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); uv_cond_broadcast(&tg->alarm); uv_mutex_unlock(&tg->alarm_lock); } } else { // spin up to threshold cycles (count sheep), then sleep uint64_t spin_cycles, spin_start = rdtsc(); while (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { if (tg->sleep_threshold) { spin_cycles = rdtsc() - spin_start; if (spin_cycles >= tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); if (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { uv_cond_wait(&tg->alarm, &tg->alarm_lock); } uv_mutex_unlock(&tg->alarm_lock); spin_start = rdtsc(); continue; } } cpu_pause(); } cpu_lfence(); if (bcast_val) *bcast_val = tg->envelope; } return 0; }
/* * This function completes reply processing for the default case in the * context of the originating cpu. */ static void lwkt_thread_replyport_remote(lwkt_msg_t msg) { lwkt_port_t port = msg->ms_reply_port; int flags; /* * Chase any thread migration that occurs */ if (port->mpu_td->td_gd != mycpu) { lwkt_send_ipiq(port->mpu_td->td_gd, (ipifunc1_t)lwkt_thread_replyport_remote, msg); return; } /* * Cleanup (in critical section, IPI on same cpu, atomic op not needed) */ #ifdef INVARIANTS KKASSERT(msg->ms_flags & MSGF_INTRANSIT); msg->ms_flags &= ~MSGF_INTRANSIT; #endif flags = msg->ms_flags; if (msg->ms_flags & MSGF_SYNC) { cpu_sfence(); msg->ms_flags |= MSGF_REPLY | MSGF_DONE; } else { _lwkt_enqueue_reply(port, msg); } if (port->mp_flags & MSGPORTF_WAITING) _lwkt_schedule_msg(port->mpu_td, flags); }
static void kcollect_thread(void *dummy) { uint32_t i; int n; for (;;) { lockmgr(&kcollect_lock, LK_EXCLUSIVE); i = kcollect_index % kcollect_samples; bzero(&kcollect_ary[i], sizeof(kcollect_ary[i])); crit_enter(); kcollect_ary[i].ticks = ticks; getmicrotime(&kcollect_ary[i].realtime); crit_exit(); for (n = 0; n < KCOLLECT_ENTRIES; ++n) { if (kcollect_callback[n]) { kcollect_ary[i].data[n] = kcollect_callback[n](n); } } cpu_sfence(); ++kcollect_index; lockmgr(&kcollect_lock, LK_RELEASE); tsleep(&dummy, 0, "sleep", hz * KCOLLECT_INTERVAL); } }
/* * lwkt_thread_replyport() - Backend to lwkt_replymsg() * * Called with the reply port as an argument but in the context of the * original target port. Completion must occur on the target port's * cpu. * * The critical section protects us from IPIs on the this CPU. */ static void lwkt_thread_replyport(lwkt_port_t port, lwkt_msg_t msg) { int flags; KKASSERT((msg->ms_flags & (MSGF_DONE|MSGF_QUEUED|MSGF_INTRANSIT)) == 0); if (msg->ms_flags & MSGF_SYNC) { /* * If a synchronous completion has been requested, just wakeup * the message without bothering to queue it to the target port. * * Assume the target thread is non-preemptive, so no critical * section is required. */ if (port->mpu_td->td_gd == mycpu) { crit_enter(); flags = msg->ms_flags; cpu_sfence(); msg->ms_flags |= MSGF_DONE | MSGF_REPLY; if (port->mp_flags & MSGPORTF_WAITING) _lwkt_schedule_msg(port->mpu_td, flags); crit_exit(); } else { #ifdef INVARIANTS atomic_set_int(&msg->ms_flags, MSGF_INTRANSIT); #endif atomic_set_int(&msg->ms_flags, MSGF_REPLY); lwkt_send_ipiq(port->mpu_td->td_gd, (ipifunc1_t)lwkt_thread_replyport_remote, msg); } } else { /* * If an asynchronous completion has been requested the message * must be queued to the reply port. * * A critical section is required to interlock the port queue. */ if (port->mpu_td->td_gd == mycpu) { crit_enter(); _lwkt_enqueue_reply(port, msg); if (port->mp_flags & MSGPORTF_WAITING) _lwkt_schedule_msg(port->mpu_td, msg->ms_flags); crit_exit(); } else { #ifdef INVARIANTS atomic_set_int(&msg->ms_flags, MSGF_INTRANSIT); #endif atomic_set_int(&msg->ms_flags, MSGF_REPLY); lwkt_send_ipiq(port->mpu_td->td_gd, (ipifunc1_t)lwkt_thread_replyport_remote, msg); } } }
/* * Chain pending links. Called on the last release of an exclusive or * shared lock when the appropriate WANTED bit is set. mtx_lock old state * is passed in with the count left at 1, which we can inherit, and other * bits which we must adjust in a single atomic operation. * * Return non-zero on success, 0 if caller needs to retry. * * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are * acquiring LINKSPIN as all other cases will also need to acquire * LINKSPIN when handling the EXWANTED case. */ static int mtx_chain_link_ex(mtx_t *mtx, u_int olock) { thread_t td = curthread; mtx_link_t *link; u_int nlock; olock &= ~MTX_LINKSPIN; nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE; /* upgrade if necc */ crit_enter_raw(td); if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) { link = mtx->mtx_exlink; KKASSERT(link != NULL); if (link->next == link) { mtx->mtx_exlink = NULL; nlock = MTX_LINKSPIN | MTX_EXWANTED; /* to clear */ } else { mtx->mtx_exlink = link->next; link->next->prev = link->prev; link->prev->next = link->next; nlock = MTX_LINKSPIN; /* to clear */ } KKASSERT(link->state == MTX_LINK_LINKED_EX); mtx->mtx_owner = link->owner; cpu_sfence(); /* * WARNING! The callback can only be safely * made with LINKSPIN still held * and in a critical section. * * WARNING! The link can go away after the * state is set, or after the * callback. */ if (link->callback) { link->state = MTX_LINK_CALLEDBACK; link->callback(link, link->arg, 0); } else { link->state = MTX_LINK_ACQUIRED; wakeup(link); } atomic_clear_int(&mtx->mtx_lock, nlock); crit_exit_raw(td); return 1; } /* retry */ crit_exit_raw(td); return 0; }
int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val) { if (tg->tid_map[ext_tid] == 0) { tg->envelope = bcast_val ? *bcast_val : NULL; cpu_sfence(); tg->forked = 1; tg->group_sense = tg->thread_sense[0]->sense; // if it's possible that threads are sleeping, signal them if (tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); uv_cond_broadcast(&tg->alarm); uv_mutex_unlock(&tg->alarm_lock); } } else { // spin up to threshold ns (count sheep), then sleep uint64_t spin_ns; uint64_t spin_start = 0; while (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { if (tg->sleep_threshold) { if (!spin_start) { // Lazily initialize spin_start since uv_hrtime is expensive spin_start = uv_hrtime(); continue; } spin_ns = uv_hrtime() - spin_start; // In case uv_hrtime is not monotonic, we'll sleep earlier if (spin_ns >= tg->sleep_threshold) { uv_mutex_lock(&tg->alarm_lock); if (tg->group_sense != tg->thread_sense[tg->tid_map[ext_tid]]->sense) { uv_cond_wait(&tg->alarm, &tg->alarm_lock); } uv_mutex_unlock(&tg->alarm_lock); spin_start = 0; continue; } } cpu_pause(); } cpu_lfence(); if (bcast_val) *bcast_val = tg->envelope; } return 0; }
/* * Release a serializing token. * * WARNING! All tokens must be released in reverse order. This will be * asserted. */ void lwkt_reltoken(lwkt_token_t tok) { thread_t td = curthread; lwkt_tokref_t ref; /* * Remove ref from thread token list and assert that it matches * the token passed in. Tokens must be released in reverse order. */ ref = td->td_toks_stop - 1; KKASSERT(ref >= &td->td_toks_base && ref->tr_tok == tok); _lwkt_reltokref(ref, td); cpu_sfence(); td->td_toks_stop = ref; }
/* * Set Transmit Enable bits for the specified queues. */ HAL_BOOL ar5211StartTxDma(struct ath_hal *ah, u_int q) { HALASSERT(q < HAL_NUM_TX_QUEUES); HALASSERT(AH5211(ah)->ah_txq[q].tqi_type != HAL_TX_QUEUE_INACTIVE); cpu_sfence(); /* Check that queue is not already active */ HALASSERT((OS_REG_READ(ah, AR_Q_TXD) & (1<<q)) == 0); HALDEBUG(ah, HAL_DEBUG_TXQUEUE, "%s: queue %u\n", __func__, q); /* Check to be sure we're not enabling a q that has its TXD bit set. */ HALASSERT((OS_REG_READ(ah, AR_Q_TXD) & (1 << q)) == 0); OS_REG_WRITE(ah, AR_Q_TXE, 1 << q); return AH_TRUE; }
/* * This sets the current real time of day. Timespecs are in seconds and * nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base, * instead we adjust basetime so basetime + gd_* results in the current * time of day. This way the gd_* fields are guarenteed to represent * a monotonically increasing 'uptime' value. * * When set_timeofday() is called from userland, the system call forces it * onto cpu #0 since only cpu #0 can update basetime_index. */ void set_timeofday(struct timespec *ts) { struct timespec *nbt; int ni; /* * XXX SMP / non-atomic basetime updates */ crit_enter(); ni = (basetime_index + 1) & BASETIME_ARYMASK; nbt = &basetime[ni]; nanouptime(nbt); nbt->tv_sec = ts->tv_sec - nbt->tv_sec; nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec; if (nbt->tv_nsec < 0) { nbt->tv_nsec += 1000000000; --nbt->tv_sec; } /* * Note that basetime diverges from boottime as the clock drift is * compensated for, so we cannot do away with boottime. When setting * the absolute time of day the drift is 0 (for an instant) and we * can simply assign boottime to basetime. * * Note that nanouptime() is based on gd_time_seconds which is drift * compensated up to a point (it is guarenteed to remain monotonically * increasing). gd_time_seconds is thus our best uptime guess and * suitable for use in the boottime calculation. It is already taken * into account in the basetime calculation above. */ boottime.tv_sec = nbt->tv_sec; ntp_delta = 0; /* * We now have a new basetime, make sure all other cpus have it, * then update the index. */ cpu_sfence(); basetime_index = ni; crit_exit(); }
/* * (Backend) Feed chain data through the cluster validator and back to * the frontend. Chains are fed from multiple nodes concurrently * and pipelined via per-node FIFOs in the XOP. * * No xop lock is needed because we are only manipulating fields under * our direct control. * * Returns 0 on success and a hammer error code if sync is permanently * lost. The caller retains a ref on the chain but by convention * the lock is typically inherited by the xop (caller loses lock). * * Returns non-zero on error. In this situation the caller retains a * ref on the chain but loses the lock (we unlock here). * * WARNING! The chain is moving between two different threads, it must * be locked SHARED to retain its data mapping, not exclusive. * When multiple operations are in progress at once, chains fed * back to the frontend for collection can wind up being locked * in different orders, only a shared lock can prevent a deadlock. * * Exclusive locks may only be used by a XOP backend node thread * temporarily, with no direct or indirect dependencies (aka * blocking/waiting) on other nodes. */ int hammer2_xop_feed(hammer2_xop_head_t *xop, hammer2_chain_t *chain, int clindex, int error) { hammer2_xop_fifo_t *fifo; /* * Multi-threaded entry into the XOP collector. We own the * fifo->wi for our clindex. */ fifo = &xop->collect[clindex]; while (fifo->ri == fifo->wi - HAMMER2_XOPFIFO) { tsleep_interlock(xop, 0); if (hammer2_xop_active(xop) == 0) { error = EINTR; goto done; } if (fifo->ri == fifo->wi - HAMMER2_XOPFIFO) { tsleep(xop, PINTERLOCKED, "h2feed", hz*60); } } if (chain) hammer2_chain_ref(chain); fifo->errors[fifo->wi & HAMMER2_XOPFIFO_MASK] = error; fifo->array[fifo->wi & HAMMER2_XOPFIFO_MASK] = chain; cpu_sfence(); ++fifo->wi; atomic_add_int(&xop->check_counter, 1); wakeup(&xop->check_counter); /* XXX optimize */ error = 0; /* * Cleanup. If an error occurred we eat the lock. If no error * occurred the fifo inherits the lock and gains an additional ref. * * The caller's ref remains in both cases. */ done: if (error && chain) hammer2_chain_unlock(chain); return error; }
/* * Flush waiting shared locks. The lock's prior state is passed in and must * be adjusted atomically only if it matches and LINKSPIN is not set. * * IMPORTANT! The caller has left one active count on the lock for us to * consume. We will apply this to the first link, but must add * additional counts for any other links. */ static int mtx_chain_link_sh(mtx_t *mtx, u_int olock) { thread_t td = curthread; mtx_link_t *link; u_int addcount; u_int nlock; olock &= ~MTX_LINKSPIN; nlock = olock | MTX_LINKSPIN; nlock &= ~MTX_EXCLUSIVE; crit_enter_raw(td); if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) { /* * It should not be possible for SHWANTED to be set without * any links pending. */ KKASSERT(mtx->mtx_shlink != NULL); /* * We have to process the count for all shared locks before * we process any of the links. Count the additional shared * locks beyond the first link (which is already accounted * for) and associate the full count with the lock * immediately. */ addcount = 0; for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink; link = link->next) { ++addcount; } if (addcount > 0) atomic_add_int(&mtx->mtx_lock, addcount); /* * We can wakeup all waiting shared locks. */ while ((link = mtx->mtx_shlink) != NULL) { KKASSERT(link->state == MTX_LINK_LINKED_SH); if (link->next == link) { mtx->mtx_shlink = NULL; } else { mtx->mtx_shlink = link->next; link->next->prev = link->prev; link->prev->next = link->next; } link->next = NULL; link->prev = NULL; cpu_sfence(); if (link->callback) { link->state = MTX_LINK_CALLEDBACK; link->callback(link, link->arg, 0); } else { cpu_sfence(); link->state = MTX_LINK_ACQUIRED; wakeup(link); } } atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN | MTX_SHWANTED); crit_exit_raw(td); return 1; } /* retry */ crit_exit_raw(td); return 0; }
static void process_comp_queue (struct nvme_host *host, u16 comp_queue_id, struct nvme_queue_info *h_comp_queue_info, struct nvme_queue_info *g_comp_queue_info) { struct nvme_request_hub *hub; hub = host->h_queue.request_hub[comp_queue_id]; u16 h_cur_head = h_comp_queue_info->cur_pos.head; u16 g_cur_head = g_comp_queue_info->cur_pos.head; struct nvme_comp first_h_comp = {0}, *first_g_comp = NULL; struct nvme_comp *h_comp, *g_comp; for (h_comp = nvme_comp_queue_at_idx (h_comp_queue_info, h_cur_head), g_comp = nvme_comp_queue_at_idx (g_comp_queue_info, g_cur_head); NVME_COMP_GET_PHASE (h_comp) == h_comp_queue_info->phase; h_comp = nvme_comp_queue_at_idx (h_comp_queue_info, h_cur_head), g_comp = nvme_comp_queue_at_idx (g_comp_queue_info, g_cur_head)) { /* This queue ID is submission queue ID */ u16 subm_queue_id = h_comp->queue_id; struct nvme_request *req; req = get_request (host, hub, subm_queue_id, h_comp->cmd_id); ASSERT (req); u64 time_taken = get_time () - req->submit_time; if (time_taken > NVME_TIME_TAKEN_WATERMARK) { printf ("Long time controller response: %llu\n", time_taken); printf ("Submission Queue ID: %u opcode: %u\n", subm_queue_id, req->cmd.std.opcode); } if (subm_queue_id == 0) process_admin_comp (host, h_comp, req); else process_io_comp (host, h_comp, req); h_cur_head++; if (h_cur_head >= h_comp_queue_info->n_entries) { h_comp_queue_info->phase ^= 1; h_cur_head = 0; } if (!req->is_h_req) { struct nvme_comp comp = *h_comp; comp.cmd_id = req->orig_cmd_id; comp.status &= ~0x1; comp.status |= g_comp_queue_info->phase; /* * Replace with the host value instead of the * value reported by the controller. This is necessary * if we mix guest commands and host commands to share * queues. */ comp.queue_head = g_subm_cur_tail (host, subm_queue_id); if (first_g_comp) { *g_comp = comp; } else { /* Copy the first completion entry later */ first_g_comp = g_comp; first_h_comp = comp; } g_cur_head++; if (g_cur_head >= g_comp_queue_info->n_entries) { g_comp_queue_info->phase ^= 1; g_cur_head = 0; } spinlock_lock (&hub->lock); g_comp_queue_info->cur_pos.head = g_cur_head; h_comp_queue_info->cur_pos.head = h_cur_head; spinlock_unlock (&hub->lock); } else { spinlock_lock (&hub->lock); nvme_write_comp_db (host, comp_queue_id, h_cur_head); hub->n_not_ack_h_reqs--; h_comp_queue_info->cur_pos.head = h_cur_head; spinlock_unlock (&hub->lock); } nvme_free_request (hub, req); } if (first_g_comp) { first_g_comp->cmd_specific = first_h_comp.cmd_specific; first_g_comp->rsvd = first_h_comp.rsvd; first_g_comp->queue_head = first_h_comp.queue_head; first_g_comp->queue_id = first_h_comp.queue_id; first_g_comp->cmd_id = first_h_comp.cmd_id; /* * Make sure everything are stored in the memory properly * before we copy the status field. This is to avoid * data corruption. */ cpu_sfence (); first_g_comp->status = first_h_comp.status; } }
/* * Invalidate the specified va across all cpus associated with the pmap. * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation * will be done fully synchronously with storing npte into *ptep and returning * opte. * * If ptep is NULL the operation will execute semi-synchronously. * ptep must be NULL if npgs > 1 */ pt_entry_t pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs, pt_entry_t *ptep, pt_entry_t npte) { globaldata_t gd = mycpu; pmap_inval_info_t *info; pt_entry_t opte = 0; int cpu = gd->gd_cpuid; cpumask_t tmpmask; unsigned long rflags; /* * Initialize invalidation for pmap and enter critical section. */ if (pmap == NULL) pmap = &kernel_pmap; pmap_inval_init(pmap); /* * Shortcut single-cpu case if possible. */ if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) { /* * Convert to invltlb if there are too many pages to * invlpg on. */ if (npgs > MAX_INVAL_PAGES) { npgs = 0; va = (vm_offset_t)-1; } /* * Invalidate the specified pages, handle invltlb if requested. */ while (npgs) { --npgs; if (ptep) { opte = atomic_swap_long(ptep, npte); ++ptep; } if (va == (vm_offset_t)-1) break; cpu_invlpg((void *)va); va += PAGE_SIZE; } if (va == (vm_offset_t)-1) cpu_invltlb(); pmap_inval_done(pmap); return opte; } /* * We need a critical section to prevent getting preempted while * we setup our command. A preemption might execute its own * pmap_inval*() command and create confusion below. * * tsc_target is our watchdog timeout that will attempt to recover * from a lost IPI. Set to 1/16 second for now. */ info = &invinfo[cpu]; info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); /* * We must wait for other cpus which may still be finishing up a * prior operation that we requested. * * We do not have to disable interrupts here. An Xinvltlb can occur * at any time (even within a critical section), but it will not * act on our command until we set our done bits. */ while (CPUMASK_TESTNZERO(info->done)) { #ifdef LOOPRECOVER if (loopwdog(info)) { info->failed = 1; loopdebug("A", info); /* XXX recover from possible bug */ CPUMASK_ASSZERO(info->done); } #endif cpu_pause(); } KKASSERT(info->mode == INVDONE); /* * Must set our cpu in the invalidation scan mask before * any possibility of [partial] execution (remember, XINVLTLB * can interrupt a critical section). */ ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); info->va = va; info->npgs = npgs; info->ptep = ptep; info->npte = npte; info->opte = 0; #ifdef LOOPRECOVER info->failed = 0; #endif info->mode = INVSTORE; tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */ cpu_ccfence(); CPUMASK_ANDMASK(tmpmask, smp_active_mask); /* * If ptep is NULL the operation can be semi-synchronous, which means * we can improve performance by flagging and removing idle cpus * (see the idleinvlclr function in mp_machdep.c). * * Typically kernel page table operation is semi-synchronous. */ if (ptep == NULL) smp_smurf_idleinvlclr(&tmpmask); CPUMASK_ORBIT(tmpmask, cpu); info->mask = tmpmask; /* * Command may start executing the moment 'done' is initialized, * disable current cpu interrupt to prevent 'done' field from * changing (other cpus can't clear done bits until the originating * cpu clears its mask bit, but other cpus CAN start clearing their * mask bits). */ #ifdef LOOPRECOVER info->sigmask = tmpmask; CHECKSIGMASK(info); #endif cpu_sfence(); rflags = read_rflags(); cpu_disable_intr(); ATOMIC_CPUMASK_COPY(info->done, tmpmask); /* execution can begin here due to races */ /* * Pass our copy of the done bits (so they don't change out from * under us) to generate the Xinvltlb interrupt on the targets. */ smp_invlpg(&tmpmask); opte = info->opte; KKASSERT(info->mode == INVDONE); /* * Target cpus will be in their loop exiting concurrently with our * cleanup. They will not lose the bitmask they obtained before so * we can safely clear this bit. */ ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); write_rflags(rflags); pmap_inval_done(pmap); return opte; }
/* * Call this *after* all CPUs Cx states have been attached. */ static void acpi_cst_postattach(void *arg) { struct acpi_cst_softc *sc; int i; /* Get set of Cx state devices */ devclass_get_devices(acpi_cst_devclass, &acpi_cst_devices, &acpi_cst_ndevices); /* * Setup any quirks that might necessary now that we have probed * all the CPUs' Cx states. */ acpi_cst_set_quirks(); if (acpi_cst_use_fadt) { /* * We are using Cx mode from FADT, probe for available Cx states * for all processors. */ for (i = 0; i < acpi_cst_ndevices; i++) { sc = device_get_softc(acpi_cst_devices[i]); acpi_cst_cx_probe_fadt(sc); } } else { /* * We are using _CST mode, remove C3 state if necessary. * * As we now know for sure that we will be using _CST mode * install our notify handler. */ for (i = 0; i < acpi_cst_ndevices; i++) { sc = device_get_softc(acpi_cst_devices[i]); if (acpi_cst_quirks & ACPI_CST_QUIRK_NO_C3) { /* Free part of unused resources */ acpi_cst_free_resource(sc, sc->cst_non_c3 + 1); sc->cst_cx_count = sc->cst_non_c3 + 1; } sc->cst_parent->cpu_cst_notify = acpi_cst_notify; } } acpi_cst_global_cx_count(); /* Perform Cx final initialization. */ for (i = 0; i < acpi_cst_ndevices; i++) { sc = device_get_softc(acpi_cst_devices[i]); acpi_cst_startup(sc); if (sc->cst_parent->glob_sysctl_tree != NULL) { struct acpi_cpu_softc *cpu = sc->cst_parent; /* Add a sysctl handler to handle global Cx lowest setting */ SYSCTL_ADD_PROC(&cpu->glob_sysctl_ctx, SYSCTL_CHILDREN(cpu->glob_sysctl_tree), OID_AUTO, "cx_lowest", CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, acpi_cst_global_lowest_sysctl, "A", "Requested global lowest Cx sleep state"); SYSCTL_ADD_PROC(&cpu->glob_sysctl_ctx, SYSCTL_CHILDREN(cpu->glob_sysctl_tree), OID_AUTO, "cx_lowest_use", CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, acpi_cst_global_lowest_use_sysctl, "A", "Global lowest Cx sleep state to use"); } } /* Take over idling from cpu_idle_default(). */ acpi_cst_cx_lowest = 0; acpi_cst_cx_lowest_req = 0; acpi_cst_disable_idle = FALSE; cpu_sfence(); cpu_idle_hook = acpi_cst_idle; }
/* * Parse a _CST package and set up its Cx states. Since the _CST object * can change dynamically, our notify handler may call this function * to clean up and probe the new _CST package. */ static int acpi_cst_cx_probe_cst(struct acpi_cst_softc *sc, int reprobe) { struct acpi_cst_cx *cx_ptr; ACPI_STATUS status; ACPI_BUFFER buf; ACPI_OBJECT *top; ACPI_OBJECT *pkg; uint32_t count; int i; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); #ifdef INVARIANTS if (reprobe) KKASSERT(&curthread->td_msgport == netisr_cpuport(sc->cst_cpuid)); #endif buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->cst_handle, "_CST", NULL, &buf); if (ACPI_FAILURE(status)) return (ENXIO); /* _CST is a package with a count and at least one Cx package. */ top = (ACPI_OBJECT *)buf.Pointer; if (!ACPI_PKG_VALID(top, 2) || acpi_PkgInt32(top, 0, &count) != 0) { device_printf(sc->cst_dev, "invalid _CST package\n"); AcpiOsFree(buf.Pointer); return (ENXIO); } if (count != top->Package.Count - 1) { device_printf(sc->cst_dev, "invalid _CST state count (%d != %d)\n", count, top->Package.Count - 1); count = top->Package.Count - 1; } if (count > MAX_CX_STATES) { device_printf(sc->cst_dev, "_CST has too many states (%d)\n", count); count = MAX_CX_STATES; } sc->cst_flags |= ACPI_CST_FLAG_PROBING | ACPI_CST_FLAG_MATCH_HT; cpu_sfence(); /* * Free all previously allocated resources * * NOTE: It is needed for _CST reprobing. */ acpi_cst_free_resource(sc, 0); /* Set up all valid states. */ sc->cst_cx_count = 0; cx_ptr = sc->cst_cx_states; for (i = 0; i < count; i++) { int error; pkg = &top->Package.Elements[i + 1]; if (!ACPI_PKG_VALID(pkg, 4) || acpi_PkgInt32(pkg, 1, &cx_ptr->type) != 0 || acpi_PkgInt32(pkg, 2, &cx_ptr->trans_lat) != 0 || acpi_PkgInt32(pkg, 3, &cx_ptr->power) != 0) { device_printf(sc->cst_dev, "skipping invalid Cx state package\n"); continue; } /* Validate the state to see if we should use it. */ switch (cx_ptr->type) { case ACPI_STATE_C1: sc->cst_non_c3 = i; cx_ptr->enter = acpi_cst_c1_halt_enter; error = acpi_cst_cx_setup(cx_ptr); if (error) panic("C1 CST HALT setup failed: %d", error); if (sc->cst_cx_count != 0) { /* * C1 is not the first C-state; something really stupid * is going on ... */ sc->cst_flags &= ~ACPI_CST_FLAG_MATCH_HT; } cx_ptr++; sc->cst_cx_count++; continue; case ACPI_STATE_C2: sc->cst_non_c3 = i; break; case ACPI_STATE_C3: default: if ((acpi_cst_quirks & ACPI_CST_QUIRK_NO_C3) != 0) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "cpu_cst%d: C3[%d] not available.\n", device_get_unit(sc->cst_dev), i)); continue; } break; } /* * Allocate the control register for C2 or C3(+). */ KASSERT(cx_ptr->res == NULL, ("still has res")); acpi_PkgRawGas(pkg, 0, &cx_ptr->gas); /* * We match number of C2/C3 for hyperthreads, only if the * register is "Fixed Hardware", e.g. on most of the Intel * CPUs. We don't have much to do for the rest of the * register types. */ if (cx_ptr->gas.SpaceId != ACPI_ADR_SPACE_FIXED_HARDWARE) sc->cst_flags &= ~ACPI_CST_FLAG_MATCH_HT; cx_ptr->rid = sc->cst_parent->cpu_next_rid; acpi_bus_alloc_gas(sc->cst_dev, &cx_ptr->res_type, &cx_ptr->rid, &cx_ptr->gas, &cx_ptr->res, RF_SHAREABLE); if (cx_ptr->res != NULL) { sc->cst_parent->cpu_next_rid++; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "cpu_cst%d: Got C%d - %d latency\n", device_get_unit(sc->cst_dev), cx_ptr->type, cx_ptr->trans_lat)); cx_ptr->enter = acpi_cst_cx_io_enter; cx_ptr->btag = rman_get_bustag(cx_ptr->res); cx_ptr->bhand = rman_get_bushandle(cx_ptr->res); error = acpi_cst_cx_setup(cx_ptr); if (error) panic("C%d CST I/O setup failed: %d", cx_ptr->type, error); cx_ptr++; sc->cst_cx_count++; } else { error = acpi_cst_cx_setup(cx_ptr); if (!error) { KASSERT(cx_ptr->enter != NULL, ("C%d enter is not set", cx_ptr->type)); cx_ptr++; sc->cst_cx_count++; } } } AcpiOsFree(buf.Pointer); if (sc->cst_flags & ACPI_CST_FLAG_MATCH_HT) { cpumask_t mask; mask = get_cpumask_from_level(sc->cst_cpuid, CORE_LEVEL); if (CPUMASK_TESTNZERO(mask)) { int cpu; for (cpu = 0; cpu < ncpus; ++cpu) { struct acpi_cst_softc *sc1 = acpi_cst_softc[cpu]; if (sc1 == NULL || sc1 == sc || (sc1->cst_flags & ACPI_CST_FLAG_ATTACHED) == 0 || (sc1->cst_flags & ACPI_CST_FLAG_MATCH_HT) == 0) continue; if (!CPUMASK_TESTBIT(mask, sc1->cst_cpuid)) continue; if (sc1->cst_cx_count != sc->cst_cx_count) { struct acpi_cst_softc *src_sc, *dst_sc; if (bootverbose) { device_printf(sc->cst_dev, "inconstent C-state count: %d, %s has %d\n", sc->cst_cx_count, device_get_nameunit(sc1->cst_dev), sc1->cst_cx_count); } if (sc1->cst_cx_count > sc->cst_cx_count) { src_sc = sc1; dst_sc = sc; } else { src_sc = sc; dst_sc = sc1; } acpi_cst_copy(dst_sc, src_sc); } } } } if (reprobe) { /* If there are C3(+) states, always enable bus master wakeup */ if ((acpi_cst_quirks & ACPI_CST_QUIRK_NO_BM) == 0) { for (i = 0; i < sc->cst_cx_count; ++i) { struct acpi_cst_cx *cx = &sc->cst_cx_states[i]; if (cx->type >= ACPI_STATE_C3) { AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 1); break; } } } /* Fix up the lowest Cx being used */ acpi_cst_set_lowest_oncpu(sc, sc->cst_cx_lowest_req); } /* * Cache the lowest non-C3 state. * NOTE: must after cst_cx_lowest is set. */ acpi_cst_non_c3(sc); cpu_sfence(); sc->cst_flags &= ~ACPI_CST_FLAG_PROBING; return (0); }
/* * API function - invalidate the pte at (va) and replace *ptep with npte * atomically only if *ptep equals opte, across the pmap's active cpus. * * Returns 1 on success, 0 on failure (caller typically retries). */ int pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, pt_entry_t opte, pt_entry_t npte) { globaldata_t gd = mycpu; pmap_inval_info_t *info; int success; int cpu = gd->gd_cpuid; cpumask_t tmpmask; unsigned long rflags; /* * Initialize invalidation for pmap and enter critical section. */ if (pmap == NULL) pmap = &kernel_pmap; pmap_inval_init(pmap); /* * Shortcut single-cpu case if possible. */ if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) { if (atomic_cmpset_long(ptep, opte, npte)) { if (va == (vm_offset_t)-1) cpu_invltlb(); else cpu_invlpg((void *)va); pmap_inval_done(pmap); return 1; } else { pmap_inval_done(pmap); return 0; } } /* * We need a critical section to prevent getting preempted while * we setup our command. A preemption might execute its own * pmap_inval*() command and create confusion below. */ info = &invinfo[cpu]; info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); /* * We must wait for other cpus which may still be finishing * up a prior operation. */ while (CPUMASK_TESTNZERO(info->done)) { #ifdef LOOPRECOVER if (loopwdog(info)) { info->failed = 1; loopdebug("B", info); /* XXX recover from possible bug */ CPUMASK_ASSZERO(info->done); } #endif cpu_pause(); } KKASSERT(info->mode == INVDONE); /* * Must set our cpu in the invalidation scan mask before * any possibility of [partial] execution (remember, XINVLTLB * can interrupt a critical section). */ ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); info->va = va; info->npgs = 1; /* unused */ info->ptep = ptep; info->npte = npte; info->opte = opte; #ifdef LOOPRECOVER info->failed = 0; #endif info->mode = INVCMPSET; info->success = 0; tmpmask = pmap->pm_active; /* volatile */ cpu_ccfence(); CPUMASK_ANDMASK(tmpmask, smp_active_mask); CPUMASK_ORBIT(tmpmask, cpu); info->mask = tmpmask; /* * Command may start executing the moment 'done' is initialized, * disable current cpu interrupt to prevent 'done' field from * changing (other cpus can't clear done bits until the originating * cpu clears its mask bit). */ #ifdef LOOPRECOVER info->sigmask = tmpmask; CHECKSIGMASK(info); #endif cpu_sfence(); rflags = read_rflags(); cpu_disable_intr(); ATOMIC_CPUMASK_COPY(info->done, tmpmask); /* * Pass our copy of the done bits (so they don't change out from * under us) to generate the Xinvltlb interrupt on the targets. */ smp_invlpg(&tmpmask); success = info->success; KKASSERT(info->mode == INVDONE); ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); write_rflags(rflags); pmap_inval_done(pmap); return success; }
/* * Exclusive-lock a mutex, block until acquired unless link is async. * Recursion is allowed. * * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS * if async. If immediately successful an async exclusive lock will return 0 * and not issue the async callback or link the link structure. The caller * must handle this case (typically this is an optimal code path). * * A tsleep() error can only be returned if PCATCH is specified in the flags. */ static __inline int __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to) { thread_t td; u_int lock; u_int nlock; int error; int isasync; for (;;) { lock = mtx->mtx_lock; cpu_ccfence(); if (lock == 0) { nlock = MTX_EXCLUSIVE | 1; if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { mtx->mtx_owner = curthread; cpu_sfence(); link->state = MTX_LINK_ACQUIRED; error = 0; break; } continue; } if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) { KKASSERT((lock & MTX_MASK) != MTX_MASK); nlock = lock + 1; if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { cpu_sfence(); link->state = MTX_LINK_ACQUIRED; error = 0; break; } continue; } /* * We need MTX_LINKSPIN to manipulate exlink or * shlink. * * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate * pending exclusive requests. It cannot be set as a separate * operation prior to acquiring MTX_LINKSPIN. * * To avoid unnecessary cpu cache traffic we poll * for collisions. It is also possible that EXWANTED * state failing the above test was spurious, so all the * tests must be repeated if we cannot obtain LINKSPIN * with the prior state tests intact (i.e. don't reload * the (lock) variable here, for heaven's sake!). */ if (lock & MTX_LINKSPIN) { cpu_pause(); continue; } td = curthread; nlock = lock | MTX_EXWANTED | MTX_LINKSPIN; crit_enter_raw(td); if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) { crit_exit_raw(td); continue; } /* * Check for early abort. */ if (link->state == MTX_LINK_ABORTED) { if (mtx->mtx_exlink == NULL) { atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN | MTX_EXWANTED); } else { atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN); } crit_exit_raw(td); link->state = MTX_LINK_IDLE; error = ENOLCK; break; } /* * Add our link to the exlink list and release LINKSPIN. */ link->owner = td; link->state = MTX_LINK_LINKED_EX; if (mtx->mtx_exlink) { link->next = mtx->mtx_exlink; link->prev = link->next->prev; link->next->prev = link; link->prev->next = link; } else { link->next = link; link->prev = link; mtx->mtx_exlink = link; } isasync = (link->callback != NULL); atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN); crit_exit_raw(td); /* * If asynchronous lock request return without * blocking, leave link structure linked. */ if (isasync) { error = EINPROGRESS; break; } /* * Wait for lock */ error = mtx_wait_link(mtx, link, flags, to); break; } return (error); }