Example #1
1
int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid,
                        void **bcast_val)
{
    if (tg->tid_map[ext_tid] == 0) {
        tg->envelope = bcast_val ? *bcast_val : NULL;
        cpu_sfence();
        tg->forked = 1;
        tg->group_sense = tg->thread_sense[0]->sense;

        // if it's possible that threads are sleeping, signal them
        if (tg->sleep_threshold) {
            uv_mutex_lock(&tg->alarm_lock);
            uv_cond_broadcast(&tg->alarm);
            uv_mutex_unlock(&tg->alarm_lock);
        }
    }
    else {
        // spin up to threshold cycles (count sheep), then sleep
        uint64_t spin_cycles, spin_start = rdtsc();
        while (tg->group_sense !=
               tg->thread_sense[tg->tid_map[ext_tid]]->sense) {
            if (tg->sleep_threshold) {
                spin_cycles = rdtsc() - spin_start;
                if (spin_cycles >= tg->sleep_threshold) {
                    uv_mutex_lock(&tg->alarm_lock);
                    if (tg->group_sense !=
                        tg->thread_sense[tg->tid_map[ext_tid]]->sense) {
                        uv_cond_wait(&tg->alarm, &tg->alarm_lock);
                    }
                    uv_mutex_unlock(&tg->alarm_lock);
                    spin_start = rdtsc();
                    continue;
                }
            }
            cpu_pause();
        }
        cpu_lfence();
        if (bcast_val)
            *bcast_val = tg->envelope;
    }

    return 0;
}
Example #2
0
/*
 * This function completes reply processing for the default case in the
 * context of the originating cpu.
 */
static
void
lwkt_thread_replyport_remote(lwkt_msg_t msg)
{
    lwkt_port_t port = msg->ms_reply_port;
    int flags;

    /*
     * Chase any thread migration that occurs
     */
    if (port->mpu_td->td_gd != mycpu) {
        lwkt_send_ipiq(port->mpu_td->td_gd,
                       (ipifunc1_t)lwkt_thread_replyport_remote, msg);
        return;
    }

    /*
     * Cleanup (in critical section, IPI on same cpu, atomic op not needed)
     */
#ifdef INVARIANTS
    KKASSERT(msg->ms_flags & MSGF_INTRANSIT);
    msg->ms_flags &= ~MSGF_INTRANSIT;
#endif
    flags = msg->ms_flags;
    if (msg->ms_flags & MSGF_SYNC) {
        cpu_sfence();
        msg->ms_flags |= MSGF_REPLY | MSGF_DONE;
    } else {
        _lwkt_enqueue_reply(port, msg);
    }
    if (port->mp_flags & MSGPORTF_WAITING)
        _lwkt_schedule_msg(port->mpu_td, flags);
}
Example #3
0
static
void
kcollect_thread(void *dummy)
{
	uint32_t i;
	int n;

	for (;;) {
		lockmgr(&kcollect_lock, LK_EXCLUSIVE);
		i = kcollect_index % kcollect_samples;
		bzero(&kcollect_ary[i], sizeof(kcollect_ary[i]));
		crit_enter();
		kcollect_ary[i].ticks = ticks;
		getmicrotime(&kcollect_ary[i].realtime);
		crit_exit();
		for (n = 0; n < KCOLLECT_ENTRIES; ++n) {
			if (kcollect_callback[n]) {
				kcollect_ary[i].data[n] =
					kcollect_callback[n](n);
			}
		}
		cpu_sfence();
		++kcollect_index;
		lockmgr(&kcollect_lock, LK_RELEASE);
		tsleep(&dummy, 0, "sleep", hz * KCOLLECT_INTERVAL);
	}
}
Example #4
0
/*
 * lwkt_thread_replyport() - Backend to lwkt_replymsg()
 *
 * Called with the reply port as an argument but in the context of the
 * original target port.  Completion must occur on the target port's
 * cpu.
 *
 * The critical section protects us from IPIs on the this CPU.
 */
static
void
lwkt_thread_replyport(lwkt_port_t port, lwkt_msg_t msg)
{
    int flags;

    KKASSERT((msg->ms_flags & (MSGF_DONE|MSGF_QUEUED|MSGF_INTRANSIT)) == 0);

    if (msg->ms_flags & MSGF_SYNC) {
        /*
         * If a synchronous completion has been requested, just wakeup
         * the message without bothering to queue it to the target port.
         *
         * Assume the target thread is non-preemptive, so no critical
         * section is required.
         */
        if (port->mpu_td->td_gd == mycpu) {
            crit_enter();
            flags = msg->ms_flags;
            cpu_sfence();
            msg->ms_flags |= MSGF_DONE | MSGF_REPLY;
            if (port->mp_flags & MSGPORTF_WAITING)
                _lwkt_schedule_msg(port->mpu_td, flags);
            crit_exit();
        } else {
#ifdef INVARIANTS
            atomic_set_int(&msg->ms_flags, MSGF_INTRANSIT);
#endif
            atomic_set_int(&msg->ms_flags, MSGF_REPLY);
            lwkt_send_ipiq(port->mpu_td->td_gd,
                           (ipifunc1_t)lwkt_thread_replyport_remote, msg);
        }
    } else {
        /*
         * If an asynchronous completion has been requested the message
         * must be queued to the reply port.
         *
         * A critical section is required to interlock the port queue.
         */
        if (port->mpu_td->td_gd == mycpu) {
            crit_enter();
            _lwkt_enqueue_reply(port, msg);
            if (port->mp_flags & MSGPORTF_WAITING)
                _lwkt_schedule_msg(port->mpu_td, msg->ms_flags);
            crit_exit();
        } else {
#ifdef INVARIANTS
            atomic_set_int(&msg->ms_flags, MSGF_INTRANSIT);
#endif
            atomic_set_int(&msg->ms_flags, MSGF_REPLY);
            lwkt_send_ipiq(port->mpu_td->td_gd,
                           (ipifunc1_t)lwkt_thread_replyport_remote, msg);
        }
    }
}
Example #5
0
/*
 * Chain pending links.  Called on the last release of an exclusive or
 * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
 * is passed in with the count left at 1, which we can inherit, and other
 * bits which we must adjust in a single atomic operation.
 *
 * Return non-zero on success, 0 if caller needs to retry.
 *
 * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
 *	 acquiring LINKSPIN as all other cases will also need to acquire
 *	 LINKSPIN when handling the EXWANTED case.
 */
static int
mtx_chain_link_ex(mtx_t *mtx, u_int olock)
{
	thread_t td = curthread;
	mtx_link_t *link;
	u_int	nlock;

	olock &= ~MTX_LINKSPIN;
	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;	/* upgrade if necc */
	crit_enter_raw(td);
	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
		link = mtx->mtx_exlink;
		KKASSERT(link != NULL);
		if (link->next == link) {
			mtx->mtx_exlink = NULL;
			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
		} else {
			mtx->mtx_exlink = link->next;
			link->next->prev = link->prev;
			link->prev->next = link->next;
			nlock = MTX_LINKSPIN;			/* to clear */
		}
		KKASSERT(link->state == MTX_LINK_LINKED_EX);
		mtx->mtx_owner = link->owner;
		cpu_sfence();

		/*
		 * WARNING! The callback can only be safely
		 *	    made with LINKSPIN still held
		 *	    and in a critical section.
		 *
		 * WARNING! The link can go away after the
		 *	    state is set, or after the
		 *	    callback.
		 */
		if (link->callback) {
			link->state = MTX_LINK_CALLEDBACK;
			link->callback(link, link->arg, 0);
		} else {
			link->state = MTX_LINK_ACQUIRED;
			wakeup(link);
		}
		atomic_clear_int(&mtx->mtx_lock, nlock);
		crit_exit_raw(td);
		return 1;
	}
	/* retry */
	crit_exit_raw(td);

	return 0;
}
Example #6
0
int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid,
                        void **bcast_val)
{
    if (tg->tid_map[ext_tid] == 0) {
        tg->envelope = bcast_val ? *bcast_val : NULL;
        cpu_sfence();
        tg->forked = 1;
        tg->group_sense = tg->thread_sense[0]->sense;

        // if it's possible that threads are sleeping, signal them
        if (tg->sleep_threshold) {
            uv_mutex_lock(&tg->alarm_lock);
            uv_cond_broadcast(&tg->alarm);
            uv_mutex_unlock(&tg->alarm_lock);
        }
    }
    else {
        // spin up to threshold ns (count sheep), then sleep
        uint64_t spin_ns;
        uint64_t spin_start = 0;
        while (tg->group_sense !=
                tg->thread_sense[tg->tid_map[ext_tid]]->sense) {
            if (tg->sleep_threshold) {
                if (!spin_start) {
                    // Lazily initialize spin_start since uv_hrtime is expensive
                    spin_start = uv_hrtime();
                    continue;
                }
                spin_ns = uv_hrtime() - spin_start;
                // In case uv_hrtime is not monotonic, we'll sleep earlier
                if (spin_ns >= tg->sleep_threshold) {
                    uv_mutex_lock(&tg->alarm_lock);
                    if (tg->group_sense !=
                            tg->thread_sense[tg->tid_map[ext_tid]]->sense) {
                        uv_cond_wait(&tg->alarm, &tg->alarm_lock);
                    }
                    uv_mutex_unlock(&tg->alarm_lock);
                    spin_start = 0;
                    continue;
                }
            }
            cpu_pause();
        }
        cpu_lfence();
        if (bcast_val)
            *bcast_val = tg->envelope;
    }

    return 0;
}
Example #7
0
/*
 * Release a serializing token.
 *
 * WARNING!  All tokens must be released in reverse order.  This will be
 *	     asserted.
 */
void
lwkt_reltoken(lwkt_token_t tok)
{
	thread_t td = curthread;
	lwkt_tokref_t ref;

	/*
	 * Remove ref from thread token list and assert that it matches
	 * the token passed in.  Tokens must be released in reverse order.
	 */
	ref = td->td_toks_stop - 1;
	KKASSERT(ref >= &td->td_toks_base && ref->tr_tok == tok);
	_lwkt_reltokref(ref, td);
	cpu_sfence();
	td->td_toks_stop = ref;
}
Example #8
0
/*
 * Set Transmit Enable bits for the specified queues.
 */
HAL_BOOL
ar5211StartTxDma(struct ath_hal *ah, u_int q)
{
	HALASSERT(q < HAL_NUM_TX_QUEUES);
	HALASSERT(AH5211(ah)->ah_txq[q].tqi_type != HAL_TX_QUEUE_INACTIVE);

	cpu_sfence();
	/* Check that queue is not already active */
	HALASSERT((OS_REG_READ(ah, AR_Q_TXD) & (1<<q)) == 0);

	HALDEBUG(ah, HAL_DEBUG_TXQUEUE, "%s: queue %u\n", __func__, q);

	/* Check to be sure we're not enabling a q that has its TXD bit set. */
	HALASSERT((OS_REG_READ(ah, AR_Q_TXD) & (1 << q)) == 0);

	OS_REG_WRITE(ah, AR_Q_TXE, 1 << q);
	return AH_TRUE;
}
Example #9
0
/*
 * This sets the current real time of day.  Timespecs are in seconds and
 * nanoseconds.  We do not mess with gd_time_seconds and gd_cpuclock_base,
 * instead we adjust basetime so basetime + gd_* results in the current
 * time of day.  This way the gd_* fields are guarenteed to represent
 * a monotonically increasing 'uptime' value.
 *
 * When set_timeofday() is called from userland, the system call forces it
 * onto cpu #0 since only cpu #0 can update basetime_index.
 */
void
set_timeofday(struct timespec *ts)
{
	struct timespec *nbt;
	int ni;

	/*
	 * XXX SMP / non-atomic basetime updates
	 */
	crit_enter();
	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	nbt = &basetime[ni];
	nanouptime(nbt);
	nbt->tv_sec = ts->tv_sec - nbt->tv_sec;
	nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec;
	if (nbt->tv_nsec < 0) {
	    nbt->tv_nsec += 1000000000;
	    --nbt->tv_sec;
	}

	/*
	 * Note that basetime diverges from boottime as the clock drift is
	 * compensated for, so we cannot do away with boottime.  When setting
	 * the absolute time of day the drift is 0 (for an instant) and we
	 * can simply assign boottime to basetime.  
	 *
	 * Note that nanouptime() is based on gd_time_seconds which is drift
	 * compensated up to a point (it is guarenteed to remain monotonically
	 * increasing).  gd_time_seconds is thus our best uptime guess and
	 * suitable for use in the boottime calculation.  It is already taken
	 * into account in the basetime calculation above.
	 */
	boottime.tv_sec = nbt->tv_sec;
	ntp_delta = 0;

	/*
	 * We now have a new basetime, make sure all other cpus have it,
	 * then update the index.
	 */
	cpu_sfence();
	basetime_index = ni;

	crit_exit();
}
Example #10
0
/*
 * (Backend) Feed chain data through the cluster validator and back to
 * the frontend.  Chains are fed from multiple nodes concurrently
 * and pipelined via per-node FIFOs in the XOP.
 *
 * No xop lock is needed because we are only manipulating fields under
 * our direct control.
 *
 * Returns 0 on success and a hammer error code if sync is permanently
 * lost.  The caller retains a ref on the chain but by convention
 * the lock is typically inherited by the xop (caller loses lock).
 *
 * Returns non-zero on error.  In this situation the caller retains a
 * ref on the chain but loses the lock (we unlock here).
 *
 * WARNING!  The chain is moving between two different threads, it must
 *	     be locked SHARED to retain its data mapping, not exclusive.
 *	     When multiple operations are in progress at once, chains fed
 *	     back to the frontend for collection can wind up being locked
 *	     in different orders, only a shared lock can prevent a deadlock.
 *
 *	     Exclusive locks may only be used by a XOP backend node thread
 *	     temporarily, with no direct or indirect dependencies (aka
 *	     blocking/waiting) on other nodes.
 */
int
hammer2_xop_feed(hammer2_xop_head_t *xop, hammer2_chain_t *chain,
		 int clindex, int error)
{
	hammer2_xop_fifo_t *fifo;

	/*
	 * Multi-threaded entry into the XOP collector.  We own the
	 * fifo->wi for our clindex.
	 */
	fifo = &xop->collect[clindex];

	while (fifo->ri == fifo->wi - HAMMER2_XOPFIFO) {
		tsleep_interlock(xop, 0);
		if (hammer2_xop_active(xop) == 0) {
			error = EINTR;
			goto done;
		}
		if (fifo->ri == fifo->wi - HAMMER2_XOPFIFO) {
			tsleep(xop, PINTERLOCKED, "h2feed", hz*60);
		}
	}
	if (chain)
		hammer2_chain_ref(chain);
	fifo->errors[fifo->wi & HAMMER2_XOPFIFO_MASK] = error;
	fifo->array[fifo->wi & HAMMER2_XOPFIFO_MASK] = chain;
	cpu_sfence();
	++fifo->wi;
	atomic_add_int(&xop->check_counter, 1);
	wakeup(&xop->check_counter);	/* XXX optimize */
	error = 0;

	/*
	 * Cleanup.  If an error occurred we eat the lock.  If no error
	 * occurred the fifo inherits the lock and gains an additional ref.
	 *
	 * The caller's ref remains in both cases.
	 */
done:
	if (error && chain)
		hammer2_chain_unlock(chain);
	return error;
}
Example #11
0
/*
 * Flush waiting shared locks.  The lock's prior state is passed in and must
 * be adjusted atomically only if it matches and LINKSPIN is not set.
 *
 * IMPORTANT! The caller has left one active count on the lock for us to
 *	      consume.  We will apply this to the first link, but must add
 *	      additional counts for any other links.
 */
static int
mtx_chain_link_sh(mtx_t *mtx, u_int olock)
{
	thread_t td = curthread;
	mtx_link_t *link;
	u_int	addcount;
	u_int	nlock;

	olock &= ~MTX_LINKSPIN;
	nlock = olock | MTX_LINKSPIN;
	nlock &= ~MTX_EXCLUSIVE;
	crit_enter_raw(td);
	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
		/*
		 * It should not be possible for SHWANTED to be set without
		 * any links pending.
		 */
		KKASSERT(mtx->mtx_shlink != NULL);

		/*
		 * We have to process the count for all shared locks before
		 * we process any of the links.  Count the additional shared
		 * locks beyond the first link (which is already accounted
		 * for) and associate the full count with the lock
		 * immediately.
		 */
		addcount = 0;
		for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
		     link = link->next) {
			++addcount;
		}
		if (addcount > 0)
			atomic_add_int(&mtx->mtx_lock, addcount);

		/*
		 * We can wakeup all waiting shared locks.
		 */
		while ((link = mtx->mtx_shlink) != NULL) {
			KKASSERT(link->state == MTX_LINK_LINKED_SH);
			if (link->next == link) {
				mtx->mtx_shlink = NULL;
			} else {
				mtx->mtx_shlink = link->next;
				link->next->prev = link->prev;
				link->prev->next = link->next;
			}
			link->next = NULL;
			link->prev = NULL;
			cpu_sfence();
			if (link->callback) {
				link->state = MTX_LINK_CALLEDBACK;
				link->callback(link, link->arg, 0);
			} else {
				cpu_sfence();
				link->state = MTX_LINK_ACQUIRED;
				wakeup(link);
			}
		}
		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
						 MTX_SHWANTED);
		crit_exit_raw(td);
		return 1;
	}
	/* retry */
	crit_exit_raw(td);

	return 0;
}
Example #12
0
static void
process_comp_queue (struct nvme_host *host,
		    u16 comp_queue_id,
		    struct nvme_queue_info *h_comp_queue_info,
		    struct nvme_queue_info *g_comp_queue_info)
{
	struct nvme_request_hub *hub;
	hub = host->h_queue.request_hub[comp_queue_id];

	u16 h_cur_head = h_comp_queue_info->cur_pos.head;
	u16 g_cur_head = g_comp_queue_info->cur_pos.head;

	struct nvme_comp first_h_comp = {0}, *first_g_comp = NULL;

	struct nvme_comp *h_comp, *g_comp;
	for (h_comp = nvme_comp_queue_at_idx (h_comp_queue_info, h_cur_head),
	     g_comp = nvme_comp_queue_at_idx (g_comp_queue_info, g_cur_head);
	     NVME_COMP_GET_PHASE (h_comp) == h_comp_queue_info->phase;
	     h_comp = nvme_comp_queue_at_idx (h_comp_queue_info, h_cur_head),
	     g_comp = nvme_comp_queue_at_idx (g_comp_queue_info, g_cur_head)) {

		/* This queue ID is submission queue ID */
		u16 subm_queue_id = h_comp->queue_id;

		struct nvme_request *req;
		req = get_request (host, hub, subm_queue_id, h_comp->cmd_id);

		ASSERT (req);

		u64 time_taken = get_time () - req->submit_time;
		if (time_taken > NVME_TIME_TAKEN_WATERMARK) {
			printf ("Long time controller response: %llu\n",
				time_taken);
			printf ("Submission Queue ID: %u opcode: %u\n",
				subm_queue_id, req->cmd.std.opcode);
		}

		if (subm_queue_id == 0)
			process_admin_comp (host, h_comp, req);
		else
			process_io_comp (host, h_comp, req);

		h_cur_head++;

		if (h_cur_head >= h_comp_queue_info->n_entries) {
			h_comp_queue_info->phase ^= 1;
			h_cur_head = 0;
		}

		if (!req->is_h_req) {
			struct nvme_comp comp = *h_comp;
			comp.cmd_id = req->orig_cmd_id;
			comp.status &= ~0x1;
			comp.status |= g_comp_queue_info->phase;

			/*
			 * Replace with the host value instead of the
			 * value reported by the controller. This is necessary
			 * if we mix guest commands and host commands to share
			 * queues.
			 */
			comp.queue_head = g_subm_cur_tail (host,
							   subm_queue_id);

			if (first_g_comp) {
				*g_comp = comp;
			} else {
				/* Copy the first completion entry later */
				first_g_comp = g_comp;
				first_h_comp = comp;
			}

			g_cur_head++;
			if (g_cur_head >= g_comp_queue_info->n_entries) {
				g_comp_queue_info->phase ^= 1;
				g_cur_head = 0;
			}

			spinlock_lock (&hub->lock);
			g_comp_queue_info->cur_pos.head = g_cur_head;
			h_comp_queue_info->cur_pos.head = h_cur_head;
			spinlock_unlock (&hub->lock);
		} else {
			spinlock_lock (&hub->lock);
			nvme_write_comp_db (host, comp_queue_id, h_cur_head);
			hub->n_not_ack_h_reqs--;
			h_comp_queue_info->cur_pos.head = h_cur_head;
			spinlock_unlock (&hub->lock);
		}

		nvme_free_request (hub, req);
	}

	if (first_g_comp) {
		first_g_comp->cmd_specific = first_h_comp.cmd_specific;
		first_g_comp->rsvd = first_h_comp.rsvd;
		first_g_comp->queue_head = first_h_comp.queue_head;
		first_g_comp->queue_id = first_h_comp.queue_id;
		first_g_comp->cmd_id = first_h_comp.cmd_id;
		/*
		 * Make sure everything are stored in the memory properly
		 * before we copy the status field. This is to avoid
		 * data corruption.
		 */
		cpu_sfence ();
		first_g_comp->status = first_h_comp.status;
	}
}
Example #13
0
/*
 * Invalidate the specified va across all cpus associated with the pmap.
 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
 * will be done fully synchronously with storing npte into *ptep and returning
 * opte.
 *
 * If ptep is NULL the operation will execute semi-synchronously.
 * ptep must be NULL if npgs > 1
 */
pt_entry_t
pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
               pt_entry_t *ptep, pt_entry_t npte)
{
    globaldata_t gd = mycpu;
    pmap_inval_info_t *info;
    pt_entry_t opte = 0;
    int cpu = gd->gd_cpuid;
    cpumask_t tmpmask;
    unsigned long rflags;

    /*
     * Initialize invalidation for pmap and enter critical section.
     */
    if (pmap == NULL)
        pmap = &kernel_pmap;
    pmap_inval_init(pmap);

    /*
     * Shortcut single-cpu case if possible.
     */
    if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
        /*
         * Convert to invltlb if there are too many pages to
         * invlpg on.
         */
        if (npgs > MAX_INVAL_PAGES) {
            npgs = 0;
            va = (vm_offset_t)-1;
        }

        /*
         * Invalidate the specified pages, handle invltlb if requested.
         */
        while (npgs) {
            --npgs;
            if (ptep) {
                opte = atomic_swap_long(ptep, npte);
                ++ptep;
            }
            if (va == (vm_offset_t)-1)
                break;
            cpu_invlpg((void *)va);
            va += PAGE_SIZE;
        }
        if (va == (vm_offset_t)-1)
            cpu_invltlb();
        pmap_inval_done(pmap);

        return opte;
    }

    /*
     * We need a critical section to prevent getting preempted while
     * we setup our command.  A preemption might execute its own
     * pmap_inval*() command and create confusion below.
     *
     * tsc_target is our watchdog timeout that will attempt to recover
     * from a lost IPI.  Set to 1/16 second for now.
     */
    info = &invinfo[cpu];
    info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);

    /*
     * We must wait for other cpus which may still be finishing up a
     * prior operation that we requested.
     *
     * We do not have to disable interrupts here.  An Xinvltlb can occur
     * at any time (even within a critical section), but it will not
     * act on our command until we set our done bits.
     */
    while (CPUMASK_TESTNZERO(info->done)) {
#ifdef LOOPRECOVER
        if (loopwdog(info)) {
            info->failed = 1;
            loopdebug("A", info);
            /* XXX recover from possible bug */
            CPUMASK_ASSZERO(info->done);
        }
#endif
        cpu_pause();
    }
    KKASSERT(info->mode == INVDONE);

    /*
     * Must set our cpu in the invalidation scan mask before
     * any possibility of [partial] execution (remember, XINVLTLB
     * can interrupt a critical section).
     */
    ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);

    info->va = va;
    info->npgs = npgs;
    info->ptep = ptep;
    info->npte = npte;
    info->opte = 0;
#ifdef LOOPRECOVER
    info->failed = 0;
#endif
    info->mode = INVSTORE;

    tmpmask = pmap->pm_active;	/* volatile (bits may be cleared) */
    cpu_ccfence();
    CPUMASK_ANDMASK(tmpmask, smp_active_mask);

    /*
     * If ptep is NULL the operation can be semi-synchronous, which means
     * we can improve performance by flagging and removing idle cpus
     * (see the idleinvlclr function in mp_machdep.c).
     *
     * Typically kernel page table operation is semi-synchronous.
     */
    if (ptep == NULL)
        smp_smurf_idleinvlclr(&tmpmask);
    CPUMASK_ORBIT(tmpmask, cpu);
    info->mask = tmpmask;

    /*
     * Command may start executing the moment 'done' is initialized,
     * disable current cpu interrupt to prevent 'done' field from
     * changing (other cpus can't clear done bits until the originating
     * cpu clears its mask bit, but other cpus CAN start clearing their
     * mask bits).
     */
#ifdef LOOPRECOVER
    info->sigmask = tmpmask;
    CHECKSIGMASK(info);
#endif
    cpu_sfence();
    rflags = read_rflags();
    cpu_disable_intr();

    ATOMIC_CPUMASK_COPY(info->done, tmpmask);
    /* execution can begin here due to races */

    /*
     * Pass our copy of the done bits (so they don't change out from
     * under us) to generate the Xinvltlb interrupt on the targets.
     */
    smp_invlpg(&tmpmask);
    opte = info->opte;
    KKASSERT(info->mode == INVDONE);

    /*
     * Target cpus will be in their loop exiting concurrently with our
     * cleanup.  They will not lose the bitmask they obtained before so
     * we can safely clear this bit.
     */
    ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
    write_rflags(rflags);
    pmap_inval_done(pmap);

    return opte;
}
Example #14
0
/*
 * Call this *after* all CPUs Cx states have been attached.
 */
static void
acpi_cst_postattach(void *arg)
{
    struct acpi_cst_softc *sc;
    int i;

    /* Get set of Cx state devices */
    devclass_get_devices(acpi_cst_devclass, &acpi_cst_devices,
	&acpi_cst_ndevices);

    /*
     * Setup any quirks that might necessary now that we have probed
     * all the CPUs' Cx states.
     */
    acpi_cst_set_quirks();

    if (acpi_cst_use_fadt) {
	/*
	 * We are using Cx mode from FADT, probe for available Cx states
	 * for all processors.
	 */
	for (i = 0; i < acpi_cst_ndevices; i++) {
	    sc = device_get_softc(acpi_cst_devices[i]);
	    acpi_cst_cx_probe_fadt(sc);
	}
    } else {
	/*
	 * We are using _CST mode, remove C3 state if necessary.
	 *
	 * As we now know for sure that we will be using _CST mode
	 * install our notify handler.
	 */
	for (i = 0; i < acpi_cst_ndevices; i++) {
	    sc = device_get_softc(acpi_cst_devices[i]);
	    if (acpi_cst_quirks & ACPI_CST_QUIRK_NO_C3) {
		/* Free part of unused resources */
		acpi_cst_free_resource(sc, sc->cst_non_c3 + 1);
		sc->cst_cx_count = sc->cst_non_c3 + 1;
	    }
	    sc->cst_parent->cpu_cst_notify = acpi_cst_notify;
	}
    }
    acpi_cst_global_cx_count();

    /* Perform Cx final initialization. */
    for (i = 0; i < acpi_cst_ndevices; i++) {
	sc = device_get_softc(acpi_cst_devices[i]);
	acpi_cst_startup(sc);

	if (sc->cst_parent->glob_sysctl_tree != NULL) {
	    struct acpi_cpu_softc *cpu = sc->cst_parent;

	    /* Add a sysctl handler to handle global Cx lowest setting */
	    SYSCTL_ADD_PROC(&cpu->glob_sysctl_ctx,
	    		    SYSCTL_CHILDREN(cpu->glob_sysctl_tree),
			    OID_AUTO, "cx_lowest",
			    CTLTYPE_STRING | CTLFLAG_RW, NULL, 0,
			    acpi_cst_global_lowest_sysctl, "A",
			    "Requested global lowest Cx sleep state");
	    SYSCTL_ADD_PROC(&cpu->glob_sysctl_ctx,
	    		    SYSCTL_CHILDREN(cpu->glob_sysctl_tree),
			    OID_AUTO, "cx_lowest_use",
			    CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
			    acpi_cst_global_lowest_use_sysctl, "A",
			    "Global lowest Cx sleep state to use");
	}
    }

    /* Take over idling from cpu_idle_default(). */
    acpi_cst_cx_lowest = 0;
    acpi_cst_cx_lowest_req = 0;
    acpi_cst_disable_idle = FALSE;

    cpu_sfence();
    cpu_idle_hook = acpi_cst_idle;
}
Example #15
0
/*
 * Parse a _CST package and set up its Cx states.  Since the _CST object
 * can change dynamically, our notify handler may call this function
 * to clean up and probe the new _CST package.
 */
static int
acpi_cst_cx_probe_cst(struct acpi_cst_softc *sc, int reprobe)
{
    struct	 acpi_cst_cx *cx_ptr;
    ACPI_STATUS	 status;
    ACPI_BUFFER	 buf;
    ACPI_OBJECT	*top;
    ACPI_OBJECT	*pkg;
    uint32_t	 count;
    int		 i;

    ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);

#ifdef INVARIANTS
    if (reprobe)
	KKASSERT(&curthread->td_msgport == netisr_cpuport(sc->cst_cpuid));
#endif

    buf.Pointer = NULL;
    buf.Length = ACPI_ALLOCATE_BUFFER;
    status = AcpiEvaluateObject(sc->cst_handle, "_CST", NULL, &buf);
    if (ACPI_FAILURE(status))
	return (ENXIO);

    /* _CST is a package with a count and at least one Cx package. */
    top = (ACPI_OBJECT *)buf.Pointer;
    if (!ACPI_PKG_VALID(top, 2) || acpi_PkgInt32(top, 0, &count) != 0) {
	device_printf(sc->cst_dev, "invalid _CST package\n");
	AcpiOsFree(buf.Pointer);
	return (ENXIO);
    }
    if (count != top->Package.Count - 1) {
	device_printf(sc->cst_dev, "invalid _CST state count (%d != %d)\n",
	       count, top->Package.Count - 1);
	count = top->Package.Count - 1;
    }
    if (count > MAX_CX_STATES) {
	device_printf(sc->cst_dev, "_CST has too many states (%d)\n", count);
	count = MAX_CX_STATES;
    }

    sc->cst_flags |= ACPI_CST_FLAG_PROBING | ACPI_CST_FLAG_MATCH_HT;
    cpu_sfence();

    /*
     * Free all previously allocated resources
     *
     * NOTE: It is needed for _CST reprobing.
     */
    acpi_cst_free_resource(sc, 0);

    /* Set up all valid states. */
    sc->cst_cx_count = 0;
    cx_ptr = sc->cst_cx_states;
    for (i = 0; i < count; i++) {
	int error;

	pkg = &top->Package.Elements[i + 1];
	if (!ACPI_PKG_VALID(pkg, 4) ||
	    acpi_PkgInt32(pkg, 1, &cx_ptr->type) != 0 ||
	    acpi_PkgInt32(pkg, 2, &cx_ptr->trans_lat) != 0 ||
	    acpi_PkgInt32(pkg, 3, &cx_ptr->power) != 0) {

	    device_printf(sc->cst_dev, "skipping invalid Cx state package\n");
	    continue;
	}

	/* Validate the state to see if we should use it. */
	switch (cx_ptr->type) {
	case ACPI_STATE_C1:
	    sc->cst_non_c3 = i;
	    cx_ptr->enter = acpi_cst_c1_halt_enter;
	    error = acpi_cst_cx_setup(cx_ptr);
	    if (error)
		panic("C1 CST HALT setup failed: %d", error);
	    if (sc->cst_cx_count != 0) {
		/*
		 * C1 is not the first C-state; something really stupid
		 * is going on ...
		 */
		sc->cst_flags &= ~ACPI_CST_FLAG_MATCH_HT;
	    }
	    cx_ptr++;
	    sc->cst_cx_count++;
	    continue;
	case ACPI_STATE_C2:
	    sc->cst_non_c3 = i;
	    break;
	case ACPI_STATE_C3:
	default:
	    if ((acpi_cst_quirks & ACPI_CST_QUIRK_NO_C3) != 0) {
		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
				 "cpu_cst%d: C3[%d] not available.\n",
				 device_get_unit(sc->cst_dev), i));
		continue;
	    }
	    break;
	}

	/*
	 * Allocate the control register for C2 or C3(+).
	 */
	KASSERT(cx_ptr->res == NULL, ("still has res"));
	acpi_PkgRawGas(pkg, 0, &cx_ptr->gas);

	/*
	 * We match number of C2/C3 for hyperthreads, only if the
	 * register is "Fixed Hardware", e.g. on most of the Intel
	 * CPUs.  We don't have much to do for the rest of the
	 * register types.
	 */
	if (cx_ptr->gas.SpaceId != ACPI_ADR_SPACE_FIXED_HARDWARE)
	    sc->cst_flags &= ~ACPI_CST_FLAG_MATCH_HT;

	cx_ptr->rid = sc->cst_parent->cpu_next_rid;
	acpi_bus_alloc_gas(sc->cst_dev, &cx_ptr->res_type, &cx_ptr->rid,
	    &cx_ptr->gas, &cx_ptr->res, RF_SHAREABLE);
	if (cx_ptr->res != NULL) {
	    sc->cst_parent->cpu_next_rid++;
	    ACPI_DEBUG_PRINT((ACPI_DB_INFO,
			     "cpu_cst%d: Got C%d - %d latency\n",
			     device_get_unit(sc->cst_dev), cx_ptr->type,
			     cx_ptr->trans_lat));
	    cx_ptr->enter = acpi_cst_cx_io_enter;
	    cx_ptr->btag = rman_get_bustag(cx_ptr->res);
	    cx_ptr->bhand = rman_get_bushandle(cx_ptr->res);
	    error = acpi_cst_cx_setup(cx_ptr);
	    if (error)
		panic("C%d CST I/O setup failed: %d", cx_ptr->type, error);
	    cx_ptr++;
	    sc->cst_cx_count++;
	} else {
	    error = acpi_cst_cx_setup(cx_ptr);
	    if (!error) {
		KASSERT(cx_ptr->enter != NULL,
		    ("C%d enter is not set", cx_ptr->type));
		cx_ptr++;
		sc->cst_cx_count++;
	    }
	}
    }
    AcpiOsFree(buf.Pointer);

    if (sc->cst_flags & ACPI_CST_FLAG_MATCH_HT) {
	cpumask_t mask;

	mask = get_cpumask_from_level(sc->cst_cpuid, CORE_LEVEL);
	if (CPUMASK_TESTNZERO(mask)) {
	    int cpu;

	    for (cpu = 0; cpu < ncpus; ++cpu) {
		struct acpi_cst_softc *sc1 = acpi_cst_softc[cpu];

		if (sc1 == NULL || sc1 == sc ||
		    (sc1->cst_flags & ACPI_CST_FLAG_ATTACHED) == 0 ||
		    (sc1->cst_flags & ACPI_CST_FLAG_MATCH_HT) == 0)
		    continue;
		if (!CPUMASK_TESTBIT(mask, sc1->cst_cpuid))
		    continue;

		if (sc1->cst_cx_count != sc->cst_cx_count) {
		    struct acpi_cst_softc *src_sc, *dst_sc;

		    if (bootverbose) {
			device_printf(sc->cst_dev,
			    "inconstent C-state count: %d, %s has %d\n",
			    sc->cst_cx_count,
			    device_get_nameunit(sc1->cst_dev),
			    sc1->cst_cx_count);
		    }
		    if (sc1->cst_cx_count > sc->cst_cx_count) {
			src_sc = sc1;
			dst_sc = sc;
		    } else {
			src_sc = sc;
			dst_sc = sc1;
		    }
		    acpi_cst_copy(dst_sc, src_sc);
		}
	    }
	}
    }

    if (reprobe) {
	/* If there are C3(+) states, always enable bus master wakeup */
	if ((acpi_cst_quirks & ACPI_CST_QUIRK_NO_BM) == 0) {
	    for (i = 0; i < sc->cst_cx_count; ++i) {
		struct acpi_cst_cx *cx = &sc->cst_cx_states[i];

		if (cx->type >= ACPI_STATE_C3) {
		    AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 1);
		    break;
		}
	    }
	}

	/* Fix up the lowest Cx being used */
	acpi_cst_set_lowest_oncpu(sc, sc->cst_cx_lowest_req);
    }

    /*
     * Cache the lowest non-C3 state.
     * NOTE: must after cst_cx_lowest is set.
     */
    acpi_cst_non_c3(sc);

    cpu_sfence();
    sc->cst_flags &= ~ACPI_CST_FLAG_PROBING;

    return (0);
}
Example #16
0
/*
 * API function - invalidate the pte at (va) and replace *ptep with npte
 * atomically only if *ptep equals opte, across the pmap's active cpus.
 *
 * Returns 1 on success, 0 on failure (caller typically retries).
 */
int
pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
                      pt_entry_t opte, pt_entry_t npte)
{
    globaldata_t gd = mycpu;
    pmap_inval_info_t *info;
    int success;
    int cpu = gd->gd_cpuid;
    cpumask_t tmpmask;
    unsigned long rflags;

    /*
     * Initialize invalidation for pmap and enter critical section.
     */
    if (pmap == NULL)
        pmap = &kernel_pmap;
    pmap_inval_init(pmap);

    /*
     * Shortcut single-cpu case if possible.
     */
    if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
        if (atomic_cmpset_long(ptep, opte, npte)) {
            if (va == (vm_offset_t)-1)
                cpu_invltlb();
            else
                cpu_invlpg((void *)va);
            pmap_inval_done(pmap);
            return 1;
        } else {
            pmap_inval_done(pmap);
            return 0;
        }
    }

    /*
     * We need a critical section to prevent getting preempted while
     * we setup our command.  A preemption might execute its own
     * pmap_inval*() command and create confusion below.
     */
    info = &invinfo[cpu];
    info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);

    /*
     * We must wait for other cpus which may still be finishing
     * up a prior operation.
     */
    while (CPUMASK_TESTNZERO(info->done)) {
#ifdef LOOPRECOVER
        if (loopwdog(info)) {
            info->failed = 1;
            loopdebug("B", info);
            /* XXX recover from possible bug */
            CPUMASK_ASSZERO(info->done);
        }
#endif
        cpu_pause();
    }
    KKASSERT(info->mode == INVDONE);

    /*
     * Must set our cpu in the invalidation scan mask before
     * any possibility of [partial] execution (remember, XINVLTLB
     * can interrupt a critical section).
     */
    ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);

    info->va = va;
    info->npgs = 1;			/* unused */
    info->ptep = ptep;
    info->npte = npte;
    info->opte = opte;
#ifdef LOOPRECOVER
    info->failed = 0;
#endif
    info->mode = INVCMPSET;
    info->success = 0;

    tmpmask = pmap->pm_active;	/* volatile */
    cpu_ccfence();
    CPUMASK_ANDMASK(tmpmask, smp_active_mask);
    CPUMASK_ORBIT(tmpmask, cpu);
    info->mask = tmpmask;

    /*
     * Command may start executing the moment 'done' is initialized,
     * disable current cpu interrupt to prevent 'done' field from
     * changing (other cpus can't clear done bits until the originating
     * cpu clears its mask bit).
     */
#ifdef LOOPRECOVER
    info->sigmask = tmpmask;
    CHECKSIGMASK(info);
#endif
    cpu_sfence();
    rflags = read_rflags();
    cpu_disable_intr();

    ATOMIC_CPUMASK_COPY(info->done, tmpmask);

    /*
     * Pass our copy of the done bits (so they don't change out from
     * under us) to generate the Xinvltlb interrupt on the targets.
     */
    smp_invlpg(&tmpmask);
    success = info->success;
    KKASSERT(info->mode == INVDONE);

    ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
    write_rflags(rflags);
    pmap_inval_done(pmap);

    return success;
}
Example #17
0
/*
 * Exclusive-lock a mutex, block until acquired unless link is async.
 * Recursion is allowed.
 *
 * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
 * if async.  If immediately successful an async exclusive lock will return 0
 * and not issue the async callback or link the link structure.  The caller
 * must handle this case (typically this is an optimal code path).
 *
 * A tsleep() error can only be returned if PCATCH is specified in the flags.
 */
static __inline int
__mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
{
	thread_t td;
	u_int	lock;
	u_int	nlock;
	int	error;
	int	isasync;

	for (;;) {
		lock = mtx->mtx_lock;
		cpu_ccfence();

		if (lock == 0) {
			nlock = MTX_EXCLUSIVE | 1;
			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
				mtx->mtx_owner = curthread;
				cpu_sfence();
				link->state = MTX_LINK_ACQUIRED;
				error = 0;
				break;
			}
			continue;
		}
		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
			KKASSERT((lock & MTX_MASK) != MTX_MASK);
			nlock = lock + 1;
			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
				cpu_sfence();
				link->state = MTX_LINK_ACQUIRED;
				error = 0;
				break;
			}
			continue;
		}

		/*
		 * We need MTX_LINKSPIN to manipulate exlink or
		 * shlink.
		 *
		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
		 * pending exclusive requests.  It cannot be set as a separate
		 * operation prior to acquiring MTX_LINKSPIN.
		 *
		 * To avoid unnecessary cpu cache traffic we poll
		 * for collisions.  It is also possible that EXWANTED
		 * state failing the above test was spurious, so all the
		 * tests must be repeated if we cannot obtain LINKSPIN
		 * with the prior state tests intact (i.e. don't reload
		 * the (lock) variable here, for heaven's sake!).
		 */
		if (lock & MTX_LINKSPIN) {
			cpu_pause();
			continue;
		}
		td = curthread;
		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
		crit_enter_raw(td);
		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
			crit_exit_raw(td);
			continue;
		}

		/*
		 * Check for early abort.
		 */
		if (link->state == MTX_LINK_ABORTED) {
			if (mtx->mtx_exlink == NULL) {
				atomic_clear_int(&mtx->mtx_lock,
						 MTX_LINKSPIN |
						 MTX_EXWANTED);
			} else {
				atomic_clear_int(&mtx->mtx_lock,
						 MTX_LINKSPIN);
			}
			crit_exit_raw(td);
			link->state = MTX_LINK_IDLE;
			error = ENOLCK;
			break;
		}

		/*
		 * Add our link to the exlink list and release LINKSPIN.
		 */
		link->owner = td;
		link->state = MTX_LINK_LINKED_EX;
		if (mtx->mtx_exlink) {
			link->next = mtx->mtx_exlink;
			link->prev = link->next->prev;
			link->next->prev = link;
			link->prev->next = link;
		} else {
			link->next = link;
			link->prev = link;
			mtx->mtx_exlink = link;
		}
		isasync = (link->callback != NULL);
		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
		crit_exit_raw(td);

		/* 
		 * If asynchronous lock request return without
		 * blocking, leave link structure linked.
		 */
		if (isasync) {
			error = EINPROGRESS;
			break;
		}

		/*
		 * Wait for lock
		 */
		error = mtx_wait_link(mtx, link, flags, to);
		break;
	}
	return (error);
}