Esempio n. 1
0
/*
 * Acquire the vnode lock unguarded.
 *
 * The non-blocking version also uses a slightly different mechanic.
 * This function will explicitly fail not only if it cannot acquire
 * the lock normally, but also if the caller already holds a lock.
 *
 * The adjusted mechanic is used to close a loophole where complex
 * VOP_RECLAIM code can circle around recursively and allocate the
 * same vnode it is trying to destroy from the freelist.
 *
 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
 * cause the incorrect behavior to occur.  If not for that lockmgr()
 * would do the right thing.
 *
 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
 */
void
vx_get(struct vnode *vp)
{
	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
}
Esempio n. 2
0
/*
 * Drop a reference from the cred structure, free it if the reference count
 * reaches 0. 
 *
 * NOTE: because we used atomic_add_int() above, without a spinlock, we
 * must also use atomic_subtract_int() below.  A spinlock is required
 * in crfree() to handle multiple callers racing the refcount to 0.
 *
 * MPSAFE
 */
void
crfree(struct ucred *cr)
{
	if (cr->cr_ref <= 0)
		panic("Freeing already free credential! %p", cr);
	if (atomic_fetchadd_int(&cr->cr_ref, -1) == 1) {
		/*
		 * Some callers of crget(), such as nfs_statfs(),
		 * allocate a temporary credential, but don't
		 * allocate a uidinfo structure.
		 */
		if (cr->cr_uidinfo != NULL) {
			uidrop(cr->cr_uidinfo);
			cr->cr_uidinfo = NULL;
		}
		if (cr->cr_ruidinfo != NULL) {
			uidrop(cr->cr_ruidinfo);
			cr->cr_ruidinfo = NULL;
		}

		/*
		 * Destroy empty prisons
		 */
		if (jailed(cr))
			prison_free(cr->cr_prison);
		cr->cr_prison = NULL;	/* safety */

		kfree((caddr_t)cr, M_CRED);
	}
}
Esempio n. 3
0
static inline void
l2t_hold(struct l2t_data *d, struct l2t_entry *e)
{

	if (atomic_fetchadd_int(&e->refcnt, 1) == 0)  /* 0 -> 1 transition */
		atomic_subtract_int(&d->nfree, 1);
}
Esempio n. 4
0
static inline rndis_request *
hv_rndis_request(rndis_device *device, uint32_t message_type,
		 uint32_t message_length)
{
	rndis_request *request;
	rndis_msg *rndis_mesg;
	rndis_set_request *set;

	request = malloc(sizeof(rndis_request), M_NETVSC, M_WAITOK | M_ZERO);

	sema_init(&request->wait_sema, 0, "rndis sema");
	
	rndis_mesg = &request->request_msg;
	rndis_mesg->ndis_msg_type = message_type;
	rndis_mesg->msg_len = message_length;

	/*
	 * Set the request id. This field is always after the rndis header
	 * for request/response packet types so we just use the set_request
	 * as a template.
	 */
	set = &rndis_mesg->msg.set_request;
	set->request_id = atomic_fetchadd_int(&device->new_request_id, 1);
	/* Increment to get the new value (call above returns old value) */
	set->request_id += 1;

	/* Add to the request list */
	mtx_lock(&device->req_lock);
	STAILQ_INSERT_TAIL(&device->myrequest_list, request, mylist_entry);
	mtx_unlock(&device->req_lock);

	return (request);
}
Esempio n. 5
0
/*
 * The GPE handler is called when IBE/OBF or SCI events occur.  We are
 * called from an unknown lock context.
 */
static UINT32
EcGpeHandler(ACPI_HANDLE GpeDevice, UINT32 GpeNumber, void *Context)
{
    struct acpi_ec_softc *sc = Context;
    ACPI_STATUS		       Status;
    EC_STATUS		       EcStatus;

    KASSERT(Context != NULL, ("EcGpeHandler called with NULL"));
    CTR0(KTR_ACPI, "ec gpe handler start");

    /*
     * Notify EcWaitEvent() that the status register is now fresh.  If we
     * didn't do this, it wouldn't be possible to distinguish an old IBE
     * from a new one, for example when doing a write transaction (writing
     * address and then data values.)
     */
    atomic_add_int(&sc->ec_gencount, 1);
    wakeup(sc);

    /*
     * If the EC_SCI bit of the status register is set, queue a query handler.
     * It will run the query and _Qxx method later, under the lock.
     */
    EcStatus = EC_GET_CSR(sc);
    if ((EcStatus & EC_EVENT_SCI) &&
	atomic_fetchadd_int(&sc->ec_sci_pend, 1) == 0) {
	CTR0(KTR_ACPI, "ec gpe queueing query handler");
	Status = AcpiOsExecute(OSL_GPE_HANDLER, EcGpeQueryHandler, Context);
	if (ACPI_FAILURE(Status)) {
	    printf("EcGpeHandler: queuing GPE query handler failed\n");
	    atomic_store_rel_int(&sc->ec_sci_pend, 0);
	}
    }
    return (ACPI_REENABLE_GPE);
}
Esempio n. 6
0
/*
 * Destroy a disconnected socket.  This routine is a NOP if entities
 * still have a reference on the socket:
 *
 *	so_pcb -	The protocol stack still has a reference
 *	SS_NOFDREF -	There is no longer a file pointer reference
 */
void
sofree(struct socket *so)
{
	struct socket *head;

	/*
	 * This is a bit hackish at the moment.  We need to interlock
	 * any accept queue we are on before we potentially lose the
	 * last reference to avoid races against a re-reference from
	 * someone operating on the queue.
	 */
	while ((head = so->so_head) != NULL) {
		lwkt_getpooltoken(head);
		if (so->so_head == head)
			break;
		lwkt_relpooltoken(head);
	}

	/*
	 * Arbitrage the last free.
	 */
	KKASSERT(so->so_refs > 0);
	if (atomic_fetchadd_int(&so->so_refs, -1) != 1) {
		if (head)
			lwkt_relpooltoken(head);
		return;
	}

	KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF));
	KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);

	/*
	 * We're done, remove ourselves from the accept queue we are
	 * on, if we are on one.
	 */
	if (head != NULL) {
		if (so->so_state & SS_INCOMP) {
			TAILQ_REMOVE(&head->so_incomp, so, so_list);
			head->so_incqlen--;
		} else if (so->so_state & SS_COMP) {
			/*
			 * We must not decommission a socket that's
			 * on the accept(2) queue.  If we do, then
			 * accept(2) may hang after select(2) indicated
			 * that the listening socket was ready.
			 */
			lwkt_relpooltoken(head);
			return;
		} else {
			panic("sofree: not queued");
		}
		soclrstate(so, SS_INCOMP);
		so->so_head = NULL;
		lwkt_relpooltoken(head);
	}
	ssb_release(&so->so_snd, so);
	sorflush(so);
	sodealloc(so);
}
Esempio n. 7
0
/*
 * If we are below the maximum allowed cluster references,
 * increment the reference count and return TRUE. Otherwise,
 * leave the reference count alone and return FALSE.
 */
static __inline bool
tcp_pcap_take_cluster_reference(void)
{
	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
		tcp_pcap_clusters_referenced_max) {
		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
		return FALSE;
	}
	return TRUE;
}
Esempio n. 8
0
void
drm_gem_object_handle_unreference_unlocked(struct drm_gem_object *obj)
{

	if (obj == NULL ||
	    atomic_load_acq_int(&obj->handle_count) == 0)
		return;

	if (atomic_fetchadd_int(&obj->handle_count, -1) == 1)
		drm_gem_object_handle_free(obj);
	drm_gem_object_unreference_unlocked(obj);
}
Esempio n. 9
0
/*
 | function for freeing external storage for mbuf
 */
static void
ext_free(void *arg)
{
     pduq_t *a = arg;

     if (atomic_fetchadd_int(&a->refcnt, -1) == 1)
	  if (a->buf != NULL) {
	       debug(3, "ou_refcnt=%d a=%p b=%p", ou_refcnt, a, a->buf);
	       kfree(a->buf, M_ISCSI);
	       a->buf = NULL;
	  }
}
Esempio n. 10
0
static __inline uint32_t
hn_rndis_rid(struct hn_softc *sc)
{
	uint32_t rid;

again:
	rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1);
	if (rid == 0)
		goto again;

	/* Use upper 16 bits for non-compat RNDIS messages. */
	return ((rid & 0xffff) << 16);
}
Esempio n. 11
0
/*
 * RNDIS filter halt device
 */
static int
hv_rf_halt_device(rndis_device *device)
{
	rndis_request *request;
	rndis_halt_request *halt;
	int i, ret;

	/* Attempt to do a rndis device halt */
	request = hv_rndis_request(device, REMOTE_NDIS_HALT_MSG,
	    RNDIS_MESSAGE_SIZE(rndis_halt_request));
	if (request == NULL) {
		return (-1);
	}

	/* initialize "poor man's semaphore" */
	request->halt_complete_flag = 0;

	/* Set up the rndis set */
	halt = &request->request_msg.msg.halt_request;
	halt->request_id = atomic_fetchadd_int(&device->new_request_id, 1);
	/* Increment to get the new value (call above returns old value) */
	halt->request_id += 1;
	
	ret = hv_rf_send_request(device, request, REMOTE_NDIS_HALT_MSG);
	if (ret != 0) {
		return (-1);
	}

	/*
	 * Wait for halt response from halt callback.  We must wait for
	 * the transaction response before freeing the request and other
	 * resources.
	 */
	for (i=HALT_COMPLETION_WAIT_COUNT; i > 0; i--) {
		if (request->halt_complete_flag != 0) {
			break;
		}
		DELAY(400);
	}
	if (i == 0) {
		return (-1);
	}

	device->state = RNDIS_DEV_UNINITIALIZED;
	
	if (request != NULL) {
		hv_put_rndis_request(device, request);
	}

	return (0);
}
Esempio n. 12
0
int
vx_get_nonblock(struct vnode *vp)
{
	int error;

	if (lockcountnb(&vp->v_lock))
		return(EBUSY);
	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
	if (error == 0) {
		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
	}
	return(error);
}
Esempio n. 13
0
static void
def_lock_release(void *lock)
{
	Lock *l;

	l = (Lock *)lock;
	if ((l->lock & WAFLAG) == 0)
		atomic_add_rel_int(&l->lock, -RC_INCR);
	else {
		assert(wnested > 0);
		atomic_add_rel_int(&l->lock, -WAFLAG);
		if (atomic_fetchadd_int(&wnested, -1) == 1)
			sigprocmask(SIG_SETMASK, &oldsigmask, NULL);
	}
}
Esempio n. 14
0
void
udev_device_unref(struct udev_device *udev_device)
{
	int refcount;

	refcount = atomic_fetchadd_int(&udev_device->refs, -1);

	if (refcount == 1) {
		atomic_subtract_int(&udev_device->refs, 0x400); /* in destruction */
		if (udev_device->dict != NULL)
			prop_object_release(udev_device->dict);

		udev_unref(udev_device->udev_ctx);
		free(udev_device);
	}
}
Esempio n. 15
0
static void
def_wlock_acquire(void *lock)
{
	Lock *l;
	sigset_t tmp_oldsigmask;

	l = (Lock *)lock;
	for (;;) {
		sigprocmask(SIG_BLOCK, &fullsigmask, &tmp_oldsigmask);
		if (atomic_cmpset_acq_int(&l->lock, 0, WAFLAG))
			break;
		sigprocmask(SIG_SETMASK, &tmp_oldsigmask, NULL);
	}
	if (atomic_fetchadd_int(&wnested, 1) == 0)
		oldsigmask = tmp_oldsigmask;
}
Esempio n. 16
0
static void
nvme_ns_io_test(void *arg)
{
	struct nvme_io_test_internal	*io_test = arg;
	struct nvme_io_test_thread	*tth;
	struct nvme_completion		cpl;
	int				error;

	tth = malloc(sizeof(*tth), M_NVME, M_WAITOK | M_ZERO);
	tth->ns = io_test->ns;
	tth->opc = io_test->opc;
	memcpy(&tth->start, &io_test->start, sizeof(tth->start));
	tth->buf = malloc(io_test->size, M_NVME, M_WAITOK);
	tth->size = io_test->size;
	tth->time = io_test->time;
	tth->idx = atomic_fetchadd_int(&io_test->td_idx, 1);

	memset(&cpl, 0, sizeof(cpl));

	nvme_ns_io_test_cb(tth, &cpl);

	error = tsleep(tth, 0, "test_wait", tth->time*hz*2);

	if (error)
		printf("%s: error = %d\n", __func__, error);

	io_test->io_completed[tth->idx] = tth->io_completed;
	wakeup_one(io_test);

	free(tth->buf, M_NVME);
	free(tth, M_NVME);

	atomic_subtract_int(&io_test->td_active, 1);
	mb();

#if __FreeBSD_version >= 800004
	kthread_exit();
#else
	kthread_exit(0);
#endif
}
Esempio n. 17
0
/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success
 * and non-zero on error (which may be packet drops or other errors).
 * addr and len identify the netmap buffer, m is the (preallocated)
 * mbuf to use for transmissions.
 *
 * We should add a reference to the mbuf so the m_freem() at the end
 * of the transmission does not consume resources.
 *
 * On FreeBSD, and on multiqueue cards, we can force the queue using
 *      if ((m->m_flags & M_FLOWID) != 0)
 *              i = m->m_pkthdr.flowid % adapter->num_queues;
 *      else
 *              i = curcpu % adapter->num_queues;
 *
 */
int
generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
                   void *addr, u_int len, u_int ring_nr)
{
    int ret;

    m->m_len = m->m_pkthdr.len = 0;

    // copy data to the mbuf
    m_copyback(m, 0, len, addr);

#if 0
    // inc refcount. We are alone, so we can skip the atomic
    atomic_fetchadd_int(m->m_ext.ref_cnt, 1);
    m->m_flags |= M_FLOWID;
#endif
    m->m_pkthdr.hash = ring_nr;	/* XXX probably not accurate */
    m->m_pkthdr.rcvif = ifp; /* used for tx notification */
    ret = ifp->if_transmit(ifp, m);
    return ret;
}
Esempio n. 18
0
/*
 * Unlock and deref a cluster.  The cluster is destroyed if this is the
 * last ref.
 */
void
hammer2_cluster_unlock(hammer2_cluster_t *cluster)
{
	hammer2_chain_t *chain;
	int i;

	KKASSERT(cluster->refs > 0);
	for (i = 0; i < cluster->nchains; ++i) {
		chain = cluster->array[i];
		if (chain) {
			hammer2_chain_unlock(chain);
			if (cluster->refs == 1)
				cluster->array[i] = NULL;	/* safety */
		}
	}
	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
		cluster->focus = NULL;
		kfree(cluster, M_HAMMER2);
		/* cluster = NULL; safety */
	}
}
Esempio n. 19
0
static int mlx4_en_process_tx_cq(struct net_device *dev,
				 struct mlx4_en_cq *cq)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	struct mlx4_cq *mcq = &cq->mcq;
	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
	struct mlx4_cqe *cqe;
	u16 index;
	u16 new_index, ring_index, stamp_index;
	u32 txbbs_skipped = 0;
#ifndef CONFIG_WQE_FORMAT_1
	u32 txbbs_stamp = 0;
#endif
	u32 cons_index = mcq->cons_index;
	int size = cq->size;
	u32 size_mask = ring->size_mask;
	struct mlx4_cqe *buf = cq->buf;
	u32 packets = 0;
	u32 bytes = 0;
	int factor = priv->cqe_factor;
	u64 timestamp = 0;
	int done = 0;


	if (!priv->port_up)
		return 0;

	index = cons_index & size_mask;
	cqe = &buf[(index << factor) + factor];
	ring_index = ring->cons & size_mask;
	stamp_index = ring_index;

	/* Process all completed CQEs */
	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
			cons_index & size)) {
		/*
		 * make sure we read the CQE after we read the
		 * ownership bit
		 */
		rmb();

		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
			     MLX4_CQE_OPCODE_ERROR)) {
			en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n",
			       ((struct mlx4_err_cqe *)cqe)->
				       vendor_err_syndrome,
			       ((struct mlx4_err_cqe *)cqe)->syndrome);
		}

		/* Skip over last polled CQE */
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;

		do {
			txbbs_skipped += ring->last_nr_txbb;
			ring_index = (ring_index + ring->last_nr_txbb) & size_mask;
			/* free next descriptor */
			ring->last_nr_txbb = mlx4_en_free_tx_desc(
					priv, ring, ring_index,
					!!((ring->cons + txbbs_skipped) &
					ring->size), timestamp);
#ifndef CONFIG_WQE_FORMAT_1
			mlx4_en_stamp_wqe(priv, ring, stamp_index,
					  !!((ring->cons + txbbs_stamp) &
						ring->size));
			stamp_index = ring_index;
			txbbs_stamp = txbbs_skipped;
#endif
			packets++;
			bytes += ring->tx_info[ring_index].nr_bytes;
		} while (ring_index != new_index);

		++cons_index;
		index = cons_index & size_mask;
		cqe = &buf[(index << factor) + factor];
	}


	/*
	 * To prevent CQ overflow we first update CQ consumer and only then
	 * the ring consumer.
	 */
	mcq->cons_index = cons_index;
	mlx4_cq_set_ci(mcq);
	wmb();
	ring->cons += txbbs_skipped;

	/* Wakeup Tx queue if it was stopped and ring is not full */
	if (unlikely(ring->blocked) &&
	    (ring->prod - ring->cons) <= ring->full_size) {
		ring->blocked = 0;
#ifdef CONFIG_RATELIMIT
		if (cq->ring < priv->native_tx_ring_num) {
			if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
				atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE);
			priv->port_stats.wake_queue++;
		}
#else
		if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
			atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE);
		priv->port_stats.wake_queue++;
#endif
		ring->wake_queue++;
	}
	return done;
}
Esempio n. 20
0
/*
 * Try to reuse a vnode from the free list.  This function is somewhat
 * advisory in that NULL can be returned as a normal case, even if free
 * vnodes are present.
 *
 * The scan is limited because it can result in excessive CPU use during
 * periods of extreme vnode use.
 *
 * NOTE: The returned vnode is not completely initialized.
 */
static
struct vnode *
cleanfreevnode(int maxcount)
{
	struct vnode *vp;
	int count;
	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);

	/*
	 * Try to deactivate some vnodes cached on the active list.
	 */
	if (countcachedvnodes(0) < inactivevnodes)
		goto skip;

	for (count = 0; count < maxcount * 2; count++) {
		spin_lock(&vfs_spin);

		vp = TAILQ_NEXT(&vnode_active_rover, v_list);
		TAILQ_REMOVE(&vnode_active_list, &vnode_active_rover, v_list);
		if (vp == NULL) {
			TAILQ_INSERT_HEAD(&vnode_active_list,
					  &vnode_active_rover, v_list);
		} else {
			TAILQ_INSERT_AFTER(&vnode_active_list, vp,
					   &vnode_active_rover, v_list);
		}
		if (vp == NULL) {
			spin_unlock(&vfs_spin);
			continue;
		}
		if ((vp->v_refcnt & VREF_MASK) != 0) {
			spin_unlock(&vfs_spin);
			vp->v_act += VACT_INC;
			if (vp->v_act > VACT_MAX)	/* SMP race ok */
				vp->v_act = VACT_MAX;
			continue;
		}

		/*
		 * decrement by less if the vnode's object has a lot of
		 * VM pages.  XXX possible SMP races.
		 */
		if (vp->v_act > 0) {
			vm_object_t obj;
			if ((obj = vp->v_object) != NULL &&
			    obj->resident_page_count >= trigger) {
				vp->v_act -= 1;
			} else {
				vp->v_act -= VACT_INC;
			}
			if (vp->v_act < 0)
				vp->v_act = 0;
			spin_unlock(&vfs_spin);
			continue;
		}

		/*
		 * Try to deactivate the vnode.
		 */
		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);

		spin_unlock(&vfs_spin);
		vrele(vp);
	}

skip:
	/*
	 * Loop trying to lock the first vnode on the free list.
	 * Cycle if we can't.
	 */
	for (count = 0; count < maxcount; count++) {
		spin_lock(&vfs_spin);

		vp = TAILQ_FIRST(&vnode_inactive_list);
		if (vp == NULL) {
			spin_unlock(&vfs_spin);
			break;
		}

		/*
		 * non-blocking vx_get will also ref the vnode on success.
		 */
		if (vx_get_nonblock(vp)) {
			KKASSERT(vp->v_state == VS_INACTIVE);
			TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
			TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
			spin_unlock(&vfs_spin);
			continue;
		}

		/*
		 * Because we are holding vfs_spin the vnode should currently
		 * be inactive and VREF_TERMINATE should still be set.
		 *
		 * Once vfs_spin is released the vnode's state should remain
		 * unmodified due to both the lock and ref on it.
		 */
		KKASSERT(vp->v_state == VS_INACTIVE);
		spin_unlock(&vfs_spin);
#ifdef TRACKVNODE
		if ((u_long)vp == trackvnode)
			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
#endif

		/*
		 * Do not reclaim/reuse a vnode while auxillary refs exists.
		 * This includes namecache refs due to a related ncp being
		 * locked or having children, a VM object association, or
		 * other hold users.
		 *
		 * Do not reclaim/reuse a vnode if someone else has a real
		 * ref on it.  This can occur if a filesystem temporarily
		 * releases the vnode lock during VOP_RECLAIM.
		 */
		if (vp->v_auxrefs ||
		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
failed:
			if (vp->v_state == VS_INACTIVE) {
				spin_lock(&vfs_spin);
				if (vp->v_state == VS_INACTIVE) {
					TAILQ_REMOVE(&vnode_inactive_list,
						     vp, v_list);
					TAILQ_INSERT_TAIL(&vnode_inactive_list,
							  vp, v_list);
				}
				spin_unlock(&vfs_spin);
			}
			vx_put(vp);
			continue;
		}

		/*
		 * VINACTIVE and VREF_TERMINATE are expected to both be set
		 * for vnodes pulled from the inactive list, and cannot be
		 * changed while we hold the vx lock.
		 *
		 * Try to reclaim the vnode.
		 */
		KKASSERT(vp->v_flag & VINACTIVE);
		KKASSERT(vp->v_refcnt & VREF_TERMINATE);

		if ((vp->v_flag & VRECLAIMED) == 0) {
			if (cache_inval_vp_nonblock(vp))
				goto failed;
			vgone_vxlocked(vp);
			/* vnode is still VX locked */
		}

		/*
		 * At this point if there are no other refs or auxrefs on
		 * the vnode with the inactive list locked, and we remove
		 * the vnode from the inactive list, it should not be
		 * possible for anyone else to access the vnode any more.
		 *
		 * Since the vnode is in a VRECLAIMED state, no new
		 * namecache associations could have been made and the
		 * vnode should have already been removed from its mountlist.
		 *
		 * Since we hold a VX lock on the vnode it cannot have been
		 * reactivated (moved out of the inactive list).
		 */
		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
		spin_lock(&vfs_spin);
		if (vp->v_auxrefs ||
		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
			spin_unlock(&vfs_spin);
			goto failed;
		}
		KKASSERT(vp->v_state == VS_INACTIVE);
		TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
		--inactivevnodes;
		vp->v_state = VS_DYING;
		spin_unlock(&vfs_spin);

		/*
		 * Nothing should have been able to access this vp.  Only
		 * our ref should remain now.
		 */
		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
		KASSERT(vp->v_refcnt == 1,
			("vp %p badrefs %08x", vp, vp->v_refcnt));

		/*
		 * Return a VX locked vnode suitable for reuse.
		 */
		return(vp);
	}
	return(NULL);
}
Esempio n. 21
0
uint32_t
vmbus_gpadl_alloc(struct vmbus_softc *sc)
{
	return atomic_fetchadd_int(&sc->vmbus_gpadl, 1);
}
Esempio n. 22
0
static int atomic_exchange_and_add(sp_counted_base_atomic_type volatile * pw, int dv)
{
    return atomic_fetchadd_int(&pw->ui,dv);
}
Esempio n. 23
0
void
tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
{
	struct mbuf *n = NULL, *mhead;

	KASSERT(th, ("%s: called with th == NULL", __func__));
	KASSERT(m, ("%s: called with m == NULL", __func__));
	KASSERT(queue, ("%s: called with queue == NULL", __func__));

	/* We only care about data packets. */
	while (m && m->m_type != MT_DATA)
		m = m->m_next;

	/* We only need to do something if we still have an mbuf. */
	if (!m)
		return;

	/* If we are not saving mbufs, return now. */
	if (queue->mq_maxlen == 0)
		return;

	/*
	 * Check to see if we will need to recycle mbufs.
	 *
	 * If we need to get rid of mbufs to stay below
	 * our packet count, try to reuse the mbuf. Once
	 * we already have a new mbuf (n), then we can
	 * simply free subsequent mbufs.
	 *
	 * Note that most of the logic in here is to deal
	 * with the reuse. If we are fine with constant
	 * mbuf allocs/deallocs, we could ditch this logic.
	 * But, it only seems to make sense to reuse
	 * mbufs we already have.
	 */
	while (mbufq_full(queue)) {
		mhead = mbufq_dequeue(queue);

		if (n) {
			tcp_pcap_m_freem(mhead);
		}
		else {
			/*
			 * If this held an external cluster, try to
			 * detach the cluster. But, if we held the
			 * last reference, go through the normal
			 * free-ing process.
			 */
			if (mhead->m_flags & M_EXT) {
				switch (mhead->m_ext.ext_type) {
				case EXT_SFBUF:
					/* Don't mess around with these. */
					tcp_pcap_m_freem(mhead);
					continue;
				default:
					if (atomic_fetchadd_int(
						mhead->m_ext.ext_cnt, -1) == 1)
					{
						/*
						 * We held the last reference
						 * on this cluster. Restore
						 * the reference count and put
						 * it back in the pool.
				 		 */
						*(mhead->m_ext.ext_cnt) = 1;
						tcp_pcap_m_freem(mhead);
						continue;
					}
					/*
					 * We were able to cleanly free the
					 * reference.
				 	 */
					atomic_subtract_int(
					    &tcp_pcap_clusters_referenced_cur,
					    1);
					tcp_pcap_alloc_reuse_ext++;
					break;
				}
			}
			else {
				tcp_pcap_alloc_reuse_mbuf++;
			}

			n = mhead;
			tcp_pcap_m_freem(n->m_next);
			m_init(n, NULL, 0, M_NOWAIT, MT_DATA, 0);
		}
	}

	/* Check to see if we need to get a new mbuf. */
	if (!n) {
		if (!(n = m_get(M_NOWAIT, MT_DATA)))
			return;
		tcp_pcap_alloc_new_mbuf++;
	}

	/*
	 * What are we dealing with? If a cluster, attach it. Otherwise,
	 * try to copy the data from the beginning of the mbuf to the
	 * end of data. (There may be data between the start of the data
	 * area and the current data pointer. We want to get this, because
	 * it may contain header information that is useful.)
	 * In cases where that isn't possible, settle for what we can
	 * get.
	 */
	if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) {
		n->m_data = m->m_data;
		n->m_len = m->m_len;
		mb_dupcl(n, m);
	}
	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
		/*
		 * At this point, n is guaranteed to be a normal mbuf
		 * with no cluster and no packet header. Because the
		 * logic in this code block requires this, the assert
		 * is here to catch any instances where someone
		 * changes the logic to invalidate that assumption.
		 */
		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
			("%s: Unexpected flags (%#x) for mbuf",
			__func__, n->m_flags));
		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
		n->m_len = m->m_len;
		bcopy(M_START(m), n->m_dat,
			m->m_len + M_LEADINGSPACE_NOWRITE(m));
	}
	else {
		/*
		 * This is the case where we need to "settle for what
		 * we can get". The most probable way to this code
		 * path is that we've already taken references to the
		 * maximum number of mbuf clusters we can, and the data
		 * is too long to fit in an mbuf's internal storage.
		 * Try for a "best fit".
		 */
		tcp_pcap_copy_bestfit(th, m, n);

		/* Don't try to get additional data. */
		goto add_to_queue;
	}

	if (m->m_next) {
		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
		tcp_pcap_adj_cluster_reference(n->m_next, 1);
	}

add_to_queue:
	/* Add the new mbuf to the list. */
	if (mbufq_enqueue(queue, n)) {
		/* This shouldn't happen. If INVARIANTS is defined, panic. */
		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
		tcp_pcap_m_freem(n);
	}
}
Esempio n. 24
0
/*
 * Non-directly-exported function to clean up after mbufs with M_EXT
 * storage attached to them if the reference count hits 1.
 */
void
mb_free_ext(struct mbuf *m)
{
	int freembuf;

	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));

	/*
	 * Check if the header is embedded in the cluster.
	 */
	freembuf = (m->m_flags & M_NOFREE) ? 0 : 1;

	switch (m->m_ext.ext_type) {
	case EXT_SFBUF:
		sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
		break;
	default:
		KASSERT(m->m_ext.ext_cnt != NULL,
		    ("%s: no refcounting pointer on %p", __func__, m));
		/* 
		 * Free attached storage if this mbuf is the only
		 * reference to it.
		 */
		if (*(m->m_ext.ext_cnt) != 1) {
			if (atomic_fetchadd_int(m->m_ext.ext_cnt, -1) != 1)
				break;
		}

		switch (m->m_ext.ext_type) {
		case EXT_PACKET:	/* The packet zone is special. */
			if (*(m->m_ext.ext_cnt) == 0)
				*(m->m_ext.ext_cnt) = 1;
			uma_zfree(zone_pack, m);
			return;		/* Job done. */
		case EXT_CLUSTER:
			uma_zfree(zone_clust, m->m_ext.ext_buf);
			break;
		case EXT_JUMBOP:
			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
			break;
		case EXT_JUMBO9:
			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
			break;
		case EXT_JUMBO16:
			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
			break;
		case EXT_NET_DRV:
		case EXT_MOD_TYPE:
		case EXT_DISPOSABLE:
			*(m->m_ext.ext_cnt) = 0;
			uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
				m->m_ext.ext_cnt));
			/* FALLTHROUGH */
		case EXT_EXTREF:
			KASSERT(m->m_ext.ext_free != NULL,
				("%s: ext_free not set", __func__));
			(*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1,
			    m->m_ext.ext_arg2);
			break;
		default:
			KASSERT(m->m_ext.ext_type == 0,
				("%s: unknown ext_type", __func__));
		}
	}

	if (freembuf)
		uma_zfree(zone_mbuf, m);
}
Esempio n. 25
0
/*
 * bfq_queue(): .queue callback of the bfq policy.
 *
 * A thread calls this function to hand in its I/O requests (bio).
 * Their bios are stored in the per-thread queue, in tdio structure.
 * Currently, the sync/async bios are queued together, which may cause
 * some issues on performance.
 *
 * Besides queueing bios, this function also calculates the average
 * thinking time and average seek distance of a thread, using the
 * information in bio structure.
 *
 * If the calling thread is waiting by the bfq scheduler due to
 * the AS feature, this function will cancel the callout alarm
 * and resume the scheduler to continue serving this thread.
 *
 * lock:
 *   THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
 *   BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
 *   in bfq_queue() or bfq_dequeue().
 *
 * refcount:
 *   If the calling thread is waited by the scheduler, the refcount
 *   of the related tdio will decrease by 1 after this function. The
 *   counterpart increasing is in bfq_dequeue(), before resetting the
 *   callout alarm.
 *
 * Return value:
 *  EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
 *  0: bio is queued successfully.
 */
static int
bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
		struct  bio *bio)
{
	struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
	struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
	int original_qlength;

	/* we do not handle flush requests. push it down to dsched */
	if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
		return (EINVAL);

	DSCHED_THREAD_IO_LOCK(tdio);
	KKASSERT(tdio->debug_priv == 0xF00FF00F);
	dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);

	dsched_set_bio_priv(bio, tdio);
	dsched_thread_io_ref(tdio);

	if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
	    (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
		bfq_update_tdio_seek_avg(bfq_tdio, bio);
	}

	bfq_update_tdio_ttime_avg(bfq_tdio);

	/* update last_bio_pushed_time */
	getmicrotime(&bfq_tdio->last_bio_pushed_time);

	if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
	    BFQ_TDIO_SEEKY(bfq_tdio))
		dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);

	/*
	 * If a tdio taks too long to think, we disable the AS feature of it.
	 */
	if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
	    (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
	    (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
		dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
		bfq_tdio->tdio_as_switch = 0;
	} else {
		bfq_tdio->tdio_as_switch = 1;
	}

	/* insert the bio into the tdio's own queue */
	KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
	TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
#if 0
	tdio->qlength++;
#endif
	original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
	DSCHED_THREAD_IO_UNLOCK(tdio);
	/*
	 * A new thread:
	 * In dequeue function, we remove the thread
	 * from the aug-tree if it has no further bios.
	 * Therefore "new" means a really new thread (a
	 * newly created thread or a thread that pushed no more
	 * bios when the scheduler was waiting for it) or
	 * one that was removed from the aug-tree earlier.
	 */
	if (original_qlength == 0) {
		/*
		 * a really new thread
		 */
		BFQ_LOCK(bfq_diskctx);
		if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
			/* insert the tdio into the wf2q queue */
			wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
		} else {
			/*
			 * the thread being waited by the scheduler
			 */
			if (bfq_diskctx->bfq_blockon == bfq_tdio) {
				/*
				 * XXX: possible race condition here:
				 * if the callout function is triggered when
				 * the following code is executed, then after
				 * releasing the TDIO lock, the callout function
				 * will set the thread inactive and it will never
				 * be inserted into the aug-tree (so its bio pushed
				 * this time will not be dispatched) until it pushes
				 * further bios
				 */
				bfq_diskctx->bfq_as_hit++;
				bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);

				if (callout_pending(&bfq_diskctx->bfq_callout))
					callout_stop(&bfq_diskctx->bfq_callout);
				bfq_diskctx->bfq_blockon = NULL;

				/* ref'ed in dequeue(), before resetting callout */
				dsched_thread_io_unref(&bfq_tdio->head);

				dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
			}
		}

		BFQ_UNLOCK(bfq_diskctx);
	}

	helper_msg_dequeue(bfq_diskctx);

	return 0;
}
Esempio n. 26
0
void
_pthread_exit_mask(void *status, sigset_t *mask)
{
    struct pthread *curthread = _get_curthread();

    /* Check if this thread is already in the process of exiting: */
    if (curthread->cancelling) {
        char msg[128];
        snprintf(msg, sizeof(msg), "Thread %p has called "
                 "pthread_exit() from a destructor. POSIX 1003.1 "
                 "1996 s16.2.5.2 does not allow this!", curthread);
        PANIC(msg);
    }

    /* Flag this thread as exiting. */
    curthread->cancelling = 1;
    curthread->no_cancel = 1;
    curthread->cancel_async = 0;
    curthread->cancel_point = 0;
    if (mask != NULL)
        __sys_sigprocmask(SIG_SETMASK, mask, NULL);
    if (curthread->unblock_sigcancel) {
        sigset_t set;

        curthread->unblock_sigcancel = 0;
        SIGEMPTYSET(set);
        SIGADDSET(set, SIGCANCEL);
        __sys_sigprocmask(SIG_UNBLOCK, mask, NULL);
    }

    /* Save the return value: */
    curthread->ret = status;
#ifdef _PTHREAD_FORCED_UNWIND

#ifdef PIC
    thread_uw_init();
#endif /* PIC */

#ifdef PIC
    if (uwl_forcedunwind != NULL) {
#else
    if (_Unwind_ForcedUnwind != NULL) {
#endif
        if (curthread->unwind_disabled) {
            if (message_printed == 0) {
                message_printed = 1;
                _thread_printf(2, "Warning: old _pthread_cleanup_push was called, "
                               "stack unwinding is disabled.\n");
            }
            goto cleanup;
        }
        thread_unwind();

    } else {
cleanup:
        while (curthread->cleanup != NULL) {
            __pthread_cleanup_pop_imp(1);
        }
        exit_thread();
    }

#else
    while (curthread->cleanup != NULL) {
        __pthread_cleanup_pop_imp(1);
    }

    exit_thread();
#endif /* _PTHREAD_FORCED_UNWIND */
}

static void
exit_thread(void)
{
    struct pthread *curthread = _get_curthread();

    /* Check if there is thread specific data: */
    if (curthread->specific != NULL) {
        /* Run the thread-specific data destructors: */
        _thread_cleanupspecific();
    }

    if (!_thr_isthreaded())
        exit(0);

    if (atomic_fetchadd_int(&_thread_active_threads, -1) == 1) {
        exit(0);
        /* Never reach! */
    }

    /* Tell malloc that the thread is exiting. */
    _malloc_thread_cleanup();

    THR_LOCK(curthread);
    curthread->state = PS_DEAD;
    if (curthread->flags & THR_FLAGS_NEED_SUSPEND) {
        curthread->cycle++;
        _thr_umtx_wake(&curthread->cycle, INT_MAX, 0);
    }
    if (!curthread->force_exit && SHOULD_REPORT_EVENT(curthread, TD_DEATH))
        _thr_report_death(curthread);
    /*
     * Thread was created with initial refcount 1, we drop the
     * reference count to allow it to be garbage collected.
     */
    curthread->refcount--;
    _thr_try_gc(curthread, curthread); /* thread lock released */

#if defined(_PTHREADS_INVARIANTS)
    if (THR_IN_CRITICAL(curthread))
        PANIC("thread exits with resources held!");
#endif
    /*
     * Kernel will do wakeup at the address, so joiner thread
     * will be resumed if it is sleeping at the address.
     */
    thr_exit(&curthread->tid);
    PANIC("thr_exit() returned");
    /* Never reach! */
}
Esempio n. 27
0
static void
nvme_ns_bio_test(void *arg)
{
	struct nvme_io_test_internal	*io_test = arg;
	struct cdevsw			*csw;
	struct mtx			*mtx;
	struct bio			*bio;
	struct cdev			*dev;
	void				*buf;
	struct timeval			t;
	uint64_t			offset;
	uint32_t			idx, io_completed = 0;
#if __FreeBSD_version >= 900017
	int				ref;
#endif

	buf = malloc(io_test->size, M_NVME, M_WAITOK);
	idx = atomic_fetchadd_int(&io_test->td_idx, 1);
	dev = io_test->ns->cdev;

	offset = idx * 2048 * nvme_ns_get_sector_size(io_test->ns);

	while (1) {

		bio = g_alloc_bio();

		memset(bio, 0, sizeof(*bio));
		bio->bio_cmd = (io_test->opc == NVME_OPC_READ) ?
		    BIO_READ : BIO_WRITE;
		bio->bio_done = nvme_ns_bio_test_cb;
		bio->bio_dev = dev;
		bio->bio_offset = offset;
		bio->bio_data = buf;
		bio->bio_bcount = io_test->size;

		if (io_test->flags & NVME_TEST_FLAG_REFTHREAD) {
#if __FreeBSD_version >= 900017
			csw = dev_refthread(dev, &ref);
#else
			csw = dev_refthread(dev);
#endif
		} else
			csw = dev->si_devsw;

		mtx = mtx_pool_find(mtxpool_sleep, bio);
		mtx_lock(mtx);
		(*csw->d_strategy)(bio);
		msleep(bio, mtx, PRIBIO, "biotestwait", 0);
		mtx_unlock(mtx);

		if (io_test->flags & NVME_TEST_FLAG_REFTHREAD) {
#if __FreeBSD_version >= 900017
			dev_relthread(dev, ref);
#else
			dev_relthread(dev);
#endif
		}

		if ((bio->bio_flags & BIO_ERROR) || (bio->bio_resid > 0))
			break;

		g_destroy_bio(bio);

		io_completed++;

		getmicrouptime(&t);
		timevalsub(&t, &io_test->start);

		if (t.tv_sec >= io_test->time)
			break;

		offset += io_test->size;
		if ((offset + io_test->size) > nvme_ns_get_size(io_test->ns))
			offset = 0;
	}

	io_test->io_completed[idx] = io_completed;
	wakeup_one(io_test);

	free(buf, M_NVME);

	atomic_subtract_int(&io_test->td_active, 1);
	mb();

#if __FreeBSD_version >= 800000
	kthread_exit();
#else
	kthread_exit(0);
#endif
}
Esempio n. 28
0
/*
 * Retire a XOP.  Used by both the VOP frontend and by the XOP backend.
 */
void
hammer2_xop_retire(hammer2_xop_head_t *xop, uint32_t mask)
{
	hammer2_xop_group_t *xgrp;
	hammer2_chain_t *chain;
	int i;

	xgrp = xop->xgrp;

	/*
	 * Remove the frontend or remove a backend feeder.  When removing
	 * the frontend we must wakeup any backend feeders who are waiting
	 * for FIFO space.
	 *
	 * XXX optimize wakeup.
	 */
	KKASSERT(xop->run_mask & mask);
	if (atomic_fetchadd_int(&xop->run_mask, -mask) != mask) {
		if (mask == HAMMER2_XOPMASK_VOP)
			wakeup(xop);
		return;
	}

	/*
	 * Cleanup the collection cluster.
	 */
	for (i = 0; i < xop->cluster.nchains; ++i) {
		xop->cluster.array[i].flags = 0;
		chain = xop->cluster.array[i].chain;
		if (chain) {
			xop->cluster.array[i].chain = NULL;
			hammer2_chain_unlock(chain);
			hammer2_chain_drop(chain);
		}
	}

	/*
	 * Cleanup the fifos, use check_counter to optimize the loop.
	 */
	mask = xop->chk_mask;
	for (i = 0; mask && i < HAMMER2_MAXCLUSTER; ++i) {
		hammer2_xop_fifo_t *fifo = &xop->collect[i];
		while (fifo->ri != fifo->wi) {
			chain = fifo->array[fifo->ri & HAMMER2_XOPFIFO_MASK];
			if (chain) {
				hammer2_chain_unlock(chain);
				hammer2_chain_drop(chain);
			}
			++fifo->ri;
			if (fifo->wi - fifo->ri < HAMMER2_XOPFIFO / 2)
				wakeup(xop);	/* XXX optimize */
		}
		mask &= ~(1U << i);
	}

	/*
	 * The inode is only held at this point, simply drop it.
	 */
	if (xop->ip) {
		hammer2_inode_drop(xop->ip);
		xop->ip = NULL;
	}
	if (xop->ip2) {
		hammer2_inode_drop(xop->ip2);
		xop->ip2 = NULL;
	}
	if (xop->ip3) {
		hammer2_inode_drop(xop->ip3);
		xop->ip3 = NULL;
	}
	if (xop->name) {
		kfree(xop->name, M_HAMMER2);
		xop->name = NULL;
		xop->name_len = 0;
	}
	if (xop->name2) {
		kfree(xop->name2, M_HAMMER2);
		xop->name2 = NULL;
		xop->name2_len = 0;
	}

	objcache_put(cache_xops, xop);
}
Esempio n. 29
0
static int
acpi_cpu_set_cx_lowest(struct acpi_cpu_softc *sc, int val)
{
    int i, old_lowest, error = 0;
    uint32_t old_type, type;

    get_mplock();

    old_lowest = atomic_swap_int(&sc->cpu_cx_lowest, val);

    old_type = sc->cpu_cx_states[old_lowest].type;
    type = sc->cpu_cx_states[val].type;
    if (old_type == ACPI_STATE_C3 && type != ACPI_STATE_C3) {
	KKASSERT(cpu_c3_ncpus > 0);
	if (atomic_fetchadd_int(&cpu_c3_ncpus, -1) == 1) {
	    /*
	     * All of the CPUs exit C3 state, use a better
	     * one shot timer.
	     */
	    error = cputimer_intr_select_caps(CPUTIMER_INTR_CAP_NONE);
	    KKASSERT(!error);
	    cputimer_intr_restart();
    	}
    } else if (type == ACPI_STATE_C3 && old_type != ACPI_STATE_C3) {
	if (atomic_fetchadd_int(&cpu_c3_ncpus, 1) == 0) {
	    /*
	     * When the first CPU enters C3 state, switch
	     * to an one shot timer, which could handle
	     * C3 state, i.e. the timer will not hang.
	     */
	    error = cputimer_intr_select_caps(CPUTIMER_INTR_CAP_PS);
	    if (!error) {
		cputimer_intr_restart();
	    } else {
		kprintf("no suitable intr cputimer found\n");

		/* Restore */
		sc->cpu_cx_lowest = old_lowest;
		atomic_fetchadd_int(&cpu_c3_ncpus, -1);
	    }
	}
    }

    rel_mplock();

    if (error)
	return error;

    /* If not disabling, cache the new lowest non-C3 state. */
    sc->cpu_non_c3 = 0;
    for (i = sc->cpu_cx_lowest; i >= 0; i--) {
	if (sc->cpu_cx_states[i].type < ACPI_STATE_C3) {
	    sc->cpu_non_c3 = i;
	    break;
	}
    }

    /* Reset the statistics counters. */
    bzero(sc->cpu_cx_stats, sizeof(sc->cpu_cx_stats));
    return (0);
}
Esempio n. 30
0
int
fork1(struct thread *td, struct fork_req *fr)
{
	struct proc *p1, *newproc;
	struct thread *td2;
	struct vmspace *vm2;
	struct file *fp_procdesc;
	vm_ooffset_t mem_charged;
	int error, nprocs_new, ok;
	static int curfail;
	static struct timeval lastfail;
	int flags, pages;

	flags = fr->fr_flags;
	pages = fr->fr_pages;

	if ((flags & RFSTOPPED) != 0)
		MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL);
	else
		MPASS(fr->fr_procp == NULL);

	/* Check for the undefined or unimplemented flags. */
	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
		return (EINVAL);

	/* Signal value requires RFTSIGZMB. */
	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
		return (EINVAL);

	/* Can't copy and clear. */
	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
		return (EINVAL);

	/* Check the validity of the signal number. */
	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
		return (EINVAL);

	if ((flags & RFPROCDESC) != 0) {
		/* Can't not create a process yet get a process descriptor. */
		if ((flags & RFPROC) == 0)
			return (EINVAL);

		/* Must provide a place to put a procdesc if creating one. */
		if (fr->fr_pd_fd == NULL)
			return (EINVAL);

		/* Check if we are using supported flags. */
		if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0)
			return (EINVAL);
	}

	p1 = td->td_proc;

	/*
	 * Here we don't create a new process, but we divorce
	 * certain parts of a process from itself.
	 */
	if ((flags & RFPROC) == 0) {
		if (fr->fr_procp != NULL)
			*fr->fr_procp = NULL;
		else if (fr->fr_pidp != NULL)
			*fr->fr_pidp = 0;
		return (fork_norfproc(td, flags));
	}

	fp_procdesc = NULL;
	newproc = NULL;
	vm2 = NULL;

	/*
	 * Increment the nprocs resource before allocations occur.
	 * Although process entries are dynamically created, we still
	 * keep a global limit on the maximum number we will
	 * create. There are hard-limits as to the number of processes
	 * that can run, established by the KVA and memory usage for
	 * the process data.
	 *
	 * Don't allow a nonprivileged user to use the last ten
	 * processes; don't let root exceed the limit.
	 */
	nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
	if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
	    PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
		error = EAGAIN;
		sx_xlock(&allproc_lock);
		if (ppsratecheck(&lastfail, &curfail, 1)) {
			printf("maxproc limit exceeded by uid %u (pid %d); "
			    "see tuning(7) and login.conf(5)\n",
			    td->td_ucred->cr_ruid, p1->p_pid);
		}
		sx_xunlock(&allproc_lock);
		goto fail2;
	}

	/*
	 * If required, create a process descriptor in the parent first; we
	 * will abandon it if something goes wrong. We don't finit() until
	 * later.
	 */
	if (flags & RFPROCDESC) {
		error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd,
		    fr->fr_pd_flags, fr->fr_pd_fcaps);
		if (error != 0)
			goto fail2;
	}

	mem_charged = 0;
	if (pages == 0)
		pages = kstack_pages;
	/* Allocate new proc. */
	newproc = uma_zalloc(proc_zone, M_WAITOK);
	td2 = FIRST_THREAD_IN_PROC(newproc);
	if (td2 == NULL) {
		td2 = thread_alloc(pages);
		if (td2 == NULL) {
			error = ENOMEM;
			goto fail2;
		}
		proc_linkup(newproc, td2);
	} else {
		if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
			if (td2->td_kstack != 0)
				vm_thread_dispose(td2);
			if (!thread_alloc_stack(td2, pages)) {
				error = ENOMEM;
				goto fail2;
			}
		}
	}

	if ((flags & RFMEM) == 0) {
		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
		if (vm2 == NULL) {
			error = ENOMEM;
			goto fail2;
		}
		if (!swap_reserve(mem_charged)) {
			/*
			 * The swap reservation failed. The accounting
			 * from the entries of the copied vm2 will be
			 * subtracted in vmspace_free(), so force the
			 * reservation there.
			 */
			swap_reserve_force(mem_charged);
			error = ENOMEM;
			goto fail2;
		}
	} else
		vm2 = NULL;

	/*
	 * XXX: This is ugly; when we copy resource usage, we need to bump
	 *      per-cred resource counters.
	 */
	proc_set_cred_init(newproc, crhold(td->td_ucred));

	/*
	 * Initialize resource accounting for the child process.
	 */
	error = racct_proc_fork(p1, newproc);
	if (error != 0) {
		error = EAGAIN;
		goto fail1;
	}

#ifdef MAC
	mac_proc_init(newproc);
#endif
	newproc->p_klist = knlist_alloc(&newproc->p_mtx);
	STAILQ_INIT(&newproc->p_ktr);

	/* We have to lock the process tree while we look for a pid. */
	sx_slock(&proctree_lock);
	sx_xlock(&allproc_lock);

	/*
	 * Increment the count of procs running with this uid. Don't allow
	 * a nonprivileged user to exceed their current limit.
	 *
	 * XXXRW: Can we avoid privilege here if it's not needed?
	 */
	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
	if (error == 0)
		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
	else {
		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
		    lim_cur(td, RLIMIT_NPROC));
	}
	if (ok) {
		do_fork(td, fr, newproc, td2, vm2, fp_procdesc);
		return (0);
	}

	error = EAGAIN;
	sx_sunlock(&proctree_lock);
	sx_xunlock(&allproc_lock);
#ifdef MAC
	mac_proc_destroy(newproc);
#endif
	racct_proc_exit(newproc);
fail1:
	crfree(newproc->p_ucred);
	newproc->p_ucred = NULL;
fail2:
	if (vm2 != NULL)
		vmspace_free(vm2);
	uma_zfree(proc_zone, newproc);
	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
		fdclose(td, fp_procdesc, *fr->fr_pd_fd);
		fdrop(fp_procdesc, td);
	}
	atomic_add_int(&nprocs, -1);
	pause("fork", hz / 2);
	return (error);
}