Esempio n. 1
0
/*ARGSUSED*/
int
size_pse_array(pgcnt_t npg, int ncpu)
{
	size_t size;
	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;

	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
	size += (1 << (highbit(size) - 1)) - 1;
	return (highbit(size) - 1);
}
Esempio n. 2
0
int
bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
{
	dmu_object_info_t doi;
	int err;

	err = dmu_object_info(mos, object, &doi);
	if (err)
		return (err);

	mutex_enter(&bpl->bpl_lock);

	ASSERT(bpl->bpl_dbuf == NULL);
	ASSERT(bpl->bpl_phys == NULL);
	ASSERT(bpl->bpl_cached_dbuf == NULL);
	ASSERT(bpl->bpl_queue == NULL);
	ASSERT(object != 0);
	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);

	bpl->bpl_mos = mos;
	bpl->bpl_object = object;
	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));

	mutex_exit(&bpl->bpl_lock);
	return (0);
}
Esempio n. 3
0
template <typename T>	std::ostringstream	&AttributeBitmap::bin(T &value, std::ostringstream &o)
{
  for (T bit = highbit(bit); bit; bit >>= 1 )
    {
      o << ( ( value & bit ) ? '1' : '0' );
    }
  return o;
}
Esempio n. 4
0
/*
 * Initialise the TPI support routines.  Called from strinit().
 */
void
tpi_init()
{
    mutex_init(&tpi_provinfo_lock, NULL, MUTEX_DEFAULT, NULL);

    /*
     * Calculate the right shift for hashing a tpi_provinfo_t.
     */
    tpi_hashshift = highbit(sizeof (tpi_provinfo_t));
}
Esempio n. 5
0
File: zap.c Progetto: ColdCanuck/zfs
void
fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
{
	dmu_buf_t *db;
	zap_leaf_t *l;
	int i;
	zap_phys_t *zp;

	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
	zap->zap_ismicro = FALSE;

	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
	    &zap->zap_f.zap_phys, zap_evict);

	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
	zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;

	zp = zap->zap_f.zap_phys;
	/*
	 * explicitly zero it since it might be coming from an
	 * initialized microzap
	 */
	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
	zp->zap_block_type = ZBT_HEADER;
	zp->zap_magic = ZAP_MAGIC;

	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);

	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
	zp->zap_num_leafs = 1;
	zp->zap_num_entries = 0;
	zp->zap_salt = zap->zap_salt;
	zp->zap_normflags = zap->zap_normflags;
	zp->zap_flags = flags;

	/* block 1 will be the first leaf */
	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;

	/*
	 * set up block 1 - the first leaf
	 */
	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
	dmu_buf_will_dirty(db, tx);

	l = kmem_zalloc(sizeof (zap_leaf_t), KM_PUSHPAGE);
	l->l_dbuf = db;
	l->l_phys = db->db_data;

	zap_leaf_init(l, zp->zap_normflags != 0);

	kmem_free(l, sizeof (zap_leaf_t));
	dmu_buf_rele(db, FTAG);
}
Esempio n. 6
0
void
zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
{
	int i;
	zap_leaf_t l;
	l.l_bs = highbit(size)-1;
	l.l_phys = buf;

	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);

	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);

	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
		struct zap_leaf_entry *le;

		switch (lc->l_free.lf_type) {
		case ZAP_CHUNK_ENTRY:
			le = &lc->l_entry;

			le->le_type =		BSWAP_8(le->le_type);
			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
			le->le_next =		BSWAP_16(le->le_next);
			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
			le->le_name_numints =	BSWAP_16(le->le_name_numints);
			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
			le->le_value_numints =	BSWAP_16(le->le_value_numints);
			le->le_cd =		BSWAP_32(le->le_cd);
			le->le_hash =		BSWAP_64(le->le_hash);
			break;
		case ZAP_CHUNK_FREE:
			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
			break;
		case ZAP_CHUNK_ARRAY:
			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
			/* la_array doesn't need swapping */
			break;
		default:
			ASSERT(!"bad leaf type");
		}
	}
}
Esempio n. 7
0
int xskin_getcolor( Display *d, int r, int g, int b ) {

  int r0,g0,b0;

  sc   = DefaultScreen( d );
  cmap = DefaultColormap( d, sc );

  rshift = 15-highbit(xskin_vis->red_mask);
  gshift = 15-highbit(xskin_vis->green_mask);
  bshift = 15-highbit(xskin_vis->blue_mask);

  if ( iscolorinited==0 ) {
    iscolorinited=1;
    for ( r0=0 ; r0<8 ; r0++ ) {
      for ( g0=0 ; g0<8 ; g0++ ) {
	for ( b0=0 ; b0<8 ; b0++ ) {
	  cols[r0][g0][b0]=-1;
	}
      }
    }
  }

  return GetColor( d, r, g, b );
}
Esempio n. 8
0
void
zap_leaf_init(zap_leaf_t *l)
{
	int i;

	l->l_bs = highbit(l->l_dbuf->db_size)-1;
	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
	}
	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
}
Esempio n. 9
0
/*
 * hermon_cfg_wqe_sizes()
 *    Context: Only called from attach() path context
 */
static void
hermon_cfg_wqe_sizes(hermon_state_t *state, hermon_cfg_profile_t *cp)
{
	uint_t	max_size, log2;
	uint_t	max_sgl, real_max_sgl;

	/*
	 * Get the requested maximum number SGL per WQE from the Hermon
	 * patchable variable
	 */
	max_sgl = hermon_wqe_max_sgl;

	/*
	 * Use requested maximum number of SGL to calculate the max descriptor
	 * size (while guaranteeing that the descriptor size is a power-of-2
	 * cachelines).  We have to use the calculation for QP1 MLX transport
	 * because the possibility that we might need to inline a GRH, along
	 * with all the other headers and alignment restrictions, sets the
	 * maximum for the number of SGLs that we can advertise support for.
	 */
	max_size = (HERMON_QP_WQE_MLX_QP1_HDRS + (max_sgl << 4));
	log2 = highbit(max_size);
	if (ISP2(max_size)) {
		log2 = log2 - 1;
	}
	max_size = (1 << log2);

	max_size = min(max_size, state->hs_devlim.max_desc_sz_sq);

	/*
	 * Then use the calculated max descriptor size to determine the "real"
	 * maximum SGL (the number beyond which we would roll over to the next
	 * power-of-2).
	 */
	real_max_sgl = (max_size - HERMON_QP_WQE_MLX_QP1_HDRS) >> 4;

	/* Then save away this configuration information */
	cp->cp_wqe_max_sgl	= max_sgl;
	cp->cp_wqe_real_max_sgl = real_max_sgl;

	/* SRQ SGL gets set to it's own patchable variable value */
	cp->cp_srq_max_sgl		= hermon_srq_max_sgl;
}
Esempio n. 10
0
/*
 * tavor_srq_sgl_to_logwqesz()
 *    Context: Can be called from interrupt or base context.
 */
static void
tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
    tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
{
	uint_t	max_size, log2, actual_sgl;

	TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);

	switch (wq_type) {
	case TAVOR_QP_WQ_TYPE_RECVQ:
		/*
		 * Use requested maximum SGL to calculate max descriptor size
		 * (while guaranteeing that the descriptor size is a
		 * power-of-2 cachelines).
		 */
		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
		log2 = highbit(max_size);
		if ((max_size & (max_size - 1)) == 0) {
			log2 = log2 - 1;
		}

		/* Make sure descriptor is at least the minimum size */
		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);

		/* Calculate actual number of SGL (given WQE size) */
		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
		break;

	default:
		TAVOR_WARNING(state, "unexpected work queue type");
		TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
		    TAVOR_TNF_ERROR, "");
		break;
	}

	/* Fill in the return values */
	*logwqesz = log2;
	*max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);

	TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
}
Esempio n. 11
0
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
    uint64_t *ashift)
{
	struct vdev_disk *dvd;
	int error;

	/*
	 * We must have a pathname, and it must be absolute.
	 */
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return (EINVAL);
	}

	/*
	 * Open the device if it's not currently open, otherwise just update
	 * the physical size of the device.
	 */
	if (vd->vdev_tsd == NULL) {
		dvd = vd->vdev_tsd = kmem_zalloc(sizeof(struct vdev_disk), KM_SLEEP);

		error = device_open(vd->vdev_path + 5, DO_RDWR, &dvd->device);
		if (error) {
			vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
			return error;
		}
	} else {
		ASSERT(vd->vdev_reopening);
		dvd = vd->vdev_tsd;
	}

	/*
	 * Determine the actual size of the device.
	 */
	*max_psize = *psize = dvd->device->size;
	*ashift = highbit(MAX(DEV_BSIZE, SPA_MINBLOCKSIZE)) - 1;
	return 0;
}
Esempio n. 12
0
int
nfs4_rnode_init(void)
{
	ulong_t nrnode4_max;
	int i;

	/*
	 * Compute the size of the rnode4 hash table
	 */
	if (nrnode <= 0)
		nrnode = ncsize;
	nrnode4_max =
	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
		    "setting nrnode to max value of %ld", nrnode4_max);
		nrnode = nrnode4_max;
	}
	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
	rtable4mask = rtable4size - 1;

	/*
	 * Allocate and initialize the hash buckets
	 */
	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
	for (i = 0; i < rtable4size; i++) {
		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
	}

	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);

	return (0);
}
Esempio n. 13
0
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
    uint64_t *ashift)
{
	spa_t *spa = vd->vdev_spa;
	vdev_disk_t *dvd = vd->vdev_tsd;
	vnode_t *devvp = NULLVP;
	vfs_context_t context = NULL;
	uint64_t blkcnt;
	uint32_t blksize;
	int fmode = 0;
	int error = 0;
	int isssd;

	/*
	 * We must have a pathname, and it must be absolute.
	 */
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return (SET_ERROR(EINVAL));
	}

	/*
	 * Reopen the device if it's not currently open. Otherwise,
	 * just update the physical size of the device.
	 */
	if (dvd != NULL) {
	  if (dvd->vd_offline) {
	    /*
	     * If we are opening a device in its offline notify
	     * context, the LDI handle was just closed. Clean
	     * up the LDI event callbacks and free vd->vdev_tsd.
	     */
	    vdev_disk_free(vd);
	  } else {
	    ASSERT(vd->vdev_reopening);
		devvp = dvd->vd_devvp;
	    goto skip_open;
	  }
	}

	/*
	 * Create vd->vdev_tsd.
	 */
	vdev_disk_alloc(vd);
	dvd = vd->vdev_tsd;

	/*
	 * When opening a disk device, we want to preserve the user's original
	 * intent.  We always want to open the device by the path the user gave
	 * us, even if it is one of multiple paths to the same device.  But we
	 * also want to be able to survive disks being removed/recabled.
	 * Therefore the sequence of opening devices is:
	 *
	 * 1. Try opening the device by path.  For legacy pools without the
	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
	 *
	 * 2. If the devid of the device matches the stored value, return
	 *    success.
	 *
	 * 3. Otherwise, the device may have moved.  Try opening the device
	 *    by the devid instead.
	 */
	/* ### APPLE TODO ### */
#ifdef illumos
	if (vd->vdev_devid != NULL) {
		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
		    &dvd->vd_minor) != 0) {
			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
			return (SET_ERROR(EINVAL));
		}
	}
#endif

	error = EINVAL;		/* presume failure */

	if (vd->vdev_path != NULL) {

		context = vfs_context_create( spl_vfs_context_kernel() );

		/* Obtain an opened/referenced vnode for the device. */
		if ((error = vnode_open(vd->vdev_path, spa_mode(spa), 0, 0,
								&devvp, context))) {
			goto out;
		}
		if (!vnode_isblk(devvp)) {
			error = ENOTBLK;
			goto out;
		}
		/*
		 * ### APPLE TODO ###
		 * vnode_authorize devvp for KAUTH_VNODE_READ_DATA and
		 * KAUTH_VNODE_WRITE_DATA
		 */

		/*
		 * Disallow opening of a device that is currently in use.
		 * Flush out any old buffers remaining from a previous use.
		 */
		if ((error = vfs_mountedon(devvp))) {
			goto out;
		}
		if (VNOP_FSYNC(devvp, MNT_WAIT, context) != 0) {
			error = ENOTBLK;
			goto out;
		}
		if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
			goto out;
		}

	} else {
		goto out;
	}


	int len = MAXPATHLEN;
	if (vn_getpath(devvp, dvd->vd_readlinkname, &len) == 0) {
		dprintf("ZFS: '%s' resolved name is '%s'\n",
			   vd->vdev_path, dvd->vd_readlinkname);
	} else {
		dvd->vd_readlinkname[0] = 0;
	}



skip_open:
	/*
	 * Determine the actual size of the device.
	 */
	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0,
	    context) != 0 ||
	    VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0,
	    context) != 0) {
		error = EINVAL;
		goto out;
	}

	*psize = blkcnt * (uint64_t)blksize;
	*max_psize = *psize;

	dvd->vd_ashift = highbit(blksize) - 1;
	dprintf("vdev_disk: Device %p ashift set to %d\n", devvp,
	    dvd->vd_ashift);


	*ashift = highbit(MAX(blksize, SPA_MINBLOCKSIZE)) - 1;

	/*
	 *  ### APPLE TODO ###
	 */
#ifdef illumos
	if (vd->vdev_wholedisk == 1) {
		int wce = 1;
		if (error == 0) {
			/*
			 * If we have the capability to expand, we'd have
			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
			 * Adjust max_psize upward accordingly since we know
			 * we own the whole disk now.
			 */
			*max_psize = capacity * blksz;
		}

		/*
		 * Since we own the whole disk, try to enable disk write
		 * caching.  We ignore errors because it's OK if we can't do it.
		 */
		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
		    FKIOCTL, kcred, NULL);
	}
#endif

	/*
	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
	 * try again.
	 */
	vd->vdev_nowritecache = B_FALSE;

	/* Inform the ZIO pipeline that we are non-rotational */
	vd->vdev_nonrot = B_FALSE;
	if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0,
				   context) == 0) {
		if (isssd)
			vd->vdev_nonrot = B_TRUE;
	}
	dprintf("ZFS: vdev_disk(%s) isSSD %d\n", vd->vdev_path ? vd->vdev_path : "",
			isssd);

	dvd->vd_devvp = devvp;
out:
	if (error) {
	  if (devvp) {
			vnode_close(devvp, fmode, context);
			dvd->vd_devvp = NULL;
	  }
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	}
	if (context)
		(void) vfs_context_rele(context);

	if (error) printf("ZFS: vdev_disk_open('%s') failed error %d\n",
					  vd->vdev_path ? vd->vdev_path : "", error);
	return (error);
}
Esempio n. 14
0
/*
 * tavor_srq_alloc()
 *    Context: Can be called only from user or kernel context.
 */
int
tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
    uint_t sleepflag, tavor_srq_options_t *op)
{
	ibt_srq_hdl_t		ibt_srqhdl;
	tavor_pdhdl_t		pd;
	ibt_srq_sizes_t		*sizes;
	ibt_srq_sizes_t		*real_sizes;
	tavor_srqhdl_t		*srqhdl;
	ibt_srq_flags_t		flags;
	tavor_rsrc_t		*srqc, *rsrc;
	tavor_hw_srqc_t		srqc_entry;
	uint32_t		*buf;
	tavor_srqhdl_t		srq;
	tavor_umap_db_entry_t	*umapdb;
	ibt_mr_attr_t		mr_attr;
	tavor_mr_options_t	mr_op;
	tavor_mrhdl_t		mr;
	uint64_t		addr;
	uint64_t		value, srq_desc_off;
	uint32_t		lkey;
	uint32_t		log_srq_size;
	uint32_t		uarpg;
	uint_t			wq_location, dma_xfer_mode, srq_is_umap;
	int			flag, status;
	char			*errormsg;
	uint_t			max_sgl;
	uint_t			wqesz;

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))

	TAVOR_TNF_ENTER(tavor_srq_alloc);

	/*
	 * Check the "options" flag.  Currently this flag tells the driver
	 * whether or not the SRQ's work queues should be come from normal
	 * system memory or whether they should be allocated from DDR memory.
	 */
	if (op == NULL) {
		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
	} else {
		wq_location = op->srqo_wq_loc;
	}

	/*
	 * Extract the necessary info from the tavor_srq_info_t structure
	 */
	real_sizes = srqinfo->srqi_real_sizes;
	sizes	   = srqinfo->srqi_sizes;
	pd	   = srqinfo->srqi_pd;
	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
	flags	   = srqinfo->srqi_flags;
	srqhdl	   = srqinfo->srqi_srqhdl;

	/*
	 * Determine whether SRQ is being allocated for userland access or
	 * whether it is being allocated for kernel access.  If the SRQ is
	 * being allocated for userland access, then lookup the UAR doorbell
	 * page number for the current process.  Note:  If this is not found
	 * (e.g. if the process has not previously open()'d the Tavor driver),
	 * then an error is returned.
	 */
	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
	if (srq_is_umap) {
		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
		if (status != DDI_SUCCESS) {
			/* Set "status" and "errormsg" and goto failure */
			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
			goto srqalloc_fail3;
		}
		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
	}

	/* Increase PD refcnt */
	tavor_pd_refcnt_inc(pd);

	/* Allocate an SRQ context entry */
	status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
		goto srqalloc_fail1;
	}

	/* Allocate the SRQ Handle entry */
	status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
		goto srqalloc_fail2;
	}

	srq = (tavor_srqhdl_t)rsrc->tr_addr;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))

	/* Calculate the SRQ number */
	tavor_srq_numcalc(state, srqc->tr_indx, &srq->srq_srqnum);

	/*
	 * If this will be a user-mappable SRQ, then allocate an entry for
	 * the "userland resources database".  This will later be added to
	 * the database (after all further SRQ operations are successful).
	 * If we fail here, we must undo the reference counts and the
	 * previous resource allocation.
	 */
	if (srq_is_umap) {
		umapdb = tavor_umap_db_alloc(state->ts_instance,
		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
		    (uint64_t)(uintptr_t)rsrc);
		if (umapdb == NULL) {
			/* Set "status" and "errormsg" and goto failure */
			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
			goto srqalloc_fail3;
		}
	}

	/*
	 * Calculate the appropriate size for the SRQ.
	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
	 * is to round the requested size up to the next highest power-of-2
	 */
	sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
	log_srq_size = highbit(sizes->srq_wr_sz);
	if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
		log_srq_size = log_srq_size - 1;
	}

	/*
	 * Next we verify that the rounded-up size is valid (i.e. consistent
	 * with the device limits and/or software-configured limits).  If not,
	 * then obviously we have a lot of cleanup to do before returning.
	 */
	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
		goto srqalloc_fail4;
	}

	/*
	 * Next we verify that the requested number of SGL is valid (i.e.
	 * consistent with the device limits and/or software-configured
	 * limits).  If not, then obviously the same cleanup needs to be done.
	 */
	max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
	if (sizes->srq_sgl_sz > max_sgl) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
		goto srqalloc_fail4;
	}

	/*
	 * Determine the SRQ's WQE sizes.  This depends on the requested
	 * number of SGLs.  Note: This also has the side-effect of
	 * calculating the real number of SGLs (for the calculated WQE size)
	 */
	tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
	    TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
	    &srq->srq_wq_sgl);

	/*
	 * Allocate the memory for SRQ work queues.  Note:  The location from
	 * which we will allocate these work queues has been passed in through
	 * the tavor_qp_options_t structure.  Since Tavor work queues are not
	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
	 * queue memory is very important.  We used to allocate work queues
	 * (the combined receive and send queues) so that they would be aligned
	 * on their combined size.  That alignment guaranteed that they would
	 * never cross the 4GB boundary (Tavor work queues are on the order of
	 * MBs at maximum).  Now we are able to relax this alignment constraint
	 * by ensuring that the IB address assigned to the queue memory (as a
	 * result of the tavor_mr_register() call) is offset from zero.
	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
	 * guarantee the alignment, but when attempting to use IOMMU bypass
	 * mode we found that we were not allowed to specify any alignment that
	 * was more restrictive than the system page size.  So we avoided this
	 * constraint by passing two alignment values, one for the memory
	 * allocation itself and the other for the DMA handle (for later bind).
	 * This used to cause more memory than necessary to be allocated (in
	 * order to guarantee the more restrictive alignment contraint).  But
	 * be guaranteeing the zero-based IB virtual address for the queue, we
	 * are able to conserve this memory.
	 *
	 * Note: If SRQ is not user-mappable, then it may come from either
	 * kernel system memory or from HCA-attached local DDR memory.
	 *
	 * Note2: We align this queue on a pagesize boundary.  This is required
	 * to make sure that all the resulting IB addresses will start at 0, for
	 * a zero-based queue.  By making sure we are aligned on at least a
	 * page, any offset we use into our queue will be the same as when we
	 * perform tavor_srq_modify() operations later.
	 */
	wqesz = (1 << srq->srq_wq_log_wqesz);
	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
	if (srq_is_umap) {
		srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
	} else {
		srq->srq_wqinfo.qa_location = wq_location;
	}
	status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
		goto srqalloc_fail4;
	}
	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))

	/*
	 * Register the memory for the SRQ work queues.  The memory for the SRQ
	 * must be registered in the Tavor TPT tables.  This gives us the LKey
	 * to specify in the SRQ context later.  Note: If the work queue is to
	 * be allocated from DDR memory, then only a "bypass" mapping is
	 * appropriate.  And if the SRQ memory is user-mappable, then we force
	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
	 * restriction, we pass the "mro_bind_override_addr" flag in the call
	 * to tavor_mr_register().  This guarantees that the resulting IB vaddr
	 * will be zero-based (modulo the offset into the first page).  If we
	 * fail here, we still have the bunch of resource and reference count
	 * cleanup to do.
	 */
	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
	    IBT_MR_NOSLEEP;
	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
	mr_attr.mr_as    = NULL;
	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
	if (srq_is_umap) {
		mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
	} else {
		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
			mr_op.mro_bind_type =
			    state->ts_cfg_profile->cp_iommu_bypass;
			dma_xfer_mode =
			    state->ts_cfg_profile->cp_streaming_consistent;
			if (dma_xfer_mode == DDI_DMA_STREAMING) {
				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
			}
		} else {
			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
		}
	}
	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
	mr_op.mro_bind_override_addr = 1;
	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
		goto srqalloc_fail5;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
	addr = mr->mr_bindinfo.bi_addr;
	lkey = mr->mr_lkey;

	/*
	 * Calculate the offset between the kernel virtual address space
	 * and the IB virtual address space.  This will be used when
	 * posting work requests to properly initialize each WQE.
	 */
	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
	    (uint64_t)mr->mr_bindinfo.bi_addr;

	/*
	 * Create WQL and Wridlist for use by this SRQ
	 */
	srq->srq_wrid_wql = tavor_wrid_wql_create(state);
	if (srq->srq_wrid_wql == NULL) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
		goto srqalloc_fail6;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))

	srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
	if (srq->srq_wridlist == NULL) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
		goto srqalloc_fail7;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))

	srq->srq_wridlist->wl_srq_en = 1;
	srq->srq_wridlist->wl_free_list_indx = -1;

	/*
	 * Fill in all the return arguments (if necessary).  This includes
	 * real queue size and real SGLs.
	 */
	if (real_sizes != NULL) {
		real_sizes->srq_wr_sz = (1 << log_srq_size);
		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
	}

	/*
	 * Fill in the SRQC entry.  This is the final step before passing
	 * ownership of the SRQC entry to the Tavor hardware.  We use all of
	 * the information collected/calculated above to fill in the
	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
	 * used for userland access, then we need to set the UAR page number
	 * appropriately (otherwise it's a "don't care")
	 */
	bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
	srqc_entry.wqe_addr_h	   = (addr >> 32);
	srqc_entry.next_wqe_addr_l = 0;
	srqc_entry.ds		   = (wqesz >> 4);
	srqc_entry.state	   = TAVOR_SRQ_STATE_HW_OWNER;
	srqc_entry.pd		   = pd->pd_pdnum;
	srqc_entry.lkey		   = lkey;
	srqc_entry.wqe_cnt	   = 0;
	if (srq_is_umap) {
		srqc_entry.uar	   = uarpg;
	} else {
		srqc_entry.uar	   = 0;
	}

	/*
	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
	 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
	 * command).  Note: In general, this operation shouldn't fail.  But
	 * if it does, we have to undo everything we've done above before
	 * returning error.
	 */
	status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
	    sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
	    sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
		    status);
		TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
		goto srqalloc_fail8;
	}

	/*
	 * Fill in the rest of the Tavor SRQ handle.  We can update
	 * the following fields for use in further operations on the SRQ.
	 */
	srq->srq_srqcrsrcp = srqc;
	srq->srq_rsrcp	   = rsrc;
	srq->srq_mrhdl	   = mr;
	srq->srq_refcnt	   = 0;
	srq->srq_is_umap   = srq_is_umap;
	srq->srq_uarpg	   = (srq->srq_is_umap) ? uarpg : 0;
	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
	srq->srq_pdhdl	   = pd;
	srq->srq_wq_lastwqeindx = -1;
	srq->srq_wq_bufsz  = (1 << log_srq_size);
	srq->srq_wq_buf	   = buf;
	srq->srq_desc_off  = srq_desc_off;
	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
	srq->srq_state	   = 0;
	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;

	/* Determine if later ddi_dma_sync will be necessary */
	srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);

	/*
	 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
	 * "srqhdl" and return success
	 */
	ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
	state->ts_srqhdl[srqc->tr_indx] = srq;

	/*
	 * If this is a user-mappable SRQ, then we need to insert the
	 * previously allocated entry into the "userland resources database".
	 * This will allow for later lookup during devmap() (i.e. mmap())
	 * calls.
	 */
	if (srq->srq_is_umap) {
		tavor_umap_db_add(umapdb);
	} else {
		mutex_enter(&srq->srq_wrid_wql->wql_lock);
		tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
		mutex_exit(&srq->srq_wrid_wql->wql_lock);
	}

	*srqhdl = srq;

	TAVOR_TNF_EXIT(tavor_srq_alloc);
	return (status);

/*
 * The following is cleanup for all possible failure cases in this routine
 */
srqalloc_fail8:
	kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
	    sizeof (tavor_wrid_entry_t));
	kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
srqalloc_fail7:
	tavor_wql_refcnt_dec(srq->srq_wrid_wql);
srqalloc_fail6:
	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
	    TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to deregister SRQ memory");
	}
srqalloc_fail5:
	tavor_queue_free(state, &srq->srq_wqinfo);
srqalloc_fail4:
	if (srq_is_umap) {
		tavor_umap_db_free(umapdb);
	}
srqalloc_fail3:
	tavor_rsrc_free(state, &rsrc);
srqalloc_fail2:
	tavor_rsrc_free(state, &srqc);
srqalloc_fail1:
	tavor_pd_refcnt_dec(pd);
srqalloc_fail:
	TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
	    tnf_string, msg, errormsg);
	TAVOR_TNF_EXIT(tavor_srq_alloc);
	return (status);
}
Esempio n. 15
0
/*
 * hermon_cfg_profile_init_phase2()
 *    Context: Only called from attach() path context
 */
int
hermon_cfg_profile_init_phase2(hermon_state_t *state)
{
	hermon_cfg_profile_t	*cp;
	hermon_hw_querydevlim_t	*devlim;
	hermon_hw_query_port_t	*port;
	uint32_t		num, size;
	int			i;

	/* Read in the device limits */
	devlim = &state->hs_devlim;
	/* and the port information */
	port = &state->hs_queryport;

	/* Read the configuration profile */
	cp = state->hs_cfg_profile;

	/*
	 * We configure all Hermon HCAs with the same profile, which
	 * is based upon the default value assignments above. If we want to
	 * add additional profiles in the future, they can be added here.
	 * Note the reference to "Memfree" is a holdover from Arbel/Sinai
	 */
	if (state->hs_cfg_profile_setting != HERMON_CFG_MEMFREE) {
		return (DDI_FAILURE);
	}

	/*
	 * Note for most configuration parameters, we use the lesser of our
	 * desired configuration value or the device-defined maximum value.
	 */
	cp->cp_log_num_mtt	= min(hermon_log_num_mtt, devlim->log_max_mtt);
	cp->cp_log_num_dmpt = min(hermon_log_num_dmpt, devlim->log_max_dmpt);
	cp->cp_log_num_cmpt	= HERMON_LOG_CMPT_PER_TYPE + 2;	/* times 4, */
								/* per PRM */
	cp->cp_log_max_mrw_sz	= min(hermon_log_max_mrw_sz,
	    devlim->log_max_mrw_sz);
	cp->cp_log_num_pd	= min(hermon_log_num_pd, devlim->log_max_pd);
	cp->cp_log_num_qp	= min(hermon_log_num_qp, devlim->log_max_qp);
	cp->cp_log_num_cq	= min(hermon_log_num_cq, devlim->log_max_cq);
	cp->cp_log_num_srq	= min(hermon_log_num_srq, devlim->log_max_srq);
	cp->cp_log_num_eq	= min(hermon_log_num_eq, devlim->log_max_eq);
	cp->cp_log_eq_sz	= min(hermon_log_eq_sz, devlim->log_max_eq_sz);
	cp->cp_log_num_rdb	= cp->cp_log_num_qp +
	    min(hermon_log_num_rdb_per_qp, devlim->log_max_ra_req_qp);
	cp->cp_hca_max_rdma_in_qp = cp->cp_hca_max_rdma_out_qp =
	    1 << min(hermon_log_num_rdb_per_qp, devlim->log_max_ra_req_qp);
	cp->cp_num_qp_per_mcg	= max(hermon_num_qp_per_mcg,
	    HERMON_NUM_QP_PER_MCG_MIN);
	cp->cp_num_qp_per_mcg	= min(cp->cp_num_qp_per_mcg,
	    (1 << devlim->log_max_qp_mcg) - 8);
	cp->cp_num_qp_per_mcg	= (1 << highbit(cp->cp_num_qp_per_mcg + 7)) - 8;
	cp->cp_log_num_mcg 	= min(hermon_log_num_mcg, devlim->log_max_mcg);
	cp->cp_log_num_mcg_hash	= hermon_log_num_mcg_hash;

	/* until srq_resize is debugged, disable it */
	cp->cp_srq_resize_enabled = 0;

	/* cp->cp_log_num_uar	= hermon_log_num_uar; */
	/*
	 * now, we HAVE to calculate the number of UAR pages, so that we can
	 * get the blueflame stuff correct as well
	 */

	size = devlim->log_max_uar_sz;
	/* 1MB (2^^20) times size (2^^size) / sparc_pg (2^^13) */
	num = (20 + size) - 13;		/* XXX - consider using PAGESHIFT */
	if (devlim->blu_flm)
		num -= 1;	/* if blueflame, only half the size for UARs */
	cp->cp_log_num_uar	= min(hermon_log_num_uar, num);


	/* while we're at it, calculate the index of the kernel uar page */
	/* either the reserved uar's or 128, whichever is smaller */
	state->hs_kernel_uar_index = (devlim->num_rsvd_uar > 128) ?
	    devlim->num_rsvd_uar : 128;

	cp->cp_log_max_pkeytbl	= port->log_max_pkey;

	cp->cp_log_max_qp_sz	= devlim->log_max_qp_sz;
	cp->cp_log_max_cq_sz	= devlim->log_max_cq_sz;
	cp->cp_log_max_srq_sz	= devlim->log_max_srq_sz;
	cp->cp_log_max_gidtbl	= port->log_max_gid;
	cp->cp_max_mtu		= port->ib_mtu;	/* XXX now from query_port */
	cp->cp_max_port_width	= port->ib_port_wid;  /* now from query_port */
	cp->cp_max_vlcap	= port->max_vl;
	cp->cp_log_num_ah	= hermon_log_num_ah;

	/* Paranoia, ensure no arrays indexed by port_num are out of bounds */
	cp->cp_num_ports	= devlim->num_ports;
	if (cp->cp_num_ports > HERMON_MAX_PORTS) {
		cmn_err(CE_CONT, "device has more ports (%d) than are "
		    "supported; Using %d ports\n",
		    cp->cp_num_ports, HERMON_MAX_PORTS);
		cp->cp_num_ports = HERMON_MAX_PORTS;
	};

	/* allocate variable sized arrays */
	for (i = 0; i < HERMON_MAX_PORTS; i++) {
		state->hs_pkey[i] = kmem_zalloc((1 << cp->cp_log_max_pkeytbl) *
		    sizeof (ib_pkey_t), KM_SLEEP);
		state->hs_guid[i] = kmem_zalloc((1 << cp->cp_log_max_gidtbl) *
		    sizeof (ib_guid_t), KM_SLEEP);
	}

	/* Determine WQE sizes from requested max SGLs */
	hermon_cfg_wqe_sizes(state, cp);

	/* Set whether to use MSIs or not */
	cp->cp_use_msi_if_avail = hermon_use_msi_if_avail;

#if !defined(_ELF64)
	/*
	 * Need to reduce the hermon kernel virtual memory footprint
	 * on 32-bit kernels.
	 */
	cp->cp_log_num_mtt	-= 6;
	cp->cp_log_num_dmpt	-= 6;
	cp->cp_log_num_pd	-= 6;
	cp->cp_log_num_qp	-= 6;
	cp->cp_log_num_cq	-= 6;
	cp->cp_log_num_srq	-= 6;
	cp->cp_log_num_rdb	= cp->cp_log_num_qp +
	    min(hermon_log_num_rdb_per_qp, devlim->log_max_ra_req_qp);
	cp->cp_hca_max_rdma_in_qp = cp->cp_hca_max_rdma_out_qp =
	    1 << min(hermon_log_num_rdb_per_qp, devlim->log_max_ra_req_qp);
#endif

	return (DDI_SUCCESS);
}
Esempio n. 16
0
void
vpm_init()
{
	long  npages;
	struct vpmap *vpm;
	struct vpmfree *vpmflp;
	int i, ndx;
	extern void prefetch_smap_w(void *);

	if (!vpm_cache_enable) {
		return;
	}

	/*
	 * Set the size of the cache.
	 */
	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
	if (vpm_cache_size < VPMAP_MINCACHE) {
		vpm_cache_size = VPMAP_MINCACHE;
	}

	/*
	 * Number of freelists.
	 */
	if (vpm_nfreelist == 0) {
		vpm_nfreelist = max_ncpus;
	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
		cmn_err(CE_WARN, "vpmap create : number of freelist "
		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
		vpm_nfreelist = 2 * max_ncpus;
	}

	/*
	 * Round it up to the next power of 2
	 */
	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
	}
	vpmd_freemsk = vpm_nfreelist - 1;

	/*
	 * Use a per cpu rotor index to spread the allocations evenly
	 * across the available vpm freelists.
	 */
	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
	ndx = 0;
	for (i = 0; i < max_ncpus; i++) {

		vpmd_cpu[i].vfree_ndx = ndx;
		ndx = (ndx + 1) & vpmd_freemsk;
	}

	/*
	 * Allocate and initialize the freelist.
	 */
	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
				KM_SLEEP);
	for (i = 0; i < vpm_nfreelist; i++) {

		vpmflp = &vpmd_free[i];
		/*
		 * Set up initial queue pointers. They will get flipped
		 * back and forth.
		 */
		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
	}

	npages = mmu_btop(vpm_cache_size);


	/*
	 * Allocate and initialize the vpmap structs.
	 */
	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
		struct vpmfree *vpmflp;
		union vpm_freeq *releq;
		struct vpmap *vpmapf;

		/*
		 * Use prefetch as we have to walk thru a large number of
		 * these data structures. We just use the smap's prefetch
		 * routine as it does the same. This should work fine
		 * for x64(this needs to be modifed when enabled on sparc).
		 */
		prefetch_smap_w((void *)vpm);

		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);

		vpmflp = VPMAP2VMF(vpm);
		releq = vpmflp->vpm_releq;

		vpmapf = releq->vpmq_free;
		if (vpmapf == NULL) {
			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
		} else {
			vpm->vpm_next = vpmapf;
			vpm->vpm_prev = vpmapf->vpm_prev;
			vpmapf->vpm_prev = vpm;
			vpm->vpm_prev->vpm_next = vpm;
			releq->vpmq_free = vpm->vpm_next;
		}

		/*
		 * Indicate that the vpmap is on the releq at start
		 */
		vpm->vpm_ndxflg = VPMRELEQ;
	}
}
Esempio n. 17
0
static int
vdev_disk_open(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift)
{
	vdev_disk_t *dvd = NULL;
	vnode_t *devvp = NULLVP;
	vfs_context_t context = NULL;
	uint64_t blkcnt;
	uint32_t blksize;
	int fmode = 0;
	int error = 0;

	/*
	 * We must have a pathname, and it must be absolute.
	 */
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return (EINVAL);
	}

	dvd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
    if (dvd == NULL)
        return ENOMEM;

	/*
	 * When opening a disk device, we want to preserve the user's original
	 * intent.  We always want to open the device by the path the user gave
	 * us, even if it is one of multiple paths to the save device.  But we
	 * also want to be able to survive disks being removed/recabled.
	 * Therefore the sequence of opening devices is:
	 *
	 * 1. Try opening the device by path.  For legacy pools without the
	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
	 *
	 * 2. If the devid of the device matches the stored value, return
	 *    success.
	 *
	 * 3. Otherwise, the device may have moved.  Try opening the device
	 *    by the devid instead.
	 *
	 */

	/* ### APPLE TODO ### */
	/* ddi_devid_str_decode */

	context = vfs_context_create((vfs_context_t)0);

	/* Obtain an opened/referenced vnode for the device. */
	error = vnode_open(vd->vdev_path, spa_mode(vd->vdev_spa), 0, 0, &devvp, context);
	if (error) {
		goto out;
	}

	if (!vnode_isblk(devvp)) {
		error = ENOTBLK;
		goto out;
	}

	/* ### APPLE TODO ### */
	/* vnode_authorize devvp for KAUTH_VNODE_READ_DATA and
	 * KAUTH_VNODE_WRITE_DATA
	 */

	/*
	 * Disallow opening of a device that is currently in use.
	 * Flush out any old buffers remaining from a previous use.
	 */
	if ((error = vfs_mountedon(devvp))) {
		goto out;
	}
	if (VNOP_FSYNC(devvp, MNT_WAIT, context) != 0) {
		error = ENOTBLK;
		goto out;
	}
	if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
		goto out;
	}

	/*
	 * Determine the actual size of the device.
	 */
	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, context)
	       	!= 0 || VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt,
		0, context) != 0) {

		error = EINVAL;
		goto out;
	}
	*size = blkcnt * (uint64_t)blksize;

	/*
	 *  ### APPLE TODO ###
	 * If we own the whole disk, try to enable disk write caching.
	 */

	/*
	 * Take the device's minimum transfer size into account.
	 */
	*ashift = highbit(MAX(blksize, SPA_MINBLOCKSIZE)) - 1;

    /*
     * Setting the vdev_ashift did in fact break the pool for import
     * on ZEVO. This puts the logic into question. It appears that vdev_top
     * will also then change. It then panics in space_map from metaslab_alloc
     */
    //vd->vdev_ashift = *ashift;
    dvd->vd_ashift = *ashift;


	/*
	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
	 * try again.
	 */
	vd->vdev_nowritecache = B_FALSE;
	vd->vdev_tsd = dvd;
	dvd->vd_devvp = devvp;
out:
	if (error) {
		if (devvp)
			vnode_close(devvp, fmode, context);
		if (dvd)
			kmem_free(dvd, sizeof (vdev_disk_t));

		/*
		 * Since the open has failed, vd->vdev_tsd should
		 * be NULL when we get here, signaling to the
		 * rest of the spa not to try and reopen or close this device
		 */
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	}
	if (context) {
		(void) vfs_context_rele(context);
	}
	return (error);
}
Esempio n. 18
0
void
cpu_intrq_cleanup(struct cpu *cpu)
{
	struct machcpu *mcpup = &cpu->cpu_m;
	int cpu_list_size;
	uint64_t cpu_q_size;
	uint64_t dev_q_size;
	uint64_t cpu_rq_size;
	uint64_t cpu_nrq_size;

	/*
	 * Free mondo data for xcalls.
	 */
	if (mcpup->mondo_data) {
		contig_mem_free(mcpup->mondo_data, INTR_REPORT_SIZE);
		mcpup->mondo_data = NULL;
		mcpup->mondo_data_ra = NULL;
	}

	/*
	 *  Free per-cpu list of ncpu_guest_max for xcalls
	 */
	cpu_list_size = ncpu_guest_max * sizeof (uint16_t);
	if (cpu_list_size < INTR_REPORT_SIZE)
		cpu_list_size = INTR_REPORT_SIZE;

	/*
	 * contig_mem_alloc() requires size to be a power of 2.
	 * Increase size to a power of 2 if necessary.
	 */
	if ((cpu_list_size & (cpu_list_size - 1)) != 0) {
		cpu_list_size = 1 << highbit(cpu_list_size);
	}

	if (mcpup->cpu_list) {
		contig_mem_free(mcpup->cpu_list, cpu_list_size);
		mcpup->cpu_list = NULL;
		mcpup->cpu_list_ra = NULL;
	}

	/*
	 * Free sun4v interrupt and error queues.
	 */
	if (mcpup->cpu_q_va) {
		cpu_q_size = cpu_q_entries * INTR_REPORT_SIZE;
		contig_mem_free(mcpup->cpu_q_va, cpu_q_size);
		mcpup->cpu_q_va = NULL;
		mcpup->cpu_q_base_pa = NULL;
		mcpup->cpu_q_size = 0;
	}

	if (mcpup->dev_q_va) {
		dev_q_size = dev_q_entries * INTR_REPORT_SIZE;
		contig_mem_free(mcpup->dev_q_va, dev_q_size);
		mcpup->dev_q_va = NULL;
		mcpup->dev_q_base_pa = NULL;
		mcpup->dev_q_size = 0;
	}

	if (mcpup->cpu_rq_va) {
		cpu_rq_size = cpu_rq_entries * Q_ENTRY_SIZE;
		contig_mem_free(mcpup->cpu_rq_va, 2 * cpu_rq_size);
		mcpup->cpu_rq_va = NULL;
		mcpup->cpu_rq_base_pa = NULL;
		mcpup->cpu_rq_size = 0;
	}

	if (mcpup->cpu_nrq_va) {
		cpu_nrq_size = cpu_nrq_entries * Q_ENTRY_SIZE;
		contig_mem_free(mcpup->cpu_nrq_va, 2 * cpu_nrq_size);
		mcpup->cpu_nrq_va = NULL;
		mcpup->cpu_nrq_base_pa = NULL;
		mcpup->cpu_nrq_size = 0;
	}
}
Esempio n. 19
0
int
cpu_intrq_setup(struct cpu *cpu)
{
	struct machcpu *mcpup = &cpu->cpu_m;
	size_t size;

	/*
	 * This routine will return with an error return if any
	 * contig_mem_alloc() fails.  It is expected that the caller will
	 * call cpu_intrq_cleanup() (or cleanup_cpu_common() which will).
	 * That will cleanly free only those blocks that were alloc'd.
	 */

	/*
	 * Allocate mondo data for xcalls.
	 */
	mcpup->mondo_data = contig_mem_alloc(INTR_REPORT_SIZE);

	if (mcpup->mondo_data == NULL) {
		cmn_err(CE_NOTE, "cpu%d: cpu mondo_data allocation failed",
		    cpu->cpu_id);
		return (ENOMEM);
	}
	/*
	 * va_to_pa() is too expensive to call for every crosscall
	 * so we do it here at init time and save it in machcpu.
	 */
	mcpup->mondo_data_ra = va_to_pa(mcpup->mondo_data);

	/*
	 *  Allocate a per-cpu list of ncpu_guest_max for xcalls
	 */
	size = ncpu_guest_max * sizeof (uint16_t);
	if (size < INTR_REPORT_SIZE)
		size = INTR_REPORT_SIZE;

	/*
	 * contig_mem_alloc() requires size to be a power of 2.
	 * Increase size to a power of 2 if necessary.
	 */
	if ((size & (size - 1)) != 0) {
		size = 1 << highbit(size);
	}

	mcpup->cpu_list = contig_mem_alloc(size);

	if (mcpup->cpu_list == NULL) {
		cmn_err(CE_NOTE, "cpu%d: cpu cpu_list allocation failed",
		    cpu->cpu_id);
		return (ENOMEM);
	}
	mcpup->cpu_list_ra = va_to_pa(mcpup->cpu_list);

	/*
	 * Allocate sun4v interrupt and error queues.
	 */
	size = cpu_q_entries * INTR_REPORT_SIZE;

	mcpup->cpu_q_va = contig_mem_alloc(size);

	if (mcpup->cpu_q_va == NULL) {
		cmn_err(CE_NOTE, "cpu%d: cpu intrq allocation failed",
		    cpu->cpu_id);
		return (ENOMEM);
	}
	mcpup->cpu_q_base_pa = va_to_pa(mcpup->cpu_q_va);
	mcpup->cpu_q_size = size;

	/*
	 * Allocate device queues
	 */
	size = dev_q_entries * INTR_REPORT_SIZE;

	mcpup->dev_q_va = contig_mem_alloc(size);

	if (mcpup->dev_q_va == NULL) {
		cmn_err(CE_NOTE, "cpu%d: dev intrq allocation failed",
		    cpu->cpu_id);
		return (ENOMEM);
	}
	mcpup->dev_q_base_pa = va_to_pa(mcpup->dev_q_va);
	mcpup->dev_q_size = size;

	/*
	 * Allocate resumable queue and its kernel buffer
	 */
	size = cpu_rq_entries * Q_ENTRY_SIZE;

	mcpup->cpu_rq_va = contig_mem_alloc(2 * size);

	if (mcpup->cpu_rq_va == NULL) {
		cmn_err(CE_NOTE, "cpu%d: resumable queue allocation failed",
		    cpu->cpu_id);
		return (ENOMEM);
	}
	mcpup->cpu_rq_base_pa = va_to_pa(mcpup->cpu_rq_va);
	mcpup->cpu_rq_size = size;
	/* zero out the memory */
	bzero(mcpup->cpu_rq_va, 2 * size);

	/*
	 * Allocate non-resumable queues
	 */
	size = cpu_nrq_entries * Q_ENTRY_SIZE;

	mcpup->cpu_nrq_va = contig_mem_alloc(2 * size);

	if (mcpup->cpu_nrq_va == NULL) {
		cmn_err(CE_NOTE, "cpu%d: nonresumable queue allocation failed",
		    cpu->cpu_id);
		return (ENOMEM);
	}
	mcpup->cpu_nrq_base_pa = va_to_pa(mcpup->cpu_nrq_va);
	mcpup->cpu_nrq_size = size;
	/* zero out the memory */
	bzero(mcpup->cpu_nrq_va, 2 * size);

	return (0);
}
Esempio n. 20
0
static void XSetupDisplay(int nframes) {
  XGCValues xgcv;
  XtAccelerators keys;

  /* Had to do it this way since embedding the keystrokes in the fallback
     resources failed to work properly -- what a kludge. */

  keys = XtParseAcceleratorTable(ckeys);

  xi.depth = DefaultDepthOfScreen(DefaultScreenOfDisplay(xi.disp));

  // give me TrueColor
  if (!XMatchVisualInfo(xi.disp, xi.screenno, xi.depth, TrueColor, &(xi.vi)))
    ErrorExit(ERROR_BADPARM, "Could not find a TrueColor visual");

  xi.vis = xi.vi.visual;
  xi.root = RootWindow(xi.disp, xi.screenno);

  // AllocNone -- clients can allocate the colormap entries
  // For TrueColor, alloc must be AloocNone
  xi.colormap = XCreateColormap(xi.disp, xi.root, xi.vis, AllocNone);

  toplevel = XtVaAppCreateShell("NMovie", "NMovie",
                                applicationShellWidgetClass,
                                xi.disp,
                                XtNvisual, xi.vis,
                                XtNcolormap, xi.colormap,
                                NULL);

  XtAppAddActions(xi.context,actions,XtNumber(actions));

  // frame
  frame = XtVaCreateManagedWidget("Frame", formWidgetClass, toplevel, NULL);
  // create buttons
  buttons = XtVaCreateManagedWidget("Buttons", formWidgetClass, frame, NULL );
  loop_bt = XtVaCreateManagedWidget("Loop", commandWidgetClass,
                                    buttons, NULL);
  swing_bt = XtVaCreateManagedWidget("Swing", commandWidgetClass,
                                     buttons, NULL);
  fast_bt = XtVaCreateManagedWidget("Faster", commandWidgetClass,
                                    buttons, NULL);
  slow_bt = XtVaCreateManagedWidget("Slower", commandWidgetClass,
                                    buttons, NULL);
  stop_bt = XtVaCreateManagedWidget("Stop", commandWidgetClass,
                                    buttons, NULL);
  back_bt = XtVaCreateManagedWidget("Back", commandWidgetClass,
                                    buttons, NULL);
  forward_bt = XtVaCreateManagedWidget("Forward", commandWidgetClass,
                                       buttons, NULL);
  quit_bt = XtVaCreateManagedWidget("Quit", commandWidgetClass,
                                    buttons, NULL);
  // canvas
  canvas = XtVaCreateManagedWidget("Canvas", simpleWidgetClass, frame,
                                   XtNwidth, cols,
                                   XtNheight, rows,
                                   XtNaccelerators, keys, NULL);
  XtInstallAllAccelerators(canvas,toplevel);
  XtRealizeWidget(toplevel);
  xi.canvas = XtWindow(canvas);

  xi.theGC = XCreateGC(xi.disp, xi.canvas, 0L, &xgcv);

  xi.rmask = xi.vis->red_mask;   // 0xFF0000
  xi.gmask = xi.vis->green_mask; // 0x00FF00
  xi.bmask = xi.vis->blue_mask;  // 0x0000FF

  xi.rshift = 7 - highbit(xi.rmask); // -16
  xi.gshift = 7 - highbit(xi.gmask); //  -8
  xi.bshift = 7 - highbit(xi.bmask); //   0

  // format is ZPixmap                                offset,data
  ximg = XCreateImage(xi.disp,xi.vis,xi.depth,ZPixmap, 0,    NULL,
                      //  bytes_per_line = 0 means assume contiguous and calculated
                      cols, rows, 32, 0);

  if ((imgdata = (char *)calloc((size_t)(rows*ximg->bytes_per_line*nframes),
                                sizeof(byte)))
      ==NULL)
    ErrorExit(ERROR_NO_MEMORY,"Failed to allocate image buffer");

  ximg->data = (char *) imgdata;
}
Esempio n. 21
0
Pixmap xskin_loadBMP( Display *d, Window w, char *filename,
		int *width, int *height ) {

  Pixmap ret=0;
  BMPHeader *bmp;
  struct timidity_file *fp;

  GC gc;

  if ( width!=NULL )
    *width=-1;
  if ( height!=NULL )
    *height=-1;

  sc   = DefaultScreen( d );
  gc   = DefaultGC( d, sc );
  cmap = DefaultColormap( d, sc );

  rshift = 15-highbit(xskin_vis->red_mask);
  gshift = 15-highbit(xskin_vis->green_mask);
  bshift = 15-highbit(xskin_vis->blue_mask);

  fp = open_file( filename, 1, OF_SILENT );
  if ( fp == NULL ) return ret;
  if ( fp->url->url_tell == NULL ) {
    fp->url = url_buff_open( fp->url, 1 );
  }

  bmp = loadBMPHeader( fp );
  if ( bmp==NULL ) goto finish1;

  if ( !loadBMPColors( d, bmp, fp ) ) goto finish1;

  ret = XCreatePixmap( d, w, bmp->w, bmp->h, xskin_depth );
  XSetForeground( d, gc, 0 );
  XFillRectangle( d, ret, gc, 0, 0, bmp->w, bmp->h );
  XSetForeground( d, gc, WhitePixel( d, sc ));

  switch( bmp->bitcounts ) {
  case 4:
    if ( bmp->compress == 0 )
      Draw4bit( d, ret, gc, bmp, fp );
    else if ( bmp->compress == 2 )
      DrawCompressed4bit( d, ret, gc, bmp, fp );
    break;
  case 8:
    if ( bmp->compress == 0 )
      Draw8bit( d, ret, gc, bmp, fp );
    else if ( bmp->compress == 1 )
      DrawCompressed8bit( d, ret, gc, bmp, fp );
    break;
  case 24:
    Draw24bit( d, ret, gc, bmp, fp );
    break;
  default:
    break;
  }

  if ( width!=NULL )
    *width = bmp->w;
  if ( height!=NULL )
    *height = bmp->h;

finish1:
  close_file( fp );
  return ret;
}
Esempio n. 22
0
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
	spa_t *spa = vd->vdev_spa;
	vdev_disk_t *dvd;
	struct dk_minfo_ext dkmext;
	int error;
	dev_t dev;
	int otyp;

	/*
	 * We must have a pathname, and it must be absolute.
	 */
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return (EINVAL);
	}

	/*
	 * Reopen the device if it's not currently open. Otherwise,
	 * just update the physical size of the device.
	 */
	if (vd->vdev_tsd != NULL) {
		ASSERT(vd->vdev_reopening);
		dvd = vd->vdev_tsd;
		goto skip_open;
	}

	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);

	/*
	 * When opening a disk device, we want to preserve the user's original
	 * intent.  We always want to open the device by the path the user gave
	 * us, even if it is one of multiple paths to the save device.  But we
	 * also want to be able to survive disks being removed/recabled.
	 * Therefore the sequence of opening devices is:
	 *
	 * 1. Try opening the device by path.  For legacy pools without the
	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
	 *
	 * 2. If the devid of the device matches the stored value, return
	 *    success.
	 *
	 * 3. Otherwise, the device may have moved.  Try opening the device
	 *    by the devid instead.
	 */
	if (vd->vdev_devid != NULL) {
		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
		    &dvd->vd_minor) != 0) {
			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
			return (EINVAL);
		}
	}

	error = EINVAL;		/* presume failure */

	if (vd->vdev_path != NULL) {
		ddi_devid_t devid;

		if (vd->vdev_wholedisk == -1ULL) {
			size_t len = strlen(vd->vdev_path) + 3;
			char *buf = kmem_alloc(len, KM_SLEEP);
			ldi_handle_t lh;

			(void) snprintf(buf, len, "%ss0", vd->vdev_path);

			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
			    &lh, zfs_li) == 0) {
				spa_strfree(vd->vdev_path);
				vd->vdev_path = buf;
				vd->vdev_wholedisk = 1ULL;
				(void) ldi_close(lh, spa_mode(spa), kcred);
			} else {
				kmem_free(buf, len);
			}
		}

		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
		    &dvd->vd_lh, zfs_li);

		/*
		 * Compare the devid to the stored value.
		 */
		if (error == 0 && vd->vdev_devid != NULL &&
		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
				error = EINVAL;
				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
				    kcred);
				dvd->vd_lh = NULL;
			}
			ddi_devid_free(devid);
		}

		/*
		 * If we succeeded in opening the device, but 'vdev_wholedisk'
		 * is not yet set, then this must be a slice.
		 */
		if (error == 0 && vd->vdev_wholedisk == -1ULL)
			vd->vdev_wholedisk = 0;
	}

	/*
	 * If we were unable to open by path, or the devid check fails, open by
	 * devid instead.
	 */
	if (error != 0 && vd->vdev_devid != NULL)
		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);

	/*
	 * If all else fails, then try opening by physical path (if available)
	 * or the logical path (if we failed due to the devid check).  While not
	 * as reliable as the devid, this will give us something, and the higher
	 * level vdev validation will prevent us from opening the wrong device.
	 */
	if (error) {
		if (vd->vdev_physpath != NULL &&
		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
			    kcred, &dvd->vd_lh, zfs_li);

		/*
		 * Note that we don't support the legacy auto-wholedisk support
		 * as above.  This hasn't been used in a very long time and we
		 * don't need to propagate its oddities to this edge condition.
		 */
		if (error && vd->vdev_path != NULL)
			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
			    kcred, &dvd->vd_lh, zfs_li);
	}

	if (error) {
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return (error);
	}

	/*
	 * Once a device is opened, verify that the physical device path (if
	 * available) is up to date.
	 */
	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
		char *physpath, *minorname;

		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
		minorname = NULL;
		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
		    (vd->vdev_physpath == NULL ||
		    strcmp(vd->vdev_physpath, physpath) != 0)) {
			if (vd->vdev_physpath)
				spa_strfree(vd->vdev_physpath);
			(void) strlcat(physpath, ":", MAXPATHLEN);
			(void) strlcat(physpath, minorname, MAXPATHLEN);
			vd->vdev_physpath = spa_strdup(physpath);
		}
		if (minorname)
			kmem_free(minorname, strlen(minorname) + 1);
		kmem_free(physpath, MAXPATHLEN);
	}

skip_open:
	/*
	 * Determine the actual size of the device.
	 */
	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return (EINVAL);
	}

	/*
	 * If we own the whole disk, try to enable disk write caching.
	 * We ignore errors because it's OK if we can't do it.
	 */
	if (vd->vdev_wholedisk == 1) {
		int wce = 1;
		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
		    FKIOCTL, kcred, NULL);
	}

	/*
	 * Determine the device's minimum transfer size.
	 * If the ioctl isn't supported, assume DEV_BSIZE.
	 */
	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
	    FKIOCTL, kcred, NULL) != 0)
		dkmext.dki_pbsize = DEV_BSIZE;

	*ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;

	/*
	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
	 * try again.
	 */
	vd->vdev_nowritecache = B_FALSE;

	return (0);
}
Esempio n. 23
0
/*
 * tavor_srq_modify()
 *    Context: Can be called only from user or kernel context.
 */
int
tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
    uint_t *real_size, uint_t sleepflag)
{
	tavor_qalloc_info_t	new_srqinfo, old_srqinfo;
	tavor_rsrc_t		*mtt, *mpt, *old_mtt;
	tavor_bind_info_t	bind;
	tavor_bind_info_t	old_bind;
	tavor_rsrc_pool_info_t	*rsrc_pool;
	tavor_mrhdl_t		mr;
	tavor_hw_mpt_t		mpt_entry;
	tavor_wrid_entry_t	*wre_new, *wre_old;
	uint64_t		mtt_ddrbaseaddr, mtt_addr;
	uint64_t		srq_desc_off;
	uint32_t		*buf, srq_old_bufsz;
	uint32_t		wqesz;
	uint_t			max_srq_size;
	uint_t			dma_xfer_mode, mtt_pgsize_bits;
	uint_t			srq_sync, log_srq_size, maxprot;
	uint_t			wq_location;
	int			status;
	char			*errormsg;

	TAVOR_TNF_ENTER(tavor_srq_modify);

	/*
	 * Check the "inddr" flag.  This flag tells the driver whether or not
	 * the SRQ's work queues should be come from normal system memory or
	 * whether they should be allocated from DDR memory.
	 */
	wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;

	/*
	 * If size requested is larger than device capability, return
	 * Insufficient Resources
	 */
	max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
	if (size > max_srq_size) {
		TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
		    TAVOR_TNF_ERROR, "");
		TAVOR_TNF_EXIT(tavor_srq_modify);
		return (IBT_HCA_WR_EXCEEDED);
	}

	/*
	 * Calculate the appropriate size for the SRQ.
	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
	 * is to round the requested size up to the next highest power-of-2
	 */
	size = max(size, TAVOR_SRQ_MIN_SIZE);
	log_srq_size = highbit(size);
	if ((size & (size - 1)) == 0) {
		log_srq_size = log_srq_size - 1;
	}

	/*
	 * Next we verify that the rounded-up size is valid (i.e. consistent
	 * with the device limits and/or software-configured limits).
	 */
	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
		goto srqmodify_fail;
	}

	/*
	 * Allocate the memory for newly resized Shared Receive Queue.
	 *
	 * Note: If SRQ is not user-mappable, then it may come from either
	 * kernel system memory or from HCA-attached local DDR memory.
	 *
	 * Note2: We align this queue on a pagesize boundary.  This is required
	 * to make sure that all the resulting IB addresses will start at 0,
	 * for a zero-based queue.  By making sure we are aligned on at least a
	 * page, any offset we use into our queue will be the same as it was
	 * when we allocated it at tavor_srq_alloc() time.
	 */
	wqesz = (1 << srq->srq_wq_log_wqesz);
	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
	new_srqinfo.qa_alloc_align = PAGESIZE;
	new_srqinfo.qa_bind_align  = PAGESIZE;
	if (srq->srq_is_umap) {
		new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
	} else {
		new_srqinfo.qa_location = wq_location;
	}
	status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
		goto srqmodify_fail;
	}
	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))

	/*
	 * Allocate the memory for the new WRE list.  This will be used later
	 * when we resize the wridlist based on the new SRQ size.
	 */
	wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
	    sizeof (tavor_wrid_entry_t), sleepflag);
	if (wre_new == NULL) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
		    "failed wre_new alloc");
		goto srqmodify_fail;
	}

	/*
	 * Fill in the "bind" struct.  This struct provides the majority
	 * of the information that will be used to distinguish between an
	 * "addr" binding (as is the case here) and a "buf" binding (see
	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
	 * which does most of the "heavy lifting" for the Tavor memory
	 * registration routines.
	 */
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
	bzero(&bind, sizeof (tavor_bind_info_t));
	bind.bi_type  = TAVOR_BINDHDL_VADDR;
	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
	bind.bi_len   = new_srqinfo.qa_size;
	bind.bi_as    = NULL;
	bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
	if (srq->srq_is_umap) {
		bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
	} else {
		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
			bind.bi_bypass =
			    state->ts_cfg_profile->cp_iommu_bypass;
			dma_xfer_mode =
			    state->ts_cfg_profile->cp_streaming_consistent;
			if (dma_xfer_mode == DDI_DMA_STREAMING) {
				bind.bi_flags |= IBT_MR_NONCOHERENT;
			}
		} else {
			bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
		}
	}
	status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
	    &mtt_pgsize_bits);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(status, "failed mtt bind");
		kmem_free(wre_new, srq->srq_wq_bufsz *
		    sizeof (tavor_wrid_entry_t));
		tavor_queue_free(state, &new_srqinfo);
		goto srqmodify_fail;
	}

	/*
	 * Calculate the offset between the kernel virtual address space
	 * and the IB virtual address space.  This will be used when
	 * posting work requests to properly initialize each WQE.
	 *
	 * Note: bind addr is zero-based (from alloc) so we calculate the
	 * correct new offset here.
	 */
	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
	    (uint64_t)bind.bi_addr;

	/*
	 * Get the base address for the MTT table.  This will be necessary
	 * below when we are modifying the MPT entry.
	 */
	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;

	/*
	 * Fill in the MPT entry.  This is the final step before passing
	 * ownership of the MPT entry to the Tavor hardware.  We use all of
	 * the information collected/calculated above to fill in the
	 * requisite portions of the MPT.
	 */
	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
	mpt_entry.reg_win_len	= bind.bi_len;
	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
	mpt_entry.mttseg_addr_l = mtt_addr >> 6;

	/*
	 * Now we grab the SRQ lock.  Since we will be updating the actual
	 * SRQ location and the producer/consumer indexes, we should hold
	 * the lock.
	 *
	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
	 * holding the "srq_lock" and if we got raised to interrupt level
	 * by priority inversion, we would not want to block in this routine
	 * waiting for success.
	 */
	mutex_enter(&srq->srq_lock);

	/*
	 * Copy old entries to new buffer
	 */
	srq_old_bufsz = srq->srq_wq_bufsz;
	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);

	/* Determine if later ddi_dma_sync will be necessary */
	srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);

	/* Sync entire "new" SRQ for use by hardware (if necessary) */
	if (srq_sync) {
		(void) ddi_dma_sync(bind.bi_dmahdl, 0,
		    new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
	}

	/*
	 * Setup MPT information for use in the MODIFY_MPT command
	 */
	mr = srq->srq_mrhdl;
	mutex_enter(&mr->mr_lock);
	mpt = srq->srq_mrhdl->mr_mptrsrcp;

	/*
	 * MODIFY_MPT
	 *
	 * If this fails for any reason, then it is an indication that
	 * something (either in HW or SW) has gone seriously wrong.  So we
	 * print a warning message and return.
	 */
	status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
	    TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
		    status);
		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
		TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
		(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
		    srq->srq_mrhdl->mr_mttrsrcp);
		kmem_free(wre_new, srq->srq_wq_bufsz *
		    sizeof (tavor_wrid_entry_t));
		tavor_queue_free(state, &new_srqinfo);
		mutex_exit(&mr->mr_lock);
		mutex_exit(&srq->srq_lock);
		return (ibc_get_ci_failure(0));
	}

	/*
	 * Update the Tavor Shared Receive Queue handle with all the new
	 * information.  At the same time, save away all the necessary
	 * information for freeing up the old resources
	 */
	old_srqinfo	   = srq->srq_wqinfo;
	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
	    sizeof (tavor_bind_info_t));

	/* Now set the new info */
	srq->srq_wqinfo	   = new_srqinfo;
	srq->srq_wq_buf	   = buf;
	srq->srq_wq_bufsz  = (1 << log_srq_size);
	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
	srq->srq_mrhdl->mr_mttrsrcp = mtt;
	srq->srq_desc_off  = srq_desc_off;
	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);

	/* Update MR mtt pagesize */
	mr->mr_logmttpgsz = mtt_pgsize_bits;
	mutex_exit(&mr->mr_lock);

#ifdef __lock_lint
	mutex_enter(&srq->srq_wrid_wql->wql_lock);
#else
	if (srq->srq_wrid_wql != NULL) {
		mutex_enter(&srq->srq_wrid_wql->wql_lock);
	}
#endif

	/*
	 * Initialize new wridlist, if needed.
	 *
	 * If a wridlist already is setup on an SRQ (the QP associated with an
	 * SRQ has moved "from_reset") then we must update this wridlist based
	 * on the new SRQ size.  We allocate the new size of Work Request ID
	 * Entries, copy over the old entries to the new list, and
	 * re-initialize the srq wridlist in non-umap case
	 */
	wre_old = NULL;
	if (srq->srq_wridlist != NULL) {
		wre_old = srq->srq_wridlist->wl_wre;

		bcopy(wre_old, wre_new, srq_old_bufsz *
		    sizeof (tavor_wrid_entry_t));

		/* Setup new sizes in wre */
		srq->srq_wridlist->wl_wre = wre_new;
		srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;

		if (!srq->srq_is_umap) {
			tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
			    srq_old_bufsz);
		}
	}

#ifdef __lock_lint
	mutex_exit(&srq->srq_wrid_wql->wql_lock);
#else
	if (srq->srq_wrid_wql != NULL) {
		mutex_exit(&srq->srq_wrid_wql->wql_lock);
	}
#endif

	/*
	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
	 * to a user process, then we need to call devmap_devmem_remap() to
	 * invalidate the mapping to the SRQ memory.  We also need to
	 * invalidate the SRQ tracking information for the user mapping.
	 *
	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
	 * does, it is an indication that something has gone seriously wrong.
	 * So we print a warning message and return error (knowing, of course,
	 * that the "old" SRQ memory will be leaked)
	 */
	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
		status = devmap_devmem_remap(srq->srq_umap_dhp,
		    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
		    DEVMAP_MAPPING_INVALID, NULL);
		if (status != DDI_SUCCESS) {
			mutex_exit(&srq->srq_lock);
			TAVOR_WARNING(state, "failed in SRQ memory "
			    "devmap_devmem_remap()");
			/* We can, however, free the memory for old wre */
			if (wre_old != NULL) {
				kmem_free(wre_old, srq_old_bufsz *
				    sizeof (tavor_wrid_entry_t));
			}
			TAVOR_TNF_EXIT(tavor_srq_modify);
			return (ibc_get_ci_failure(0));
		}
		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
	}

	/*
	 * Drop the SRQ lock now.  The only thing left to do is to free up
	 * the old resources.
	 */
	mutex_exit(&srq->srq_lock);

	/*
	 * Unbind the MTT entries.
	 */
	status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
	if (status != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to unbind old SRQ memory");
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
		    "failed to unbind (old)");
		goto srqmodify_fail;
	}

	/* Free the memory for old wre */
	if (wre_old != NULL) {
		kmem_free(wre_old, srq_old_bufsz *
		    sizeof (tavor_wrid_entry_t));
	}

	/* Free the memory for the old SRQ */
	tavor_queue_free(state, &old_srqinfo);

	/*
	 * Fill in the return arguments (if necessary).  This includes the
	 * real new completion queue size.
	 */
	if (real_size != NULL) {
		*real_size = (1 << log_srq_size);
	}

	TAVOR_TNF_EXIT(tavor_srq_modify);
	return (DDI_SUCCESS);

srqmodify_fail:
	TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
	    tnf_string, msg, errormsg);
	TAVOR_TNF_EXIT(tavor_srq_modify);
	return (status);
}
Esempio n. 24
0
/* ARGSUSED */
int
socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags,
    struct cred *cr, caller_context_t *ct)
{
	dev_t		fsid;
	struct sonode 	*so;
	static int	sonode_shift = 0;

	/*
	 * Calculate the amount of bitshift to a sonode pointer which will
	 * still keep it unique.  See below.
	 */
	if (sonode_shift == 0)
		sonode_shift = highbit(sizeof (struct sonode));
	ASSERT(sonode_shift > 0);

	so = VTOSO(vp);
	fsid = sockdev;

	if (so->so_version == SOV_STREAM) {
		/*
		 * The imaginary "sockmod" has been popped - act
		 * as a stream
		 */
		vap->va_type = VCHR;
		vap->va_mode = 0;
	} else {
		vap->va_type = vp->v_type;
		vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|
		    S_IROTH|S_IWOTH;
	}
	vap->va_uid = vap->va_gid = 0;
	vap->va_fsid = fsid;
	/*
	 * If the va_nodeid is > MAX_USHORT, then i386 stats might fail.
	 * So we shift down the sonode pointer to try and get the most
	 * uniqueness into 16-bits.
	 */
	vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF;
	vap->va_nlink = 0;
	vap->va_size = 0;

	/*
	 * We need to zero out the va_rdev to avoid some fstats getting
	 * EOVERFLOW.  This also mimics SunOS 4.x and BSD behavior.
	 */
	vap->va_rdev = (dev_t)0;
	vap->va_blksize = MAXBSIZE;
	vap->va_nblocks = btod(vap->va_size);

	if (!SOCK_IS_NONSTR(so)) {
		sotpi_info_t *sti = SOTOTPI(so);

		mutex_enter(&so->so_lock);
		vap->va_atime.tv_sec = sti->sti_atime;
		vap->va_mtime.tv_sec = sti->sti_mtime;
		vap->va_ctime.tv_sec = sti->sti_ctime;
		mutex_exit(&so->so_lock);
	} else {
		vap->va_atime.tv_sec = 0;
		vap->va_mtime.tv_sec = 0;
		vap->va_ctime.tv_sec = 0;
	}

	vap->va_atime.tv_nsec = 0;
	vap->va_mtime.tv_nsec = 0;
	vap->va_ctime.tv_nsec = 0;
	vap->va_seq = 0;

	return (0);
}
Esempio n. 25
0
static zap_t *
mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
{
	zap_t *winner;
	zap_t *zap;
	int i;

	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));

	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
	rw_init(&zap->zap_rwlock, NULL, 0, 0);
	rw_enter(&zap->zap_rwlock, RW_WRITER);
	zap->zap_objset = os;
	zap->zap_object = obj;
	zap->zap_dbuf = db;

	if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
		mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, 0, 0);
		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
	} else {
		zap->zap_ismicro = TRUE;
	}

	/*
	 * Make sure that zap_ismicro is set before we let others see
	 * it, because zap_lockdir() checks zap_ismicro without the lock
	 * held.
	 */
	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);

	if (winner != NULL) {
#ifdef __APPLE__
		if (!zap->zap_ismicro)
			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
#endif
		kmem_free(zap, sizeof (zap_t));
		return (winner);
	}

	if (zap->zap_ismicro) {
		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
		avl_create(&zap->zap_m.zap_avl, mze_compare,
		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));

		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
			mzap_ent_phys_t *mze =
			    &zap->zap_m.zap_phys->mz_chunk[i];
			if (mze->mze_name[0]) {
				zap->zap_m.zap_num_entries++;
				mze_insert(zap, i,
				    zap_hash(zap, mze->mze_name), mze);
			}
		}
	} else {
		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;

		ASSERT3U(sizeof (struct zap_leaf_header), ==,
		    2*ZAP_LEAF_CHUNKSIZE);

		/*
		 * The embedded pointer table should not overlap the
		 * other members.
		 */
		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
		    &zap->zap_f.zap_phys->zap_salt);

		/*
		 * The embedded pointer table should end at the end of
		 * the block
		 */
		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
		    (uintptr_t)zap->zap_f.zap_phys, ==,
		    zap->zap_dbuf->db_size);
	}
	rw_exit(&zap->zap_rwlock);
	return (zap);
}
Esempio n. 26
0
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
    uint64_t *ashift)
{
	spa_t *spa = vd->vdev_spa;
	vdev_disk_t *dvd = vd->vdev_tsd;
	ldi_ev_cookie_t ecookie;
	vdev_disk_ldi_cb_t *lcb;
	union {
		struct dk_minfo_ext ude;
		struct dk_minfo ud;
	} dks;
	struct dk_minfo_ext *dkmext = &dks.ude;
	struct dk_minfo *dkm = &dks.ud;
	int error;
	dev_t dev;
	int otyp;
	boolean_t validate_devid = B_FALSE;
	ddi_devid_t devid;
	uint64_t capacity = 0, blksz = 0, pbsize;

	/*
	 * We must have a pathname, and it must be absolute.
	 */
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return (SET_ERROR(EINVAL));
	}

	/*
	 * Reopen the device if it's not currently open. Otherwise,
	 * just update the physical size of the device.
	 */
	if (dvd != NULL) {
		if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
			/*
			 * If we are opening a device in its offline notify
			 * context, the LDI handle was just closed. Clean
			 * up the LDI event callbacks and free vd->vdev_tsd.
			 */
			vdev_disk_free(vd);
		} else {
			ASSERT(vd->vdev_reopening);
			goto skip_open;
		}
	}

	/*
	 * Create vd->vdev_tsd.
	 */
	vdev_disk_alloc(vd);
	dvd = vd->vdev_tsd;

	/*
	 * When opening a disk device, we want to preserve the user's original
	 * intent.  We always want to open the device by the path the user gave
	 * us, even if it is one of multiple paths to the save device.  But we
	 * also want to be able to survive disks being removed/recabled.
	 * Therefore the sequence of opening devices is:
	 *
	 * 1. Try opening the device by path.  For legacy pools without the
	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
	 *
	 * 2. If the devid of the device matches the stored value, return
	 *    success.
	 *
	 * 3. Otherwise, the device may have moved.  Try opening the device
	 *    by the devid instead.
	 */
	if (vd->vdev_devid != NULL) {
		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
		    &dvd->vd_minor) != 0) {
			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
			return (SET_ERROR(EINVAL));
		}
	}

	error = EINVAL;		/* presume failure */

	if (vd->vdev_path != NULL) {

		if (vd->vdev_wholedisk == -1ULL) {
			size_t len = strlen(vd->vdev_path) + 3;
			char *buf = kmem_alloc(len, KM_SLEEP);

			(void) snprintf(buf, len, "%ss0", vd->vdev_path);

			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
			    &dvd->vd_lh, zfs_li);
			if (error == 0) {
				spa_strfree(vd->vdev_path);
				vd->vdev_path = buf;
				vd->vdev_wholedisk = 1ULL;
			} else {
				kmem_free(buf, len);
			}
		}

		/*
		 * If we have not yet opened the device, try to open it by the
		 * specified path.
		 */
		if (error != 0) {
			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
			    kcred, &dvd->vd_lh, zfs_li);
		}

		/*
		 * Compare the devid to the stored value.
		 */
		if (error == 0 && vd->vdev_devid != NULL &&
		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
				error = SET_ERROR(EINVAL);
				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
				    kcred);
				dvd->vd_lh = NULL;
			}
			ddi_devid_free(devid);
		}

		/*
		 * If we succeeded in opening the device, but 'vdev_wholedisk'
		 * is not yet set, then this must be a slice.
		 */
		if (error == 0 && vd->vdev_wholedisk == -1ULL)
			vd->vdev_wholedisk = 0;
	}

	/*
	 * If we were unable to open by path, or the devid check fails, open by
	 * devid instead.
	 */
	if (error != 0 && vd->vdev_devid != NULL) {
		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
	}

	/*
	 * If all else fails, then try opening by physical path (if available)
	 * or the logical path (if we failed due to the devid check).  While not
	 * as reliable as the devid, this will give us something, and the higher
	 * level vdev validation will prevent us from opening the wrong device.
	 */
	if (error) {
		if (vd->vdev_devid != NULL)
			validate_devid = B_TRUE;

		if (vd->vdev_physpath != NULL &&
		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
			    kcred, &dvd->vd_lh, zfs_li);

		/*
		 * Note that we don't support the legacy auto-wholedisk support
		 * as above.  This hasn't been used in a very long time and we
		 * don't need to propagate its oddities to this edge condition.
		 */
		if (error && vd->vdev_path != NULL)
			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
			    kcred, &dvd->vd_lh, zfs_li);
	}

	if (error) {
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return (error);
	}

	/*
	 * Now that the device has been successfully opened, update the devid
	 * if necessary.
	 */
	if (validate_devid && spa_writeable(spa) &&
	    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
		if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
			char *vd_devid;

			vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
			zfs_dbgmsg("vdev %s: update devid from %s, "
			    "to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
			spa_strfree(vd->vdev_devid);
			vd->vdev_devid = spa_strdup(vd_devid);
			ddi_devid_str_free(vd_devid);
		}
		ddi_devid_free(devid);
	}

	/*
	 * Once a device is opened, verify that the physical device path (if
	 * available) is up to date.
	 */
	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
		char *physpath, *minorname;

		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
		minorname = NULL;
		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
		    (vd->vdev_physpath == NULL ||
		    strcmp(vd->vdev_physpath, physpath) != 0)) {
			if (vd->vdev_physpath)
				spa_strfree(vd->vdev_physpath);
			(void) strlcat(physpath, ":", MAXPATHLEN);
			(void) strlcat(physpath, minorname, MAXPATHLEN);
			vd->vdev_physpath = spa_strdup(physpath);
		}
		if (minorname)
			kmem_free(minorname, strlen(minorname) + 1);
		kmem_free(physpath, MAXPATHLEN);
	}

	/*
	 * Register callbacks for the LDI offline event.
	 */
	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
	    LDI_EV_SUCCESS) {
		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
	}

	/*
	 * Register callbacks for the LDI degrade event.
	 */
	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
	    LDI_EV_SUCCESS) {
		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
	}
skip_open:
	/*
	 * Determine the actual size of the device.
	 */
	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return (SET_ERROR(EINVAL));
	}

	*max_psize = *psize;

	/*
	 * Determine the device's minimum transfer size.
	 * If the ioctl isn't supported, assume DEV_BSIZE.
	 */
	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
		capacity = dkmext->dki_capacity - 1;
		blksz = dkmext->dki_lbsize;
		pbsize = dkmext->dki_pbsize;
	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
		VDEV_DEBUG(
		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
		    vd->vdev_path);
		capacity = dkm->dki_capacity - 1;
		blksz = dkm->dki_lbsize;
		pbsize = blksz;
	} else {
		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
		    vd->vdev_path, error);
		pbsize = DEV_BSIZE;
	}

	*ashift = highbit(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;

	if (vd->vdev_wholedisk == 1) {
		int wce = 1;

		if (error == 0) {
			/*
			 * If we have the capability to expand, we'd have
			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
			 * Adjust max_psize upward accordingly since we know
			 * we own the whole disk now.
			 */
			*max_psize += vdev_disk_get_space(vd, capacity, blksz);
			zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
			    "max_psize %llu", vd->vdev_path, *psize,
			    *max_psize);
		}

		/*
		 * Since we own the whole disk, try to enable disk write
		 * caching.  We ignore errors because it's OK if we can't do it.
		 */
		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
		    FKIOCTL, kcred, NULL);
	}

	/*
	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
	 * try again.
	 */
	vd->vdev_nowritecache = B_FALSE;

	return (0);
}
Esempio n. 27
0
static taskq_t *
taskq_create_common(const char *name, int instance, int nthreads, pri_t pri,
                    int minalloc, int maxalloc, uint_t flags)
{
    taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_NOSLEEP);
    uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus);
    uint_t bsize;	/* # of buckets - always power of 2 */

    ASSERT(instance == 0);
    ASSERT(flags == TASKQ_PREPOPULATE | TASKQ_NOINSTANCE);

    /*
     * TASKQ_CPR_SAFE and TASKQ_DYNAMIC flags are mutually exclusive.
     */
    ASSERT((flags & (TASKQ_DYNAMIC | TASKQ_CPR_SAFE)) !=
           ((TASKQ_DYNAMIC | TASKQ_CPR_SAFE)));

    ASSERT(tq->tq_buckets == NULL);

    bsize = 1 << (highbit(ncpus) - 1);
    ASSERT(bsize >= 1);
    bsize = MIN(bsize, taskq_maxbuckets);

    tq->tq_maxsize = nthreads;

    (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
    tq->tq_name[TASKQ_NAMELEN] = '\0';
    /* Make sure the name conforms to the rules for C indentifiers */
    strident_canon(tq->tq_name, TASKQ_NAMELEN);

    tq->tq_flags = flags | TASKQ_ACTIVE;
    tq->tq_active = nthreads;
    tq->tq_nthreads = nthreads;
    tq->tq_minalloc = minalloc;
    tq->tq_maxalloc = maxalloc;
    tq->tq_nbuckets = bsize;
    tq->tq_pri = pri;

    if (flags & TASKQ_PREPOPULATE) {
        mutex_enter(&tq->tq_lock);
        while (minalloc-- > 0)
            taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
        mutex_exit(&tq->tq_lock);
    }

    if (nthreads == 1) {
        tq->tq_thread = thread_create(NULL, 0, taskq_thread, tq,
                                      0, NULL, TS_RUN, pri);
    } else {
        kthread_t **tpp = kmem_alloc(sizeof (kthread_t *) * nthreads,
                                     KM_SLEEP);

        tq->tq_threadlist = tpp;

        mutex_enter(&tq->tq_lock);
        while (nthreads-- > 0) {
            *tpp = thread_create(NULL, 0, taskq_thread, tq,
                                 0, NULL, TS_RUN, pri);
            tpp++;
        }
        mutex_exit(&tq->tq_lock);
    }

    return (tq);
}