Пример #1
0
/*
 * BT_RROOT -- Fix up the recno root page after it has been split.
 *
 * Parameters:
 *	t:	tree
 *	h:	root page
 *	l:	left page
 *	r:	right page
 *
 * Returns:
 *	RET_ERROR, RET_SUCCESS
 */
static int
bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
{
	char *dest;
	uint32_t sz;
	size_t temp;

	temp = t->bt_psize - NRINTERNAL;
	_DBFIT(temp, uint32_t);
	sz = (uint32_t)temp;

	/* Insert the left and right keys, set the header information. */
	_DBFIT(sz, indx_t);
	h->linp[0] = h->upper = (indx_t)sz;
	dest = (char *)(void *)h + h->upper;
	WR_RINTERNAL(dest,
	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);

	h->linp[1] = h->upper -= NRINTERNAL;
	dest = (char *)(void *)h + h->upper;
	WR_RINTERNAL(dest,
	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);

	h->lower = BTDATAOFF + 2 * sizeof(indx_t);

	/* Unpin the root page, set to recno internal page. */
	h->flags &= ~P_TYPE;
	h->flags |= P_RINTERNAL;
	mpool_put(t->bt_mp, h, MPOOL_DIRTY);

	return (RET_SUCCESS);
}
Пример #2
0
/*
 * __OVFL_PUT -- Store an overflow key/data item.
 *
 * Parameters:
 *	t:	tree
 *	data:	DBT to store
 *	pgno:	storage page number
 *
 * Returns:
 *	RET_ERROR, RET_SUCCESS
 */
int
__ovfl_put(BTREE *t, const DBT *dbt, pgno_t *pg)
{
	PAGE *h, *last;
	void *p;
	pgno_t npg;
	uint32_t sz, nb, plen;
	size_t temp;

	/*
	 * Allocate pages and copy the key/data record into them.  Store the
	 * number of the first page in the chain.
	 */
	temp = t->bt_psize - BTDATAOFF;
	_DBFIT(temp, uint32_t);
	plen = (uint32_t)temp;
	last = NULL;
	p = dbt->data;
	temp = dbt->size;
	_DBFIT(temp, uint32_t);
	sz = temp;
	for (;; p = (char *)p + plen, last = h) {
		if ((h = __bt_new(t, &npg)) == NULL)
			return (RET_ERROR);

		h->pgno = npg;
		h->nextpg = h->prevpg = P_INVALID;
		h->flags = P_OVERFLOW;
		h->lower = h->upper = 0;

		nb = MIN(sz, plen);
		(void)memmove((char *)(void *)h + BTDATAOFF, p, (size_t)nb);

		if (last) {
			last->nextpg = h->pgno;
			mpool_put(t->bt_mp, last, MPOOL_DIRTY);
		} else
			*pg = h->pgno;

		if ((sz -= nb) == 0) {
			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
			break;
		}
	}
	return (RET_SUCCESS);
}
Пример #3
0
/*
 * __OVFL_GET -- Get an overflow key/data item.
 *
 * Parameters:
 *	t:	tree
 *	p:	pointer to { pgno_t, uint32_t }
 *	buf:	storage address
 *	bufsz:	storage size
 *
 * Returns:
 *	RET_ERROR, RET_SUCCESS
 */
int
__ovfl_get(BTREE *t, void *p, size_t *ssz, void **buf, size_t *bufsz)
{
	PAGE *h;
	pgno_t pg;
	uint32_t sz, nb, plen;
	size_t temp;

	memmove(&pg, p, sizeof(pg));
	memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(uint32_t));
	*ssz = sz;

#ifdef DEBUG
	if (pg == P_INVALID || sz == 0)
		abort();
#endif
	/* Make the buffer bigger as necessary. */
	if (*bufsz < sz) {
		void *nbuf = realloc(*buf, sz);
		if (nbuf == NULL)
			return (RET_ERROR);
		*buf = nbuf;
		*bufsz = sz;
	}

	/*
	 * Step through the linked list of pages, copying the data on each one
	 * into the buffer.  Never copy more than the data's length.
	 */
	temp = t->bt_psize - BTDATAOFF;
	_DBFIT(temp, uint32_t);
	plen = (uint32_t)temp;
	for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) {
		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
			return (RET_ERROR);

		nb = MIN(sz, plen);
		memmove(p, (char *)(void *)h + BTDATAOFF, nb);
		mpool_put(t->bt_mp, h, 0);

		if ((sz -= nb) == 0)
			break;
	}
	return (RET_SUCCESS);
}
Пример #4
0
/*
 * This calls alloc_segs which may run out of memory.  Alloc_segs will destroy
 * the table and set errno, so we just pass the error information along.
 *
 * Returns 0 on No Error
 */
static int
init_htab(HTAB *hashp, size_t nelem)
{
	int nbuckets;
	uint32_t nsegs;
	int l2;

	/*
	 * Divide number of elements by the fill factor and determine a
	 * desired number of buckets.  Allocate space for the next greater
	 * power of two number of buckets.
	 */
	nelem = (nelem - 1) / hashp->FFACTOR + 1;

	_DBFIT(nelem, uint32_t);
	l2 = __log2(MAX((uint32_t)nelem, 2));
	nbuckets = 1 << l2;

	hashp->SPARES[l2] = l2 + 1;
	hashp->SPARES[l2 + 1] = l2 + 1;
	hashp->OVFL_POINT = l2;
	hashp->LAST_FREED = 2;

	/* First bitmap page is at: splitpoint l2 page offset 1 */
	if (__ibitmap(hashp, (int)OADDR_OF(l2, 1), l2 + 1, 0))
		return (-1);

	hashp->MAX_BUCKET = hashp->LOW_MASK = nbuckets - 1;
	hashp->HIGH_MASK = (nbuckets << 1) - 1;
	/* LINTED constant in conditional context */
	hashp->HDRPAGES = ((MAX(sizeof(HASHHDR), MINHDRSIZE) - 1) >>
	    hashp->BSHIFT) + 1;

	nsegs = (nbuckets - 1) / hashp->SGSIZE + 1;
	nsegs = 1 << __log2(nsegs);

	if (nsegs > hashp->DSIZE)
		hashp->DSIZE = nsegs;
	return (alloc_segs(hashp, (int)nsegs));
}
Пример #5
0
/*
 * __OVFL_DELETE -- Delete an overflow chain.
 *
 * Parameters:
 *	t:	tree
 *	p:	pointer to { pgno_t, uint32_t }
 *
 * Returns:
 *	RET_ERROR, RET_SUCCESS
 */
int
__ovfl_delete(BTREE *t, void *p)
{
	PAGE *h;
	pgno_t pg;
	uint32_t sz, plen;
	size_t temp;

	(void)memmove(&pg, p, sizeof(pgno_t));
	(void)memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(uint32_t));

#ifdef DEBUG
	if (pg == P_INVALID || sz == 0)
		abort();
#endif
	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
		return (RET_ERROR);

	/* Don't delete chains used by internal pages. */
	if (h->flags & P_PRESERVE) {
		mpool_put(t->bt_mp, h, 0);
		return (RET_SUCCESS);
	}

	/* Step through the chain, calling the free routine for each page. */
	temp = t->bt_psize - BTDATAOFF;
	_DBFIT(temp, uint32_t);
	plen = (uint32_t)temp;
	for (;; sz -= plen) {
		pg = h->nextpg;
		__bt_free(t, h);
		if (sz <= plen)
			break;
		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
			return (RET_ERROR);
	}
	return (RET_SUCCESS);
}
Пример #6
0
/*
 * __BT_OPEN -- Open a btree.
 *
 * Creates and fills a DB struct, and calls the routine that actually
 * opens the btree.
 *
 * Parameters:
 *	fname:	filename (NULL for in-memory trees)
 *	flags:	open flag bits
 *	mode:	open permission bits
 *	b:	BTREEINFO pointer
 *
 * Returns:
 *	NULL on failure, pointer to DB on success.
 *
 */
DB *
__bt_open(const char *fname, int flags, mode_t mode, const BTREEINFO *openinfo,
    int dflags)
{
	struct stat sb;
	BTMETA m;
	BTREE *t;
	BTREEINFO b;
	DB *dbp;
	pgno_t ncache;
	ssize_t nr;
	size_t temp;
	int machine_lorder;

	t = NULL;

	/*
	 * Intention is to make sure all of the user's selections are okay
	 * here and then use them without checking.  Can't be complete, since
	 * we don't know the right page size, lorder or flags until the backing
	 * file is opened.  Also, the file's page size can cause the cachesize
	 * to change.
	 */
	machine_lorder = byteorder();
	if (openinfo) {
		b = *openinfo;

		/* Flags: R_DUP. */
		if (b.flags & ~(R_DUP))
			goto einval;

		/*
		 * Page size must be indx_t aligned and >= MINPSIZE.  Default
		 * page size is set farther on, based on the underlying file
		 * transfer size.
		 */
		if (b.psize &&
		    (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
		    b.psize & (sizeof(indx_t) - 1)))
			goto einval;

		/* Minimum number of keys per page; absolute minimum is 2. */
		if (b.minkeypage) {
			if (b.minkeypage < 2)
				goto einval;
		} else
			b.minkeypage = DEFMINKEYPAGE;

		/* If no comparison, use default comparison and prefix. */
		if (b.compare == NULL) {
			b.compare = __bt_defcmp;
			if (b.prefix == NULL)
				b.prefix = __bt_defpfx;
		}

		if (b.lorder == 0)
			b.lorder = machine_lorder;
	} else {
		b.compare = __bt_defcmp;
		b.cachesize = 0;
		b.flags = 0;
		b.lorder = machine_lorder;
		b.minkeypage = DEFMINKEYPAGE;
		b.prefix = __bt_defpfx;
		b.psize = 0;
	}

	/* Check for the ubiquitous PDP-11. */
	if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
		goto einval;

	/* Allocate and initialize DB and BTREE structures. */
	if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
		goto err;
	memset(t, 0, sizeof(BTREE));
	t->bt_fd = -1;			/* Don't close unopened fd on error. */
	t->bt_lorder = b.lorder;
	t->bt_order = NOT;
	t->bt_cmp = b.compare;
	t->bt_pfx = b.prefix;
	t->bt_rfd = -1;

	if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
		goto err;
	memset(t->bt_dbp, 0, sizeof(DB));
	if (t->bt_lorder != machine_lorder)
		F_SET(t, B_NEEDSWAP);

	dbp->type = DB_BTREE;
	dbp->internal = t;
	dbp->close = __bt_close;
	dbp->del = __bt_delete;
	dbp->fd = __bt_fd;
	dbp->get = __bt_get;
	dbp->put = __bt_put;
	dbp->seq = __bt_seq;
	dbp->sync = __bt_sync;

	/*
	 * If no file name was supplied, this is an in-memory btree and we
	 * open a backing temporary file.  Otherwise, it's a disk-based tree.
	 */
	if (fname) {
		switch (flags & O_ACCMODE) {
		case O_RDONLY:
			F_SET(t, B_RDONLY);
			break;
		case O_RDWR:
			break;
		case O_WRONLY:
		default:
			goto einval;
		}
		
		if ((t->bt_fd = open(fname, flags, mode)) == -1)
			goto err;
		if (fcntl(t->bt_fd, F_SETFD, FD_CLOEXEC) == -1)
			goto err;
	} else {
		if ((flags & O_ACCMODE) != O_RDWR)
			goto einval;
		if ((t->bt_fd = tmp()) == -1)
			goto err;
		F_SET(t, B_INMEM);
	}

	if (fcntl(t->bt_fd, F_SETFD, FD_CLOEXEC) == -1)
		goto err;

	if (fstat(t->bt_fd, &sb))
		goto err;
	if (sb.st_size) {
		if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
			goto err;
		if (nr != sizeof(BTMETA))
			goto eftype;

		/*
		 * Read in the meta-data.  This can change the notion of what
		 * the lorder, page size and flags are, and, when the page size
		 * changes, the cachesize value can change too.  If the user
		 * specified the wrong byte order for an existing database, we
		 * don't bother to return an error, we just clear the NEEDSWAP
		 * bit.
		 */
		if (m.magic == BTREEMAGIC)
			F_CLR(t, B_NEEDSWAP);
		else {
			F_SET(t, B_NEEDSWAP);
			M_32_SWAP(m.magic);
			M_32_SWAP(m.version);
			M_32_SWAP(m.psize);
			M_32_SWAP(m.free);
			M_32_SWAP(m.nrecs);
			M_32_SWAP(m.flags);
		}
		if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
			goto eftype;
		if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
		    m.psize & (sizeof(indx_t) - 1))
			goto eftype;
		if (m.flags & ~SAVEMETA)
			goto eftype;
		b.psize = m.psize;
		F_SET(t, m.flags);
		t->bt_free = m.free;
		t->bt_nrecs = m.nrecs;
	} else {
		/*
		 * Set the page size to the best value for I/O to this file.
		 * Don't overflow the page offset type.
		 */
		if (b.psize == 0) {
			b.psize = sb.st_blksize;
			if (b.psize < MINPSIZE)
				b.psize = MINPSIZE;
			if (b.psize > MAX_PAGE_OFFSET + 1)
				b.psize = MAX_PAGE_OFFSET + 1;
		}

		/* Set flag if duplicates permitted. */
		if (!(b.flags & R_DUP))
			F_SET(t, B_NODUPS);

		t->bt_free = P_INVALID;
		t->bt_nrecs = 0;
		F_SET(t, B_METADIRTY);
	}

	t->bt_psize = b.psize;

	/* Set the cache size; must be a multiple of the page size. */
	if (b.cachesize && b.cachesize & (b.psize - 1))
		b.cachesize += (~b.cachesize & (b.psize - 1)) + 1;
	if (b.cachesize < b.psize * MINCACHE)
		b.cachesize = b.psize * MINCACHE;

	/* Calculate number of pages to cache. */
	ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;

	/*
	 * The btree data structure requires that at least two keys can fit on
	 * a page, but other than that there's no fixed requirement.  The user
	 * specified a minimum number per page, and we translated that into the
	 * number of bytes a key/data pair can use before being placed on an
	 * overflow page.  This calculation includes the page header, the size
	 * of the index referencing the leaf item and the size of the leaf item
	 * structure.  Also, don't let the user specify a minkeypage such that
	 * a key/data pair won't fit even if both key and data are on overflow
	 * pages.
	 */
	temp = (t->bt_psize - BTDATAOFF) / b.minkeypage -
	    (sizeof(indx_t) + NBLEAFDBT(0, 0));
	_DBFIT(temp, indx_t);
	t->bt_ovflsize = (indx_t)temp;
	if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
		t->bt_ovflsize =
		    NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);

	/* Initialize the buffer pool. */
	if ((t->bt_mp =
	    mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
		goto err;
	if (!F_ISSET(t, B_INMEM))
		mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);

	/* Create a root page if new tree. */
	if (nroot(t) == RET_ERROR)
		goto err;

	/* Global flags. */
	if (dflags & DB_LOCK)
		F_SET(t, B_DB_LOCK);
	if (dflags & DB_SHMEM)
		F_SET(t, B_DB_SHMEM);
	if (dflags & DB_TXN)
		F_SET(t, B_DB_TXN);

	return (dbp);

einval:	errno = EINVAL;
	goto err;

eftype:	errno = EFTYPE;
	goto err;

err:	if (t) {
		if (t->bt_dbp)
			free(t->bt_dbp);
		if (t->bt_fd != -1)
			(void)close(t->bt_fd);
		free(t);
	}
	return (NULL);
}
Пример #7
0
/*
 * __REC_IPUT -- Add a recno item to the tree.
 *
 * Parameters:
 *	t:	tree
 *	nrec:	record number
 *	data:	data
 *
 * Returns:
 *	RET_ERROR, RET_SUCCESS
 */
int
__rec_iput(BTREE *t, recno_t nrec, const DBT *data, u_int flags)
{
    DBT tdata;
    EPG *e;
    PAGE *h;
    indx_t idx, nxtindex;
    pgno_t pg;
    uint32_t nbytes;
    int dflags, status;
    char *dest, db[NOVFLSIZE];

    /*
     * If the data won't fit on a page, store it on indirect pages.
     *
     * XXX
     * If the insert fails later on, these pages aren't recovered.
     */
    if (data->size > t->bt_ovflsize) {
        if (__ovfl_put(t, data, &pg) == RET_ERROR)
            return (RET_ERROR);
        tdata.data = db;
        tdata.size = NOVFLSIZE;
        *(pgno_t *)(void *)db = pg;
        _DBFIT(data->size, uint32_t);
        *(uint32_t *)(void *)(db + sizeof(pgno_t)) =
            (uint32_t)data->size;
        dflags = P_BIGDATA;
        data = &tdata;
    } else
        dflags = 0;

    /* __rec_search pins the returned page. */
    if ((e = __rec_search(t, nrec,
                          nrec > t->bt_nrecs || flags == R_IAFTER || flags == R_IBEFORE ?
                          SINSERT : SEARCH)) == NULL)
        return (RET_ERROR);

    h = e->page;
    idx = e->index;

    /*
     * Add the specified key/data pair to the tree.  The R_IAFTER and
     * R_IBEFORE flags insert the key after/before the specified key.
     *
     * Pages are split as required.
     */
    switch (flags) {
    case R_IAFTER:
        ++idx;
        break;
    case R_IBEFORE:
        break;
    default:
        if (nrec < t->bt_nrecs &&
                __rec_dleaf(t, h, (uint32_t)idx) == RET_ERROR) {
            mpool_put(t->bt_mp, h, 0);
            return (RET_ERROR);
        }
        break;
    }

    /*
     * If not enough room, split the page.  The split code will insert
     * the key and data and unpin the current page.  If inserting into
     * the offset array, shift the pointers up.
     */
    nbytes = NRLEAFDBT(data->size);
    if ((uint32_t) (h->upper - h->lower) < nbytes + sizeof(indx_t)) {
        status = __bt_split(t, h, NULL, data, dflags, nbytes,
                            (uint32_t)idx);
        if (status == RET_SUCCESS)
            ++t->bt_nrecs;
        return (status);
    }

    if (idx < (nxtindex = NEXTINDEX(h)))
        memmove(h->linp + idx + 1, h->linp + idx,
                (nxtindex - idx) * sizeof(indx_t));
    h->lower += sizeof(indx_t);

    h->linp[idx] = h->upper -= nbytes;
    dest = (char *)(void *)h + h->upper;
    WR_RLEAF(dest, data, dflags);

    ++t->bt_nrecs;
    F_SET(t, B_MODIFIED);
    mpool_put(t->bt_mp, h, MPOOL_DIRTY);

    return (RET_SUCCESS);
}
Пример #8
0
/*
 * Big_insert
 *
 * You need to do an insert and the key/data pair is too big
 *
 * Returns:
 * 0 ==> OK
 *-1 ==> ERROR
 */
int
__big_insert(HTAB *hashp, BUFHEAD *bufp, const DBT *key, const DBT *val)
{
	uint16_t *p, n;
	size_t key_size, val_size;
	uint16_t space, move_bytes, off;
	char *cp, *key_data, *val_data;
	size_t temp;

	cp = bufp->page;		/* Character pointer of p. */
	p = (uint16_t *)(void *)cp;

	key_data = (char *)key->data;
	_DBFIT(key->size, int);
	key_size = key->size;
	val_data = (char *)val->data;
	_DBFIT(val->size, int);
	val_size = val->size;

	/* First move the Key */
	
	temp = FREESPACE(p) - BIGOVERHEAD;
	_DBFIT(temp, uint16_t);
	space = (uint16_t)temp;
	while (key_size) {
		move_bytes = MIN(space, key_size);
		off = OFFSET(p) - move_bytes;
		memmove(cp + off, key_data, (size_t)move_bytes);
		key_size -= move_bytes;
		key_data += move_bytes;
		n = p[0];
		p[++n] = off;
		p[0] = ++n;
		temp = off - PAGE_META(n);
		_DBFIT(temp, uint16_t);
		FREESPACE(p) = (uint16_t)temp;
		OFFSET(p) = off;
		p[n] = PARTIAL_KEY;
		bufp = __add_ovflpage(hashp, bufp);
		if (!bufp)
			return (-1);
		n = p[0];
		if (!key_size) {
			space = FREESPACE(p);
			if (space) {
				move_bytes = MIN(space, val_size);
				/*
				 * If the data would fit exactly in the
				 * remaining space, we must overflow it to the
				 * next page; otherwise the invariant that the
				 * data must end on a page with FREESPACE
				 * non-zero would fail.
				 */
				if (space == val_size && val_size == val->size)
					goto toolarge;
				off = OFFSET(p) - move_bytes;
				memmove(cp + off, val_data, (size_t)move_bytes);
				val_data += move_bytes;
				val_size -= move_bytes;
				p[n] = off;
				p[n - 2] = FULL_KEY_DATA;
				FREESPACE(p) = FREESPACE(p) - move_bytes;
				OFFSET(p) = off;
			} else {
			toolarge:
				p[n - 2] = FULL_KEY;
			}
		}
		p = (uint16_t *)(void *)bufp->page;
		cp = bufp->page;
		bufp->flags |= BUF_MOD;
		temp = FREESPACE(p) - BIGOVERHEAD;
		_DBFIT(temp, uint16_t);
		space = (uint16_t)temp;
	}

	/* Now move the data */
	temp = FREESPACE(p) - BIGOVERHEAD;
	_DBFIT(temp, uint16_t);
	space = (uint16_t)temp;
	while (val_size) {
		move_bytes = MIN(space, val_size);
		/*
		 * Here's the hack to make sure that if the data ends on the
		 * same page as the key ends, FREESPACE is at least one.
		 */
		if (space == val_size && val_size == val->size)
			move_bytes--;
		off = OFFSET(p) - move_bytes;
		memmove(cp + off, val_data, (size_t)move_bytes);
		val_size -= move_bytes;
		val_data += move_bytes;
		n = p[0];
		p[++n] = off;
		p[0] = ++n;
		temp = off - PAGE_META(n);
		_DBFIT(temp, uint16_t);
		FREESPACE(p) = (uint16_t)temp;
		OFFSET(p) = off;
		if (val_size) {
			p[n] = FULL_KEY;
			bufp = __add_ovflpage(hashp, bufp);
			if (!bufp)
				return (-1);
			cp = bufp->page;
			p = (uint16_t *)(void *)cp;
		} else
			p[n] = FULL_KEY_DATA;
		bufp->flags |= BUF_MOD;
		temp = FREESPACE(p) - BIGOVERHEAD;
		_DBFIT(temp, uint16_t);
		space = (uint16_t)temp;
	}
	return (0);
}
Пример #9
0
/*
 * Returns:
 *  0 => OK
 * -1 => error
 */
int
__big_split(
	HTAB *hashp,
	BUFHEAD *op,	/* Pointer to where to put keys that go in old bucket */
	BUFHEAD *np,	/* Pointer to new bucket page */
			/* Pointer to first page containing the big key/data */
	BUFHEAD *big_keyp,
	int addr,	/* Address of big_keyp */
	uint32_t   obucket,/* Old Bucket */
	SPLIT_RETURN *ret
)
{
	BUFHEAD *tmpp;
	uint16_t *tp;
	BUFHEAD *bp;
	DBT key, val;
	uint32_t change;
	uint16_t free_space, n, off;
	size_t temp;

	bp = big_keyp;

	/* Now figure out where the big key/data goes */
	if (__big_keydata(hashp, big_keyp, &key, &val, 0))
		return (-1);
	change = (__call_hash(hashp, key.data, (int)key.size) != obucket);

	if ((ret->next_addr = __find_last_page(hashp, &big_keyp)) != 0) {
		if (!(ret->nextp =
		    __get_buf(hashp, (uint32_t)ret->next_addr, big_keyp, 0)))
			return (-1);
	} else
		ret->nextp = NULL;

	/* Now make one of np/op point to the big key/data pair */
	_DIAGASSERT(np->ovfl == NULL);
	if (change)
		tmpp = np;
	else
		tmpp = op;

	tmpp->flags |= BUF_MOD;
#ifdef DEBUG1
	(void)fprintf(stderr,
	    "BIG_SPLIT: %d->ovfl was %d is now %d\n", tmpp->addr,
	    (tmpp->ovfl ? tmpp->ovfl->addr : 0), (bp ? bp->addr : 0));
#endif
	tmpp->ovfl = bp;	/* one of op/np point to big_keyp */
	tp = (uint16_t *)(void *)tmpp->page;
	_DIAGASSERT(FREESPACE(tp) >= OVFLSIZE);
	n = tp[0];
	off = OFFSET(tp);
	free_space = FREESPACE(tp);
	tp[++n] = (uint16_t)addr;
	tp[++n] = OVFLPAGE;
	tp[0] = n;
	OFFSET(tp) = off;
	temp = free_space - OVFLSIZE;
	_DBFIT(temp, uint16_t);
	FREESPACE(tp) = (uint16_t)temp;

	/*
	 * Finally, set the new and old return values. BIG_KEYP contains a
	 * pointer to the last page of the big key_data pair. Make sure that
	 * big_keyp has no following page (2 elements) or create an empty
	 * following page.
	 */

	ret->newp = np;
	ret->oldp = op;

	tp = (uint16_t *)(void *)big_keyp->page;
	big_keyp->flags |= BUF_MOD;
	if (tp[0] > 2) {
		/*
		 * There may be either one or two offsets on this page.  If
		 * there is one, then the overflow page is linked on normally
		 * and tp[4] is OVFLPAGE.  If there are two, tp[4] contains
		 * the second offset and needs to get stuffed in after the
		 * next overflow page is added.
		 */
		n = tp[4];
		free_space = FREESPACE(tp);
		off = OFFSET(tp);
		tp[0] -= 2;
		temp = free_space + OVFLSIZE;
		_DBFIT(temp, uint16_t);
		FREESPACE(tp) = (uint16_t)temp;
		OFFSET(tp) = off;
		tmpp = __add_ovflpage(hashp, big_keyp);
		if (!tmpp)
			return (-1);
		tp[4] = n;
	} else
		tmpp = big_keyp;

	if (change)
		ret->newp = tmpp;
	else
		ret->oldp = tmpp;
	return (0);
}
Пример #10
0
/*
 * Called when bufp's page  contains a partial key (index should be 1)
 *
 * All pages in the big key/data pair except bufp are freed.  We cannot
 * free bufp because the page pointing to it is lost and we can't get rid
 * of its pointer.
 *
 * Returns:
 * 0 => OK
 *-1 => ERROR
 */
int
__big_delete(HTAB *hashp, BUFHEAD *bufp)
{
	BUFHEAD *last_bfp, *rbufp;
	uint16_t *bp, pageno;
	int key_done, n;
	size_t temp;

	rbufp = bufp;
	last_bfp = NULL;
	bp = (uint16_t *)(void *)bufp->page;
	pageno = 0;
	key_done = 0;

	while (!key_done || (bp[2] != FULL_KEY_DATA)) {
		if (bp[2] == FULL_KEY || bp[2] == FULL_KEY_DATA)
			key_done = 1;

		/*
		 * If there is freespace left on a FULL_KEY_DATA page, then
		 * the data is short and fits entirely on this page, and this
		 * is the last page.
		 */
		if (bp[2] == FULL_KEY_DATA && FREESPACE(bp))
			break;
		pageno = bp[bp[0] - 1];
		rbufp->flags |= BUF_MOD;
		rbufp = __get_buf(hashp, (uint32_t)pageno, rbufp, 0);
		if (last_bfp)
			__free_ovflpage(hashp, last_bfp);
		last_bfp = rbufp;
		if (!rbufp)
			return (-1);		/* Error. */
		bp = (uint16_t *)(void *)rbufp->page;
	}

	/*
	 * If we get here then rbufp points to the last page of the big
	 * key/data pair.  Bufp points to the first one -- it should now be
	 * empty pointing to the next page after this pair.  Can't free it
	 * because we don't have the page pointing to it.
	 */

	/* This is information from the last page of the pair. */
	n = bp[0];
	pageno = bp[n - 1];

	/* Now, bp is the first page of the pair. */
	bp = (uint16_t *)(void *)bufp->page;
	if (n > 2) {
		/* There is an overflow page. */
		bp[1] = pageno;
		bp[2] = OVFLPAGE;
		bufp->ovfl = rbufp->ovfl;
	} else
		/* This is the last page. */
		bufp->ovfl = NULL;
	n -= 2;
	bp[0] = n;
	temp = hashp->BSIZE - PAGE_META(n);
	_DBFIT(temp, uint16_t);
	FREESPACE(bp) = (uint16_t)temp;
	OFFSET(bp) = hashp->BSIZE;

	bufp->flags |= BUF_MOD;
	if (rbufp)
		__free_ovflpage(hashp, rbufp);
	if (last_bfp && last_bfp != rbufp)
		__free_ovflpage(hashp, last_bfp);

	hashp->NKEYS--;
	return (0);
}
Пример #11
0
/*
 * __BT_SPLIT -- Split the tree.
 *
 * Parameters:
 *	t:	tree
 *	sp:	page to split
 *	key:	key to insert
 *	data:	data to insert
 *	flags:	BIGKEY/BIGDATA flags
 *	ilen:	insert length
 *	skip:	index to leave open
 *
 * Returns:
 *	RET_ERROR, RET_SUCCESS
 */
int
__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
    size_t ilen, uint32_t argskip)
{
	BINTERNAL *bi = NULL;	/* pacify gcc */
	BLEAF *bl = NULL, *tbl;	/* pacify gcc */
	DBT a, b;
	EPGNO *parent;
	PAGE *h, *l, *r, *lchild, *rchild;
	indx_t nxtindex;
	uint16_t skip;
	uint32_t n, nbytes, nksize = 0; /* pacify gcc */
	int parentsplit;
	char *dest;

	/*
	 * Split the page into two pages, l and r.  The split routines return
	 * a pointer to the page into which the key should be inserted and with
	 * skip set to the offset which should be used.  Additionally, l and r
	 * are pinned.
	 */
	skip = argskip;
	h = sp->pgno == P_ROOT ?
	    bt_root(t, sp, &l, &r, &skip, ilen) :
	    bt_page(t, sp, &l, &r, &skip, ilen);
	if (h == NULL)
		return (RET_ERROR);

	/*
	 * Insert the new key/data pair into the leaf page.  (Key inserts
	 * always cause a leaf page to split first.)
	 */
	_DBFIT(ilen, indx_t);
	h->upper -= (indx_t)ilen;
	h->linp[skip] = h->upper;
	dest = (char *)(void *)h + h->upper;
	if (F_ISSET(t, R_RECNO))
		WR_RLEAF(dest, data, flags);
	else
		WR_BLEAF(dest, key, data, flags);

	/* If the root page was split, make it look right. */
	if (sp->pgno == P_ROOT &&
	    (F_ISSET(t, R_RECNO) ?
	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
		goto err2;

	/*
	 * Now we walk the parent page stack -- a LIFO stack of the pages that
	 * were traversed when we searched for the page that split.  Each stack
	 * entry is a page number and a page index offset.  The offset is for
	 * the page traversed on the search.  We've just split a page, so we
	 * have to insert a new key into the parent page.
	 *
	 * If the insert into the parent page causes it to split, may have to
	 * continue splitting all the way up the tree.  We stop if the root
	 * splits or the page inserted into didn't have to split to hold the
	 * new key.  Some algorithms replace the key for the old page as well
	 * as the new page.  We don't, as there's no reason to believe that the
	 * first key on the old page is any better than the key we have, and,
	 * in the case of a key being placed at index 0 causing the split, the
	 * key is unavailable.
	 *
	 * There are a maximum of 5 pages pinned at any time.  We keep the left
	 * and right pages pinned while working on the parent.   The 5 are the
	 * two children, left parent and right parent (when the parent splits)
	 * and the root page or the overflow key page when calling bt_preserve.
	 * This code must make sure that all pins are released other than the
	 * root page or overflow page which is unlocked elsewhere.
	 */
	while ((parent = BT_POP(t)) != NULL) {
		lchild = l;
		rchild = r;

		/* Get the parent page. */
		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
			goto err2;

	 	/*
		 * The new key goes ONE AFTER the index, because the split
		 * was to the right.
		 */
		skip = parent->index + 1;

		/*
		 * Calculate the space needed on the parent page.
		 *
		 * Prefix trees: space hack when inserting into BINTERNAL
		 * pages.  Retain only what's needed to distinguish between
		 * the new entry and the LAST entry on the page to its left.
		 * If the keys compare equal, retain the entire key.  Note,
		 * we don't touch overflow keys, and the entire key must be
		 * retained for the next-to-left most key on the leftmost
		 * page of each level, or the search will fail.  Applicable
		 * ONLY to internal pages that have leaf pages as children.
		 * Further reduction of the key between pairs of internal
		 * pages loses too much information.
		 */
		switch (rchild->flags & P_TYPE) {
		case P_BINTERNAL:
			bi = GETBINTERNAL(rchild, 0);
			nbytes = NBINTERNAL(bi->ksize);
			break;
		case P_BLEAF:
			bl = GETBLEAF(rchild, 0);
			nbytes = NBINTERNAL(bl->ksize);
			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
			    (h->prevpg != P_INVALID || skip > 1)) {
				size_t temp;
				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
				a.size = tbl->ksize;
				a.data = tbl->bytes;
				b.size = bl->ksize;
				b.data = bl->bytes;
				temp = t->bt_pfx(&a, &b);
				_DBFIT(temp, uint32_t);
				nksize = (uint32_t)temp;
				n = NBINTERNAL(nksize);
				if (n < nbytes) {
#ifdef STATISTICS
					bt_pfxsaved += nbytes - n;
#endif
					nbytes = n;
				} else
					nksize = 0;
			} else
				nksize = 0;
			break;
		case P_RINTERNAL:
		case P_RLEAF:
			nbytes = NRINTERNAL;
			break;
		default:
			abort();
		}

		/* Split the parent page if necessary or shift the indices. */
		if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) {
			sp = h;
			h = h->pgno == P_ROOT ?
			    bt_root(t, h, &l, &r, &skip, nbytes) :
			    bt_page(t, h, &l, &r, &skip, nbytes);
			if (h == NULL)
				goto err1;
			parentsplit = 1;
		} else {
			if (skip < (nxtindex = NEXTINDEX(h)))
				memmove(h->linp + skip + 1, h->linp + skip,
				    (nxtindex - skip) * sizeof(indx_t));
			h->lower += sizeof(indx_t);
			parentsplit = 0;
		}

		/* Insert the key into the parent page. */
		switch (rchild->flags & P_TYPE) {
		case P_BINTERNAL:
			h->linp[skip] = h->upper -= nbytes;
			dest = (char *)(void *)h + h->linp[skip];
			memmove(dest, bi, nbytes);
			((BINTERNAL *)(void *)dest)->pgno = rchild->pgno;
			break;
		case P_BLEAF:
			h->linp[skip] = h->upper -= nbytes;
			dest = (char *)(void *)h + h->linp[skip];
			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
			    rchild->pgno, bl->flags & P_BIGKEY);
			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
			if (bl->flags & P_BIGKEY &&
			    bt_preserve(t, *(pgno_t *)(void *)bl->bytes) ==
			    RET_ERROR)
				goto err1;
			break;
		case P_RINTERNAL:
			/*
			 * Update the left page count.  If split
			 * added at index 0, fix the correct page.
			 */
			if (skip > 0)
				dest = (char *)(void *)h + h->linp[skip - 1];
			else
				dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1];
			((RINTERNAL *)(void *)dest)->nrecs = rec_total(lchild);
			((RINTERNAL *)(void *)dest)->pgno = lchild->pgno;

			/* Update the right page count. */
			h->linp[skip] = h->upper -= nbytes;
			dest = (char *)(void *)h + h->linp[skip];
			((RINTERNAL *)(void *)dest)->nrecs = rec_total(rchild);
			((RINTERNAL *)(void *)dest)->pgno = rchild->pgno;
			break;
		case P_RLEAF:
			/*
			 * Update the left page count.  If split
			 * added at index 0, fix the correct page.
			 */
			if (skip > 0)
				dest = (char *)(void *)h + h->linp[skip - 1];
			else
				dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1];
			((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(lchild);
			((RINTERNAL *)(void *)dest)->pgno = lchild->pgno;

			/* Update the right page count. */
			h->linp[skip] = h->upper -= nbytes;
			dest = (char *)(void *)h + h->linp[skip];
			((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(rchild);
			((RINTERNAL *)(void *)dest)->pgno = rchild->pgno;
			break;
		default:
			abort();
		}

		/* Unpin the held pages. */
		if (!parentsplit) {
			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
			break;
		}

		/* If the root page was split, make it look right. */
		if (sp->pgno == P_ROOT &&
		    (F_ISSET(t, R_RECNO) ?
		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
			goto err1;

		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
	}

	/* Unpin the held pages. */
	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
	mpool_put(t->bt_mp, r, MPOOL_DIRTY);

	/* Clear any pages left on the stack. */
	return (RET_SUCCESS);

	/*
	 * If something fails in the above loop we were already walking back
	 * up the tree and the tree is now inconsistent.  Nothing much we can
	 * do about it but release any memory we're holding.
	 */
err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);

err2:	mpool_put(t->bt_mp, l, 0);
	mpool_put(t->bt_mp, r, 0);
	__dbpanic(t->bt_dbp);
	return (RET_ERROR);
}
Пример #12
0
/*
 * BT_PSPLIT -- Do the real work of splitting the page.
 *
 * Parameters:
 *	t:	tree
 *	h:	page to be split
 *	l:	page to put lower half of data
 *	r:	page to put upper half of data
 *	pskip:	pointer to index to leave open
 *	ilen:	insert length
 *
 * Returns:
 *	Pointer to page in which to insert.
 */
static PAGE *
bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
{
	BINTERNAL *bi;
	BLEAF *bl;
	CURSOR *c;
	RLEAF *rl;
	PAGE *rval;
	void *src = NULL;	/* pacify gcc */
	indx_t full, half, nxt, off, skip, top, used;
	uint32_t nbytes;
	size_t temp;
	int bigkeycnt, isbigkey;

	/*
	 * Split the data to the left and right pages.  Leave the skip index
	 * open.  Additionally, make some effort not to split on an overflow
	 * key.  This makes internal page processing faster and can save
	 * space as overflow keys used by internal pages are never deleted.
	 */
	bigkeycnt = 0;
	skip = *pskip;
	temp = t->bt_psize - BTDATAOFF;
	_DBFIT(temp, indx_t);
	full = (indx_t)temp;
	half = full / 2;
	used = 0;
	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
		if (skip == off) {
			_DBFIT(ilen, uint32_t);
			nbytes = (uint32_t)ilen;
			isbigkey = 0;		/* XXX: not really known. */
		} else
			switch (h->flags & P_TYPE) {
			case P_BINTERNAL:
				src = bi = GETBINTERNAL(h, nxt);
				nbytes = NBINTERNAL(bi->ksize);
				isbigkey = bi->flags & P_BIGKEY;
				break;
			case P_BLEAF:
				src = bl = GETBLEAF(h, nxt);
				nbytes = NBLEAF(bl);
				isbigkey = bl->flags & P_BIGKEY;
				break;
			case P_RINTERNAL:
				src = GETRINTERNAL(h, nxt);
				nbytes = NRINTERNAL;
				isbigkey = 0;
				break;
			case P_RLEAF:
				src = rl = GETRLEAF(h, nxt);
				nbytes = NRLEAF(rl);
				isbigkey = 0;
				break;
			default:
				abort();
			}

		/*
		 * If the key/data pairs are substantial fractions of the max
		 * possible size for the page, it's possible to get situations
		 * where we decide to try and copy too much onto the left page.
		 * Make sure that doesn't happen.
		 */
		if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
		    nxt == top - 1) {
			--off;
			break;
		}

		/* Copy the key/data pair, if not the skipped index. */
		if (skip != off) {
			++nxt;

			l->linp[off] = l->upper -= nbytes;
			memmove((char *)(void *)l + l->upper, src, nbytes);
		}

		temp = nbytes + sizeof(indx_t);
		_DBFIT(temp, indx_t);
		used += (indx_t)temp;
		if (used >= half) {
			if (!isbigkey || bigkeycnt == 3)
				break;
			else
				++bigkeycnt;
		}
	}

	/*
	 * Off is the last offset that's valid for the left page.
	 * Nxt is the first offset to be placed on the right page.
	 */
	temp = (off + 1) * sizeof(indx_t);
	_DBFIT(temp, indx_t);
	l->lower += (indx_t)temp;

	/*
	 * If splitting the page that the cursor was on, the cursor has to be
	 * adjusted to point to the same record as before the split.  If the
	 * cursor is at or past the skipped slot, the cursor is incremented by
	 * one.  If the cursor is on the right page, it is decremented by the
	 * number of records split to the left page.
	 */
	c = &t->bt_cursor;
	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
		if (c->pg.index >= skip)
			++c->pg.index;
		if (c->pg.index < nxt)			/* Left page. */
			c->pg.pgno = l->pgno;
		else {					/* Right page. */
			c->pg.pgno = r->pgno;
			c->pg.index -= nxt;
		}
	}

	/*
	 * If the skipped index was on the left page, just return that page.
	 * Otherwise, adjust the skip index to reflect the new position on
	 * the right page.
	 */
	if (skip <= off) {
		skip = MAX_PAGE_OFFSET;
		rval = l;
	} else {
		rval = r;
		*pskip -= nxt;
	}

	for (off = 0; nxt < top; ++off) {
		if (skip == nxt) {
			++off;
			skip = MAX_PAGE_OFFSET;
		}
		switch (h->flags & P_TYPE) {
		case P_BINTERNAL:
			src = bi = GETBINTERNAL(h, nxt);
			nbytes = NBINTERNAL(bi->ksize);
			break;
		case P_BLEAF:
			src = bl = GETBLEAF(h, nxt);
			nbytes = NBLEAF(bl);
			break;
		case P_RINTERNAL:
			src = GETRINTERNAL(h, nxt);
			nbytes = NRINTERNAL;
			break;
		case P_RLEAF:
			src = rl = GETRLEAF(h, nxt);
			nbytes = NRLEAF(rl);
			break;
		default:
			abort();
		}
		++nxt;
		r->linp[off] = r->upper -= nbytes;
		memmove((char *)(void *)r + r->upper, src, nbytes);
	}
	temp = off * sizeof(indx_t);
	_DBFIT(temp, indx_t);
	r->lower += (indx_t)temp;

	/* If the key is being appended to the page, adjust the index. */
	if (skip == top)
		r->lower += sizeof(indx_t);

	return (rval);
}