/* * BT_BROOT -- Fix up the btree root page after it has been split. * * Parameters: * t: tree * h: root page * l: left page * r: right page * * Returns: * RET_ERROR, RET_SUCCESS */ static int bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) { BINTERNAL *bi; BLEAF *bl; u_int32_t nbytes; char *dest; /* * If the root page was a leaf page, change it into an internal page. * We copy the key we split on (but not the key's data, in the case of * a leaf page) to the new root page. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ nbytes = NBINTERNAL(0); h->linp[0] = h->upper = t->bt_psize - nbytes; dest = (char *)h + h->upper; WR_BINTERNAL(dest, 0, l->pgno, 0); switch (h->flags & P_TYPE) { case P_BLEAF: bl = GETBLEAF(r, 0); nbytes = NBINTERNAL(bl->ksize); __PAST_END(h->linp, 1) = h->upper -= nbytes; dest = (char *)h + h->upper; WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); memmove(dest, bl->bytes, bl->ksize); /* * If the key is on an overflow page, mark the overflow chain * so it isn't deleted when the leaf copy of the key is deleted. */ if (bl->flags & P_BIGKEY && bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) return (RET_ERROR); break; case P_BINTERNAL: bi = GETBINTERNAL(r, 0); nbytes = NBINTERNAL(bi->ksize); __PAST_END(h->linp, 1) = h->upper -= nbytes; dest = (char *)h + h->upper; memmove(dest, bi, nbytes); ((BINTERNAL *)dest)->pgno = r->pgno; break; default: abort(); } /* There are two keys on the page. */ h->lower = BTDATAOFF + 2 * sizeof(indx_t); /* Unpin the root page, set to btree internal page. */ h->flags &= ~P_TYPE; h->flags |= P_BINTERNAL; mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); }
/* * __BT_CMP -- Compare a key to a given record. * * Parameters: * t: tree * k1: DBT pointer of first arg to comparison * e: pointer to EPG for comparison * * Returns: * < 0 if k1 is < record * = 0 if k1 is = record * > 0 if k1 is > record */ int __bt_cmp(BTREE *t, const DBT *k1, EPG *e) { BINTERNAL *bi; BLEAF *bl; DBT k2; PAGE *h; void *bigkey; /* * The left-most key on internal pages, at any level of the tree, is * guaranteed by the following code to be less than any user key. * This saves us from having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything we've yet seen. */ h = e->page; if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF)) return (1); bigkey = NULL; if (h->flags & P_BLEAF) { bl = GETBLEAF(h, e->index); if (bl->flags & P_BIGKEY) bigkey = bl->bytes; else { k2.data = bl->bytes; k2.size = bl->ksize; } } else { bi = GETBINTERNAL(h, e->index); if (bi->flags & P_BIGKEY) bigkey = bi->bytes; else { k2.data = bi->bytes; k2.size = bi->ksize; } } if (bigkey) { if (__ovfl_get(t, bigkey, &k2.size, &t->bt_rdata.data, &t->bt_rdata.size)) return (RET_ERROR); k2.data = t->bt_rdata.data; } return ((*t->bt_cmp)(k1, &k2)); }
/* * __bt_seqset -- * Set the sequential scan to a specific key. * * Parameters: * t: tree * ep: storage for returned key * key: key for initial scan position * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV * * Side effects: * Pins the page the cursor references. * * Returns: * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. */ static int __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags) { PAGE *h; pgno_t pg; int exact; /* * Find the first, last or specific key in the tree and point the * cursor at it. The cursor may not be moved until a new key has * been found. */ switch (flags) { case R_CURSOR: /* Keyed scan. */ /* * Find the first instance of the key or the smallest key * which is greater than or equal to the specified key. */ if (key->data == NULL || key->size == 0) { errno = EINVAL; return (RET_ERROR); } return (__bt_first(t, key, ep, &exact)); case R_FIRST: /* First record. */ case R_NEXT: /* Walk down the left-hand side of the tree. */ for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); /* Check for an empty tree. */ if (NEXTINDEX(h) == 0) { mpool_put(t->bt_mp, h, 0); return (RET_SPECIAL); } if (h->flags & (P_BLEAF | P_RLEAF)) break; pg = GETBINTERNAL(h, 0)->pgno; mpool_put(t->bt_mp, h, 0); } ep->page = h; ep->index = 0; break; case R_LAST: /* Last record. */ case R_PREV: /* Walk down the right-hand side of the tree. */ for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); /* Check for an empty tree. */ if (NEXTINDEX(h) == 0) { mpool_put(t->bt_mp, h, 0); return (RET_SPECIAL); } if (h->flags & (P_BLEAF | P_RLEAF)) break; pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; mpool_put(t->bt_mp, h, 0); } ep->page = h; ep->index = NEXTINDEX(h) - 1; break; } return (RET_SUCCESS); }
/* * __bt_pdelete -- * Delete a single page from the tree. * * Parameters: * t: tree * h: leaf page * * Returns: * RET_SUCCESS, RET_ERROR. * * Side-effects: * mpool_put's the page */ static int __bt_pdelete(BTREE *t, PAGE *h) { BINTERNAL *bi; PAGE *pg; EPGNO *parent; indx_t cnt, idx, *ip, offset; u_int32_t nksize; char *from; /* * Walk the parent page stack -- a LIFO stack of the pages that were * traversed when we searched for the page where the delete occurred. * Each stack entry is a page number and a page index offset. The * offset is for the page traversed on the search. We've just deleted * a page, so we have to delete the key from the parent page. * * If the delete from the parent page makes it empty, this process may * continue all the way up the tree. We stop if we reach the root page * (which is never deleted, it's just not worth the effort) or if the * delete does not empty the page. */ while ((parent = BT_POP(t)) != NULL) { /* Get the parent page. */ if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) return (RET_ERROR); idx = parent->index; bi = GETBINTERNAL(pg, idx); /* Free any overflow pages. */ if (bi->flags & P_BIGKEY && __ovfl_delete(t, bi->bytes) == RET_ERROR) { mpool_put(t->bt_mp, pg, 0); return (RET_ERROR); } /* * Free the parent if it has only the one key and it's not the * root page. If it's the rootpage, turn it back into an empty * leaf page. */ if (NEXTINDEX(pg) == 1) { if (pg->pgno == P_ROOT) { pg->lower = BTDATAOFF; pg->upper = t->bt_psize; pg->flags = P_BLEAF; } else { if (__bt_relink(t, pg) || __bt_free(t, pg)) return (RET_ERROR); continue; } } else { /* Pack remaining key items at the end of the page. */ nksize = NBINTERNAL(bi->ksize); from = (char *)pg + pg->upper; memmove(from + nksize, from, (char *)bi - from); pg->upper += nksize; /* Adjust indices' offsets, shift the indices down. */ offset = pg->linp[idx]; for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) if (ip[0] < offset) ip[0] += nksize; for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; pg->lower -= sizeof(indx_t); } mpool_put(t->bt_mp, pg, MPOOL_DIRTY); break; } /* Free the leaf page, as long as it wasn't the root. */ if (h->pgno == P_ROOT) { mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); } return (__bt_relink(t, h) || __bt_free(t, h)); }
/* * __bt_stkacq -- * Acquire a stack so we can delete a cursor entry. * * Parameters: * t: tree * hp: pointer to current, pinned PAGE pointer * c: pointer to the cursor * * Returns: * 0 on success, 1 on failure */ static int __bt_stkacq(BTREE *t, PAGE **hp, CURSOR *c) { BINTERNAL *bi; EPG *e; EPGNO *parent; PAGE *h; indx_t idx; pgno_t pgno; recno_t nextpg, prevpg; int exact, level; /* * Find the first occurrence of the key in the tree. Toss the * currently locked page so we don't hit an already-locked page. */ h = *hp; mpool_put(t->bt_mp, h, 0); if ((e = __bt_search(t, &c->key, &exact)) == NULL) return (1); h = e->page; /* See if we got it in one shot. */ if (h->pgno == c->pg.pgno) goto ret; /* * Move right, looking for the page. At each move we have to move * up the stack until we don't have to move to the next page. If * we have to change pages at an internal level, we have to fix the * stack back up. */ while (h->pgno != c->pg.pgno) { if ((nextpg = h->nextpg) == P_INVALID) break; mpool_put(t->bt_mp, h, 0); /* Move up the stack. */ for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) return (1); /* Move to the next index. */ if (parent->index != NEXTINDEX(h) - 1) { idx = parent->index + 1; BT_PUSH(t, h->pgno, idx); break; } mpool_put(t->bt_mp, h, 0); } /* Restore the stack. */ while (level--) { /* Push the next level down onto the stack. */ bi = GETBINTERNAL(h, idx); pgno = bi->pgno; BT_PUSH(t, pgno, 0); /* Lose the currently pinned page. */ mpool_put(t->bt_mp, h, 0); /* Get the next level down. */ if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) return (1); idx = 0; } mpool_put(t->bt_mp, h, 0); if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) return (1); } if (h->pgno == c->pg.pgno) goto ret; /* Reacquire the original stack. */ mpool_put(t->bt_mp, h, 0); if ((e = __bt_search(t, &c->key, &exact)) == NULL) return (1); h = e->page; /* * Move left, looking for the page. At each move we have to move * up the stack until we don't have to change pages to move to the * next page. If we have to change pages at an internal level, we * have to fix the stack back up. */ while (h->pgno != c->pg.pgno) { if ((prevpg = h->prevpg) == P_INVALID) break; mpool_put(t->bt_mp, h, 0); /* Move up the stack. */ for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) return (1); /* Move to the next index. */ if (parent->index != 0) { idx = parent->index - 1; BT_PUSH(t, h->pgno, idx); break; } mpool_put(t->bt_mp, h, 0); } /* Restore the stack. */ while (level--) { /* Push the next level down onto the stack. */ bi = GETBINTERNAL(h, idx); pgno = bi->pgno; /* Lose the currently pinned page. */ mpool_put(t->bt_mp, h, 0); /* Get the next level down. */ if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) return (1); idx = NEXTINDEX(h) - 1; BT_PUSH(t, pgno, idx); } mpool_put(t->bt_mp, h, 0); if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) return (1); } ret: mpool_put(t->bt_mp, h, 0); return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); }
/* * __BT_BPGIN, __BT_BPGOUT -- * Convert host-specific number layout to/from the host-independent * format stored on disk. * * Parameters: * t: tree * pg: page number * h: page to convert */ void __bt_pgin(void *t, pgno_t pg, void *pp) { PAGE *h; indx_t i, top; u_char flags; char *p; if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) return; if (pg == P_META) { mswap(pp); return; } h = pp; M_32_SWAP(h->pgno); M_32_SWAP(h->prevpg); M_32_SWAP(h->nextpg); M_32_SWAP(h->flags); M_16_SWAP(h->lower); M_16_SWAP(h->upper); top = NEXTINDEX(h); if ((h->flags & P_TYPE) == P_BINTERNAL) for (i = 0; i < top; i++) { M_16_SWAP(h->linp[i]); p = (char *)GETBINTERNAL(h, i); P_32_SWAP(p); p += sizeof(u_int32_t); P_32_SWAP(p); p += sizeof(pgno_t); if (*(u_char *)p & P_BIGKEY) { p += sizeof(u_char); P_32_SWAP(p); p += sizeof(pgno_t); P_32_SWAP(p); } } else if ((h->flags & P_TYPE) == P_BLEAF) for (i = 0; i < top; i++) { M_16_SWAP(h->linp[i]); p = (char *)GETBLEAF(h, i); P_32_SWAP(p); p += sizeof(u_int32_t); P_32_SWAP(p); p += sizeof(u_int32_t); flags = *(u_char *)p; if (flags & (P_BIGKEY | P_BIGDATA)) { p += sizeof(u_char); if (flags & P_BIGKEY) { P_32_SWAP(p); p += sizeof(pgno_t); P_32_SWAP(p); } if (flags & P_BIGDATA) { p += sizeof(u_int32_t); P_32_SWAP(p); p += sizeof(pgno_t); P_32_SWAP(p); } } } }
/* * __BT_SPLIT -- Split the tree. * * Parameters: * t: tree * sp: page to split * key: key to insert * data: data to insert * flags: BIGKEY/BIGDATA flags * ilen: insert length * skip: index to leave open * * Returns: * RET_ERROR, RET_SUCCESS */ int __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, size_t ilen, u_int32_t argskip) { BINTERNAL *bi; BLEAF *bl, *tbl; DBT a, b; EPGNO *parent; PAGE *h, *l, *r, *lchild, *rchild; indx_t nxtindex; u_int16_t skip; u_int32_t n, nbytes, nksize; int parentsplit; char *dest; /* * Split the page into two pages, l and r. The split routines return * a pointer to the page into which the key should be inserted and with * skip set to the offset which should be used. Additionally, l and r * are pinned. */ skip = argskip; h = sp->pgno == P_ROOT ? bt_root(t, sp, &l, &r, &skip, ilen) : bt_page(t, sp, &l, &r, &skip, ilen); if (h == NULL) return (RET_ERROR); /* * Insert the new key/data pair into the leaf page. (Key inserts * always cause a leaf page to split first.) */ h->linp[skip] = h->upper -= ilen; dest = (char *)h + h->upper; if (F_ISSET(t, R_RECNO)) WR_RLEAF(dest, data, flags) else WR_BLEAF(dest, key, data, flags) /* If the root page was split, make it look right. */ if (sp->pgno == P_ROOT && (F_ISSET(t, R_RECNO) ? bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) goto err2; /* * Now we walk the parent page stack -- a LIFO stack of the pages that * were traversed when we searched for the page that split. Each stack * entry is a page number and a page index offset. The offset is for * the page traversed on the search. We've just split a page, so we * have to insert a new key into the parent page. * * If the insert into the parent page causes it to split, may have to * continue splitting all the way up the tree. We stop if the root * splits or the page inserted into didn't have to split to hold the * new key. Some algorithms replace the key for the old page as well * as the new page. We don't, as there's no reason to believe that the * first key on the old page is any better than the key we have, and, * in the case of a key being placed at index 0 causing the split, the * key is unavailable. * * There are a maximum of 5 pages pinned at any time. We keep the left * and right pages pinned while working on the parent. The 5 are the * two children, left parent and right parent (when the parent splits) * and the root page or the overflow key page when calling bt_preserve. * This code must make sure that all pins are released other than the * root page or overflow page which is unlocked elsewhere. */ while ((parent = BT_POP(t)) != NULL) { lchild = l; rchild = r; /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) goto err2; /* * The new key goes ONE AFTER the index, because the split * was to the right. */ skip = parent->index + 1; /* * Calculate the space needed on the parent page. * * Prefix trees: space hack when inserting into BINTERNAL * pages. Retain only what's needed to distinguish between * the new entry and the LAST entry on the page to its left. * If the keys compare equal, retain the entire key. Note, * we don't touch overflow keys, and the entire key must be * retained for the next-to-left most key on the leftmost * page of each level, or the search will fail. Applicable * ONLY to internal pages that have leaf pages as children. * Further reduction of the key between pairs of internal * pages loses too much information. */ switch (rchild->flags & P_TYPE) { case P_BINTERNAL: bi = GETBINTERNAL(rchild, 0); nbytes = NBINTERNAL(bi->ksize); break; case P_BLEAF: bl = GETBLEAF(rchild, 0); nbytes = NBINTERNAL(bl->ksize); if (t->bt_pfx && !(bl->flags & P_BIGKEY) && (h->prevpg != P_INVALID || skip > 1)) { tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); a.size = tbl->ksize; a.data = tbl->bytes; b.size = bl->ksize; b.data = bl->bytes; nksize = t->bt_pfx(&a, &b); n = NBINTERNAL(nksize); if (n < nbytes) { #ifdef STATISTICS bt_pfxsaved += nbytes - n; #endif nbytes = n; } else nksize = 0; } else nksize = 0; break; case P_RINTERNAL: case P_RLEAF: nbytes = NRINTERNAL; break; default: abort(); } /* Split the parent page if necessary or shift the indices. */ if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) { sp = h; h = h->pgno == P_ROOT ? bt_root(t, h, &l, &r, &skip, nbytes) : bt_page(t, h, &l, &r, &skip, nbytes); if (h == NULL) goto err1; parentsplit = 1; } else { if (skip < (nxtindex = NEXTINDEX(h))) memmove(h->linp + skip + 1, h->linp + skip, (nxtindex - skip) * sizeof(indx_t)); h->lower += sizeof(indx_t); parentsplit = 0; } /* Insert the key into the parent page. */ switch (rchild->flags & P_TYPE) { case P_BINTERNAL: h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; memmove(dest, bi, nbytes); ((BINTERNAL *)dest)->pgno = rchild->pgno; break; case P_BLEAF: h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, rchild->pgno, bl->flags & P_BIGKEY); memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); if (bl->flags & P_BIGKEY) { pgno_t pgno; memcpy(&pgno, bl->bytes, sizeof(pgno)); if (bt_preserve(t, pgno) == RET_ERROR) goto err1; } break; case P_RINTERNAL: /* * Update the left page count. If split * added at index 0, fix the correct page. */ if (skip > 0) dest = (char *)h + h->linp[skip - 1]; else dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; ((RINTERNAL *)dest)->nrecs = rec_total(lchild); ((RINTERNAL *)dest)->pgno = lchild->pgno; /* Update the right page count. */ h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; ((RINTERNAL *)dest)->nrecs = rec_total(rchild); ((RINTERNAL *)dest)->pgno = rchild->pgno; break; case P_RLEAF: /* * Update the left page count. If split * added at index 0, fix the correct page. */ if (skip > 0) dest = (char *)h + h->linp[skip - 1]; else dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); ((RINTERNAL *)dest)->pgno = lchild->pgno; /* Update the right page count. */ h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); ((RINTERNAL *)dest)->pgno = rchild->pgno; break; default: abort(); } /* Unpin the held pages. */ if (!parentsplit) { mpool_put(t->bt_mp, h, MPOOL_DIRTY); break; } /* If the root page was split, make it look right. */ if (sp->pgno == P_ROOT && (F_ISSET(t, R_RECNO) ? bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) goto err1; mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); } /* Unpin the held pages. */ mpool_put(t->bt_mp, l, MPOOL_DIRTY); mpool_put(t->bt_mp, r, MPOOL_DIRTY); /* Clear any pages left on the stack. */ return (RET_SUCCESS); /* * If something fails in the above loop we were already walking back * up the tree and the tree is now inconsistent. Nothing much we can * do about it but release any memory we're holding. */ err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); err2: mpool_put(t->bt_mp, l, 0); mpool_put(t->bt_mp, r, 0); __dbpanic(t->bt_dbp); return (RET_ERROR); }
/* * BT_PSPLIT -- Do the real work of splitting the page. * * Parameters: * t: tree * h: page to be split * l: page to put lower half of data * r: page to put upper half of data * pskip: pointer to index to leave open * ilen: insert length * * Returns: * Pointer to page in which to insert. */ static PAGE * bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) { BINTERNAL *bi; BLEAF *bl; CURSOR *c; RLEAF *rl; PAGE *rval; void *src; indx_t full, half, nxt, off, skip, top, used; u_int32_t nbytes; int bigkeycnt, isbigkey; /* * Split the data to the left and right pages. Leave the skip index * open. Additionally, make some effort not to split on an overflow * key. This makes internal page processing faster and can save * space as overflow keys used by internal pages are never deleted. */ bigkeycnt = 0; skip = *pskip; full = t->bt_psize - BTDATAOFF; half = full / 2; used = 0; for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { if (skip == off) { nbytes = ilen; isbigkey = 0; /* XXX: not really known. */ } else switch (h->flags & P_TYPE) { case P_BINTERNAL: src = bi = GETBINTERNAL(h, nxt); nbytes = NBINTERNAL(bi->ksize); isbigkey = bi->flags & P_BIGKEY; break; case P_BLEAF: src = bl = GETBLEAF(h, nxt); nbytes = NBLEAF(bl); isbigkey = bl->flags & P_BIGKEY; break; case P_RINTERNAL: src = GETRINTERNAL(h, nxt); nbytes = NRINTERNAL; isbigkey = 0; break; case P_RLEAF: src = rl = GETRLEAF(h, nxt); nbytes = NRLEAF(rl); isbigkey = 0; break; default: abort(); } /* * If the key/data pairs are substantial fractions of the max * possible size for the page, it's possible to get situations * where we decide to try and copy too much onto the left page. * Make sure that doesn't happen. */ if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || nxt == top - 1) { --off; break; } /* Copy the key/data pair, if not the skipped index. */ if (skip != off) { ++nxt; l->linp[off] = l->upper -= nbytes; memmove((char *)l + l->upper, src, nbytes); } used += nbytes + sizeof(indx_t); if (used >= half) { if (!isbigkey || bigkeycnt == 3) break; else ++bigkeycnt; } } /* * Off is the last offset that's valid for the left page. * Nxt is the first offset to be placed on the right page. */ l->lower += (off + 1) * sizeof(indx_t); /* * If splitting the page that the cursor was on, the cursor has to be * adjusted to point to the same record as before the split. If the * cursor is at or past the skipped slot, the cursor is incremented by * one. If the cursor is on the right page, it is decremented by the * number of records split to the left page. */ c = &t->bt_cursor; if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { if (c->pg.index >= skip) ++c->pg.index; if (c->pg.index < nxt) /* Left page. */ c->pg.pgno = l->pgno; else { /* Right page. */ c->pg.pgno = r->pgno; c->pg.index -= nxt; } } /* * If the skipped index was on the left page, just return that page. * Otherwise, adjust the skip index to reflect the new position on * the right page. */ if (skip <= off) { skip = MAX_PAGE_OFFSET; rval = l; } else { rval = r; *pskip -= nxt; } for (off = 0; nxt < top; ++off) { if (skip == nxt) { ++off; skip = MAX_PAGE_OFFSET; } switch (h->flags & P_TYPE) { case P_BINTERNAL: src = bi = GETBINTERNAL(h, nxt); nbytes = NBINTERNAL(bi->ksize); break; case P_BLEAF: src = bl = GETBLEAF(h, nxt); nbytes = NBLEAF(bl); break; case P_RINTERNAL: src = GETRINTERNAL(h, nxt); nbytes = NRINTERNAL; break; case P_RLEAF: src = rl = GETRLEAF(h, nxt); nbytes = NRLEAF(rl); break; default: abort(); } ++nxt; r->linp[off] = r->upper -= nbytes; memmove((char *)r + r->upper, src, nbytes); } r->lower += off * sizeof(indx_t); /* If the key is being appended to the page, adjust the index. */ if (skip == top) r->lower += sizeof(indx_t); return (rval); }
/* * BT_STAT -- Gather/print the tree statistics * * Parameters: * dbp: pointer to the DB */ void __bt_stat(DB *dbp) { extern u_long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit; extern u_long bt_sortsplit, bt_split; BTREE *t; PAGE *h; pgno_t i, pcont, pinternal, pleaf; u_long ifree, lfree, nkeys; int levels; t = dbp->internal; pcont = pinternal = pleaf = 0; nkeys = ifree = lfree = 0; for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, MPOOL_IGNOREPIN)) != NULL; ++i) switch (h->flags & P_TYPE) { case P_BINTERNAL: case P_RINTERNAL: ++pinternal; ifree += h->upper - h->lower; break; case P_BLEAF: case P_RLEAF: ++pleaf; lfree += h->upper - h->lower; nkeys += NEXTINDEX(h); break; case P_OVERFLOW: ++pcont; break; } /* Count the levels of the tree. */ for (i = P_ROOT, levels = 0 ;; ++levels) { h = mpool_get(t->bt_mp, i, MPOOL_IGNOREPIN); if (h->flags & (P_BLEAF|P_RLEAF)) { if (levels == 0) levels = 1; break; } i = F_ISSET(t, R_RECNO) ? GETRINTERNAL(h, 0)->pgno : GETBINTERNAL(h, 0)->pgno; } (void)fprintf(stderr, "%d level%s with %lu keys", levels, levels == 1 ? "" : "s", nkeys); if (F_ISSET(t, R_RECNO)) (void)fprintf(stderr, " (%u header count)", t->bt_nrecs); (void)fprintf(stderr, "\n%u pages (leaf %u, internal %u, overflow %u)\n", pinternal + pleaf + pcont, pleaf, pinternal, pcont); (void)fprintf(stderr, "%lu cache hits, %lu cache misses\n", bt_cache_hit, bt_cache_miss); (void)fprintf(stderr, "%lu splits (%lu root splits, %lu sort splits)\n", bt_split, bt_rootsplit, bt_sortsplit); pleaf *= t->bt_psize - BTDATAOFF; if (pleaf) (void)fprintf(stderr, "%.0f%% leaf fill (%lu bytes used, %lu bytes free)\n", ((double)(pleaf - lfree) / pleaf) * 100, pleaf - lfree, lfree); pinternal *= t->bt_psize - BTDATAOFF; if (pinternal) (void)fprintf(stderr, "%.0f%% internal fill (%lu bytes used, %lu bytes free\n", ((double)(pinternal - ifree) / pinternal) * 100, pinternal - ifree, ifree); if (bt_pfxsaved) (void)fprintf(stderr, "prefix checking removed %lu bytes.\n", bt_pfxsaved); }
/* * BT_DPAGE -- Dump the page * * Parameters: * h: pointer to the PAGE */ void __bt_dpage(PAGE *h) { BINTERNAL *bi; BLEAF *bl; RINTERNAL *ri; RLEAF *rl; indx_t cur, top; char *sep; (void)fprintf(stderr, " page %u: (", h->pgno); #undef X #define X(flag, name) \ if (h->flags & flag) { \ (void)fprintf(stderr, "%s%s", sep, name); \ sep = ", "; \ } sep = ""; X(P_BINTERNAL, "BINTERNAL") /* types */ X(P_BLEAF, "BLEAF") X(P_RINTERNAL, "RINTERNAL") /* types */ X(P_RLEAF, "RLEAF") X(P_OVERFLOW, "OVERFLOW") X(P_PRESERVE, "PRESERVE"); (void)fprintf(stderr, ")\n"); #undef X (void)fprintf(stderr, "\tprev %2u next %2u", h->prevpg, h->nextpg); if (h->flags & P_OVERFLOW) return; top = NEXTINDEX(h); (void)fprintf(stderr, " lower %3d upper %3d nextind %d\n", h->lower, h->upper, top); for (cur = 0; cur < top; cur++) { (void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]); switch (h->flags & P_TYPE) { case P_BINTERNAL: bi = GETBINTERNAL(h, cur); (void)fprintf(stderr, "size %03d pgno %03d", bi->ksize, bi->pgno); if (bi->flags & P_BIGKEY) (void)fprintf(stderr, " (indirect)"); else if (bi->ksize) (void)fprintf(stderr, " {%.*s}", (int)bi->ksize, bi->bytes); break; case P_RINTERNAL: ri = GETRINTERNAL(h, cur); (void)fprintf(stderr, "entries %03d pgno %03d", ri->nrecs, ri->pgno); break; case P_BLEAF: bl = GETBLEAF(h, cur); if (bl->flags & P_BIGKEY) (void)fprintf(stderr, "big key page %u size %u/", *(pgno_t *)bl->bytes, *(u_int32_t *)(bl->bytes + sizeof(pgno_t))); else if (bl->ksize) (void)fprintf(stderr, "%s/", bl->bytes); if (bl->flags & P_BIGDATA) (void)fprintf(stderr, "big data page %u size %u", *(pgno_t *)(bl->bytes + bl->ksize), *(u_int32_t *)(bl->bytes + bl->ksize + sizeof(pgno_t))); else if (bl->dsize) (void)fprintf(stderr, "%.*s", (int)bl->dsize, bl->bytes + bl->ksize); break; case P_RLEAF: rl = GETRLEAF(h, cur); if (rl->flags & P_BIGDATA) (void)fprintf(stderr, "big data page %u size %u", *(pgno_t *)rl->bytes, *(u_int32_t *)(rl->bytes + sizeof(pgno_t))); else if (rl->dsize) (void)fprintf(stderr, "%.*s", (int)rl->dsize, rl->bytes); break; } (void)fprintf(stderr, "\n"); } }
/* * __bt_search -- * Search a btree for a key. * * Parameters: * t: tree to search * key: key to find * exactp: pointer to exact match flag * * Returns: * The EPG for matching record, if any, or the EPG for the location * of the key, if it were inserted into the tree, is entered into * the bt_cur field of the tree. A pointer to the field is returned. */ EPG * __bt_search(BTREE *t, const DBT *key, int *exactp) { PAGE *h; indx_t base, idx, lim; pgno_t pg; int cmp; BT_CLR(t); for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (NULL); /* Do a binary search on the current page. */ t->bt_cur.page = h; for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { t->bt_cur.index = idx = base + (lim >> 1); if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { if (h->flags & P_BLEAF) { *exactp = 1; return (&t->bt_cur); } goto next; } if (cmp > 0) { base = idx + 1; --lim; } } /* * If it's a leaf page, we're almost done. If no duplicates * are allowed, or we have an exact match, we're done. Else, * it's possible that there were matching keys on this page, * which later deleted, and we're on a page with no matches * while there are matches on other pages. If at the start or * end of a page, check the adjacent page. */ if (h->flags & P_BLEAF) { if (!F_ISSET(t, B_NODUPS)) { if (base == 0 && h->prevpg != P_INVALID && __bt_sprev(t, h, key, exactp)) return (&t->bt_cur); if (base == NEXTINDEX(h) && h->nextpg != P_INVALID && __bt_snext(t, h, key, exactp)) return (&t->bt_cur); } *exactp = 0; t->bt_cur.index = base; return (&t->bt_cur); } /* * No match found. Base is the smallest index greater than * key and may be zero or a last + 1 index. If it's non-zero, * decrement by one, and record the internal page which should * be a parent page for the key. If a split later occurs, the * inserted page will be to the right of the saved page. */ idx = base ? base - 1 : base; next: BT_PUSH(t, h->pgno, idx); pg = GETBINTERNAL(h, idx)->pgno; mpool_put(t->bt_mp, h, 0); } }
/* * __bt_search -- * Search a btree for a key. * * Parameters: * t: tree to search * key: key to find * exactp: pointer to exact match flag * * Returns: * The EPG for matching record, if any, or the EPG for the location * of the key, if it were inserted into the tree, is entered into * the bt_cur field of the tree. A pointer to the field is returned. */ EPG * __bt_search_st(BTREE *t,const DBT *key,int *exactp) { /* * 1.Read from root of the btree in NTT * 2.reconstruct the node */ PAGE *h=NULL; /* h is a logical B-Tree node, either a disk mode node or a virtual node in memory construct by log */ indx_t base, index, lim; pgno_t pg; //node id of the page int cmp; BT_CLR(t); /* @mx it initializes t->bt_sp */ err_debug(("Searh Btree")); for (pg = P_ROOT;;) { err_debug(("~^")); err_debug(("Read Node %ud",pg)); h = read_node(t,pg); //__bt_dpage(h); err_debug(("~$End Read")); if(h==NULL) return (NULL); /* ??? not so clear about the binary search */ /* Do a binary search on the current page. */ t->bt_cur.page = h; for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { t->bt_cur.index = index = base + (lim >> 1); if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { if (h->flags & P_BLEAF) { *exactp = 1; err_debug(("End Search")); return (&t->bt_cur); } goto next; } if (cmp > 0) { base = index + 1; --lim; } } /* * If it's a leaf page, we're almost done. If no duplicates * are allowed, or we have an exact match, we're done. Else, * it's possible that there were matching keys on this page, * which later deleted, and we're on a page with no matches * while there are matches on other pages. If at the start or * end of a page, check the adjacent page. * * TODO: what about this condition for log mode ? */ if (h->flags & P_BLEAF) { #if 0 if (!F_ISSET(t, B_NODUPS)) { if (base == 0 && h->prevpg != P_INVALID && __bt_sprev(t, h, key, exactp)) err_debug(("End Search")); return (&t->bt_cur); if (base == NEXTINDEX(h) && h->nextpg != P_INVALID && __bt_snext(t, h, key, exactp)) err_debug(("End Search\n")); return (&t->bt_cur); } #endif *exactp = 0; t->bt_cur.index = base; err_debug(("End Search")); return (&t->bt_cur); } /* * No match found. Base is the smallest index greater than * key and may be zero or a last + 1 index. If it's non-zero, * decrement by one, and record the internal page which should * be a parent page for the key. If a split later occurs, the * inserted page will be to the right of the saved page. */ index = base ? base - 1 : base; next: BT_PUSH(t, pg, index); pg = GETBINTERNAL(h, index)->pgno; Mpool_put(t->bt_mp, h, 0); } }