DB * __rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo, int dflags) { BTREE *t; BTREEINFO btopeninfo; DB *dbp; PAGE *h; struct stat sb; int rfd = -1; /* pacify gcc */ int sverrno; dbp = NULL; /* Open the user's file -- if this fails, we're done. */ if (fname != NULL) { if ((rfd = __dbopen(fname, flags, mode, NULL)) == -1) return NULL; } /* Create a btree in memory (backed by disk). */ if (openinfo) { if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT)) goto einval; btopeninfo.flags = 0; btopeninfo.cachesize = openinfo->cachesize; btopeninfo.maxkeypage = 0; btopeninfo.minkeypage = 0; btopeninfo.psize = openinfo->psize; btopeninfo.compare = NULL; btopeninfo.prefix = NULL; btopeninfo.lorder = openinfo->lorder; dbp = __bt_open(openinfo->bfname, O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags); } else dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags); if (dbp == NULL) goto err; /* * Some fields in the tree structure are recno specific. Fill them * in and make the btree structure look like a recno structure. We * don't change the bt_ovflsize value, it's close enough and slightly * bigger. */ t = dbp->internal; if (openinfo) { if (openinfo->flags & R_FIXEDLEN) { F_SET(t, R_FIXLEN); t->bt_reclen = openinfo->reclen; if (t->bt_reclen == 0) goto einval; } t->bt_bval = openinfo->bval; } else t->bt_bval = '\n'; F_SET(t, R_RECNO); if (fname == NULL) F_SET(t, R_EOF | R_INMEM); else t->bt_rfd = rfd; if (fname != NULL) { /* * In 4.4BSD, stat(2) returns true for ISSOCK on pipes. * Unfortunately, that's not portable, so we use lseek * and check the errno values. */ errno = 0; if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, R_RDONLY); break; default: goto einval; } slow: if ((t->bt_rfp = fdopen(rfd, "r")) == NULL) goto err; F_SET(t, R_CLOSEFP); t->bt_irec = F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe; } else { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, R_RDONLY); break; case O_RDWR: break; default: goto einval; } if (fstat(rfd, &sb)) goto err; /* * Kluge -- we'd like to test to see if the file is too * big to mmap. Since, we don't know what size or type * off_t's or size_t's are, what the largest unsigned * integral type is, or what random insanity the local * C compiler will perpetrate, doing the comparison in * a portable way is flatly impossible. Hope that mmap * fails if the file is too large. */ if (sb.st_size == 0) F_SET(t, R_EOF); else { #ifdef MMAP_NOT_AVAILABLE /* * XXX * Mmap doesn't work correctly on many current * systems. In particular, it can fail subtly, * with cache coherency problems. Don't use it * for now. */ t->bt_msize = sb.st_size; if ((t->bt_smap = mmap(NULL, t->bt_msize, PROT_READ, MAP_FILE | MAP_PRIVATE, rfd, (off_t)0)) == (caddr_t)-1) goto slow; t->bt_cmap = t->bt_smap; t->bt_emap = t->bt_smap + sb.st_size; t->bt_irec = F_ISSET(t, R_FIXLEN) ? __rec_fmap : __rec_vmap; F_SET(t, R_MEMMAPPED); #else goto slow; #endif } } } /* Use the recno routines. */ dbp->close = __rec_close; dbp->del = __rec_delete; dbp->fd = __rec_fd; dbp->get = __rec_get; dbp->put = __rec_put; dbp->seq = __rec_seq; dbp->sync = __rec_sync; /* If the root page was created, reset the flags. */ if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL) goto err; if ((h->flags & P_TYPE) == P_BLEAF) { F_CLR(h, P_TYPE); F_SET(h, P_RLEAF); mpool_put(t->bt_mp, h, MPOOL_DIRTY); } else mpool_put(t->bt_mp, h, 0); if (openinfo && openinfo->flags & R_SNAPSHOT && !F_ISSET(t, R_EOF | R_INMEM) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) goto err; return (dbp); einval: errno = EINVAL; err: sverrno = errno; if (dbp != NULL) (void)__bt_close(dbp); if (fname != NULL) (void)close(rfd); errno = sverrno; return (NULL); }
/* * __REC_SYNC -- sync the recno tree to disk. * * Parameters: * dbp: pointer to access method * * Returns: * RET_SUCCESS, RET_ERROR. */ int __rec_sync(const DB *dbp, u_int flags) { struct iovec iov[2]; BTREE *t; DBT data, key; off_t off; recno_t scursor, trec; int status; t = dbp->internal; /* Toss any page pinned across calls. */ if (t->bt_pinned != NULL) { mpool_put(t->bt_mp, t->bt_pinned, 0); t->bt_pinned = NULL; } if (flags == R_RECNOSYNC) return (__bt_sync(dbp, 0)); if (F_ISSET(t, R_RDONLY | R_INMEM) || !F_ISSET(t, R_MODIFIED)) return (RET_SUCCESS); /* Read any remaining records into the tree. */ if (!F_ISSET(t, R_EOF) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) return (RET_ERROR); /* Rewind the file descriptor. */ if (lseek(t->bt_rfd, (off_t)0, SEEK_SET) != 0) return (RET_ERROR); /* Save the cursor. */ scursor = t->bt_cursor.rcursor; key.size = sizeof(recno_t); key.data = &trec; if (F_ISSET(t, R_FIXLEN)) { /* * We assume that fixed length records are all fixed length. * Any that aren't are either EINVAL'd or corrected by the * record put code. */ status = (dbp->seq)(dbp, &key, &data, R_FIRST); while (status == RET_SUCCESS) { if (_write(t->bt_rfd, data.data, data.size) != (ssize_t)data.size) return (RET_ERROR); status = (dbp->seq)(dbp, &key, &data, R_NEXT); } } else { iov[1].iov_base = &t->bt_bval; iov[1].iov_len = 1; status = (dbp->seq)(dbp, &key, &data, R_FIRST); while (status == RET_SUCCESS) { iov[0].iov_base = data.data; iov[0].iov_len = data.size; if (_writev(t->bt_rfd, iov, 2) != (ssize_t)(data.size + 1)) return (RET_ERROR); status = (dbp->seq)(dbp, &key, &data, R_NEXT); } } /* Restore the cursor. */ t->bt_cursor.rcursor = scursor; if (status == RET_ERROR) return (RET_ERROR); if ((off = lseek(t->bt_rfd, (off_t)0, SEEK_CUR)) == -1) return (RET_ERROR); if (ftruncate(t->bt_rfd, off)) return (RET_ERROR); F_CLR(t, R_MODIFIED); return (RET_SUCCESS); }
/* * __bt_curdel -- * Delete the cursor. * * Parameters: * t: tree * key: referenced key (or NULL) * h: page * idx: index on page to delete * * Returns: * RET_SUCCESS, RET_ERROR. */ static int __bt_curdel(BTREE *t, const DBT *key, PAGE *h, u_int idx) { CURSOR *c; EPG e; PAGE *pg; int curcopy, status; /* * If there are duplicates, move forward or backward to one. * Otherwise, copy the key into the cursor area. */ c = &t->bt_cursor; F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); curcopy = 0; if (!F_ISSET(t, B_NODUPS)) { /* * We're going to have to do comparisons. If we weren't * provided a copy of the key, i.e. the user is deleting * the current cursor position, get one. */ if (key == NULL) { e.page = h; e.index = idx; if ((status = __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) return (status); curcopy = 1; key = &c->key; } /* Check previous key, if not at the beginning of the page. */ if (idx > 0) { e.page = h; e.index = idx - 1; if (__bt_cmp(t, key, &e) == 0) { F_SET(c, CURS_BEFORE); goto dup2; } } /* Check next key, if not at the end of the page. */ if (idx < NEXTINDEX(h) - 1) { e.page = h; e.index = idx + 1; if (__bt_cmp(t, key, &e) == 0) { F_SET(c, CURS_AFTER); goto dup2; } } /* Check previous key if at the beginning of the page. */ if (idx == 0 && h->prevpg != P_INVALID) { if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) return (RET_ERROR); e.page = pg; e.index = NEXTINDEX(pg) - 1; if (__bt_cmp(t, key, &e) == 0) { F_SET(c, CURS_BEFORE); goto dup1; } mpool_put(t->bt_mp, pg, 0); } /* Check next key if at the end of the page. */ if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) return (RET_ERROR); e.page = pg; e.index = 0; if (__bt_cmp(t, key, &e) == 0) { F_SET(c, CURS_AFTER); dup1: mpool_put(t->bt_mp, pg, 0); dup2: c->pg.pgno = e.page->pgno; c->pg.index = e.index; return (RET_SUCCESS); } mpool_put(t->bt_mp, pg, 0); } } e.page = h; e.index = idx; if (curcopy || (status = __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { F_SET(c, CURS_ACQUIRE); return (RET_SUCCESS); } return (status); }
/* * __bt_delete * Delete the item(s) referenced by a key. * * Return RET_SPECIAL if the key is not found. */ int __bt_delete(const DB *dbp, const DBT *key, u_int flags) { BTREE *t; CURSOR *c; PAGE *h; int status; t = dbp->internal; /* Toss any page pinned across calls. */ if (t->bt_pinned != NULL) { mpool_put(t->bt_mp, t->bt_pinned, 0); t->bt_pinned = NULL; } /* Check for change to a read-only tree. */ if (F_ISSET(t, B_RDONLY)) { errno = EPERM; return (RET_ERROR); } switch (flags) { case 0: status = __bt_bdelete(t, key); break; case R_CURSOR: /* * If flags is R_CURSOR, delete the cursor. Must already * have started a scan and not have already deleted it. */ c = &t->bt_cursor; if (F_ISSET(c, CURS_INIT)) { if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) return (RET_SPECIAL); if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) return (RET_ERROR); /* * If the page is about to be emptied, we'll need to * delete it, which means we have to acquire a stack. */ if (NEXTINDEX(h) == 1) if (__bt_stkacq(t, &h, &t->bt_cursor)) return (RET_ERROR); status = __bt_dleaf(t, NULL, h, c->pg.index); if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { if (__bt_pdelete(t, h)) return (RET_ERROR); } else mpool_put(t->bt_mp, h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); break; } /* FALLTHROUGH */ default: errno = EINVAL; return (RET_ERROR); } if (status == RET_SUCCESS) F_SET(t, B_MODIFIED); return (status); }
/* * __bt_search -- * Search a btree for a key. * * Parameters: * t: tree to search * key: key to find * exactp: pointer to exact match flag * * Returns: * The EPG for matching record, if any, or the EPG for the location * of the key, if it were inserted into the tree, is entered into * the bt_cur field of the tree. A pointer to the field is returned. */ EPG * __bt_search(BTREE *t, const DBT *key, int *exactp) { PAGE *h; indx_t base, idx, lim; pgno_t pg; int cmp; BT_CLR(t); for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (NULL); /* Do a binary search on the current page. */ t->bt_cur.page = h; for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { t->bt_cur.index = idx = base + (lim >> 1); if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { if (h->flags & P_BLEAF) { *exactp = 1; return (&t->bt_cur); } goto next; } if (cmp > 0) { base = idx + 1; --lim; } } /* * If it's a leaf page, we're almost done. If no duplicates * are allowed, or we have an exact match, we're done. Else, * it's possible that there were matching keys on this page, * which later deleted, and we're on a page with no matches * while there are matches on other pages. If at the start or * end of a page, check the adjacent page. */ if (h->flags & P_BLEAF) { if (!F_ISSET(t, B_NODUPS)) { if (base == 0 && h->prevpg != P_INVALID && __bt_sprev(t, h, key, exactp)) return (&t->bt_cur); if (base == NEXTINDEX(h) && h->nextpg != P_INVALID && __bt_snext(t, h, key, exactp)) return (&t->bt_cur); } *exactp = 0; t->bt_cur.index = base; return (&t->bt_cur); } /* * No match found. Base is the smallest index greater than * key and may be zero or a last + 1 index. If it's non-zero, * decrement by one, and record the internal page which should * be a parent page for the key. If a split later occurs, the * inserted page will be to the right of the saved page. */ idx = base ? base - 1 : base; next: BT_PUSH(t, h->pgno, idx); pg = GETBINTERNAL(h, idx)->pgno; mpool_put(t->bt_mp, h, 0); } }
/* * __bt_pdelete -- * Delete a single page from the tree. * * Parameters: * t: tree * h: leaf page * * Returns: * RET_SUCCESS, RET_ERROR. * * Side-effects: * mpool_put's the page */ static int __bt_pdelete(BTREE *t, PAGE *h) { BINTERNAL *bi; PAGE *pg; EPGNO *parent; indx_t cnt, idx, *ip, offset; u_int32_t nksize; char *from; /* * Walk the parent page stack -- a LIFO stack of the pages that were * traversed when we searched for the page where the delete occurred. * Each stack entry is a page number and a page index offset. The * offset is for the page traversed on the search. We've just deleted * a page, so we have to delete the key from the parent page. * * If the delete from the parent page makes it empty, this process may * continue all the way up the tree. We stop if we reach the root page * (which is never deleted, it's just not worth the effort) or if the * delete does not empty the page. */ while ((parent = BT_POP(t)) != NULL) { /* Get the parent page. */ if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) return (RET_ERROR); idx = parent->index; bi = GETBINTERNAL(pg, idx); /* Free any overflow pages. */ if (bi->flags & P_BIGKEY && __ovfl_delete(t, bi->bytes) == RET_ERROR) { mpool_put(t->bt_mp, pg, 0); return (RET_ERROR); } /* * Free the parent if it has only the one key and it's not the * root page. If it's the rootpage, turn it back into an empty * leaf page. */ if (NEXTINDEX(pg) == 1) { if (pg->pgno == P_ROOT) { pg->lower = BTDATAOFF; pg->upper = t->bt_psize; pg->flags = P_BLEAF; } else { if (__bt_relink(t, pg) || __bt_free(t, pg)) return (RET_ERROR); continue; } } else { /* Pack remaining key items at the end of the page. */ nksize = NBINTERNAL(bi->ksize); from = (char *)pg + pg->upper; memmove(from + nksize, from, (char *)bi - from); pg->upper += nksize; /* Adjust indices' offsets, shift the indices down. */ offset = pg->linp[idx]; for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) if (ip[0] < offset) ip[0] += nksize; for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; pg->lower -= sizeof(indx_t); } mpool_put(t->bt_mp, pg, MPOOL_DIRTY); break; } /* Free the leaf page, as long as it wasn't the root. */ if (h->pgno == P_ROOT) { mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); } return (__bt_relink(t, h) || __bt_free(t, h)); }
/* * __REC_SEARCH -- Search a btree for a key. * * Parameters: * t: tree to search * recno: key to find * op: search operation * * Returns: * EPG for matching record, if any, or the EPG for the location of the * key, if it were inserted into the tree. * * Returns: * The EPG for matching record, if any, or the EPG for the location * of the key, if it were inserted into the tree, is entered into * the bt_cur field of the tree. A pointer to the field is returned. */ EPG * __rec_search(BTREE *t, recno_t recno, enum SRCHOP op) { indx_t idx; PAGE *h; EPGNO *parent; RINTERNAL *r; pgno_t pg; indx_t top; recno_t total; int sverrno; BT_CLR(t); for (pg = P_ROOT, total = 0;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) goto err; if (h->flags & P_RLEAF) { t->bt_cur.page = h; t->bt_cur.index = recno - total; return (&t->bt_cur); } for (idx = 0, top = NEXTINDEX(h);;) { r = GETRINTERNAL(h, idx); if (++idx == top || total + r->nrecs > recno) break; total += r->nrecs; } BT_PUSH(t, pg, idx - 1); pg = r->pgno; switch (op) { case SDELETE: --GETRINTERNAL(h, (idx - 1))->nrecs; mpool_put(t->bt_mp, h, MPOOL_DIRTY); break; case SINSERT: ++GETRINTERNAL(h, (idx - 1))->nrecs; mpool_put(t->bt_mp, h, MPOOL_DIRTY); break; case SEARCH: mpool_put(t->bt_mp, h, 0); break; } } /* Try and recover the tree. */ err: sverrno = errno; if (op != SEARCH) while ((parent = BT_POP(t)) != NULL) { if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) break; if (op == SINSERT) --GETRINTERNAL(h, parent->index)->nrecs; else ++GETRINTERNAL(h, parent->index)->nrecs; mpool_put(t->bt_mp, h, MPOOL_DIRTY); } errno = sverrno; return (NULL); }
/* * __bt_seqset -- * Set the sequential scan to a specific key. * * Parameters: * t: tree * ep: storage for returned key * key: key for initial scan position * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV * * Side effects: * Pins the page the cursor references. * * Returns: * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. */ static int __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags) { PAGE *h; pgno_t pg; int exact; /* * Find the first, last or specific key in the tree and point the * cursor at it. The cursor may not be moved until a new key has * been found. */ switch (flags) { case R_CURSOR: /* Keyed scan. */ /* * Find the first instance of the key or the smallest key * which is greater than or equal to the specified key. */ if (key->data == NULL || key->size == 0) { errno = EINVAL; return (RET_ERROR); } return (__bt_first(t, key, ep, &exact)); case R_FIRST: /* First record. */ case R_NEXT: /* Walk down the left-hand side of the tree. */ for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); /* Check for an empty tree. */ if (NEXTINDEX(h) == 0) { mpool_put(t->bt_mp, h, 0); return (RET_SPECIAL); } if (h->flags & (P_BLEAF | P_RLEAF)) break; pg = GETBINTERNAL(h, 0)->pgno; mpool_put(t->bt_mp, h, 0); } ep->page = h; ep->index = 0; break; case R_LAST: /* Last record. */ case R_PREV: /* Walk down the right-hand side of the tree. */ for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); /* Check for an empty tree. */ if (NEXTINDEX(h) == 0) { mpool_put(t->bt_mp, h, 0); return (RET_SPECIAL); } if (h->flags & (P_BLEAF | P_RLEAF)) break; pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; mpool_put(t->bt_mp, h, 0); } ep->page = h; ep->index = NEXTINDEX(h) - 1; break; } return (RET_SUCCESS); }
/* * __REC_IPUT -- Add a recno item to the tree. * * Parameters: * t: tree * nrec: record number * data: data * * Returns: * RET_ERROR, RET_SUCCESS */ int __rec_iput(BTREE *t, recno_t nrec, const DBT *data, u_int flags) { DBT tdata; EPG *e; PAGE *h; indx_t idx, nxtindex; pgno_t pg; uint32_t nbytes; int dflags, status; char *dest, db[NOVFLSIZE]; /* * If the data won't fit on a page, store it on indirect pages. * * XXX * If the insert fails later on, these pages aren't recovered. */ if (data->size > t->bt_ovflsize) { if (__ovfl_put(t, data, &pg) == RET_ERROR) return (RET_ERROR); tdata.data = db; tdata.size = NOVFLSIZE; *(pgno_t *)(void *)db = pg; _DBFIT(data->size, uint32_t); *(uint32_t *)(void *)(db + sizeof(pgno_t)) = (uint32_t)data->size; dflags = P_BIGDATA; data = &tdata; } else dflags = 0; /* __rec_search pins the returned page. */ if ((e = __rec_search(t, nrec, nrec > t->bt_nrecs || flags == R_IAFTER || flags == R_IBEFORE ? SINSERT : SEARCH)) == NULL) return (RET_ERROR); h = e->page; idx = e->index; /* * Add the specified key/data pair to the tree. The R_IAFTER and * R_IBEFORE flags insert the key after/before the specified key. * * Pages are split as required. */ switch (flags) { case R_IAFTER: ++idx; break; case R_IBEFORE: break; default: if (nrec < t->bt_nrecs && __rec_dleaf(t, h, (uint32_t)idx) == RET_ERROR) { mpool_put(t->bt_mp, h, 0); return (RET_ERROR); } break; } /* * If not enough room, split the page. The split code will insert * the key and data and unpin the current page. If inserting into * the offset array, shift the pointers up. */ nbytes = NRLEAFDBT(data->size); if ((uint32_t) (h->upper - h->lower) < nbytes + sizeof(indx_t)) { status = __bt_split(t, h, NULL, data, dflags, nbytes, (uint32_t)idx); if (status == RET_SUCCESS) ++t->bt_nrecs; return (status); } if (idx < (nxtindex = NEXTINDEX(h))) memmove(h->linp + idx + 1, h->linp + idx, (nxtindex - idx) * sizeof(indx_t)); h->lower += sizeof(indx_t); h->linp[idx] = h->upper -= nbytes; dest = (char *)(void *)h + h->upper; WR_RLEAF(dest, data, dflags); ++t->bt_nrecs; F_SET(t, B_MODIFIED); mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); }
/* * __REC_PUT -- Add a recno item to the tree. * * Parameters: * dbp: pointer to access method * key: key * data: data * flag: R_CURSOR, R_IAFTER, R_IBEFORE, R_NOOVERWRITE * * Returns: * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is * already in the tree and R_NOOVERWRITE specified. */ int __rec_put(const DB *dbp, DBT *key, const DBT *data, u_int flags) { BTREE *t; DBT fdata, tdata; recno_t nrec; int status; t = dbp->internal; /* Toss any page pinned across calls. */ if (t->bt_pinned != NULL) { mpool_put(t->bt_mp, t->bt_pinned, 0); t->bt_pinned = NULL; } /* * If using fixed-length records, and the record is long, return * EINVAL. If it's short, pad it out. Use the record data return * memory, it's only short-term. */ if (F_ISSET(t, R_FIXLEN) && data->size != t->bt_reclen) { if (data->size > t->bt_reclen) goto einval; if (t->bt_rdata.size < t->bt_reclen) { t->bt_rdata.data = t->bt_rdata.data == NULL ? malloc(t->bt_reclen) : realloc(t->bt_rdata.data, t->bt_reclen); if (t->bt_rdata.data == NULL) return (RET_ERROR); t->bt_rdata.size = t->bt_reclen; } memmove(t->bt_rdata.data, data->data, data->size); memset((char *)t->bt_rdata.data + data->size, t->bt_bval, t->bt_reclen - data->size); fdata.data = t->bt_rdata.data; fdata.size = t->bt_reclen; } else { fdata.data = data->data; fdata.size = data->size; } switch (flags) { case R_CURSOR: if (!F_ISSET(&t->bt_cursor, CURS_INIT)) goto einval; nrec = t->bt_cursor.rcursor; break; case R_SETCURSOR: if ((nrec = *(recno_t *)key->data) == 0) goto einval; break; case R_IAFTER: if ((nrec = *(recno_t *)key->data) == 0) { nrec = 1; flags = R_IBEFORE; } break; case 0: case R_IBEFORE: if ((nrec = *(recno_t *)key->data) == 0) goto einval; break; case R_NOOVERWRITE: if ((nrec = *(recno_t *)key->data) == 0) goto einval; if (nrec <= t->bt_nrecs) return (RET_SPECIAL); break; default: einval: errno = EINVAL; return (RET_ERROR); } /* * Make sure that records up to and including the put record are * already in the database. If skipping records, create empty ones. */ if (nrec > t->bt_nrecs) { if (!F_ISSET(t, R_EOF | R_INMEM) && t->bt_irec(t, nrec) == RET_ERROR) return (RET_ERROR); if (nrec > t->bt_nrecs + 1) { if (F_ISSET(t, R_FIXLEN)) { if ((tdata.data = (void *)malloc(t->bt_reclen)) == NULL) return (RET_ERROR); tdata.size = t->bt_reclen; memset(tdata.data, t->bt_bval, tdata.size); } else { tdata.data = NULL; tdata.size = 0; } while (nrec > t->bt_nrecs + 1) if (__rec_iput(t, t->bt_nrecs, &tdata, 0) != RET_SUCCESS) return (RET_ERROR); if (F_ISSET(t, R_FIXLEN)) free(tdata.data); } } if ((status = __rec_iput(t, nrec - 1, &fdata, flags)) != RET_SUCCESS) return (status); if (flags == R_SETCURSOR) t->bt_cursor.rcursor = nrec; F_SET(t, R_MODIFIED); return (__rec_ret(t, NULL, nrec, key, NULL)); }
/* * __BT_SPLIT -- Split the tree. * * Parameters: * t: tree * sp: page to split * key: key to insert * data: data to insert * flags: BIGKEY/BIGDATA flags * ilen: insert length * skip: index to leave open * * Returns: * RET_ERROR, RET_SUCCESS */ int __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, size_t ilen, u_int32_t argskip) { BINTERNAL *bi; BLEAF *bl, *tbl; DBT a, b; EPGNO *parent; PAGE *h, *l, *r, *lchild, *rchild; indx_t nxtindex; u_int16_t skip; u_int32_t n, nbytes, nksize; int parentsplit; char *dest; /* * Split the page into two pages, l and r. The split routines return * a pointer to the page into which the key should be inserted and with * skip set to the offset which should be used. Additionally, l and r * are pinned. */ skip = argskip; h = sp->pgno == P_ROOT ? bt_root(t, sp, &l, &r, &skip, ilen) : bt_page(t, sp, &l, &r, &skip, ilen); if (h == NULL) return (RET_ERROR); /* * Insert the new key/data pair into the leaf page. (Key inserts * always cause a leaf page to split first.) */ h->linp[skip] = h->upper -= ilen; dest = (char *)h + h->upper; if (F_ISSET(t, R_RECNO)) WR_RLEAF(dest, data, flags) else WR_BLEAF(dest, key, data, flags) /* If the root page was split, make it look right. */ if (sp->pgno == P_ROOT && (F_ISSET(t, R_RECNO) ? bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) goto err2; /* * Now we walk the parent page stack -- a LIFO stack of the pages that * were traversed when we searched for the page that split. Each stack * entry is a page number and a page index offset. The offset is for * the page traversed on the search. We've just split a page, so we * have to insert a new key into the parent page. * * If the insert into the parent page causes it to split, may have to * continue splitting all the way up the tree. We stop if the root * splits or the page inserted into didn't have to split to hold the * new key. Some algorithms replace the key for the old page as well * as the new page. We don't, as there's no reason to believe that the * first key on the old page is any better than the key we have, and, * in the case of a key being placed at index 0 causing the split, the * key is unavailable. * * There are a maximum of 5 pages pinned at any time. We keep the left * and right pages pinned while working on the parent. The 5 are the * two children, left parent and right parent (when the parent splits) * and the root page or the overflow key page when calling bt_preserve. * This code must make sure that all pins are released other than the * root page or overflow page which is unlocked elsewhere. */ while ((parent = BT_POP(t)) != NULL) { lchild = l; rchild = r; /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) goto err2; /* * The new key goes ONE AFTER the index, because the split * was to the right. */ skip = parent->index + 1; /* * Calculate the space needed on the parent page. * * Prefix trees: space hack when inserting into BINTERNAL * pages. Retain only what's needed to distinguish between * the new entry and the LAST entry on the page to its left. * If the keys compare equal, retain the entire key. Note, * we don't touch overflow keys, and the entire key must be * retained for the next-to-left most key on the leftmost * page of each level, or the search will fail. Applicable * ONLY to internal pages that have leaf pages as children. * Further reduction of the key between pairs of internal * pages loses too much information. */ switch (rchild->flags & P_TYPE) { case P_BINTERNAL: bi = GETBINTERNAL(rchild, 0); nbytes = NBINTERNAL(bi->ksize); break; case P_BLEAF: bl = GETBLEAF(rchild, 0); nbytes = NBINTERNAL(bl->ksize); if (t->bt_pfx && !(bl->flags & P_BIGKEY) && (h->prevpg != P_INVALID || skip > 1)) { tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); a.size = tbl->ksize; a.data = tbl->bytes; b.size = bl->ksize; b.data = bl->bytes; nksize = t->bt_pfx(&a, &b); n = NBINTERNAL(nksize); if (n < nbytes) { #ifdef STATISTICS bt_pfxsaved += nbytes - n; #endif nbytes = n; } else nksize = 0; } else nksize = 0; break; case P_RINTERNAL: case P_RLEAF: nbytes = NRINTERNAL; break; default: abort(); } /* Split the parent page if necessary or shift the indices. */ if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) { sp = h; h = h->pgno == P_ROOT ? bt_root(t, h, &l, &r, &skip, nbytes) : bt_page(t, h, &l, &r, &skip, nbytes); if (h == NULL) goto err1; parentsplit = 1; } else { if (skip < (nxtindex = NEXTINDEX(h))) memmove(h->linp + skip + 1, h->linp + skip, (nxtindex - skip) * sizeof(indx_t)); h->lower += sizeof(indx_t); parentsplit = 0; } /* Insert the key into the parent page. */ switch (rchild->flags & P_TYPE) { case P_BINTERNAL: h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; memmove(dest, bi, nbytes); ((BINTERNAL *)dest)->pgno = rchild->pgno; break; case P_BLEAF: h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, rchild->pgno, bl->flags & P_BIGKEY); memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); if (bl->flags & P_BIGKEY) { pgno_t pgno; memcpy(&pgno, bl->bytes, sizeof(pgno)); if (bt_preserve(t, pgno) == RET_ERROR) goto err1; } break; case P_RINTERNAL: /* * Update the left page count. If split * added at index 0, fix the correct page. */ if (skip > 0) dest = (char *)h + h->linp[skip - 1]; else dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; ((RINTERNAL *)dest)->nrecs = rec_total(lchild); ((RINTERNAL *)dest)->pgno = lchild->pgno; /* Update the right page count. */ h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; ((RINTERNAL *)dest)->nrecs = rec_total(rchild); ((RINTERNAL *)dest)->pgno = rchild->pgno; break; case P_RLEAF: /* * Update the left page count. If split * added at index 0, fix the correct page. */ if (skip > 0) dest = (char *)h + h->linp[skip - 1]; else dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); ((RINTERNAL *)dest)->pgno = lchild->pgno; /* Update the right page count. */ h->linp[skip] = h->upper -= nbytes; dest = (char *)h + h->linp[skip]; ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); ((RINTERNAL *)dest)->pgno = rchild->pgno; break; default: abort(); } /* Unpin the held pages. */ if (!parentsplit) { mpool_put(t->bt_mp, h, MPOOL_DIRTY); break; } /* If the root page was split, make it look right. */ if (sp->pgno == P_ROOT && (F_ISSET(t, R_RECNO) ? bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) goto err1; mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); } /* Unpin the held pages. */ mpool_put(t->bt_mp, l, MPOOL_DIRTY); mpool_put(t->bt_mp, r, MPOOL_DIRTY); /* Clear any pages left on the stack. */ return (RET_SUCCESS); /* * If something fails in the above loop we were already walking back * up the tree and the tree is now inconsistent. Nothing much we can * do about it but release any memory we're holding. */ err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); err2: mpool_put(t->bt_mp, l, 0); mpool_put(t->bt_mp, r, 0); __dbpanic(t->bt_dbp); return (RET_ERROR); }
/* * BT_BROOT -- Fix up the btree root page after it has been split. * * Parameters: * t: tree * h: root page * l: left page * r: right page * * Returns: * RET_ERROR, RET_SUCCESS */ static int bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) { BINTERNAL *bi; BLEAF *bl; u_int32_t nbytes; char *dest; /* * If the root page was a leaf page, change it into an internal page. * We copy the key we split on (but not the key's data, in the case of * a leaf page) to the new root page. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ nbytes = NBINTERNAL(0); h->linp[0] = h->upper = t->bt_psize - nbytes; dest = (char *)h + h->upper; WR_BINTERNAL(dest, 0, l->pgno, 0); switch (h->flags & P_TYPE) { case P_BLEAF: bl = GETBLEAF(r, 0); nbytes = NBINTERNAL(bl->ksize); __PAST_END(h->linp, 1) = h->upper -= nbytes; dest = (char *)h + h->upper; WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); memmove(dest, bl->bytes, bl->ksize); /* * If the key is on an overflow page, mark the overflow chain * so it isn't deleted when the leaf copy of the key is deleted. */ if (bl->flags & P_BIGKEY) { pgno_t pgno; memcpy(&pgno, bl->bytes, sizeof(pgno)); if (bt_preserve(t, pgno) == RET_ERROR) return (RET_ERROR); } break; case P_BINTERNAL: bi = GETBINTERNAL(r, 0); nbytes = NBINTERNAL(bi->ksize); __PAST_END(h->linp, 1) = h->upper -= nbytes; dest = (char *)h + h->upper; memmove(dest, bi, nbytes); ((BINTERNAL *)dest)->pgno = r->pgno; break; default: abort(); } /* There are two keys on the page. */ h->lower = BTDATAOFF + 2 * sizeof(indx_t); /* Unpin the root page, set to btree internal page. */ h->flags &= ~P_TYPE; h->flags |= P_BINTERNAL; mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); }
/* * BT_PAGE -- Split a non-root page of a btree. * * Parameters: * t: tree * h: root page * lp: pointer to left page pointer * rp: pointer to right page pointer * skip: pointer to index to leave open * ilen: insert length * * Returns: * Pointer to page in which to insert or NULL on error. */ static PAGE * bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) { PAGE *l, *r, *tp; pgno_t npg; #ifdef STATISTICS ++bt_split; #endif /* Put the new right page for the split into place. */ if ((r = __bt_new(t, &npg)) == NULL) return (NULL); r->pgno = npg; r->lower = BTDATAOFF; r->upper = t->bt_psize; r->nextpg = h->nextpg; r->prevpg = h->pgno; r->flags = h->flags & P_TYPE; /* * If we're splitting the last page on a level because we're appending * a key to it (skip is NEXTINDEX()), it's likely that the data is * sorted. Adding an empty page on the side of the level is less work * and can push the fill factor much higher than normal. If we're * wrong it's no big deal, we'll just do the split the right way next * time. It may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, but it's not. * Don't even try. */ if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { #ifdef STATISTICS ++bt_sortsplit; #endif h->nextpg = r->pgno; r->lower = BTDATAOFF + sizeof(indx_t); *skip = 0; *lp = h; *rp = r; return (r); } /* Put the new left page for the split into place. */ if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) { mpool_put(t->bt_mp, r, 0); return (NULL); } l->pgno = h->pgno; l->nextpg = r->pgno; l->prevpg = h->prevpg; l->lower = BTDATAOFF; l->upper = t->bt_psize; l->flags = h->flags & P_TYPE; /* Fix up the previous pointer of the page after the split page. */ if (h->nextpg != P_INVALID) { if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { free(l); /* XXX mpool_free(t->bt_mp, r->pgno); */ return (NULL); } tp->prevpg = r->pgno; mpool_put(t->bt_mp, tp, MPOOL_DIRTY); } /* * Split right. The key/data pairs aren't sorted in the btree page so * it's simpler to copy the data from the split page onto two new pages * instead of copying half the data to the right page and compacting * the left page in place. Since the left page can't change, we have * to swap the original and the allocated left page after the split. */ tp = bt_psplit(t, h, l, r, skip, ilen); /* Move the new left page onto the old left page. */ memmove(h, l, t->bt_psize); if (tp == l) tp = h; free(l); *lp = h; *rp = r; return (tp); }
/* * __bt_first -- * Find the first entry. * * Parameters: * t: the tree * key: the key * erval: return EPG * exactp: pointer to exact match flag * * Returns: * The first entry in the tree greater than or equal to key, * or RET_SPECIAL if no such key exists. */ static int __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp) { PAGE *h; EPG *ep, save; pgno_t pg; /* * Find any matching record; __bt_search pins the page. * * If it's an exact match and duplicates are possible, walk backwards * in the tree until we find the first one. Otherwise, make sure it's * a valid key (__bt_search may return an index just past the end of a * page) and return it. */ if ((ep = __bt_search(t, key, exactp)) == NULL) return (0); if (*exactp) { if (F_ISSET(t, B_NODUPS)) { *erval = *ep; return (RET_SUCCESS); } /* * Walk backwards, as long as the entry matches and there are * keys left in the tree. Save a copy of each match in case * we go too far. */ save = *ep; h = ep->page; do { if (save.page->pgno != ep->page->pgno) { mpool_put(t->bt_mp, save.page, 0); save = *ep; } else save.index = ep->index; /* * Don't unpin the page the last (or original) match * was on, but make sure it's unpinned if an error * occurs. */ if (ep->index == 0) { if (h->prevpg == P_INVALID) break; if (h->pgno != save.page->pgno) mpool_put(t->bt_mp, h, 0); if ((h = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) { if (h->pgno == save.page->pgno) mpool_put(t->bt_mp, save.page, 0); return (RET_ERROR); } ep->page = h; ep->index = NEXTINDEX(h); } --ep->index; } while (__bt_cmp(t, key, ep) == 0); /* * Reach here with the last page that was looked at pinned, * which may or may not be the same as the last (or original) * match page. If it's not useful, release it. */ if (h->pgno != save.page->pgno) mpool_put(t->bt_mp, h, 0); *erval = save; return (RET_SUCCESS); } /* If at the end of a page, find the next entry. */ if (ep->index == NEXTINDEX(ep->page)) { h = ep->page; pg = h->nextpg; mpool_put(t->bt_mp, h, 0); if (pg == P_INVALID) return (RET_SPECIAL); if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); ep->index = 0; ep->page = h; } *erval = *ep; return (RET_SUCCESS); }
/* * __bt_stkacq -- * Acquire a stack so we can delete a cursor entry. * * Parameters: * t: tree * hp: pointer to current, pinned PAGE pointer * c: pointer to the cursor * * Returns: * 0 on success, 1 on failure */ static int __bt_stkacq(BTREE *t, PAGE **hp, CURSOR *c) { BINTERNAL *bi; EPG *e; EPGNO *parent; PAGE *h; indx_t idx; pgno_t pgno; recno_t nextpg, prevpg; int exact, level; /* * Find the first occurrence of the key in the tree. Toss the * currently locked page so we don't hit an already-locked page. */ h = *hp; mpool_put(t->bt_mp, h, 0); if ((e = __bt_search(t, &c->key, &exact)) == NULL) return (1); h = e->page; /* See if we got it in one shot. */ if (h->pgno == c->pg.pgno) goto ret; /* * Move right, looking for the page. At each move we have to move * up the stack until we don't have to move to the next page. If * we have to change pages at an internal level, we have to fix the * stack back up. */ while (h->pgno != c->pg.pgno) { if ((nextpg = h->nextpg) == P_INVALID) break; mpool_put(t->bt_mp, h, 0); /* Move up the stack. */ for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) return (1); /* Move to the next index. */ if (parent->index != NEXTINDEX(h) - 1) { idx = parent->index + 1; BT_PUSH(t, h->pgno, idx); break; } mpool_put(t->bt_mp, h, 0); } /* Restore the stack. */ while (level--) { /* Push the next level down onto the stack. */ bi = GETBINTERNAL(h, idx); pgno = bi->pgno; BT_PUSH(t, pgno, 0); /* Lose the currently pinned page. */ mpool_put(t->bt_mp, h, 0); /* Get the next level down. */ if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) return (1); idx = 0; } mpool_put(t->bt_mp, h, 0); if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) return (1); } if (h->pgno == c->pg.pgno) goto ret; /* Reacquire the original stack. */ mpool_put(t->bt_mp, h, 0); if ((e = __bt_search(t, &c->key, &exact)) == NULL) return (1); h = e->page; /* * Move left, looking for the page. At each move we have to move * up the stack until we don't have to change pages to move to the * next page. If we have to change pages at an internal level, we * have to fix the stack back up. */ while (h->pgno != c->pg.pgno) { if ((prevpg = h->prevpg) == P_INVALID) break; mpool_put(t->bt_mp, h, 0); /* Move up the stack. */ for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) return (1); /* Move to the next index. */ if (parent->index != 0) { idx = parent->index - 1; BT_PUSH(t, h->pgno, idx); break; } mpool_put(t->bt_mp, h, 0); } /* Restore the stack. */ while (level--) { /* Push the next level down onto the stack. */ bi = GETBINTERNAL(h, idx); pgno = bi->pgno; /* Lose the currently pinned page. */ mpool_put(t->bt_mp, h, 0); /* Get the next level down. */ if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) return (1); idx = NEXTINDEX(h) - 1; BT_PUSH(t, pgno, idx); } mpool_put(t->bt_mp, h, 0); if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) return (1); } ret: mpool_put(t->bt_mp, h, 0); return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); }
/* * __REC_SEQ -- Recno sequential scan interface. * * Parameters: * dbp: pointer to access method * key: key for positioning and return value * data: data return value * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. * * Returns: * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. */ int __rec_seq(const DB *dbp, DBT *key, DBT *data, u_int flags) { BTREE *t; EPG *e; recno_t nrec; int status; t = dbp->internal; /* Toss any page pinned across calls. */ if (t->bt_pinned != NULL) { mpool_put(t->bt_mp, t->bt_pinned, 0); t->bt_pinned = NULL; } switch(flags) { case R_CURSOR: if ((nrec = *(recno_t *)key->data) == 0) goto einval; break; case R_NEXT: if (F_ISSET(&t->bt_cursor, CURS_INIT)) { nrec = t->bt_cursor.rcursor + 1; break; } /* FALLTHROUGH */ case R_FIRST: nrec = 1; break; case R_PREV: if (F_ISSET(&t->bt_cursor, CURS_INIT)) { if ((nrec = t->bt_cursor.rcursor - 1) == 0) return (RET_SPECIAL); break; } /* FALLTHROUGH */ case R_LAST: if (!F_ISSET(t, R_EOF | R_INMEM) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) return (RET_ERROR); nrec = t->bt_nrecs; break; default: einval: errno = EINVAL; return (RET_ERROR); } if (t->bt_nrecs == 0 || nrec > t->bt_nrecs) { if (!F_ISSET(t, R_EOF | R_INMEM) && (status = t->bt_irec(t, nrec)) != RET_SUCCESS) return (status); if (t->bt_nrecs == 0 || nrec > t->bt_nrecs) return (RET_SPECIAL); } if ((e = __rec_search(t, nrec - 1, SEARCH)) == NULL) return (RET_ERROR); F_SET(&t->bt_cursor, CURS_INIT); t->bt_cursor.rcursor = nrec; status = __rec_ret(t, e, nrec, key, data); if (F_ISSET(t, B_DB_LOCK)) mpool_put(t->bt_mp, e->page, 0); else t->bt_pinned = e->page; return (status); }
/* * __bt_bdelete -- * Delete all key/data pairs matching the specified key. * * Parameters: * t: tree * key: key to delete * * Returns: * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. */ static int __bt_bdelete(BTREE *t, const DBT *key) { EPG *e; PAGE *h; int deleted, exact, redo; deleted = 0; /* Find any matching record; __bt_search pins the page. */ loop: if ((e = __bt_search(t, key, &exact)) == NULL) return (deleted ? RET_SUCCESS : RET_ERROR); if (!exact) { mpool_put(t->bt_mp, e->page, 0); return (deleted ? RET_SUCCESS : RET_SPECIAL); } /* * Delete forward, then delete backward, from the found key. If * there are duplicates and we reach either side of the page, do * the key search again, so that we get them all. */ redo = 0; h = e->page; do { if (__bt_dleaf(t, key, h, e->index)) { mpool_put(t->bt_mp, h, 0); return (RET_ERROR); } if (F_ISSET(t, B_NODUPS)) { if (NEXTINDEX(h) == 0) { if (__bt_pdelete(t, h)) return (RET_ERROR); } else mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); } deleted = 1; } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); /* Check for right-hand edge of the page. */ if (e->index == NEXTINDEX(h)) redo = 1; /* Delete from the key to the beginning of the page. */ while (e->index-- > 0) { if (__bt_cmp(t, key, e) != 0) break; if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { mpool_put(t->bt_mp, h, 0); return (RET_ERROR); } if (e->index == 0) redo = 1; } /* Check for an empty page. */ if (NEXTINDEX(h) == 0) { if (__bt_pdelete(t, h)) return (RET_ERROR); goto loop; } /* Put the page. */ mpool_put(t->bt_mp, h, MPOOL_DIRTY); if (redo) goto loop; return (RET_SUCCESS); }
/* * __bt_seqadvance -- * Advance the sequential scan. * * Parameters: * t: tree * flags: R_NEXT, R_PREV * * Side effects: * Pins the page the new key/data record is on. * * Returns: * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. */ static int __bt_seqadv(BTREE *t, EPG *ep, int flags) { CURSOR *c; PAGE *h; indx_t idx; pgno_t pg; int exact; /* * There are a couple of states that we can be in. The cursor has * been initialized by the time we get here, but that's all we know. */ c = &t->bt_cursor; /* * The cursor was deleted where there weren't any duplicate records, * so the key was saved. Find out where that key would go in the * current tree. It doesn't matter if the returned key is an exact * match or not -- if it's an exact match, the record was added after * the delete so we can just return it. If not, as long as there's * a record there, return it. */ if (F_ISSET(c, CURS_ACQUIRE)) return (__bt_first(t, &c->key, ep, &exact)); /* Get the page referenced by the cursor. */ if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) return (RET_ERROR); /* * Find the next/previous record in the tree and point the cursor at * it. The cursor may not be moved until a new key has been found. */ switch (flags) { case R_NEXT: /* Next record. */ /* * The cursor was deleted in duplicate records, and moved * forward to a record that has yet to be returned. Clear * that flag, and return the record. */ if (F_ISSET(c, CURS_AFTER)) goto usecurrent; idx = c->pg.index; if (++idx == NEXTINDEX(h)) { pg = h->nextpg; mpool_put(t->bt_mp, h, 0); if (pg == P_INVALID) return (RET_SPECIAL); if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); idx = 0; } break; case R_PREV: /* Previous record. */ /* * The cursor was deleted in duplicate records, and moved * backward to a record that has yet to be returned. Clear * that flag, and return the record. */ if (F_ISSET(c, CURS_BEFORE)) { usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); ep->page = h; ep->index = c->pg.index; return (RET_SUCCESS); } idx = c->pg.index; if (idx == 0) { pg = h->prevpg; mpool_put(t->bt_mp, h, 0); if (pg == P_INVALID) return (RET_SPECIAL); if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); idx = NEXTINDEX(h) - 1; } else --idx; break; } ep->page = h; ep->index = idx; return (RET_SUCCESS); }
/* * BT_STAT -- Gather/print the tree statistics * * Parameters: * dbp: pointer to the DB */ void __bt_stat(DB *dbp) { extern unsigned long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit; extern unsigned long bt_sortsplit, bt_split; BTREE *t; PAGE *h; pgno_t i, pcont, pinternal, pleaf; unsigned long ifree, lfree, nkeys; int levels; t = dbp->internal; pcont = pinternal = pleaf = 0; nkeys = ifree = lfree = 0; for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { switch (h->flags & P_TYPE) { case P_BINTERNAL: case P_RINTERNAL: ++pinternal; ifree += h->upper - h->lower; break; case P_BLEAF: case P_RLEAF: ++pleaf; lfree += h->upper - h->lower; nkeys += NEXTINDEX(h); break; case P_OVERFLOW: ++pcont; break; } (void)mpool_put(t->bt_mp, h, 0); } /* Count the levels of the tree. */ for (i = P_ROOT, levels = 0 ;; ++levels) { h = mpool_get(t->bt_mp, i, 0); if (h->flags & (P_BLEAF|P_RLEAF)) { if (levels == 0) levels = 1; (void)mpool_put(t->bt_mp, h, 0); break; } i = F_ISSET(t, R_RECNO) ? GETRINTERNAL(h, 0)->pgno : GETBINTERNAL(h, 0)->pgno; (void)mpool_put(t->bt_mp, h, 0); } (void)fprintf(stderr, "%d level%s with %ld keys", levels, levels == 1 ? "" : "s", nkeys); if (F_ISSET(t, R_RECNO)) (void)fprintf(stderr, " (%ld header count)", (long)t->bt_nrecs); (void)fprintf(stderr, "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n", (long)pinternal + pleaf + pcont, (long)pleaf, (long)pinternal, (long)pcont); (void)fprintf(stderr, "%ld cache hits, %ld cache misses\n", bt_cache_hit, bt_cache_miss); (void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n", bt_split, bt_rootsplit, bt_sortsplit); pleaf *= t->bt_psize - BTDATAOFF; if (pleaf) (void)fprintf(stderr, "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n", ((double)(pleaf - lfree) / pleaf) * 100, pleaf - lfree, lfree); pinternal *= t->bt_psize - BTDATAOFF; if (pinternal) (void)fprintf(stderr, "%.0f%% internal fill (%ld bytes used, %ld bytes free\n", ((double)(pinternal - ifree) / pinternal) * 100, pinternal - ifree, ifree); if (bt_pfxsaved) (void)fprintf(stderr, "prefix checking removed %lu bytes.\n", bt_pfxsaved); }