/* * BT_RROOT -- Fix up the recno root page after it has been split. * * Parameters: * t: tree * h: root page * l: left page * r: right page * * Returns: * RET_ERROR, RET_SUCCESS */ static int bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) { char *dest; uint32_t sz; size_t temp; temp = t->bt_psize - NRINTERNAL; _DBFIT(temp, uint32_t); sz = (uint32_t)temp; /* Insert the left and right keys, set the header information. */ _DBFIT(sz, indx_t); h->linp[0] = h->upper = (indx_t)sz; dest = (char *)(void *)h + h->upper; WR_RINTERNAL(dest, l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); h->linp[1] = h->upper -= NRINTERNAL; dest = (char *)(void *)h + h->upper; WR_RINTERNAL(dest, r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); h->lower = BTDATAOFF + 2 * sizeof(indx_t); /* Unpin the root page, set to recno internal page. */ h->flags &= ~P_TYPE; h->flags |= P_RINTERNAL; mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); }
/* * __OVFL_PUT -- Store an overflow key/data item. * * Parameters: * t: tree * data: DBT to store * pgno: storage page number * * Returns: * RET_ERROR, RET_SUCCESS */ int __ovfl_put(BTREE *t, const DBT *dbt, pgno_t *pg) { PAGE *h, *last; void *p; pgno_t npg; uint32_t sz, nb, plen; size_t temp; /* * Allocate pages and copy the key/data record into them. Store the * number of the first page in the chain. */ temp = t->bt_psize - BTDATAOFF; _DBFIT(temp, uint32_t); plen = (uint32_t)temp; last = NULL; p = dbt->data; temp = dbt->size; _DBFIT(temp, uint32_t); sz = temp; for (;; p = (char *)p + plen, last = h) { if ((h = __bt_new(t, &npg)) == NULL) return (RET_ERROR); h->pgno = npg; h->nextpg = h->prevpg = P_INVALID; h->flags = P_OVERFLOW; h->lower = h->upper = 0; nb = MIN(sz, plen); (void)memmove((char *)(void *)h + BTDATAOFF, p, (size_t)nb); if (last) { last->nextpg = h->pgno; mpool_put(t->bt_mp, last, MPOOL_DIRTY); } else *pg = h->pgno; if ((sz -= nb) == 0) { mpool_put(t->bt_mp, h, MPOOL_DIRTY); break; } } return (RET_SUCCESS); }
/* * __OVFL_GET -- Get an overflow key/data item. * * Parameters: * t: tree * p: pointer to { pgno_t, uint32_t } * buf: storage address * bufsz: storage size * * Returns: * RET_ERROR, RET_SUCCESS */ int __ovfl_get(BTREE *t, void *p, size_t *ssz, void **buf, size_t *bufsz) { PAGE *h; pgno_t pg; uint32_t sz, nb, plen; size_t temp; memmove(&pg, p, sizeof(pg)); memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(uint32_t)); *ssz = sz; #ifdef DEBUG if (pg == P_INVALID || sz == 0) abort(); #endif /* Make the buffer bigger as necessary. */ if (*bufsz < sz) { void *nbuf = realloc(*buf, sz); if (nbuf == NULL) return (RET_ERROR); *buf = nbuf; *bufsz = sz; } /* * Step through the linked list of pages, copying the data on each one * into the buffer. Never copy more than the data's length. */ temp = t->bt_psize - BTDATAOFF; _DBFIT(temp, uint32_t); plen = (uint32_t)temp; for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); nb = MIN(sz, plen); memmove(p, (char *)(void *)h + BTDATAOFF, nb); mpool_put(t->bt_mp, h, 0); if ((sz -= nb) == 0) break; } return (RET_SUCCESS); }
/* * This calls alloc_segs which may run out of memory. Alloc_segs will destroy * the table and set errno, so we just pass the error information along. * * Returns 0 on No Error */ static int init_htab(HTAB *hashp, size_t nelem) { int nbuckets; uint32_t nsegs; int l2; /* * Divide number of elements by the fill factor and determine a * desired number of buckets. Allocate space for the next greater * power of two number of buckets. */ nelem = (nelem - 1) / hashp->FFACTOR + 1; _DBFIT(nelem, uint32_t); l2 = __log2(MAX((uint32_t)nelem, 2)); nbuckets = 1 << l2; hashp->SPARES[l2] = l2 + 1; hashp->SPARES[l2 + 1] = l2 + 1; hashp->OVFL_POINT = l2; hashp->LAST_FREED = 2; /* First bitmap page is at: splitpoint l2 page offset 1 */ if (__ibitmap(hashp, (int)OADDR_OF(l2, 1), l2 + 1, 0)) return (-1); hashp->MAX_BUCKET = hashp->LOW_MASK = nbuckets - 1; hashp->HIGH_MASK = (nbuckets << 1) - 1; /* LINTED constant in conditional context */ hashp->HDRPAGES = ((MAX(sizeof(HASHHDR), MINHDRSIZE) - 1) >> hashp->BSHIFT) + 1; nsegs = (nbuckets - 1) / hashp->SGSIZE + 1; nsegs = 1 << __log2(nsegs); if (nsegs > hashp->DSIZE) hashp->DSIZE = nsegs; return (alloc_segs(hashp, (int)nsegs)); }
/* * __OVFL_DELETE -- Delete an overflow chain. * * Parameters: * t: tree * p: pointer to { pgno_t, uint32_t } * * Returns: * RET_ERROR, RET_SUCCESS */ int __ovfl_delete(BTREE *t, void *p) { PAGE *h; pgno_t pg; uint32_t sz, plen; size_t temp; (void)memmove(&pg, p, sizeof(pgno_t)); (void)memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(uint32_t)); #ifdef DEBUG if (pg == P_INVALID || sz == 0) abort(); #endif if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); /* Don't delete chains used by internal pages. */ if (h->flags & P_PRESERVE) { mpool_put(t->bt_mp, h, 0); return (RET_SUCCESS); } /* Step through the chain, calling the free routine for each page. */ temp = t->bt_psize - BTDATAOFF; _DBFIT(temp, uint32_t); plen = (uint32_t)temp; for (;; sz -= plen) { pg = h->nextpg; __bt_free(t, h); if (sz <= plen) break; if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (RET_ERROR); } return (RET_SUCCESS); }
/* * __BT_OPEN -- Open a btree. * * Creates and fills a DB struct, and calls the routine that actually * opens the btree. * * Parameters: * fname: filename (NULL for in-memory trees) * flags: open flag bits * mode: open permission bits * b: BTREEINFO pointer * * Returns: * NULL on failure, pointer to DB on success. * */ DB * __bt_open(const char *fname, int flags, mode_t mode, const BTREEINFO *openinfo, int dflags) { struct stat sb; BTMETA m; BTREE *t; BTREEINFO b; DB *dbp; pgno_t ncache; ssize_t nr; size_t temp; int machine_lorder; t = NULL; /* * Intention is to make sure all of the user's selections are okay * here and then use them without checking. Can't be complete, since * we don't know the right page size, lorder or flags until the backing * file is opened. Also, the file's page size can cause the cachesize * to change. */ machine_lorder = byteorder(); if (openinfo) { b = *openinfo; /* Flags: R_DUP. */ if (b.flags & ~(R_DUP)) goto einval; /* * Page size must be indx_t aligned and >= MINPSIZE. Default * page size is set farther on, based on the underlying file * transfer size. */ if (b.psize && (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || b.psize & (sizeof(indx_t) - 1))) goto einval; /* Minimum number of keys per page; absolute minimum is 2. */ if (b.minkeypage) { if (b.minkeypage < 2) goto einval; } else b.minkeypage = DEFMINKEYPAGE; /* If no comparison, use default comparison and prefix. */ if (b.compare == NULL) { b.compare = __bt_defcmp; if (b.prefix == NULL) b.prefix = __bt_defpfx; } if (b.lorder == 0) b.lorder = machine_lorder; } else { b.compare = __bt_defcmp; b.cachesize = 0; b.flags = 0; b.lorder = machine_lorder; b.minkeypage = DEFMINKEYPAGE; b.prefix = __bt_defpfx; b.psize = 0; } /* Check for the ubiquitous PDP-11. */ if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) goto einval; /* Allocate and initialize DB and BTREE structures. */ if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) goto err; memset(t, 0, sizeof(BTREE)); t->bt_fd = -1; /* Don't close unopened fd on error. */ t->bt_lorder = b.lorder; t->bt_order = NOT; t->bt_cmp = b.compare; t->bt_pfx = b.prefix; t->bt_rfd = -1; if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) goto err; memset(t->bt_dbp, 0, sizeof(DB)); if (t->bt_lorder != machine_lorder) F_SET(t, B_NEEDSWAP); dbp->type = DB_BTREE; dbp->internal = t; dbp->close = __bt_close; dbp->del = __bt_delete; dbp->fd = __bt_fd; dbp->get = __bt_get; dbp->put = __bt_put; dbp->seq = __bt_seq; dbp->sync = __bt_sync; /* * If no file name was supplied, this is an in-memory btree and we * open a backing temporary file. Otherwise, it's a disk-based tree. */ if (fname) { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, B_RDONLY); break; case O_RDWR: break; case O_WRONLY: default: goto einval; } if ((t->bt_fd = open(fname, flags, mode)) == -1) goto err; if (fcntl(t->bt_fd, F_SETFD, FD_CLOEXEC) == -1) goto err; } else { if ((flags & O_ACCMODE) != O_RDWR) goto einval; if ((t->bt_fd = tmp()) == -1) goto err; F_SET(t, B_INMEM); } if (fcntl(t->bt_fd, F_SETFD, FD_CLOEXEC) == -1) goto err; if (fstat(t->bt_fd, &sb)) goto err; if (sb.st_size) { if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) goto err; if (nr != sizeof(BTMETA)) goto eftype; /* * Read in the meta-data. This can change the notion of what * the lorder, page size and flags are, and, when the page size * changes, the cachesize value can change too. If the user * specified the wrong byte order for an existing database, we * don't bother to return an error, we just clear the NEEDSWAP * bit. */ if (m.magic == BTREEMAGIC) F_CLR(t, B_NEEDSWAP); else { F_SET(t, B_NEEDSWAP); M_32_SWAP(m.magic); M_32_SWAP(m.version); M_32_SWAP(m.psize); M_32_SWAP(m.free); M_32_SWAP(m.nrecs); M_32_SWAP(m.flags); } if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) goto eftype; if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || m.psize & (sizeof(indx_t) - 1)) goto eftype; if (m.flags & ~SAVEMETA) goto eftype; b.psize = m.psize; F_SET(t, m.flags); t->bt_free = m.free; t->bt_nrecs = m.nrecs; } else { /* * Set the page size to the best value for I/O to this file. * Don't overflow the page offset type. */ if (b.psize == 0) { b.psize = sb.st_blksize; if (b.psize < MINPSIZE) b.psize = MINPSIZE; if (b.psize > MAX_PAGE_OFFSET + 1) b.psize = MAX_PAGE_OFFSET + 1; } /* Set flag if duplicates permitted. */ if (!(b.flags & R_DUP)) F_SET(t, B_NODUPS); t->bt_free = P_INVALID; t->bt_nrecs = 0; F_SET(t, B_METADIRTY); } t->bt_psize = b.psize; /* Set the cache size; must be a multiple of the page size. */ if (b.cachesize && b.cachesize & (b.psize - 1)) b.cachesize += (~b.cachesize & (b.psize - 1)) + 1; if (b.cachesize < b.psize * MINCACHE) b.cachesize = b.psize * MINCACHE; /* Calculate number of pages to cache. */ ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; /* * The btree data structure requires that at least two keys can fit on * a page, but other than that there's no fixed requirement. The user * specified a minimum number per page, and we translated that into the * number of bytes a key/data pair can use before being placed on an * overflow page. This calculation includes the page header, the size * of the index referencing the leaf item and the size of the leaf item * structure. Also, don't let the user specify a minkeypage such that * a key/data pair won't fit even if both key and data are on overflow * pages. */ temp = (t->bt_psize - BTDATAOFF) / b.minkeypage - (sizeof(indx_t) + NBLEAFDBT(0, 0)); _DBFIT(temp, indx_t); t->bt_ovflsize = (indx_t)temp; if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) t->bt_ovflsize = NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); /* Initialize the buffer pool. */ if ((t->bt_mp = mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) goto err; if (!F_ISSET(t, B_INMEM)) mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); /* Create a root page if new tree. */ if (nroot(t) == RET_ERROR) goto err; /* Global flags. */ if (dflags & DB_LOCK) F_SET(t, B_DB_LOCK); if (dflags & DB_SHMEM) F_SET(t, B_DB_SHMEM); if (dflags & DB_TXN) F_SET(t, B_DB_TXN); return (dbp); einval: errno = EINVAL; goto err; eftype: errno = EFTYPE; goto err; err: if (t) { if (t->bt_dbp) free(t->bt_dbp); if (t->bt_fd != -1) (void)close(t->bt_fd); free(t); } return (NULL); }
/* * __REC_IPUT -- Add a recno item to the tree. * * Parameters: * t: tree * nrec: record number * data: data * * Returns: * RET_ERROR, RET_SUCCESS */ int __rec_iput(BTREE *t, recno_t nrec, const DBT *data, u_int flags) { DBT tdata; EPG *e; PAGE *h; indx_t idx, nxtindex; pgno_t pg; uint32_t nbytes; int dflags, status; char *dest, db[NOVFLSIZE]; /* * If the data won't fit on a page, store it on indirect pages. * * XXX * If the insert fails later on, these pages aren't recovered. */ if (data->size > t->bt_ovflsize) { if (__ovfl_put(t, data, &pg) == RET_ERROR) return (RET_ERROR); tdata.data = db; tdata.size = NOVFLSIZE; *(pgno_t *)(void *)db = pg; _DBFIT(data->size, uint32_t); *(uint32_t *)(void *)(db + sizeof(pgno_t)) = (uint32_t)data->size; dflags = P_BIGDATA; data = &tdata; } else dflags = 0; /* __rec_search pins the returned page. */ if ((e = __rec_search(t, nrec, nrec > t->bt_nrecs || flags == R_IAFTER || flags == R_IBEFORE ? SINSERT : SEARCH)) == NULL) return (RET_ERROR); h = e->page; idx = e->index; /* * Add the specified key/data pair to the tree. The R_IAFTER and * R_IBEFORE flags insert the key after/before the specified key. * * Pages are split as required. */ switch (flags) { case R_IAFTER: ++idx; break; case R_IBEFORE: break; default: if (nrec < t->bt_nrecs && __rec_dleaf(t, h, (uint32_t)idx) == RET_ERROR) { mpool_put(t->bt_mp, h, 0); return (RET_ERROR); } break; } /* * If not enough room, split the page. The split code will insert * the key and data and unpin the current page. If inserting into * the offset array, shift the pointers up. */ nbytes = NRLEAFDBT(data->size); if ((uint32_t) (h->upper - h->lower) < nbytes + sizeof(indx_t)) { status = __bt_split(t, h, NULL, data, dflags, nbytes, (uint32_t)idx); if (status == RET_SUCCESS) ++t->bt_nrecs; return (status); } if (idx < (nxtindex = NEXTINDEX(h))) memmove(h->linp + idx + 1, h->linp + idx, (nxtindex - idx) * sizeof(indx_t)); h->lower += sizeof(indx_t); h->linp[idx] = h->upper -= nbytes; dest = (char *)(void *)h + h->upper; WR_RLEAF(dest, data, dflags); ++t->bt_nrecs; F_SET(t, B_MODIFIED); mpool_put(t->bt_mp, h, MPOOL_DIRTY); return (RET_SUCCESS); }
/* * Big_insert * * You need to do an insert and the key/data pair is too big * * Returns: * 0 ==> OK *-1 ==> ERROR */ int __big_insert(HTAB *hashp, BUFHEAD *bufp, const DBT *key, const DBT *val) { uint16_t *p, n; size_t key_size, val_size; uint16_t space, move_bytes, off; char *cp, *key_data, *val_data; size_t temp; cp = bufp->page; /* Character pointer of p. */ p = (uint16_t *)(void *)cp; key_data = (char *)key->data; _DBFIT(key->size, int); key_size = key->size; val_data = (char *)val->data; _DBFIT(val->size, int); val_size = val->size; /* First move the Key */ temp = FREESPACE(p) - BIGOVERHEAD; _DBFIT(temp, uint16_t); space = (uint16_t)temp; while (key_size) { move_bytes = MIN(space, key_size); off = OFFSET(p) - move_bytes; memmove(cp + off, key_data, (size_t)move_bytes); key_size -= move_bytes; key_data += move_bytes; n = p[0]; p[++n] = off; p[0] = ++n; temp = off - PAGE_META(n); _DBFIT(temp, uint16_t); FREESPACE(p) = (uint16_t)temp; OFFSET(p) = off; p[n] = PARTIAL_KEY; bufp = __add_ovflpage(hashp, bufp); if (!bufp) return (-1); n = p[0]; if (!key_size) { space = FREESPACE(p); if (space) { move_bytes = MIN(space, val_size); /* * If the data would fit exactly in the * remaining space, we must overflow it to the * next page; otherwise the invariant that the * data must end on a page with FREESPACE * non-zero would fail. */ if (space == val_size && val_size == val->size) goto toolarge; off = OFFSET(p) - move_bytes; memmove(cp + off, val_data, (size_t)move_bytes); val_data += move_bytes; val_size -= move_bytes; p[n] = off; p[n - 2] = FULL_KEY_DATA; FREESPACE(p) = FREESPACE(p) - move_bytes; OFFSET(p) = off; } else { toolarge: p[n - 2] = FULL_KEY; } } p = (uint16_t *)(void *)bufp->page; cp = bufp->page; bufp->flags |= BUF_MOD; temp = FREESPACE(p) - BIGOVERHEAD; _DBFIT(temp, uint16_t); space = (uint16_t)temp; } /* Now move the data */ temp = FREESPACE(p) - BIGOVERHEAD; _DBFIT(temp, uint16_t); space = (uint16_t)temp; while (val_size) { move_bytes = MIN(space, val_size); /* * Here's the hack to make sure that if the data ends on the * same page as the key ends, FREESPACE is at least one. */ if (space == val_size && val_size == val->size) move_bytes--; off = OFFSET(p) - move_bytes; memmove(cp + off, val_data, (size_t)move_bytes); val_size -= move_bytes; val_data += move_bytes; n = p[0]; p[++n] = off; p[0] = ++n; temp = off - PAGE_META(n); _DBFIT(temp, uint16_t); FREESPACE(p) = (uint16_t)temp; OFFSET(p) = off; if (val_size) { p[n] = FULL_KEY; bufp = __add_ovflpage(hashp, bufp); if (!bufp) return (-1); cp = bufp->page; p = (uint16_t *)(void *)cp; } else p[n] = FULL_KEY_DATA; bufp->flags |= BUF_MOD; temp = FREESPACE(p) - BIGOVERHEAD; _DBFIT(temp, uint16_t); space = (uint16_t)temp; } return (0); }
/* * Returns: * 0 => OK * -1 => error */ int __big_split( HTAB *hashp, BUFHEAD *op, /* Pointer to where to put keys that go in old bucket */ BUFHEAD *np, /* Pointer to new bucket page */ /* Pointer to first page containing the big key/data */ BUFHEAD *big_keyp, int addr, /* Address of big_keyp */ uint32_t obucket,/* Old Bucket */ SPLIT_RETURN *ret ) { BUFHEAD *tmpp; uint16_t *tp; BUFHEAD *bp; DBT key, val; uint32_t change; uint16_t free_space, n, off; size_t temp; bp = big_keyp; /* Now figure out where the big key/data goes */ if (__big_keydata(hashp, big_keyp, &key, &val, 0)) return (-1); change = (__call_hash(hashp, key.data, (int)key.size) != obucket); if ((ret->next_addr = __find_last_page(hashp, &big_keyp)) != 0) { if (!(ret->nextp = __get_buf(hashp, (uint32_t)ret->next_addr, big_keyp, 0))) return (-1); } else ret->nextp = NULL; /* Now make one of np/op point to the big key/data pair */ _DIAGASSERT(np->ovfl == NULL); if (change) tmpp = np; else tmpp = op; tmpp->flags |= BUF_MOD; #ifdef DEBUG1 (void)fprintf(stderr, "BIG_SPLIT: %d->ovfl was %d is now %d\n", tmpp->addr, (tmpp->ovfl ? tmpp->ovfl->addr : 0), (bp ? bp->addr : 0)); #endif tmpp->ovfl = bp; /* one of op/np point to big_keyp */ tp = (uint16_t *)(void *)tmpp->page; _DIAGASSERT(FREESPACE(tp) >= OVFLSIZE); n = tp[0]; off = OFFSET(tp); free_space = FREESPACE(tp); tp[++n] = (uint16_t)addr; tp[++n] = OVFLPAGE; tp[0] = n; OFFSET(tp) = off; temp = free_space - OVFLSIZE; _DBFIT(temp, uint16_t); FREESPACE(tp) = (uint16_t)temp; /* * Finally, set the new and old return values. BIG_KEYP contains a * pointer to the last page of the big key_data pair. Make sure that * big_keyp has no following page (2 elements) or create an empty * following page. */ ret->newp = np; ret->oldp = op; tp = (uint16_t *)(void *)big_keyp->page; big_keyp->flags |= BUF_MOD; if (tp[0] > 2) { /* * There may be either one or two offsets on this page. If * there is one, then the overflow page is linked on normally * and tp[4] is OVFLPAGE. If there are two, tp[4] contains * the second offset and needs to get stuffed in after the * next overflow page is added. */ n = tp[4]; free_space = FREESPACE(tp); off = OFFSET(tp); tp[0] -= 2; temp = free_space + OVFLSIZE; _DBFIT(temp, uint16_t); FREESPACE(tp) = (uint16_t)temp; OFFSET(tp) = off; tmpp = __add_ovflpage(hashp, big_keyp); if (!tmpp) return (-1); tp[4] = n; } else tmpp = big_keyp; if (change) ret->newp = tmpp; else ret->oldp = tmpp; return (0); }
/* * Called when bufp's page contains a partial key (index should be 1) * * All pages in the big key/data pair except bufp are freed. We cannot * free bufp because the page pointing to it is lost and we can't get rid * of its pointer. * * Returns: * 0 => OK *-1 => ERROR */ int __big_delete(HTAB *hashp, BUFHEAD *bufp) { BUFHEAD *last_bfp, *rbufp; uint16_t *bp, pageno; int key_done, n; size_t temp; rbufp = bufp; last_bfp = NULL; bp = (uint16_t *)(void *)bufp->page; pageno = 0; key_done = 0; while (!key_done || (bp[2] != FULL_KEY_DATA)) { if (bp[2] == FULL_KEY || bp[2] == FULL_KEY_DATA) key_done = 1; /* * If there is freespace left on a FULL_KEY_DATA page, then * the data is short and fits entirely on this page, and this * is the last page. */ if (bp[2] == FULL_KEY_DATA && FREESPACE(bp)) break; pageno = bp[bp[0] - 1]; rbufp->flags |= BUF_MOD; rbufp = __get_buf(hashp, (uint32_t)pageno, rbufp, 0); if (last_bfp) __free_ovflpage(hashp, last_bfp); last_bfp = rbufp; if (!rbufp) return (-1); /* Error. */ bp = (uint16_t *)(void *)rbufp->page; } /* * If we get here then rbufp points to the last page of the big * key/data pair. Bufp points to the first one -- it should now be * empty pointing to the next page after this pair. Can't free it * because we don't have the page pointing to it. */ /* This is information from the last page of the pair. */ n = bp[0]; pageno = bp[n - 1]; /* Now, bp is the first page of the pair. */ bp = (uint16_t *)(void *)bufp->page; if (n > 2) { /* There is an overflow page. */ bp[1] = pageno; bp[2] = OVFLPAGE; bufp->ovfl = rbufp->ovfl; } else /* This is the last page. */ bufp->ovfl = NULL; n -= 2; bp[0] = n; temp = hashp->BSIZE - PAGE_META(n); _DBFIT(temp, uint16_t); FREESPACE(bp) = (uint16_t)temp; OFFSET(bp) = hashp->BSIZE; bufp->flags |= BUF_MOD; if (rbufp) __free_ovflpage(hashp, rbufp); if (last_bfp && last_bfp != rbufp) __free_ovflpage(hashp, last_bfp); hashp->NKEYS--; return (0); }
/* * __BT_SPLIT -- Split the tree. * * Parameters: * t: tree * sp: page to split * key: key to insert * data: data to insert * flags: BIGKEY/BIGDATA flags * ilen: insert length * skip: index to leave open * * Returns: * RET_ERROR, RET_SUCCESS */ int __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, size_t ilen, uint32_t argskip) { BINTERNAL *bi = NULL; /* pacify gcc */ BLEAF *bl = NULL, *tbl; /* pacify gcc */ DBT a, b; EPGNO *parent; PAGE *h, *l, *r, *lchild, *rchild; indx_t nxtindex; uint16_t skip; uint32_t n, nbytes, nksize = 0; /* pacify gcc */ int parentsplit; char *dest; /* * Split the page into two pages, l and r. The split routines return * a pointer to the page into which the key should be inserted and with * skip set to the offset which should be used. Additionally, l and r * are pinned. */ skip = argskip; h = sp->pgno == P_ROOT ? bt_root(t, sp, &l, &r, &skip, ilen) : bt_page(t, sp, &l, &r, &skip, ilen); if (h == NULL) return (RET_ERROR); /* * Insert the new key/data pair into the leaf page. (Key inserts * always cause a leaf page to split first.) */ _DBFIT(ilen, indx_t); h->upper -= (indx_t)ilen; h->linp[skip] = h->upper; dest = (char *)(void *)h + h->upper; if (F_ISSET(t, R_RECNO)) WR_RLEAF(dest, data, flags); else WR_BLEAF(dest, key, data, flags); /* If the root page was split, make it look right. */ if (sp->pgno == P_ROOT && (F_ISSET(t, R_RECNO) ? bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) goto err2; /* * Now we walk the parent page stack -- a LIFO stack of the pages that * were traversed when we searched for the page that split. Each stack * entry is a page number and a page index offset. The offset is for * the page traversed on the search. We've just split a page, so we * have to insert a new key into the parent page. * * If the insert into the parent page causes it to split, may have to * continue splitting all the way up the tree. We stop if the root * splits or the page inserted into didn't have to split to hold the * new key. Some algorithms replace the key for the old page as well * as the new page. We don't, as there's no reason to believe that the * first key on the old page is any better than the key we have, and, * in the case of a key being placed at index 0 causing the split, the * key is unavailable. * * There are a maximum of 5 pages pinned at any time. We keep the left * and right pages pinned while working on the parent. The 5 are the * two children, left parent and right parent (when the parent splits) * and the root page or the overflow key page when calling bt_preserve. * This code must make sure that all pins are released other than the * root page or overflow page which is unlocked elsewhere. */ while ((parent = BT_POP(t)) != NULL) { lchild = l; rchild = r; /* Get the parent page. */ if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) goto err2; /* * The new key goes ONE AFTER the index, because the split * was to the right. */ skip = parent->index + 1; /* * Calculate the space needed on the parent page. * * Prefix trees: space hack when inserting into BINTERNAL * pages. Retain only what's needed to distinguish between * the new entry and the LAST entry on the page to its left. * If the keys compare equal, retain the entire key. Note, * we don't touch overflow keys, and the entire key must be * retained for the next-to-left most key on the leftmost * page of each level, or the search will fail. Applicable * ONLY to internal pages that have leaf pages as children. * Further reduction of the key between pairs of internal * pages loses too much information. */ switch (rchild->flags & P_TYPE) { case P_BINTERNAL: bi = GETBINTERNAL(rchild, 0); nbytes = NBINTERNAL(bi->ksize); break; case P_BLEAF: bl = GETBLEAF(rchild, 0); nbytes = NBINTERNAL(bl->ksize); if (t->bt_pfx && !(bl->flags & P_BIGKEY) && (h->prevpg != P_INVALID || skip > 1)) { size_t temp; tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); a.size = tbl->ksize; a.data = tbl->bytes; b.size = bl->ksize; b.data = bl->bytes; temp = t->bt_pfx(&a, &b); _DBFIT(temp, uint32_t); nksize = (uint32_t)temp; n = NBINTERNAL(nksize); if (n < nbytes) { #ifdef STATISTICS bt_pfxsaved += nbytes - n; #endif nbytes = n; } else nksize = 0; } else nksize = 0; break; case P_RINTERNAL: case P_RLEAF: nbytes = NRINTERNAL; break; default: abort(); } /* Split the parent page if necessary or shift the indices. */ if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) { sp = h; h = h->pgno == P_ROOT ? bt_root(t, h, &l, &r, &skip, nbytes) : bt_page(t, h, &l, &r, &skip, nbytes); if (h == NULL) goto err1; parentsplit = 1; } else { if (skip < (nxtindex = NEXTINDEX(h))) memmove(h->linp + skip + 1, h->linp + skip, (nxtindex - skip) * sizeof(indx_t)); h->lower += sizeof(indx_t); parentsplit = 0; } /* Insert the key into the parent page. */ switch (rchild->flags & P_TYPE) { case P_BINTERNAL: h->linp[skip] = h->upper -= nbytes; dest = (char *)(void *)h + h->linp[skip]; memmove(dest, bi, nbytes); ((BINTERNAL *)(void *)dest)->pgno = rchild->pgno; break; case P_BLEAF: h->linp[skip] = h->upper -= nbytes; dest = (char *)(void *)h + h->linp[skip]; WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, rchild->pgno, bl->flags & P_BIGKEY); memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); if (bl->flags & P_BIGKEY && bt_preserve(t, *(pgno_t *)(void *)bl->bytes) == RET_ERROR) goto err1; break; case P_RINTERNAL: /* * Update the left page count. If split * added at index 0, fix the correct page. */ if (skip > 0) dest = (char *)(void *)h + h->linp[skip - 1]; else dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1]; ((RINTERNAL *)(void *)dest)->nrecs = rec_total(lchild); ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno; /* Update the right page count. */ h->linp[skip] = h->upper -= nbytes; dest = (char *)(void *)h + h->linp[skip]; ((RINTERNAL *)(void *)dest)->nrecs = rec_total(rchild); ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno; break; case P_RLEAF: /* * Update the left page count. If split * added at index 0, fix the correct page. */ if (skip > 0) dest = (char *)(void *)h + h->linp[skip - 1]; else dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1]; ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(lchild); ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno; /* Update the right page count. */ h->linp[skip] = h->upper -= nbytes; dest = (char *)(void *)h + h->linp[skip]; ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(rchild); ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno; break; default: abort(); } /* Unpin the held pages. */ if (!parentsplit) { mpool_put(t->bt_mp, h, MPOOL_DIRTY); break; } /* If the root page was split, make it look right. */ if (sp->pgno == P_ROOT && (F_ISSET(t, R_RECNO) ? bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) goto err1; mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); } /* Unpin the held pages. */ mpool_put(t->bt_mp, l, MPOOL_DIRTY); mpool_put(t->bt_mp, r, MPOOL_DIRTY); /* Clear any pages left on the stack. */ return (RET_SUCCESS); /* * If something fails in the above loop we were already walking back * up the tree and the tree is now inconsistent. Nothing much we can * do about it but release any memory we're holding. */ err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); err2: mpool_put(t->bt_mp, l, 0); mpool_put(t->bt_mp, r, 0); __dbpanic(t->bt_dbp); return (RET_ERROR); }
/* * BT_PSPLIT -- Do the real work of splitting the page. * * Parameters: * t: tree * h: page to be split * l: page to put lower half of data * r: page to put upper half of data * pskip: pointer to index to leave open * ilen: insert length * * Returns: * Pointer to page in which to insert. */ static PAGE * bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) { BINTERNAL *bi; BLEAF *bl; CURSOR *c; RLEAF *rl; PAGE *rval; void *src = NULL; /* pacify gcc */ indx_t full, half, nxt, off, skip, top, used; uint32_t nbytes; size_t temp; int bigkeycnt, isbigkey; /* * Split the data to the left and right pages. Leave the skip index * open. Additionally, make some effort not to split on an overflow * key. This makes internal page processing faster and can save * space as overflow keys used by internal pages are never deleted. */ bigkeycnt = 0; skip = *pskip; temp = t->bt_psize - BTDATAOFF; _DBFIT(temp, indx_t); full = (indx_t)temp; half = full / 2; used = 0; for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { if (skip == off) { _DBFIT(ilen, uint32_t); nbytes = (uint32_t)ilen; isbigkey = 0; /* XXX: not really known. */ } else switch (h->flags & P_TYPE) { case P_BINTERNAL: src = bi = GETBINTERNAL(h, nxt); nbytes = NBINTERNAL(bi->ksize); isbigkey = bi->flags & P_BIGKEY; break; case P_BLEAF: src = bl = GETBLEAF(h, nxt); nbytes = NBLEAF(bl); isbigkey = bl->flags & P_BIGKEY; break; case P_RINTERNAL: src = GETRINTERNAL(h, nxt); nbytes = NRINTERNAL; isbigkey = 0; break; case P_RLEAF: src = rl = GETRLEAF(h, nxt); nbytes = NRLEAF(rl); isbigkey = 0; break; default: abort(); } /* * If the key/data pairs are substantial fractions of the max * possible size for the page, it's possible to get situations * where we decide to try and copy too much onto the left page. * Make sure that doesn't happen. */ if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || nxt == top - 1) { --off; break; } /* Copy the key/data pair, if not the skipped index. */ if (skip != off) { ++nxt; l->linp[off] = l->upper -= nbytes; memmove((char *)(void *)l + l->upper, src, nbytes); } temp = nbytes + sizeof(indx_t); _DBFIT(temp, indx_t); used += (indx_t)temp; if (used >= half) { if (!isbigkey || bigkeycnt == 3) break; else ++bigkeycnt; } } /* * Off is the last offset that's valid for the left page. * Nxt is the first offset to be placed on the right page. */ temp = (off + 1) * sizeof(indx_t); _DBFIT(temp, indx_t); l->lower += (indx_t)temp; /* * If splitting the page that the cursor was on, the cursor has to be * adjusted to point to the same record as before the split. If the * cursor is at or past the skipped slot, the cursor is incremented by * one. If the cursor is on the right page, it is decremented by the * number of records split to the left page. */ c = &t->bt_cursor; if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { if (c->pg.index >= skip) ++c->pg.index; if (c->pg.index < nxt) /* Left page. */ c->pg.pgno = l->pgno; else { /* Right page. */ c->pg.pgno = r->pgno; c->pg.index -= nxt; } } /* * If the skipped index was on the left page, just return that page. * Otherwise, adjust the skip index to reflect the new position on * the right page. */ if (skip <= off) { skip = MAX_PAGE_OFFSET; rval = l; } else { rval = r; *pskip -= nxt; } for (off = 0; nxt < top; ++off) { if (skip == nxt) { ++off; skip = MAX_PAGE_OFFSET; } switch (h->flags & P_TYPE) { case P_BINTERNAL: src = bi = GETBINTERNAL(h, nxt); nbytes = NBINTERNAL(bi->ksize); break; case P_BLEAF: src = bl = GETBLEAF(h, nxt); nbytes = NBLEAF(bl); break; case P_RINTERNAL: src = GETRINTERNAL(h, nxt); nbytes = NRINTERNAL; break; case P_RLEAF: src = rl = GETRLEAF(h, nxt); nbytes = NRLEAF(rl); break; default: abort(); } ++nxt; r->linp[off] = r->upper -= nbytes; memmove((char *)(void *)r + r->upper, src, nbytes); } temp = off * sizeof(indx_t); _DBFIT(temp, indx_t); r->lower += (indx_t)temp; /* If the key is being appended to the page, adjust the index. */ if (skip == top) r->lower += sizeof(indx_t); return (rval); }