DbStatus btree1NextKey (DbCursor *dbCursor, DbMap *map) { Btree1Cursor *cursor = (Btree1Cursor *)((char *)dbCursor + dbCursor->xtra); uint8_t *key; switch (dbCursor->state) { case CursorNone: btree1LeftKey(dbCursor, map); break; case CursorRightEof: return DB_CURSOR_eof; default: break; } while (true) { uint32_t max = cursor->page->cnt; if (!cursor->page->right.bits) max--; while (++cursor->slotIdx <= max) { Btree1Slot *slot = slotptr(cursor->page, cursor->slotIdx); if (slot->dead) continue; key = keyaddr(cursor->page, slot->off); dbCursor->key = key + keypre(key); dbCursor->keyLen = keylen(key); dbCursor->state = CursorPosAt; return DB_OK; } if (cursor->page->right.bits) cursor->page = getObj(map, cursor->page->right); else break; cursor->slotIdx = 0; } dbCursor->state = CursorRightEof; return DB_CURSOR_eof; }
DbStatus btree1PrevKey (DbCursor *dbCursor, DbMap *map) { Btree1Cursor *cursor = (Btree1Cursor *)((char *)dbCursor + dbCursor->xtra); uint8_t *key; switch (dbCursor->state) { case CursorNone: btree1RightKey(dbCursor, map); break; case CursorLeftEof: return DB_CURSOR_eof; default: break; } while (true) { if (cursor->slotIdx > 1) { Btree1Slot *slot = slotptr(cursor->page, --cursor->slotIdx); if (slot->dead) continue; key = keyaddr(cursor->page, slot->off); dbCursor->key = key + keypre(key); dbCursor->keyLen = keylen(key); dbCursor->state = CursorPosAt; return DB_OK; } if (cursor->page->left.bits) cursor->page = getObj(map, cursor->page->left); else break; cursor->slotIdx = cursor->page->cnt + 1; } dbCursor->state = CursorLeftEof; return DB_CURSOR_eof; }
DbStatus btree1LoadPage(DbMap *map, Btree1Set *set, void *key, uint32_t keyLen, uint8_t lvl, Btree1Lock lock, bool stopper) { Btree1Index *btree1 = btree1index(map); uint8_t drill = 0xff, *ptr; Btree1Page *prevPage = NULL; Btree1Lock mode, prevMode; DbAddr prevPageNo; set->pageNo.bits = btree1->root.bits; prevPageNo.bits = 0; // start at our idea of the root level of the btree1 and drill down do { // determine lock mode of drill level mode = (drill == lvl) ? lock : Btree1_lockRead; set->page = getObj(map, set->pageNo); // release parent or left sibling page if( prevPageNo.bits ) { btree1UnlockPage(prevPage, prevMode); prevPageNo.bits = 0; } // obtain mode lock btree1LockPage(set->page, mode); if( set->page->free ) return DB_BTREE_error; // re-read and re-lock root after determining actual level of root if( set->page->lvl != drill) { assert(drill == 0xff); drill = set->page->lvl; if( lock != Btree1_lockRead && drill == lvl ) { btree1UnlockPage(set->page, mode); continue; } } assert(lvl <= set->page->lvl); prevPageNo.bits = set->pageNo.bits; prevPage = set->page; prevMode = mode; // find key on page at this level // and descend to requested level if( !set->page->kill ) if( (set->slotIdx = btree1FindSlot (set->page, key, keyLen, stopper)) ) { if( drill == lvl ) return DB_OK; // find next non-dead slot -- the fence key if nothing else while( slotptr(set->page, set->slotIdx)->dead ) if( set->slotIdx++ < set->page->cnt ) continue; else return DB_BTREE_error; // get next page down ptr = keyptr(set->page, set->slotIdx); set->pageNo.bits = btree1GetPageNo(ptr + keypre(ptr), keylen(ptr)); assert(drill > 0); drill--; continue; } // or slide right into next page set->pageNo.bits = set->page->right.bits; } while( set->pageNo.bits ); // return error on end of right chain return DB_BTREE_error; }
Btree1Slot *btree1Slot(Btree1Page *page, uint32_t idx) { return slotptr(page, idx); }
DbStatus btree1CleanPage(Handle *index, Btree1Set *set, uint32_t totKeyLen) { Btree1Index *btree1 = btree1index(index->map); Btree1Slot librarian, *source, *dest; uint32_t size = btree1->pageSize; Btree1Page *page = set->page; uint32_t max = page->cnt; uint32_t len, cnt, idx; uint32_t newSlot = max; Btree1PageType type; Btree1Page *frame; uint8_t *key; DbAddr addr; librarian.bits = 0; librarian.type = Btree1_librarian; librarian.dead = 1; if( !page->lvl ) { size <<= btree1->leafXtra; type = Btree1_leafPage; } else { type = Btree1_interior; } if( page->min >= (max+1) * sizeof(Btree1Slot) + sizeof(*page) + totKeyLen ) return DB_OK; // skip cleanup and proceed directly to split // if there's not enough garbage // to bother with. if( page->garbage < size / 5 ) return DB_BTREE_needssplit; if( (addr.bits = allocObj(index->map, listFree(index, type), NULL, type, size, false)) ) frame = getObj(index->map, addr); else return DB_ERROR_outofmemory; memcpy (frame, page, size); // skip page info and set rest of page to zero memset (page+1, 0, size - sizeof(*page)); page->garbage = 0; page->act = 0; cnt = 0; idx = 0; source = slotptr(frame, cnt); dest = slotptr(page, idx); // clean up page first by // removing deleted keys while( source++, cnt++ < max ) { if( cnt == set->slotIdx ) newSlot = idx + 2; if( source->dead ) continue; // copy the active key across key = keyaddr(frame, source->off); len = keylen(key) + keypre(key); size -= len; memcpy ((uint8_t *)page + size, key, len); // make a librarian slot if (cnt < max) { (++dest)->bits = librarian.bits; ++idx; } // set up the slot (++dest)->bits = source->bits; dest->off = size; idx++; page->act++; } page->min = size; page->cnt = idx; // update insert slot index // for newly cleaned-up page set->slotIdx = newSlot; // return temporary frame addSlotToFrame(index->map, listFree(index,addr.type), NULL, addr.bits); // see if page has enough space now, or does it still need splitting? if( page->min >= (idx+1) * sizeof(Btree1Slot) + sizeof(*page) + totKeyLen ) return DB_OK; return DB_BTREE_needssplit; }
DbStatus btree1SplitRoot(Handle *index, Btree1Set *root, DbAddr right, uint8_t *leftKey) { Btree1Index *btree1 = btree1index(index->map); uint32_t keyLen, nxt = btree1->pageSize; Btree1Page *leftPage, *rightPage; Btree1Slot *slot; uint8_t *ptr; uint32_t off; DbAddr left; // Obtain an empty page to use, and copy the current // root contents into it, e.g. lower keys if( (left.bits = btree1NewPage(index, root->page->lvl)) ) leftPage = getObj(index->map, left); else return DB_ERROR_outofmemory; // copy in new smaller keys into left page // (clear the latches) memcpy (leftPage->latch + 1, root->page->latch + 1, btree1->pageSize - sizeof(*leftPage->latch)); rightPage = getObj(index->map, right); rightPage->left.bits = left.bits; // preserve the page info at the bottom // of higher keys and set rest to zero memset(root->page+1, 0, btree1->pageSize - sizeof(*root->page)); // insert stopper key on root page // pointing to right half page // and increase the root height nxt -= 1 + sizeof(uint64_t); slot = slotptr(root->page, 2); slot->type = Btree1_stopper; slot->off = nxt; ptr = keyaddr(root->page, nxt); btree1PutPageNo(ptr + 1, 0, right.bits); ptr[0] = sizeof(uint64_t); // next insert lower keys (left) fence key on newroot page as // first key and reserve space for the key. keyLen = keylen(leftKey); off = keypre(leftKey); nxt -= keyLen + off; slot = slotptr(root->page, 1); slot->type = Btree1_indexed; slot->off = nxt; // construct lower (left) page key ptr = keyaddr(root->page, nxt); memcpy (ptr + off, leftKey + keypre(leftKey), keyLen - sizeof(uint64_t)); btree1PutPageNo(ptr + off, keyLen - sizeof(uint64_t), left.bits); if (off == 1) ptr[0] = keyLen; else ptr[0] = keyLen / 256 | 0x80, ptr[1] = keyLen; root->page->right.bits = 0; root->page->min = nxt; root->page->cnt = 2; root->page->act = 2; root->page->lvl++; // release root page btree1UnlockPage(root->page, Btree1_lockWrite); return DB_OK; }
DbStatus btree1SplitPage (Handle *index, Btree1Set *set) { uint8_t leftKey[Btree1_maxkey], rightKey[Btree1_maxkey]; Btree1Index *btree1 = btree1index(index->map); uint32_t cnt = 0, idx = 0, max, nxt, off; Btree1Slot librarian, *source, *dest; uint32_t size = btree1->pageSize; Btree1Page *frame, *rightPage; uint8_t lvl = set->page->lvl; uint32_t totLen, keyLen; uint8_t *key = NULL; DbAddr right, addr; bool stopper; DbStatus stat; #ifdef DEBUG atomicAdd32(&Splits, 1); #endif librarian.bits = 0; librarian.type = Btree1_librarian; librarian.dead = 1; if( !set->page->lvl ) size <<= btree1->leafXtra; // get new page and write higher keys to it. if( (right.bits = btree1NewPage(index, lvl)) ) rightPage = getObj(index->map, right); else return DB_ERROR_outofmemory; max = set->page->cnt; cnt = max / 2; nxt = size; idx = 0; source = slotptr(set->page, cnt); dest = slotptr(rightPage, 0); while( source++, cnt++ < max ) { if( source->dead ) continue; key = keyaddr(set->page, source->off); totLen = keylen(key) + keypre(key); nxt -= totLen; memcpy (keyaddr(rightPage, nxt), key, totLen); rightPage->act++; // add librarian slot if (cnt < max) { (++dest)->bits = librarian.bits; dest->off = nxt; idx++; } // add actual slot (++dest)->bits = source->bits; dest->off = nxt; idx++; } // remember right fence key for larger page // extend right leaf fence key with // the right page number on leaf page. stopper = dest->type == Btree1_stopper; keyLen = keylen(key); if( set->page->lvl) keyLen -= sizeof(uint64_t); // strip off pageNo if( keyLen + sizeof(uint64_t) < 128 ) off = 1; else off = 2; // copy key and add pageNo memcpy (rightKey + off, key + keypre(key), keyLen); btree1PutPageNo(rightKey + off, keyLen, right.bits); keyLen += sizeof(uint64_t); if (off == 1) rightKey[0] = keyLen; else rightKey[0] = keyLen / 256 | 0x80, rightKey[1] = keyLen; rightPage->min = nxt; rightPage->cnt = idx; rightPage->lvl = lvl; // link right node if( set->pageNo.type != Btree1_rootPage ) { rightPage->right.bits = set->page->right.bits; rightPage->left.bits = set->pageNo.bits; if( !lvl && rightPage->right.bits ) { Btree1Page *farRight = getObj(index->map, rightPage->right); btree1LockPage (farRight, Btree1_lockLink); farRight->left.bits = right.bits; btree1UnlockPage (farRight, Btree1_lockLink); } } // copy lower keys from temporary frame back into old page if( (addr.bits = btree1NewPage(index, lvl)) ) frame = getObj(index->map, addr); else return DB_ERROR_outofmemory; memcpy (frame, set->page, size); memset (set->page+1, 0, size - sizeof(*set->page)); set->page->garbage = 0; set->page->act = 0; nxt = size; max /= 2; cnt = 0; idx = 0; // ignore librarian max key if( slotptr(frame, max)->type == Btree1_librarian ) max--; source = slotptr(frame, 0); dest = slotptr(set->page, 0); #ifdef DEBUG key = keyaddr(frame, source[2].off); assert(keylen(key) > 0); #endif // assemble page of smaller keys from temporary frame copy while( source++, cnt++ < max ) { if( source->dead ) continue; key = keyaddr(frame, source->off); totLen = keylen(key) + keypre(key); nxt -= totLen; memcpy (keyaddr(set->page, nxt), key, totLen); // add librarian slot, except before fence key if (cnt < max) { (++dest)->bits = librarian.bits; dest->off = nxt; idx++; } // add actual slot (++dest)->bits = source->bits; dest->off = nxt; idx++; set->page->act++; } set->page->right.bits = right.bits; set->page->min = nxt; set->page->cnt = idx; // remember left fence key for smaller page // extend left leaf fence key with // the left page number. keyLen = keylen(key); if( set->page->lvl) keyLen -= sizeof(uint64_t); // strip off pageNo if( keyLen + sizeof(uint64_t) < 128 ) off = 1; else off = 2; // copy key and add pageNo memcpy (leftKey + off, key + keypre(key), keyLen); btree1PutPageNo(leftKey + off, keyLen, set->pageNo.bits); keyLen += sizeof(uint64_t); if (off == 1) leftKey[0] = keyLen; else leftKey[0] = keyLen / 256 | 0x80, leftKey[1] = keyLen; // return temporary frame addSlotToFrame(index->map, listFree(index, addr.type), NULL, addr.bits); // if current page is the root page, split it if( set->pageNo.type == Btree1_rootPage ) return btree1SplitRoot (index, set, right, leftKey); // insert new fences in their parent pages btree1LockPage (rightPage, Btree1_lockParent); btree1LockPage (set->page, Btree1_lockParent); btree1UnlockPage (set->page, Btree1_lockWrite); // insert new fence for reformulated left block of smaller keys if( (stat = btree1InsertKey(index, leftKey + keypre(leftKey), keylen(leftKey), lvl+1, Btree1_indexed) )) return stat; // switch fence for right block of larger keys to new right page if( (stat = btree1FixKey(index, rightKey, lvl+1, stopper) )) return stat; btree1UnlockPage (set->page, Btree1_lockParent); btree1UnlockPage (rightPage, Btree1_lockParent); return DB_OK; }
BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) { uint lvl, attr, cacheblk, last; BtPage alloc; int lockmode; off64_t size; uint amt[1]; BtMgr* mgr; BtKey key; #ifndef unix SYSTEM_INFO sysinfo[1]; #endif // determine sanity of page size and buffer pool if( bits > BT_maxbits ) bits = BT_maxbits; else if( bits < BT_minbits ) bits = BT_minbits; if( !poolmax ) return NULL; // must have buffer pool #ifdef unix mgr = calloc (1, sizeof(BtMgr)); switch (mode & 0x7fff) { case BT_rw: mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); lockmode = 1; break; case BT_ro: default: mgr->idx = open ((char*)name, O_RDONLY); lockmode = 0; break; } if( mgr->idx == -1 ) return free(mgr), NULL; cacheblk = 4096; // minimum mmap segment size for unix #else mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); attr = FILE_ATTRIBUTE_NORMAL; switch (mode & 0x7fff) { case BT_rw: mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); lockmode = 1; break; case BT_ro: default: mgr->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL); lockmode = 0; break; } if( mgr->idx == INVALID_HANDLE_VALUE ) return GlobalFree(mgr), NULL; // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity GetSystemInfo(sysinfo); cacheblk = sysinfo->dwAllocationGranularity; #endif #ifdef unix alloc = malloc (BT_maxpage); *amt = 0; // read minimum page size to get root info if( size = lseek (mgr->idx, 0L, 2) ) { if( pread(mgr->idx, alloc, BT_minpage, 0) == BT_minpage ) bits = alloc->bits; else return free(mgr), free(alloc), NULL; } else if( mode == BT_ro ) return bt_mgrclose (mgr), NULL; #else alloc = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); size = GetFileSize(mgr->idx, amt); if( size || *amt ) { if( !ReadFile(mgr->idx, (char *)alloc, BT_minpage, amt, NULL) ) return bt_mgrclose (mgr), NULL; bits = alloc->bits; } else if( mode == BT_ro ) return bt_mgrclose (mgr), NULL; #endif mgr->page_size = 1 << bits; mgr->page_bits = bits; mgr->poolmax = poolmax; mgr->mode = mode; if( cacheblk < mgr->page_size ) cacheblk = mgr->page_size; // mask for partial memmaps mgr->poolmask = (cacheblk >> bits) - 1; // see if requested size of pages per memmap is greater if( (1 << segsize) > mgr->poolmask ) mgr->poolmask = (1 << segsize) - 1; mgr->seg_bits = 0; while( (1 << mgr->seg_bits) <= mgr->poolmask ) mgr->seg_bits++; mgr->hashsize = hashsize; #ifdef unix mgr->pool = calloc (poolmax, sizeof(BtPool)); mgr->hash = calloc (hashsize, sizeof(ushort)); mgr->latch = calloc (hashsize, sizeof(BtLatch)); mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 8) / 8); #else mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool)); mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); #endif if( size || *amt ) goto mgrxit; // initializes an empty b-tree with root page and page of leaves memset (alloc, 0, 1 << bits); bt_putid(alloc->right, MIN_lvl+1); alloc->bits = mgr->page_bits; #ifdef unix if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) return bt_mgrclose (mgr), NULL; #else if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) return bt_mgrclose (mgr), NULL; if( *amt < mgr->page_size ) return bt_mgrclose (mgr), NULL; #endif memset (alloc, 0, 1 << bits); alloc->bits = mgr->page_bits; for( lvl=MIN_lvl; lvl--; ) { slotptr(alloc, 1)->off = mgr->page_size - 3; bt_putid(slotptr(alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number key = keyptr(alloc, 1); key->len = 2; // create stopper key key->key[0] = 0xff; key->key[1] = 0xff; alloc->min = mgr->page_size - 3; alloc->lvl = lvl; alloc->cnt = 1; alloc->act = 1; #ifdef unix if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) return bt_mgrclose (mgr), NULL; #else if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) return bt_mgrclose (mgr), NULL; if( *amt < mgr->page_size ) return bt_mgrclose (mgr), NULL; #endif } // create empty page area by writing last page of first // segment area (other pages are zeroed by O/S) if( mgr->poolmask ) { memset(alloc, 0, mgr->page_size); last = mgr->poolmask; while( last < MIN_lvl + 1 ) last += mgr->poolmask + 1; #ifdef unix pwrite(mgr->idx, alloc, mgr->page_size, last << mgr->page_bits); #else SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) return bt_mgrclose (mgr), NULL; if( *amt < mgr->page_size ) return bt_mgrclose (mgr), NULL; #endif } mgrxit: #ifdef unix free (alloc); #else VirtualFree (alloc, 0, MEM_RELEASE); #endif return mgr; }