/** * Allocate space in storage for and write the content references by 'data' * (and length 'size') to storage. * * Conditions will apply whether the data is written through cache or direct * to device. * * The content is, of course, prefixed by a BLOB header. * * Partial writes are handled in this function. */ ham_status_t blob_allocate(ham_env_t *env, ham_db_t *db, ham_record_t *record, ham_u32_t flags, ham_offset_t *blobid) { ham_status_t st; ham_page_t *page=0; ham_offset_t addr; blob_t hdr; ham_u8_t *chunk_data[2]; ham_size_t alloc_size; ham_size_t chunk_size[2]; ham_device_t *device=env_get_device(env); ham_bool_t freshly_created = HAM_FALSE; *blobid=0; /* * PARTIAL WRITE * * if offset+partial_size equals the full record size, then we won't * have any gaps. In this case we just write the full record and ignore * the partial parameters. */ if (flags&HAM_PARTIAL) { if (record->partial_offset==0 && record->partial_offset+record->partial_size==record->size) flags&=~HAM_PARTIAL; } /* * in-memory-database: the blobid is actually a pointer to the memory * buffer, in which the blob (with the blob-header) is stored */ if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) { blob_t *hdr; ham_u8_t *p=(ham_u8_t *)allocator_alloc(env_get_allocator(env), record->size+sizeof(blob_t)); if (!p) { return HAM_OUT_OF_MEMORY; } /* initialize the header */ hdr=(blob_t *)p; memset(hdr, 0, sizeof(*hdr)); blob_set_self(hdr, (ham_offset_t)PTR_TO_U64(p)); blob_set_alloc_size(hdr, record->size+sizeof(blob_t)); blob_set_size(hdr, record->size); /* do we have gaps? if yes, fill them with zeroes */ if (flags&HAM_PARTIAL) { ham_u8_t *s=p+sizeof(blob_t); if (record->partial_offset) memset(s, 0, record->partial_offset); memcpy(s+record->partial_offset, record->data, record->partial_size); if (record->partial_offset+record->partial_size<record->size) memset(s+record->partial_offset+record->partial_size, 0, record->size-(record->partial_offset+record->partial_size)); } else { memcpy(p+sizeof(blob_t), record->data, record->size); } *blobid=(ham_offset_t)PTR_TO_U64(p); return (0); } memset(&hdr, 0, sizeof(hdr)); /* * blobs are CHUNKSIZE-allocated */ alloc_size=sizeof(blob_t)+record->size; alloc_size += DB_CHUNKSIZE - 1; alloc_size -= alloc_size % DB_CHUNKSIZE; /* * check if we have space in the freelist */ st = freel_alloc_area(&addr, env, db, alloc_size); if (!addr) { if (st) return st; /* * if the blob is small AND if logging is disabled: load the page * through the cache */ if (__blob_from_cache(env, alloc_size)) { st = db_alloc_page(&page, db, PAGE_TYPE_BLOB, PAGE_IGNORE_FREELIST); ham_assert(st ? page == NULL : 1, (0)); ham_assert(!st ? page != NULL : 1, (0)); if (st) return st; /* blob pages don't have a page header */ page_set_npers_flags(page, page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER); addr=page_get_self(page); /* move the remaining space to the freelist */ (void)freel_mark_free(env, db, addr+alloc_size, env_get_pagesize(env)-alloc_size, HAM_FALSE); blob_set_alloc_size(&hdr, alloc_size); } else { /* * otherwise use direct IO to allocate the space */ ham_size_t aligned=alloc_size; aligned += env_get_pagesize(env) - 1; aligned -= aligned % env_get_pagesize(env); st=device->alloc(device, aligned, &addr); if (st) return (st); /* if aligned!=size, and the remaining chunk is large enough: * move it to the freelist */ { ham_size_t diff=aligned-alloc_size; if (diff > SMALLEST_CHUNK_SIZE) { (void)freel_mark_free(env, db, addr+alloc_size, diff, HAM_FALSE); blob_set_alloc_size(&hdr, aligned-diff); } else { blob_set_alloc_size(&hdr, aligned); } } freshly_created = HAM_TRUE; } ham_assert(HAM_SUCCESS == freel_check_area_is_allocated(env, db, addr, alloc_size), (0)); } else { ham_assert(!st, (0)); blob_set_alloc_size(&hdr, alloc_size); } blob_set_size(&hdr, record->size); blob_set_self(&hdr, addr); /* * PARTIAL WRITE * * are there gaps at the beginning? If yes, then we'll fill with zeros */ if ((flags&HAM_PARTIAL) && (record->partial_offset)) { ham_u8_t *ptr; ham_size_t gapsize=record->partial_offset; ptr=allocator_calloc(env_get_allocator(env), gapsize > env_get_pagesize(env) ? env_get_pagesize(env) : gapsize); if (!ptr) return (HAM_OUT_OF_MEMORY); /* * first: write the header */ chunk_data[0]=(ham_u8_t *)&hdr; chunk_size[0]=sizeof(hdr); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) return (st); addr+=sizeof(hdr); /* now fill the gap; if the gap is bigger than a pagesize we'll * split the gap into smaller chunks */ while (gapsize>=env_get_pagesize(env)) { chunk_data[0]=ptr; chunk_size[0]=env_get_pagesize(env); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) break; gapsize-=env_get_pagesize(env); addr+=env_get_pagesize(env); } /* fill the remaining gap */ if (gapsize) { chunk_data[0]=ptr; chunk_size[0]=gapsize; st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) return (st); addr+=gapsize; } allocator_free(env_get_allocator(env), ptr); /* now write the "real" data */ chunk_data[0]=(ham_u8_t *)record->data; chunk_size[0]=record->partial_size; st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) return (st); addr+=record->partial_size; } else { /* * not writing partially: write header and data, then we're done */ chunk_data[0]=(ham_u8_t *)&hdr; chunk_size[0]=sizeof(hdr); chunk_data[1]=(ham_u8_t *)record->data; chunk_size[1]=(flags&HAM_PARTIAL) ? record->partial_size : record->size; st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 2); if (st) return (st); addr+=sizeof(hdr)+ ((flags&HAM_PARTIAL) ? record->partial_size : record->size); } /* * store the blobid; it will be returned to the caller */ *blobid=blob_get_self(&hdr); /* * PARTIAL WRITES: * * if we have gaps at the end of the blob: just append more chunks to * fill these gaps. Since they can be pretty large we split them into * smaller chunks if necessary. */ if (flags&HAM_PARTIAL) { if (record->partial_offset+record->partial_size < record->size) { ham_u8_t *ptr; ham_size_t gapsize=record->size - (record->partial_offset+record->partial_size); /* now fill the gap; if the gap is bigger than a pagesize we'll * split the gap into smaller chunks * * we split this loop in two - the outer loop will allocate the * memory buffer, thus saving some allocations */ while (gapsize>env_get_pagesize(env)) { ham_u8_t *ptr=allocator_calloc(env_get_allocator(env), env_get_pagesize(env)); if (!ptr) return (HAM_OUT_OF_MEMORY); while (gapsize>env_get_pagesize(env)) { chunk_data[0]=ptr; chunk_size[0]=env_get_pagesize(env); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) break; gapsize-=env_get_pagesize(env); addr+=env_get_pagesize(env); } allocator_free(env_get_allocator(env), ptr); if (st) return (st); } /* now write the remainder, which is less than a pagesize */ ham_assert(gapsize<env_get_pagesize(env), ("")); chunk_size[0]=gapsize; ptr=chunk_data[0]=allocator_calloc(env_get_allocator(env), gapsize); if (!ptr) return (HAM_OUT_OF_MEMORY); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); allocator_free(env_get_allocator(env), ptr); if (st) return (st); } } return (0); }
static ham_status_t __insert_cursor(ham_btree_t *be, ham_key_t *key, ham_record_t *record, ham_bt_cursor_t *cursor, insert_hints_t *hints) { ham_status_t st; ham_page_t *root; ham_db_t *db=be_get_db(be); ham_env_t *env = db_get_env(db); insert_scratchpad_t scratchpad; ham_assert(hints->force_append == HAM_FALSE, (0)); ham_assert(hints->force_prepend == HAM_FALSE, (0)); /* * initialize the scratchpad */ memset(&scratchpad, 0, sizeof(scratchpad)); scratchpad.be=be; scratchpad.record=record; scratchpad.cursor=cursor; /* * get the root-page... */ ham_assert(btree_get_rootpage(be)!=0, ("btree has no root page")); st=db_fetch_page(&root, db, btree_get_rootpage(be), 0); ham_assert(st ? root == NULL : 1, (0)); if (st) return st; /* * ... and start the recursion */ st=__insert_recursive(root, key, 0, &scratchpad, hints); /* * if the root page was split, we have to create a new * root page. */ if (st==SPLIT) { ham_page_t *newroot; btree_node_t *node; /* * the root-page will be changed... */ st=ham_log_add_page_before(root); if (st) return (st); /* * allocate a new root page */ st=db_alloc_page(&newroot, db, PAGE_TYPE_B_ROOT, 0); ham_assert(st ? newroot == NULL : 1, (0)); if (st) return (st); ham_assert(page_get_owner(newroot), ("")); /* clear the node header */ memset(page_get_payload(newroot), 0, sizeof(btree_node_t)); stats_page_is_nuked(db, root, HAM_TRUE); /* * insert the pivot element and the ptr_left */ node=ham_page_get_btree_node(newroot); btree_node_set_ptr_left(node, btree_get_rootpage(be)); st=__insert_nosplit(newroot, &scratchpad.key, scratchpad.rid, scratchpad.record, scratchpad.cursor, hints); ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0)); scratchpad.cursor=0; /* don't overwrite cursor if __insert_nosplit is called again */ if (st) { ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0)); if (scratchpad.key.data) allocator_free(env_get_allocator(env), scratchpad.key.data); return (st); } /* * set the new root page * * !! * do NOT delete the old root page - it's still in use! * * also don't forget to flush the backend - otherwise the header * page of the database will not contain the updated information. * The backend is flushed when the database is closed, but if * recovery is enabled then the flush here is critical. */ btree_set_rootpage(be, page_get_self(newroot)); be_set_dirty(be, HAM_TRUE); be->_fun_flush(be); /* * As we re-purpose a page, we will reset its pagecounter * as well to signal its first use as the new type assigned * here. */ if (env_get_cache(env) && (page_get_type(root)!=PAGE_TYPE_B_INDEX)) cache_update_page_access_counter(root, env_get_cache(env), 0); page_set_type(root, PAGE_TYPE_B_INDEX); page_set_dirty(root, env); page_set_dirty(newroot, env); /* the root page was modified (btree_set_rootpage) - make sure that * it's logged */ if (env_get_rt_flags(env)&HAM_ENABLE_RECOVERY) { st=txn_add_page(env_get_txn(env), env_get_header_page(env), HAM_TRUE); if (st) return (st); } } /* * release the scratchpad-memory and return to caller */ ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0)); if (scratchpad.key.data) allocator_free(env_get_allocator(env), scratchpad.key.data); return (st); }
static ham_status_t __insert_split(ham_page_t *page, ham_key_t *key, ham_offset_t rid, insert_scratchpad_t *scratchpad, insert_hints_t *hints) { int cmp; ham_status_t st; ham_page_t *newpage, *oldsib; int_key_t *nbte, *obte; btree_node_t *nbtp, *obtp, *sbtp; ham_size_t count, keysize; ham_db_t *db=page_get_owner(page); ham_env_t *env = db_get_env(db); ham_key_t pivotkey, oldkey; ham_offset_t pivotrid; ham_u16_t pivot; ham_bool_t pivot_at_end=HAM_FALSE; ham_assert(page_get_owner(page), (0)); ham_assert(device_get_env(page_get_device(page)) == db_get_env(page_get_owner(page)), (0)); ham_assert(hints->force_append == HAM_FALSE, (0)); keysize=db_get_keysize(db); /* * allocate a new page */ hints->cost++; st=db_alloc_page(&newpage, db, PAGE_TYPE_B_INDEX, 0); ham_assert(st ? page == NULL : 1, (0)); ham_assert(!st ? page != NULL : 1, (0)); if (st) return st; ham_assert(page_get_owner(newpage), ("")); /* clear the node header */ memset(page_get_payload(newpage), 0, sizeof(btree_node_t)); stats_page_is_nuked(db, page, HAM_TRUE); /* * move half of the key/rid-tuples to the new page * * !! recno: keys are sorted; we do a "lazy split" */ nbtp=ham_page_get_btree_node(newpage); nbte=btree_node_get_key(db, nbtp, 0); obtp=ham_page_get_btree_node(page); obte=btree_node_get_key(db, obtp, 0); count=btree_node_get_count(obtp); /* * for databases with sequential access (this includes recno databases): * do not split in the middle, but at the very end of the page * * if this page is the right-most page in the index, and this key is * inserted at the very end, then we select the same pivot as for * sequential access */ if (db_get_data_access_mode(db)&HAM_DAM_SEQUENTIAL_INSERT) pivot_at_end=HAM_TRUE; else if (btree_node_get_right(obtp)==0) { cmp=key_compare_pub_to_int(db, page, key, btree_node_get_count(obtp)-1); if (cmp>0) pivot_at_end=HAM_TRUE; } /* * internal pages set the count of the new page to count-pivot-1 (because * the pivot element will become ptr_left of the new page). * by using pivot=count-2 we make sure that at least 1 element will remain * in the new node. */ if (pivot_at_end) { pivot=count-2; } else { pivot=count/2; } /* * uncouple all cursors */ st=bt_uncouple_all_cursors(page, pivot); if (st) return (st); /* * if we split a leaf, we'll insert the pivot element in the leaf * page, too. in internal nodes, we don't insert it, but propagate * it to the parent node only. */ if (btree_node_is_leaf(obtp)) { hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot)); memcpy((char *)nbte, ((char *)obte)+(db_get_int_key_header_size()+keysize)*pivot, (db_get_int_key_header_size()+keysize)*(count-pivot)); } else { hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot-1)); memcpy((char *)nbte, ((char *)obte)+(db_get_int_key_header_size()+keysize)*(pivot+1), (db_get_int_key_header_size()+keysize)*(count-pivot-1)); } /* * store the pivot element, we'll need it later to propagate it * to the parent page */ nbte=btree_node_get_key(db, obtp, pivot); memset(&pivotkey, 0, sizeof(pivotkey)); memset(&oldkey, 0, sizeof(oldkey)); oldkey.data=key_get_key(nbte); oldkey.size=key_get_size(nbte); oldkey._flags=key_get_flags(nbte); st = util_copy_key(db, &oldkey, &pivotkey); if (st) { (void)db_free_page(newpage, DB_MOVE_TO_FREELIST); goto fail_dramatically; } pivotrid=page_get_self(newpage); /* * adjust the page count */ if (btree_node_is_leaf(obtp)) { btree_node_set_count(obtp, pivot); btree_node_set_count(nbtp, count-pivot); } else { btree_node_set_count(obtp, pivot); btree_node_set_count(nbtp, count-pivot-1); } /* * if we're in an internal page: fix the ptr_left of the new page * (it points to the ptr of the pivot key) */ if (!btree_node_is_leaf(obtp)) { /* * nbte still contains the pivot key */ btree_node_set_ptr_left(nbtp, key_get_ptr(nbte)); } /* * insert the new element */ hints->cost++; cmp=key_compare_pub_to_int(db, page, key, pivot); if (cmp < -1) { st = (ham_status_t)cmp; goto fail_dramatically; } if (cmp>=0) st=__insert_nosplit(newpage, key, rid, scratchpad->record, scratchpad->cursor, hints); else st=__insert_nosplit(page, key, rid, scratchpad->record, scratchpad->cursor, hints); if (st) { goto fail_dramatically; } scratchpad->cursor=0; /* don't overwrite cursor if __insert_nosplit is called again */ /* * fix the double-linked list of pages, and mark the pages as dirty */ if (btree_node_get_right(obtp)) { st=db_fetch_page(&oldsib, db, btree_node_get_right(obtp), 0); if (st) goto fail_dramatically; } else { oldsib=0; } if (oldsib) { st=ham_log_add_page_before(oldsib); if (st) goto fail_dramatically; } btree_node_set_left (nbtp, page_get_self(page)); btree_node_set_right(nbtp, btree_node_get_right(obtp)); btree_node_set_right(obtp, page_get_self(newpage)); if (oldsib) { sbtp=ham_page_get_btree_node(oldsib); btree_node_set_left(sbtp, page_get_self(newpage)); page_set_dirty(oldsib, env); } page_set_dirty(newpage, env); page_set_dirty(page, env); /* * propagate the pivot key to the parent page */ ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0)); if (scratchpad->key.data) allocator_free(env_get_allocator(env), scratchpad->key.data); scratchpad->key=pivotkey; scratchpad->rid=pivotrid; ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0)); return (SPLIT); fail_dramatically: ham_assert(!(pivotkey.flags & HAM_KEY_USER_ALLOC), (0)); if (pivotkey.data) allocator_free(env_get_allocator(env), pivotkey.data); return st; }
/** * create and initialize a new backend * * @remark this function is called after the @a ham_db_t structure * and the file were created * * the @a flags are stored in the database; only transfer * the persistent flags! * * @note This is a B+-tree 'backend' method. */ static ham_status_t my_fun_create(ham_btree_t *be, ham_u16_t keysize, ham_u32_t flags) { ham_status_t st; ham_page_t *root; ham_size_t maxkeys; ham_db_t *db=be_get_db(be); db_indexdata_t *indexdata=env_get_indexdata_ptr(db_get_env(db), db_get_indexdata_offset(db)); if (be_is_active(be)) { ham_trace(("backend has alread been initialized before!")); /* HAM_INTERNAL_ERROR -- not really, when keeping custom * backends in mind */ return HAM_ALREADY_INITIALIZED; } /* * prevent overflow - maxkeys only has 16 bit! */ maxkeys=btree_calc_maxkeys(env_get_pagesize(db_get_env(db)), keysize); if (maxkeys>MAX_KEYS_PER_NODE) { ham_trace(("keysize/pagesize ratio too high")); return HAM_INV_KEYSIZE; } else if (maxkeys==0) { ham_trace(("keysize too large for the current pagesize")); return HAM_INV_KEYSIZE; } /* * allocate a new root page */ st=db_alloc_page(&root, db, PAGE_TYPE_B_ROOT, PAGE_IGNORE_FREELIST); ham_assert(st ? root == NULL : 1, (0)); ham_assert(!st ? root != NULL : 1, (0)); if (!root) return st ? st : HAM_INTERNAL_ERROR; memset(page_get_raw_payload(root), 0, sizeof(btree_node_t)+sizeof(ham_perm_page_union_t)); /* * calculate the maximum number of keys for this page, * and make sure that this number is even */ btree_set_maxkeys(be, (ham_u16_t)maxkeys); be_set_dirty(be, HAM_TRUE); be_set_keysize(be, keysize); be_set_flags(be, flags); btree_set_rootpage(be, page_get_self(root)); index_clear_reserved(indexdata); index_set_max_keys(indexdata, (ham_u16_t)maxkeys); index_set_keysize(indexdata, keysize); index_set_self(indexdata, page_get_self(root)); index_set_flags(indexdata, flags); index_set_recno(indexdata, 0); index_clear_reserved(indexdata); env_set_dirty(db_get_env(db)); be_set_active(be, HAM_TRUE); return (0); }