static void print_environment(ham_env_t *env) { /* * we need a temp. database */ ham_db_t *db; ham_status_t st; st=ham_new(&db); if (st) error("ham_new", st); st=ham_env_open_db(env, db, 0xf001, 0, 0); if (st) error("ham_env_open_db", st); printf("environment\n"); printf(" pagesize: %u\n", env_get_pagesize(env)); printf(" version: %u.%u.%u.%u\n", env_get_version(env, 0), env_get_version(env, 1), env_get_version(env, 2), env_get_version(env, 3)); printf(" serialno: %u\n", env_get_serialno(env)); printf(" max databases: %u\n", env_get_max_databases(env)); st=ham_close(db, 0); if (st) error("ham_close", st); ham_delete(db); }
/** * estimate the number of keys per page, given the keysize * * @remark this function is only available when * hamsterdb is compiled with HAM_ENABLE_INTERNAL turned on. * * @note This is a B+-tree 'backend' method. */ static ham_status_t my_fun_calc_keycount_per_page(ham_btree_t *be, ham_size_t *maxkeys, ham_u16_t keysize) { ham_db_t *db=be_get_db(be); if (keysize == 0) { *maxkeys=btree_get_maxkeys(be); } else { /* * prevent overflow - maxkeys only has 16 bit! */ *maxkeys=btree_calc_maxkeys(env_get_pagesize(db_get_env(db)), keysize); if (*maxkeys>MAX_KEYS_PER_NODE) { ham_trace(("keysize/pagesize ratio too high")); return HAM_INV_KEYSIZE; } else if (*maxkeys==0) { ham_trace(("keysize too large for the current pagesize")); return HAM_INV_KEYSIZE; } } return (0); }
ham_bool_t cache_too_big(ham_cache_t *cache) { if (cache_get_cur_elements(cache)*env_get_pagesize(cache_get_env(cache)) >cache_get_capacity(cache)) return (HAM_TRUE); return (HAM_FALSE); }
/** * write a series of data chunks to storage at file offset 'addr'. * * The chunks are assumed to be stored in sequential order, adjacent * to each other, i.e. as one long data strip. * * Writing is performed on a per-page basis, where special conditions * will decide whether or not the write operation is performed * through the page cache or directly to device; such is determined * on a per-page basis. */ static ham_status_t __write_chunks(ham_env_t *env, ham_page_t *page, ham_offset_t addr, ham_bool_t allocated, ham_bool_t freshly_created, ham_u8_t **chunk_data, ham_size_t *chunk_size, ham_size_t chunks) { ham_size_t i; ham_status_t st; ham_offset_t pageid; ham_device_t *device=env_get_device(env); ham_size_t pagesize = env_get_pagesize(env); ham_assert(freshly_created ? allocated : 1, (0)); /* * for each chunk... */ for (i=0; i<chunks; i++) { while (chunk_size[i]) { /* * get the page-ID from this chunk */ pageid = addr - (addr % pagesize); /* * is this the current page? */ if (page && page_get_self(page)!=pageid) page=0; /* * fetch the page from the cache, if it's in the cache * (unless we're logging - in this case always go through * the buffered routines) */ if (!page) { /* * keep pages in cache when they are located at the 'edges' of * the blob, as they MAY be accessed for different data. * Of course, when a blob is small, there's only one (partial) * page accessed anyhow, so that one should end up in cache * then. * * When transaction logging is turned on, it's the same story, * really. We _could_ keep all those pages in cache now, * but this would be thrashing the cache with blob data that's * accessed once only and for transaction abort (or commit) * the amount of effort does not change. * * THOUGHT: * * Do we actually care what was in that page, which is going * to be overwritten in its entirety, BEFORE we do this, i.e. * before the transaction? * * Answer: NO (and YES in special circumstances). * * Elaboration: As this would have been free space before, the * actual content does not matter, so it's not required to add * the FULL pages written by the blob write action here to the * transaction log: even on transaction abort, that lingering * data is marked as 'bogus'/free as it was before anyhow. * * And then, assuming a longer running transaction, where this * page was freed during a previous action WITHIN * the transaction, well, than the transaction log should * already carry this page's previous content as instructed * by the erase operation. HOWEVER, the erase operation would * not have a particular NEED to edit this page, as an erase op * is complete by just marking this space as free in the * freelist, resulting in the freelist pages (and the btree * pages) being the only ones being edited and ending up in * the transaction log then. * * Which means we'll have to log the previous content of these * pages to the transaction log anyhow. UNLESS, that is, when * WE allocated these pages in the first place: then there * cannot be any 'pre-transaction' state of these pages * except that of 'not existing', i.e. 'free'. In which case, * their actual content doesn't matter! (freshly_created) * * And what if we have recovery logging turned on, but it's * not about an active transaction here? * In that case, the recovery log would only log the OLD page * content, which we've concluded is insignificant, ever. Of * course, that's assuming (again!) that we're writing to * freshly created pages, which no-one has seen before. * * Just as long as we can prevent this section from thrashing * the page cache, thank you very much... */ ham_bool_t at_blob_edge = (__blob_from_cache(env, chunk_size[i]) || (addr % pagesize) != 0 || chunk_size[i] < pagesize); ham_bool_t cacheonly = (!at_blob_edge && (!env_get_log(env) || freshly_created)); //ham_assert(db_get_txn(db) ? !!env_get_log(env) : 1, (0)); st=env_fetch_page(&page, env, pageid, cacheonly ? DB_ONLY_FROM_CACHE : at_blob_edge ? 0 : DB_NEW_PAGE_DOES_THRASH_CACHE); ham_assert(st ? !page : 1, (0)); /* blob pages don't have a page header */ if (page) { page_set_npers_flags(page, page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER); /* if this page was recently allocated by the parent * function: set a flag */ if (cacheonly && allocated && addr==page_get_self(page) && env_get_txn(env)) page_set_alloc_txn_id(page, txn_get_id(env_get_txn(env))); } else if (st) { return st; } } /* * if we have a page pointer: use it; otherwise write directly * to the device */ if (page) { ham_size_t writestart= (ham_size_t)(addr-page_get_self(page)); ham_size_t writesize = (ham_size_t)(pagesize - writestart); if (writesize>chunk_size[i]) writesize=chunk_size[i]; if ((st=ham_log_add_page_before(page))) return (st); memcpy(&page_get_raw_payload(page)[writestart], chunk_data[i], writesize); page_set_dirty(page, env); addr+=writesize; chunk_data[i]+=writesize; chunk_size[i]-=writesize; } else { ham_size_t s = chunk_size[i]; /* limit to the next page boundary */ if (s > pageid+pagesize-addr) s = (ham_size_t)(pageid+pagesize-addr); ham_assert(env_get_log(env) ? freshly_created : 1, (0)); st=device->write(device, addr, chunk_data[i], s); if (st) return st; addr+=s; chunk_data[i]+=s; chunk_size[i]-=s; } } } return (0); }
/** * Allocate space in storage for and write the content references by 'data' * (and length 'size') to storage. * * Conditions will apply whether the data is written through cache or direct * to device. * * The content is, of course, prefixed by a BLOB header. * * Partial writes are handled in this function. */ ham_status_t blob_allocate(ham_env_t *env, ham_db_t *db, ham_record_t *record, ham_u32_t flags, ham_offset_t *blobid) { ham_status_t st; ham_page_t *page=0; ham_offset_t addr; blob_t hdr; ham_u8_t *chunk_data[2]; ham_size_t alloc_size; ham_size_t chunk_size[2]; ham_device_t *device=env_get_device(env); ham_bool_t freshly_created = HAM_FALSE; *blobid=0; /* * PARTIAL WRITE * * if offset+partial_size equals the full record size, then we won't * have any gaps. In this case we just write the full record and ignore * the partial parameters. */ if (flags&HAM_PARTIAL) { if (record->partial_offset==0 && record->partial_offset+record->partial_size==record->size) flags&=~HAM_PARTIAL; } /* * in-memory-database: the blobid is actually a pointer to the memory * buffer, in which the blob (with the blob-header) is stored */ if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) { blob_t *hdr; ham_u8_t *p=(ham_u8_t *)allocator_alloc(env_get_allocator(env), record->size+sizeof(blob_t)); if (!p) { return HAM_OUT_OF_MEMORY; } /* initialize the header */ hdr=(blob_t *)p; memset(hdr, 0, sizeof(*hdr)); blob_set_self(hdr, (ham_offset_t)PTR_TO_U64(p)); blob_set_alloc_size(hdr, record->size+sizeof(blob_t)); blob_set_size(hdr, record->size); /* do we have gaps? if yes, fill them with zeroes */ if (flags&HAM_PARTIAL) { ham_u8_t *s=p+sizeof(blob_t); if (record->partial_offset) memset(s, 0, record->partial_offset); memcpy(s+record->partial_offset, record->data, record->partial_size); if (record->partial_offset+record->partial_size<record->size) memset(s+record->partial_offset+record->partial_size, 0, record->size-(record->partial_offset+record->partial_size)); } else { memcpy(p+sizeof(blob_t), record->data, record->size); } *blobid=(ham_offset_t)PTR_TO_U64(p); return (0); } memset(&hdr, 0, sizeof(hdr)); /* * blobs are CHUNKSIZE-allocated */ alloc_size=sizeof(blob_t)+record->size; alloc_size += DB_CHUNKSIZE - 1; alloc_size -= alloc_size % DB_CHUNKSIZE; /* * check if we have space in the freelist */ st = freel_alloc_area(&addr, env, db, alloc_size); if (!addr) { if (st) return st; /* * if the blob is small AND if logging is disabled: load the page * through the cache */ if (__blob_from_cache(env, alloc_size)) { st = db_alloc_page(&page, db, PAGE_TYPE_BLOB, PAGE_IGNORE_FREELIST); ham_assert(st ? page == NULL : 1, (0)); ham_assert(!st ? page != NULL : 1, (0)); if (st) return st; /* blob pages don't have a page header */ page_set_npers_flags(page, page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER); addr=page_get_self(page); /* move the remaining space to the freelist */ (void)freel_mark_free(env, db, addr+alloc_size, env_get_pagesize(env)-alloc_size, HAM_FALSE); blob_set_alloc_size(&hdr, alloc_size); } else { /* * otherwise use direct IO to allocate the space */ ham_size_t aligned=alloc_size; aligned += env_get_pagesize(env) - 1; aligned -= aligned % env_get_pagesize(env); st=device->alloc(device, aligned, &addr); if (st) return (st); /* if aligned!=size, and the remaining chunk is large enough: * move it to the freelist */ { ham_size_t diff=aligned-alloc_size; if (diff > SMALLEST_CHUNK_SIZE) { (void)freel_mark_free(env, db, addr+alloc_size, diff, HAM_FALSE); blob_set_alloc_size(&hdr, aligned-diff); } else { blob_set_alloc_size(&hdr, aligned); } } freshly_created = HAM_TRUE; } ham_assert(HAM_SUCCESS == freel_check_area_is_allocated(env, db, addr, alloc_size), (0)); } else { ham_assert(!st, (0)); blob_set_alloc_size(&hdr, alloc_size); } blob_set_size(&hdr, record->size); blob_set_self(&hdr, addr); /* * PARTIAL WRITE * * are there gaps at the beginning? If yes, then we'll fill with zeros */ if ((flags&HAM_PARTIAL) && (record->partial_offset)) { ham_u8_t *ptr; ham_size_t gapsize=record->partial_offset; ptr=allocator_calloc(env_get_allocator(env), gapsize > env_get_pagesize(env) ? env_get_pagesize(env) : gapsize); if (!ptr) return (HAM_OUT_OF_MEMORY); /* * first: write the header */ chunk_data[0]=(ham_u8_t *)&hdr; chunk_size[0]=sizeof(hdr); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) return (st); addr+=sizeof(hdr); /* now fill the gap; if the gap is bigger than a pagesize we'll * split the gap into smaller chunks */ while (gapsize>=env_get_pagesize(env)) { chunk_data[0]=ptr; chunk_size[0]=env_get_pagesize(env); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) break; gapsize-=env_get_pagesize(env); addr+=env_get_pagesize(env); } /* fill the remaining gap */ if (gapsize) { chunk_data[0]=ptr; chunk_size[0]=gapsize; st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) return (st); addr+=gapsize; } allocator_free(env_get_allocator(env), ptr); /* now write the "real" data */ chunk_data[0]=(ham_u8_t *)record->data; chunk_size[0]=record->partial_size; st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) return (st); addr+=record->partial_size; } else { /* * not writing partially: write header and data, then we're done */ chunk_data[0]=(ham_u8_t *)&hdr; chunk_size[0]=sizeof(hdr); chunk_data[1]=(ham_u8_t *)record->data; chunk_size[1]=(flags&HAM_PARTIAL) ? record->partial_size : record->size; st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 2); if (st) return (st); addr+=sizeof(hdr)+ ((flags&HAM_PARTIAL) ? record->partial_size : record->size); } /* * store the blobid; it will be returned to the caller */ *blobid=blob_get_self(&hdr); /* * PARTIAL WRITES: * * if we have gaps at the end of the blob: just append more chunks to * fill these gaps. Since they can be pretty large we split them into * smaller chunks if necessary. */ if (flags&HAM_PARTIAL) { if (record->partial_offset+record->partial_size < record->size) { ham_u8_t *ptr; ham_size_t gapsize=record->size - (record->partial_offset+record->partial_size); /* now fill the gap; if the gap is bigger than a pagesize we'll * split the gap into smaller chunks * * we split this loop in two - the outer loop will allocate the * memory buffer, thus saving some allocations */ while (gapsize>env_get_pagesize(env)) { ham_u8_t *ptr=allocator_calloc(env_get_allocator(env), env_get_pagesize(env)); if (!ptr) return (HAM_OUT_OF_MEMORY); while (gapsize>env_get_pagesize(env)) { chunk_data[0]=ptr; chunk_size[0]=env_get_pagesize(env); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); if (st) break; gapsize-=env_get_pagesize(env); addr+=env_get_pagesize(env); } allocator_free(env_get_allocator(env), ptr); if (st) return (st); } /* now write the remainder, which is less than a pagesize */ ham_assert(gapsize<env_get_pagesize(env), ("")); chunk_size[0]=gapsize; ptr=chunk_data[0]=allocator_calloc(env_get_allocator(env), gapsize); if (!ptr) return (HAM_OUT_OF_MEMORY); st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, chunk_data, chunk_size, 1); allocator_free(env_get_allocator(env), ptr); if (st) return (st); } } return (0); }
static ham_status_t __read_chunk(ham_env_t *env, ham_page_t *page, ham_page_t **fpage, ham_offset_t addr, ham_u8_t *data, ham_size_t size) { ham_status_t st; ham_device_t *device=env_get_device(env); while (size) { /* * get the page-ID from this chunk */ ham_offset_t pageid; pageid = addr - (addr % env_get_pagesize(env)); if (page) { if (page_get_self(page)!=pageid) page=0; } /* * is it the current page? if not, try to fetch the page from * the cache - but only read the page from disk, if the * chunk is small */ if (!page) { st=env_fetch_page(&page, env, pageid, __blob_from_cache(env, size) ? 0 : DB_ONLY_FROM_CACHE); ham_assert(st ? !page : 1, (0)); /* blob pages don't have a page header */ if (page) page_set_npers_flags(page, page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER); else if (st) return st; } /* * if we have a page pointer: use it; otherwise read directly * from the device */ if (page) { ham_size_t readstart= (ham_size_t)(addr-page_get_self(page)); ham_size_t readsize = (ham_size_t)(env_get_pagesize(env)-readstart); if (readsize>size) readsize=size; memcpy(data, &page_get_raw_payload(page)[readstart], readsize); addr+=readsize; data+=readsize; size-=readsize; } else { ham_size_t s=(size<env_get_pagesize(env) ? size : env_get_pagesize(env)); /* limit to the next page boundary */ if (s>pageid+env_get_pagesize(env)-addr) s=(ham_size_t)(pageid+env_get_pagesize(env)-addr); st=device->read(device, addr, data, s); if (st) return st; addr+=s; data+=s; size-=s; } } if (fpage) *fpage=page; return (0); }
ham_status_t txn_abort(ham_txn_t *txn, ham_u32_t flags) { ham_status_t st; ham_env_t *env=txn_get_env(txn); /* * are cursors attached to this txn? if yes, fail */ if (txn_get_cursor_refcount(txn)) { ham_trace(("transaction cannot be aborted till all attached " "cursors are closed")); return HAM_CURSOR_STILL_OPEN; } if (env_get_log(env) && !(txn_get_flags(txn)&HAM_TXN_READ_ONLY)) { st=ham_log_append_txn_abort(env_get_log(env), txn); if (st) return st; } env_set_txn(env, 0); /* * undo all operations from this transaction * * this includes allocated pages (they're moved to the freelist), * deleted pages (they're un-deleted) and other modifications (will * re-create the original page from the logfile) * * keep txn_get_pagelist(txn) intact during every round, so no * local var for this one. */ while (txn_get_pagelist(txn)) { ham_page_t *head = txn_get_pagelist(txn); if (!(flags & DO_NOT_NUKE_PAGE_STATS)) { /* * nuke critical statistics, such as tracked outer bounds; imagine, * for example, a failing erase transaction which, through erasing * the top-most key, lowers the actual upper bound, after which * the transaction fails at some later point in life. Now if we * wouldn't 'rewind' our bounds-statistics, we would have a * situation where a subsequent out-of-bounds insert (~ append) * would possibly FAIL due to the hinter using incorrect bounds * information then! * * Hence we 'reverse' our statistics here and the easiest route * is to just nuke the critical bits; subsequent find/insert/erase * operations will ensure that the stats will get updated again, * anyhow. All we loose then is a few subsequent operations, which * might have been hinted if we had played a smarter game of * statistics 'reversal'. Soit. */ ham_db_t *db = page_get_owner(head); /* * only need to do this for index pages anyhow, and those are the * ones which have their 'ownership' set. */ if (db) { stats_page_is_nuked(db, head, HAM_FALSE); } } ham_assert(page_is_in_list(txn_get_pagelist(txn), head, PAGE_LIST_TXN), (0)); txn_get_pagelist(txn) = page_list_remove(head, PAGE_LIST_TXN, head); /* if this page was allocated by this transaction, then we can * move the whole page to the freelist */ if (page_get_alloc_txn_id(head)==txn_get_id(txn)) { (void)freel_mark_free(env, 0, page_get_self(head), env_get_pagesize(env), HAM_TRUE); } else { /* remove the 'delete pending' flag */ page_set_npers_flags(head, page_get_npers_flags(head)&~PAGE_NPERS_DELETE_PENDING); /* if the page is dirty, and RECOVERY is enabled: recreate * the original, unmodified page from the log */ if (env_get_log(env) && page_is_dirty(head)) { st=ham_log_recreate(env_get_log(env), head); if (st) return (st); /*page_set_undirty(head); */ } } /* page is no longer in use */ page_release_ref(head); } ham_assert(txn_get_pagelist(txn)==0, (0)); return (0); }
/** * create and initialize a new backend * * @remark this function is called after the @a ham_db_t structure * and the file were created * * the @a flags are stored in the database; only transfer * the persistent flags! * * @note This is a B+-tree 'backend' method. */ static ham_status_t my_fun_create(ham_btree_t *be, ham_u16_t keysize, ham_u32_t flags) { ham_status_t st; ham_page_t *root; ham_size_t maxkeys; ham_db_t *db=be_get_db(be); db_indexdata_t *indexdata=env_get_indexdata_ptr(db_get_env(db), db_get_indexdata_offset(db)); if (be_is_active(be)) { ham_trace(("backend has alread been initialized before!")); /* HAM_INTERNAL_ERROR -- not really, when keeping custom * backends in mind */ return HAM_ALREADY_INITIALIZED; } /* * prevent overflow - maxkeys only has 16 bit! */ maxkeys=btree_calc_maxkeys(env_get_pagesize(db_get_env(db)), keysize); if (maxkeys>MAX_KEYS_PER_NODE) { ham_trace(("keysize/pagesize ratio too high")); return HAM_INV_KEYSIZE; } else if (maxkeys==0) { ham_trace(("keysize too large for the current pagesize")); return HAM_INV_KEYSIZE; } /* * allocate a new root page */ st=db_alloc_page(&root, db, PAGE_TYPE_B_ROOT, PAGE_IGNORE_FREELIST); ham_assert(st ? root == NULL : 1, (0)); ham_assert(!st ? root != NULL : 1, (0)); if (!root) return st ? st : HAM_INTERNAL_ERROR; memset(page_get_raw_payload(root), 0, sizeof(btree_node_t)+sizeof(ham_perm_page_union_t)); /* * calculate the maximum number of keys for this page, * and make sure that this number is even */ btree_set_maxkeys(be, (ham_u16_t)maxkeys); be_set_dirty(be, HAM_TRUE); be_set_keysize(be, keysize); be_set_flags(be, flags); btree_set_rootpage(be, page_get_self(root)); index_clear_reserved(indexdata); index_set_max_keys(indexdata, (ham_u16_t)maxkeys); index_set_keysize(indexdata, keysize); index_set_self(indexdata, page_get_self(root)); index_set_flags(indexdata, flags); index_set_recno(indexdata, 0); index_clear_reserved(indexdata); env_set_dirty(db_get_env(db)); be_set_active(be, HAM_TRUE); return (0); }