static ham_status_t __get_duplicate_table(dupe_table_t **table_ref, ham_page_t **page, ham_env_t *env, ham_u64_t table_id) { ham_status_t st; blob_t hdr; ham_page_t *hdrpage=0; dupe_table_t *table; *page = 0; if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) { ham_u8_t *p=(ham_u8_t *)U64_TO_PTR(table_id); *table_ref = (dupe_table_t *)(p+sizeof(hdr)); return HAM_SUCCESS; } *table_ref = 0; /* * load the blob header */ st=__read_chunk(env, 0, &hdrpage, table_id, (ham_u8_t *)&hdr, sizeof(hdr)); if (st) { return st; } /* * if the whole table is in a page (and not split between several * pages), just return a pointer directly in the page */ if (page_get_self(hdrpage)+env_get_usable_pagesize(env) >= table_id+blob_get_size(&hdr)) { ham_u8_t *p=page_get_raw_payload(hdrpage); /* yes, table is in the page */ *page=hdrpage; *table_ref = (dupe_table_t *) &p[table_id-page_get_self(hdrpage)+sizeof(hdr)]; return HAM_SUCCESS; } /* * otherwise allocate memory for the table */ table=allocator_alloc(env_get_allocator(env), (ham_size_t)blob_get_size(&hdr)); if (!table) { return HAM_OUT_OF_MEMORY; } /* * then read the rest of the blob */ st=__read_chunk(env, hdrpage, 0, table_id+sizeof(hdr), (ham_u8_t *)table, (ham_size_t)blob_get_size(&hdr)); if (st) { return st; } *table_ref = table; return HAM_SUCCESS; }
/** * write a series of data chunks to storage at file offset 'addr'. * * The chunks are assumed to be stored in sequential order, adjacent * to each other, i.e. as one long data strip. * * Writing is performed on a per-page basis, where special conditions * will decide whether or not the write operation is performed * through the page cache or directly to device; such is determined * on a per-page basis. */ static ham_status_t __write_chunks(ham_env_t *env, ham_page_t *page, ham_offset_t addr, ham_bool_t allocated, ham_bool_t freshly_created, ham_u8_t **chunk_data, ham_size_t *chunk_size, ham_size_t chunks) { ham_size_t i; ham_status_t st; ham_offset_t pageid; ham_device_t *device=env_get_device(env); ham_size_t pagesize = env_get_pagesize(env); ham_assert(freshly_created ? allocated : 1, (0)); /* * for each chunk... */ for (i=0; i<chunks; i++) { while (chunk_size[i]) { /* * get the page-ID from this chunk */ pageid = addr - (addr % pagesize); /* * is this the current page? */ if (page && page_get_self(page)!=pageid) page=0; /* * fetch the page from the cache, if it's in the cache * (unless we're logging - in this case always go through * the buffered routines) */ if (!page) { /* * keep pages in cache when they are located at the 'edges' of * the blob, as they MAY be accessed for different data. * Of course, when a blob is small, there's only one (partial) * page accessed anyhow, so that one should end up in cache * then. * * When transaction logging is turned on, it's the same story, * really. We _could_ keep all those pages in cache now, * but this would be thrashing the cache with blob data that's * accessed once only and for transaction abort (or commit) * the amount of effort does not change. * * THOUGHT: * * Do we actually care what was in that page, which is going * to be overwritten in its entirety, BEFORE we do this, i.e. * before the transaction? * * Answer: NO (and YES in special circumstances). * * Elaboration: As this would have been free space before, the * actual content does not matter, so it's not required to add * the FULL pages written by the blob write action here to the * transaction log: even on transaction abort, that lingering * data is marked as 'bogus'/free as it was before anyhow. * * And then, assuming a longer running transaction, where this * page was freed during a previous action WITHIN * the transaction, well, than the transaction log should * already carry this page's previous content as instructed * by the erase operation. HOWEVER, the erase operation would * not have a particular NEED to edit this page, as an erase op * is complete by just marking this space as free in the * freelist, resulting in the freelist pages (and the btree * pages) being the only ones being edited and ending up in * the transaction log then. * * Which means we'll have to log the previous content of these * pages to the transaction log anyhow. UNLESS, that is, when * WE allocated these pages in the first place: then there * cannot be any 'pre-transaction' state of these pages * except that of 'not existing', i.e. 'free'. In which case, * their actual content doesn't matter! (freshly_created) * * And what if we have recovery logging turned on, but it's * not about an active transaction here? * In that case, the recovery log would only log the OLD page * content, which we've concluded is insignificant, ever. Of * course, that's assuming (again!) that we're writing to * freshly created pages, which no-one has seen before. * * Just as long as we can prevent this section from thrashing * the page cache, thank you very much... */ ham_bool_t at_blob_edge = (__blob_from_cache(env, chunk_size[i]) || (addr % pagesize) != 0 || chunk_size[i] < pagesize); ham_bool_t cacheonly = (!at_blob_edge && (!env_get_log(env) || freshly_created)); //ham_assert(db_get_txn(db) ? !!env_get_log(env) : 1, (0)); st=env_fetch_page(&page, env, pageid, cacheonly ? DB_ONLY_FROM_CACHE : at_blob_edge ? 0 : DB_NEW_PAGE_DOES_THRASH_CACHE); ham_assert(st ? !page : 1, (0)); /* blob pages don't have a page header */ if (page) { page_set_npers_flags(page, page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER); /* if this page was recently allocated by the parent * function: set a flag */ if (cacheonly && allocated && addr==page_get_self(page) && env_get_txn(env)) page_set_alloc_txn_id(page, txn_get_id(env_get_txn(env))); } else if (st) { return st; } } /* * if we have a page pointer: use it; otherwise write directly * to the device */ if (page) { ham_size_t writestart= (ham_size_t)(addr-page_get_self(page)); ham_size_t writesize = (ham_size_t)(pagesize - writestart); if (writesize>chunk_size[i]) writesize=chunk_size[i]; if ((st=ham_log_add_page_before(page))) return (st); memcpy(&page_get_raw_payload(page)[writestart], chunk_data[i], writesize); page_set_dirty(page, env); addr+=writesize; chunk_data[i]+=writesize; chunk_size[i]-=writesize; } else { ham_size_t s = chunk_size[i]; /* limit to the next page boundary */ if (s > pageid+pagesize-addr) s = (ham_size_t)(pageid+pagesize-addr); ham_assert(env_get_log(env) ? freshly_created : 1, (0)); st=device->write(device, addr, chunk_data[i], s); if (st) return st; addr+=s; chunk_data[i]+=s; chunk_size[i]-=s; } } } return (0); }
static ham_status_t __read_chunk(ham_env_t *env, ham_page_t *page, ham_page_t **fpage, ham_offset_t addr, ham_u8_t *data, ham_size_t size) { ham_status_t st; ham_device_t *device=env_get_device(env); while (size) { /* * get the page-ID from this chunk */ ham_offset_t pageid; pageid = addr - (addr % env_get_pagesize(env)); if (page) { if (page_get_self(page)!=pageid) page=0; } /* * is it the current page? if not, try to fetch the page from * the cache - but only read the page from disk, if the * chunk is small */ if (!page) { st=env_fetch_page(&page, env, pageid, __blob_from_cache(env, size) ? 0 : DB_ONLY_FROM_CACHE); ham_assert(st ? !page : 1, (0)); /* blob pages don't have a page header */ if (page) page_set_npers_flags(page, page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER); else if (st) return st; } /* * if we have a page pointer: use it; otherwise read directly * from the device */ if (page) { ham_size_t readstart= (ham_size_t)(addr-page_get_self(page)); ham_size_t readsize = (ham_size_t)(env_get_pagesize(env)-readstart); if (readsize>size) readsize=size; memcpy(data, &page_get_raw_payload(page)[readstart], readsize); addr+=readsize; data+=readsize; size-=readsize; } else { ham_size_t s=(size<env_get_pagesize(env) ? size : env_get_pagesize(env)); /* limit to the next page boundary */ if (s>pageid+env_get_pagesize(env)-addr) s=(ham_size_t)(pageid+env_get_pagesize(env)-addr); st=device->read(device, addr, data, s); if (st) return st; addr+=s; data+=s; size-=s; } } if (fpage) *fpage=page; return (0); }
/** * create and initialize a new backend * * @remark this function is called after the @a ham_db_t structure * and the file were created * * the @a flags are stored in the database; only transfer * the persistent flags! * * @note This is a B+-tree 'backend' method. */ static ham_status_t my_fun_create(ham_btree_t *be, ham_u16_t keysize, ham_u32_t flags) { ham_status_t st; ham_page_t *root; ham_size_t maxkeys; ham_db_t *db=be_get_db(be); db_indexdata_t *indexdata=env_get_indexdata_ptr(db_get_env(db), db_get_indexdata_offset(db)); if (be_is_active(be)) { ham_trace(("backend has alread been initialized before!")); /* HAM_INTERNAL_ERROR -- not really, when keeping custom * backends in mind */ return HAM_ALREADY_INITIALIZED; } /* * prevent overflow - maxkeys only has 16 bit! */ maxkeys=btree_calc_maxkeys(env_get_pagesize(db_get_env(db)), keysize); if (maxkeys>MAX_KEYS_PER_NODE) { ham_trace(("keysize/pagesize ratio too high")); return HAM_INV_KEYSIZE; } else if (maxkeys==0) { ham_trace(("keysize too large for the current pagesize")); return HAM_INV_KEYSIZE; } /* * allocate a new root page */ st=db_alloc_page(&root, db, PAGE_TYPE_B_ROOT, PAGE_IGNORE_FREELIST); ham_assert(st ? root == NULL : 1, (0)); ham_assert(!st ? root != NULL : 1, (0)); if (!root) return st ? st : HAM_INTERNAL_ERROR; memset(page_get_raw_payload(root), 0, sizeof(btree_node_t)+sizeof(ham_perm_page_union_t)); /* * calculate the maximum number of keys for this page, * and make sure that this number is even */ btree_set_maxkeys(be, (ham_u16_t)maxkeys); be_set_dirty(be, HAM_TRUE); be_set_keysize(be, keysize); be_set_flags(be, flags); btree_set_rootpage(be, page_get_self(root)); index_clear_reserved(indexdata); index_set_max_keys(indexdata, (ham_u16_t)maxkeys); index_set_keysize(indexdata, keysize); index_set_self(indexdata, page_get_self(root)); index_set_flags(indexdata, flags); index_set_recno(indexdata, 0); index_clear_reserved(indexdata); env_set_dirty(db_get_env(db)); be_set_active(be, HAM_TRUE); return (0); }