/* * __wt_ovfl_txnc_add -- * Add a new entry to the page's list of transaction-cached overflow * records. */ int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size) { WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc; size_t size; u_int i, skipdepth; uint8_t *p; if (page->modify->ovfl_track == NULL) WT_RET(__ovfl_track_init(session, page)); head = page->modify->ovfl_track->ovfl_txnc; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate the WT_OVFL_TXNC structure, next pointers for the skip * list, room for the address and value, then copy everything into * place. * * To minimize the WT_OVFL_TXNC structure size, the address offset * and size are single bytes: that's safe because the address follows * the structure (which can't be more than about 100B), and address * cookies are limited to 255B. */ size = sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size; WT_RET(__wt_calloc(session, 1, size, &txnc)); p = (uint8_t *)txnc + sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *); txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc); txnc->addr_size = (uint8_t)addr_size; memcpy(p, addr, addr_size); p += addr_size; txnc->value_offset = WT_PTRDIFF32(p, txnc); txnc->value_size = WT_STORE_SIZE(value_size); memcpy(p, value, value_size); txnc->current = __wt_txn_new_id(session); __wt_cache_page_inmem_incr( session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC)); /* Insert the new entry into the skiplist. */ __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size); for (i = 0; i < skipdepth; ++i) { txnc->next[i] = *stack[i]; *stack[i] = txnc; } if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add")); return (0); }
/* * __wt_block_write -- * Write a buffer into a block, returning the block's address cookie. */ int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_size, int data_cksum) { off_t offset; uint32_t size, cksum; uint8_t *endp; WT_UNUSED(addr_size); WT_RET(__wt_block_write_off( session, block, buf, &offset, &size, &cksum, data_cksum, 0)); endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_size = WT_PTRDIFF32(endp, addr); return (0); }
/* * __wt_block_salvage_next -- * Return the next block from the file. */ int __wt_block_salvage_next( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) { WT_BLOCK_HEADER *blk; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; offset = block->slvg_off; fh = block->fh; allocsize = block->allocsize; WT_RET(__wt_buf_initsize(session, buf, allocsize)); /* Read through the file, looking for pages with valid checksums. */ for (max = fh->file_size;;) { if (offset >= max) { /* Check eof. */ *eofp = 1; return (0); } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. */ WT_RET(__wt_read(session, fh, offset, allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The page size isn't insane, read the entire page: reading the * page validates the checksum and then decompresses the page as * needed. If reading the page fails, it's probably corruption, * we ignore this block. */ if (__wt_block_read_off( session, block, buf, offset, size, cksum)) { skip: WT_VERBOSE_RET(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* * Free the block and make sure we don't return it more * than once. */ WT_RET(__wt_block_off_free( session, block, offset, (off_t)allocsize)); block->slvg_off = offset += allocsize; continue; } /* * Valid block, return to our caller. * * The buffer may have grown: make sure we read from the full * page image. */ blk = WT_BLOCK_HEADER_REF(buf->mem); break; } /* * Track the largest write-generation we've seen in the file so future * writes, done after salvage completes, are preferred to these blocks. */ *write_genp = blk->write_gen; if (block->live.write_gen < blk->write_gen) block->live.write_gen = blk->write_gen; /* Re-create the address cookie that should reference this block. */ endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); /* We're successfully returning the page, move past it. */ block->slvg_off = offset + size; return (0); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = fh->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read(session, fh, offset, allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); block->slvg_off += allocsize; /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The block size isn't insane, read the entire block. Reading * the block validates the checksum; if reading the block fails, * ignore it. If reading the block succeeds, return its address * as a possible page. */ if (__wt_block_read_off( session, block, tmp, offset, size, cksum) == 0) break; skip: WT_VERBOSE_ERR(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* Free the allocation-size block. */ WT_ERR(__wt_block_off_free( session, block, offset, (off_t)allocsize)); } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); done: err: __wt_scr_free(&tmp); return (ret); }
/* * __ckpt_update -- * Update a checkpoint. */ static int __ckpt_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, uint64_t ckpt_size, int is_live) { WT_EXTLIST *alloc; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); #endif /* * Write the checkpoint's alloc and discard extent lists. After each * write, remove any allocated blocks from the system's allocation * list, checkpoint extent blocks don't appear on any extent lists. */ alloc = &block->live.alloc; WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->alloc.offset, ci->alloc.size)); WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->discard.offset, ci->discard.size)); /* * We only write an avail list for the live system, other checkpoint's * avail lists are static and never change. * * Write the avail list last so it reflects changes due to allocating * blocks for the alloc and discard lists. Second, when we write the * live system's avail list, it's two lists: the current avail list * plus the list of blocks to be made available when the new checkpoint * completes. We can't merge that second list into the real list yet, * it's not truly available until the new checkpoint locations have been * saved to the metadata. */ if (is_live) { WT_RET(__wt_block_extlist_write( session, block, &ci->avail, &ci->ckpt_avail)); if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->avail.offset, ci->avail.size)); } /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing checkpoints because we * want to test the checkpoint's blocks against a reasonable maximum * file size during verification. This is bad: imagine a checkpoint * appearing early in the file, re-written, and then the checkpoint * requires blocks at the end of the file, blocks after the listed file * size. If the application opens that checkpoint for writing * (discarding subsequent checkpoints), we would truncate the file to * the early chunk, discarding the re-written checkpoint information. * The alternative, updating the file size has its own problems, in * that case we'd work correctly, but we'd lose all of the blocks * between the original checkpoint and the re-written checkpoint. * Currently, there's no API to roll-forward intermediate checkpoints, * if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &ci->file_size)); /* Set the checkpoint size for the live system. */ if (is_live) ci->ckpt_size = ckpt_size; /* * Copy the checkpoint information into the checkpoint array's address * cookie. */ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = ckpt->raw.mem; WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); ckpt->raw.size = WT_PTRDIFF32(endp, ckpt->raw.mem); if (WT_VERBOSE_ISSET(session, ckpt)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); WT_VERBOSE_ERR(session, ckpt, "%s: create-checkpoint: %s: %s", block->name, ckpt->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }
/* * __snapshot_update -- * Update a snapshot. */ static int __snapshot_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap, WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard)); #endif /* * Write the snapshot's extent lists; we only write an avail list for * the live system, other snapshot's avail lists are static and never * change. When we do write the avail list for the live system it's * two lists: the current avail list plus the list of blocks that are * being made available as of the new snapshot. We can't merge that * second list into the real list yet, it's not truly available until * the new snapshot location has been saved to the metadata. */ WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL)); if (is_live) WT_RET(__wt_block_extlist_write( session, block, &si->avail, &si->snapshot_avail)); WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL)); /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing snapshots because we want * to test the snapshot's blocks against a reasonable maximum file size * during verification. This is not good: imagine a snapshot appearing * early in the file, re-written, and then the snapshot requires blocks * at the end of the file, blocks after the listed file size. If the * application opens that snapshot for writing (discarding subsequent * snapshots), we would truncate the file to the early chunk, discarding * the re-written snapshot information. The alternative, updating the * file size has its own problems, in that case we'd work correctly, but * we'd lose all of the blocks between the original snapshot and the * re-written snapshot. Currently, there's no API to roll-forward * intermediate snapshots, if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &si->file_size)); /* Set the snapshot size for the live system. */ if (is_live) si->snapshot_size = snapshot_size; /* * Copy the snapshot information into the snapshot array's address * cookie. */ WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = snap->raw.mem; WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si)); snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem); if (WT_VERBOSE_ISSET(session, snapshot)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: create-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }