/* * __wt_block_read_off_blind -- * Read the block at an offset, return the size and checksum, debugging * only. */ int __wt_block_read_off_blind(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, uint32_t *sizep, uint32_t *checksump) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; *sizep = 0; *checksump = 0; /* * Make sure the buffer is large enough for the header and read the * the first allocation-size block. */ WT_RET(__wt_scr_alloc(session, block->allocsize, &tmp)); WT_ERR(__wt_read( session, block->fh, offset, (size_t)block->allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); __wt_block_header_byteswap(blk); *sizep = blk->disk_size; *checksump = blk->checksum; err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_read_off_blind -- * Read the block at an offset, try to figure out what it looks like, * debugging only. */ int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset) { WT_BLOCK_HEADER *blk; uint32_t cksum, size; /* * Make sure the buffer is large enough for the header and read the * the first allocation-size block. */ WT_RET(__wt_buf_init(session, buf, block->allocsize)); WT_RET(__wt_read( session, block->fh, offset, (size_t)block->allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * Copy out the size and checksum (we're about to re-use the buffer), * and if the size isn't insane, read the rest of the block. */ size = blk->disk_size; cksum = blk->cksum; if (__wt_block_offset_invalid(block, offset, size)) WT_RET_MSG(session, EINVAL, "block at offset %" PRIuMAX " cannot be a valid block, no " "read attempted", (uintmax_t)offset); return (__wt_block_read_off(session, block, buf, offset, size, cksum)); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; size_t bufsize; uint32_t page_cksum; WT_RET(__wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum)); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); page_cksum = blk->cksum; if (page_cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); } if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * copy -- * Copy the created page to the end of the salvage file. */ void copy(u_int gen, u_int recno) { FILE *ifp, *ofp; WT_PAGE_HEADER *dsk; WT_BLOCK_HEADER *blk; char buf[PSIZE]; CHECK((ifp = fopen(LOAD, "r")) != NULL); /* * If the salvage file doesn't exist, then we're creating it: * copy the first sector (the file description). * Otherwise, we are appending to an existing file. */ if (file_exists(SLVG)) CHECK((ofp = fopen(SLVG, "a")) != NULL); else { CHECK((ofp = fopen(SLVG, "w")) != NULL); CHECK(fread(buf, 1, PSIZE, ifp) == PSIZE); CHECK(fwrite(buf, 1, PSIZE, ofp) == PSIZE); } /* * If there's data, copy/update the first formatted page. */ if (gen != 0) { CHECK(fseek(ifp, (long)PSIZE, SEEK_SET) == 0); CHECK(fread(buf, 1, PSIZE, ifp) == PSIZE); dsk = (void *)buf; if (page_type != WT_PAGE_ROW_LEAF) dsk->recno = recno; dsk->write_gen = gen; blk = WT_BLOCK_HEADER_REF(buf); blk->checksum = 0; blk->checksum = __wt_checksum(dsk, PSIZE); CHECK(fwrite(buf, 1, PSIZE, ofp) == PSIZE); } CHECK(fclose(ifp) == 0); CHECK(fclose(ofp) == 0); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; wt_off_t max, offset; uint32_t allocsize, checksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = block->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read( session, fh, offset, (size_t)allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); __wt_block_header_byteswap(blk); size = blk->disk_size; checksum = blk->checksum; /* * Check the block size: if it's not insane, read the block. * Reading the block validates any checksum; if reading the * block succeeds, return its address as a possible page, * otherwise, move past it. */ if (!__wt_block_offset_invalid(block, offset, size) && __wt_block_read_off( session, block, tmp, offset, size, checksum) == 0) break; /* Free the allocation-size block. */ __wt_verbose(session, WT_VERB_SALVAGE, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); WT_ERR(__wt_block_off_free( session, block, offset, (wt_off_t)allocsize)); block->slvg_off += allocsize; } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); done: err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_salvage_next -- * Return the next block from the file. */ int __wt_block_salvage_next( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) { WT_BLOCK_HEADER *blk; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; offset = block->slvg_off; fh = block->fh; allocsize = block->allocsize; WT_RET(__wt_buf_initsize(session, buf, allocsize)); /* Read through the file, looking for pages with valid checksums. */ for (max = fh->file_size;;) { if (offset >= max) { /* Check eof. */ *eofp = 1; return (0); } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. */ WT_RET(__wt_read(session, fh, offset, allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The page size isn't insane, read the entire page: reading the * page validates the checksum and then decompresses the page as * needed. If reading the page fails, it's probably corruption, * we ignore this block. */ if (__wt_block_read_off( session, block, buf, offset, size, cksum)) { skip: WT_VERBOSE_RET(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* * Free the block and make sure we don't return it more * than once. */ WT_RET(__wt_block_off_free( session, block, offset, (off_t)allocsize)); block->slvg_off = offset += allocsize; continue; } /* * Valid block, return to our caller. * * The buffer may have grown: make sure we read from the full * page image. */ blk = WT_BLOCK_HEADER_REF(buf->mem); break; } /* * Track the largest write-generation we've seen in the file so future * writes, done after salvage completes, are preferred to these blocks. */ *write_genp = blk->write_gen; if (block->live.write_gen < blk->write_gen) block->live.write_gen = blk->write_gen; /* Re-create the address cookie that should reference this block. */ endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); /* We're successfully returning the page, move past it. */ block->slvg_off = offset + size; return (0); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; off_t offset; uint32_t align_size; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = align_size; /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!locked) __wt_spin_lock(session, &block->live_lock); ret = __wt_block_alloc(session, block, &offset, (off_t)align_size); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); #if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE) /* * Extend the file in chunks. We aren't holding a lock and we'd prefer * to limit the number of threads extending the file at the same time, * so choose the one thread that's crossing the extended boundary. We * don't extend newly created files, and it's theoretically possible we * might wait so long our extension of the file is passed by another * thread writing single blocks, that's why there's a check in case the * extended file size becomes too small: if the file size catches up, * every thread will try to extend it. */ if (fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; #if defined(HAVE_POSIX_FALLOCATE) if ((ret = posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0) WT_RET_MSG( session, ret, "%s: posix_fallocate", fh->name); #elif defined(HAVE_FTRUNCATE) if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0) WT_RET_MSG(session, ret, "%s: ftruncate", fh->name); #endif } #endif if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!locked) __wt_spin_lock(session, &block->live_lock); WT_TRET( __wt_block_off_free(session, block, offset, align_size)); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { block->os_cache_dirty = 0; if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) WT_RET_MSG( session, ret, "%s: sync_file_range", block->name); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); WT_VERBOSE_RET(session, write, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, align_size, blk->cksum); *offsetp = offset; *sizep = align_size; *cksump = blk->cksum; return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_cksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, page_cksum, cksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, swap.cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /* * Extend the file in chunks. We want to limit the number of threads * extending the file at the same time, so choose the one thread that's * crossing the extended boundary. We don't extend newly created files, * and it's theoretically possible we might wait so long our extension * of the file is passed by another thread writing single blocks, that's * why there's a check in case the extended file size becomes too small: * if the file size catches up, every thread tries to extend it. * * File extension may require locking: some variants of the system call * used to extend the file initialize the extended space. If a writing * thread races with the extending thread, the extending thread might * overwrite already written data, and that would be very, very bad. * * Some variants of the system call to extend the file fail at run-time * based on the filesystem type, fall back to ftruncate in that case, * and remember that ftruncate requires locking. */ if (ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /* * Release any locally acquired lock if it's not needed * to extend the file, extending the file might require * updating file metadata, which can be slow. (It may be * a bad idea to configure for file extension on systems * that require locking over the extend call.) */ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /* Extend the file. */ if ((ret = __wt_fallocate(session, fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else { extend_truncate: /* * We may have a caller lock or a locally acquired lock, * but we need a lock to call ftruncate. */ if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /* * The truncate might fail if there's a file mapping * (if there's an open checkpoint on the file), that's * OK. */ if ((ret = __wt_ftruncate( session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } /* Release any locally acquired lock. */ if (local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return (ret); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, bool data_cksum, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; uint32_t cksum; bool local_locked; fh = block->fh; /* * Clear the block header to ensure all of it is initialized, even the * unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* * Swap the page-header as needed; this doesn't belong here, but it's * the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything * in little-endian format. The checksum is (potentially) returned in a * big-endian format, swap it into place in a separate step. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; __wt_block_header_byteswap(blk); blk->cksum = cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->cksum = __wt_bswap32(blk->cksum); #endif /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. * Allocate space for the write, and optionally extend the file (note * the block-extend function may release the lock). * Release any locally acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend( session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = cksum; return (0); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = fh->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read(session, fh, offset, allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); block->slvg_off += allocsize; /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The block size isn't insane, read the entire block. Reading * the block validates the checksum; if reading the block fails, * ignore it. If reading the block succeeds, return its address * as a possible page. */ if (__wt_block_read_off( session, block, tmp, offset, size, cksum) == 0) break; skip: WT_VERBOSE_ERR(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* Free the allocation-size block. */ WT_ERR(__wt_block_off_free( session, block, offset, (off_t)allocsize)); } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); done: err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/*将buffer的数据写入到block对应的文件中,并计算checksum和size*/ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /*buf不是对齐模式,不能进行写,因为这个是和磁盘相关的写入,必须是对齐的*/ if(!F_ISSET(buf, WT_ITEM_ALIGNED)){ WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /*计算buf->size按block对齐,对齐后有可能会比现有的buf->memsize大,如果大的话,不能进行写,有可能会缓冲区溢出*/ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /*超过4G*/ if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /*将对其后pading的buffer位置进行清0*/ memset((uint8_t*)buf->mem + buf->size, 0, align_size - buf->size); /*设置block header,计算存储的数据长度*/ blk->disk_size = WT_STORE_SIZE(align_size); blk->flags = 0; if(data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); /*计算buf的cksum*/ blk->cksum = __wt_cksum(buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /*判断文件是否需要进行扩大,如果不扩大就有可能存不下写入的block数据*/ if(ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))){ /*调整extend_size为原来的offset + extend_len的两倍*/ fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /*释放block->live_lock的自旋锁,因为重设文件大小会时间比较长,需要先释放自旋锁,防止CPU空转*/ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /*扩大文件的占用空间*/ if ((ret = __wt_fallocate(session,fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else{ extend_truncate: if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /*直接调整文件大小,这个比__wt_fallocate更慢*/ if ((ret = __wt_ftruncate(session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } if(local_locked){ __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /*进行block的数据写入*/ ret =__wt_write(session, fh, offset, align_size, buf->mem); if (ret != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); /*没写成功,将ext对应的数据返回给avail list*/ WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /*需要进行fsync操作,脏页太多,进行一次异步刷盘*/ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /*清理fh->fd文件对应的system page cache中的数据,这个过程可能会有IO操作,相当于同步的sync调用*/ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return ret; }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; uint32_t page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * If we're compressing the file blocks, place the initial read into a * scratch buffer, we're going to have to re-allocate more memory for * decompression. Else check the caller's buffer size and grow it as * necessary, there will only be one buffer. */ if (block->compressor == NULL) { F_SET(buf, WT_ITEM_ALIGNED); WT_RET(__wt_buf_init(session, buf, size)); buf->size = size; dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, size, &tmp)); tmp->size = size; dsk = tmp->mem; } /* Read. */ WT_ERR(__wt_read(session, block->fh, offset, size, dsk)); blk = WT_BLOCK_HEADER_REF(dsk); /* Validate the checksum. */ if (block->checksum && cksum != WT_BLOCK_CHECKSUM_NOT_SET && blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) { blk->cksum = 0; page_cksum = __wt_cksum(dsk, size); if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET) ++page_cksum; if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); WT_ERR(WT_ERROR); } } /* * If the in-memory block size is larger than the on-disk block size, * the block is compressed. Size the user's buffer, copy the skipped * bytes of the original image into place, then decompress. * * If the in-memory block size is less than or equal to the on-disk * block size, the block is not compressed. */ if (blk->disk_size < dsk->size) { if (block->compressor == NULL) WT_ERR(__wt_illegal_value(session, block->name)); WT_ERR(__wt_buf_init(session, buf, dsk->size)); buf->size = dsk->size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(block->compressor->decompress( block->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->size - WT_BLOCK_COMPRESS_SKIP, &result_len)); if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP) WT_ERR(__wt_illegal_value(session, block->name)); } else if (block->compressor == NULL) buf->size = dsk->size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, why configure a compressor that * doesn't work? Allocate a buffer of the right size * (we used a scratch buffer which might be large), and * copy the data into place. */ WT_ERR( __wt_buf_set(session, buf, tmp->data, dsk->size)); WT_BSTAT_INCR(session, page_read); WT_CSTAT_INCR(session, block_read); err: __wt_scr_free(&tmp); return (ret); }