/* * __wt_block_read_off_blind -- * Read the block at an offset, try to figure out what it looks like, * debugging only. */ int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset) { WT_BLOCK_HEADER *blk; uint32_t cksum, size; /* * Make sure the buffer is large enough for the header and read the * the first allocation-size block. */ WT_RET(__wt_buf_init(session, buf, block->allocsize)); WT_RET(__wt_read( session, block->fh, offset, (size_t)block->allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * Copy out the size and checksum (we're about to re-use the buffer), * and if the size isn't insane, read the rest of the block. */ size = blk->disk_size; cksum = blk->cksum; if (__wt_block_offset_invalid(block, offset, size)) WT_RET_MSG(session, EINVAL, "block at offset %" PRIuMAX " cannot be a valid block, no " "read attempted", (uintmax_t)offset); return (__wt_block_read_off(session, block, buf, offset, size, cksum)); }
/* * __wt_block_read_off_blind -- * Read the block at an offset, return the size and checksum, debugging * only. */ int __wt_block_read_off_blind(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, uint32_t *sizep, uint32_t *checksump) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; *sizep = 0; *checksump = 0; /* * Make sure the buffer is large enough for the header and read the * the first allocation-size block. */ WT_RET(__wt_scr_alloc(session, block->allocsize, &tmp)); WT_ERR(__wt_read( session, block->fh, offset, (size_t)block->allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); __wt_block_header_byteswap(blk); *sizep = blk->disk_size; *checksump = blk->checksum; err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; size_t bufsize; uint32_t page_cksum; WT_RET(__wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum)); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); page_cksum = blk->cksum; if (page_cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); } if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __desc_read -- * Read and verify the file's metadata. */ static int __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; uint32_t cksum; /* Use a scratch buffer to get correct alignment for direct I/O. */ WT_RET(__wt_scr_alloc(session, block->allocsize, &buf)); /* Read the first allocation-sized block and verify the file format. */ WT_ERR(__wt_read( session, block->fh, (off_t)0, (size_t)block->allocsize, buf->mem)); desc = buf->mem; WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, "%s: magic %" PRIu32 ", major/minor: %" PRIu32 "/%" PRIu32 ", checksum %#" PRIx32, block->name, desc->magic, desc->majorv, desc->minorv, desc->cksum)); /* * We fail the open if the checksum fails, or the magic number is wrong * or the major/minor numbers are unsupported for this version. This * test is done even if the caller is verifying or salvaging the file: * it makes sense for verify, and for salvage we don't overwrite files * without some reason to believe they are WiredTiger files. The user * may have entered the wrong file name, and is now frantically pounding * their interrupt key. */ cksum = desc->cksum; desc->cksum = 0; if (desc->magic != WT_BLOCK_MAGIC || cksum != __wt_cksum(desc, block->allocsize)) WT_ERR_MSG(session, WT_ERROR, "%s does not appear to be a WiredTiger file", block->name); if (desc->majorv > WT_BLOCK_MAJOR_VERSION || (desc->majorv == WT_BLOCK_MAJOR_VERSION && desc->minorv > WT_BLOCK_MINOR_VERSION)) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only " "supports major/minor versions up to %d/%d, and the file " "is version %d/%d", WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, desc->majorv, desc->minorv); err: __wt_scr_free(&buf); return (ret); }
/* * __fstream_getline -- * Get a line from a stream. * * Implementation of the POSIX getline or BSD fgetln functions (finding the * function in a portable way is hard, it's simple enough to write it instead). * * Note: Unlike the standard getline calls, this function doesn't include the * trailing newline character in the returned buffer and discards empty lines * (so the caller's EOF marker is a returned line length of 0). */ static int __fstream_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fstr, WT_ITEM *buf) { const char *p; size_t len; char c; /* * We always NUL-terminate the returned string (even if it's empty), * make sure there's buffer space for a trailing NUL in all cases. */ WT_RET(__wt_buf_init(session, buf, 100)); for (;;) { /* Check if we need to refill the buffer. */ if (WT_PTRDIFF(fstr->buf.data, fstr->buf.mem) >= fstr->buf.size) { len = WT_MIN(WT_STREAM_BUFSIZE, (size_t)(fstr->size - fstr->off)); if (len == 0) break; /* EOF */ WT_RET(__wt_buf_initsize(session, &fstr->buf, len)); WT_RET(__wt_read( session, fstr->fh, fstr->off, len, fstr->buf.mem)); fstr->off += (wt_off_t)len; } c = *(p = fstr->buf.data); fstr->buf.data = ++p; /* Leave space for a trailing NUL. */ WT_RET(__wt_buf_extend(session, buf, buf->size + 2)); if (c == '\n') { if (buf->size == 0) continue; break; } ((char *)buf->mem)[buf->size++] = c; } ((char *)buf->mem)[buf->size] = '\0'; return (0); }
/* * __log_filesize -- * Returns an estimate of the real end of log file. */ static int __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint64_t rec; uint32_t allocsize; off_t log_size, off; conn = S2C(session); log = conn->log; if (eof == NULL) return (0); *eof = 0; WT_ERR(__wt_filesize(session, fh, &log_size)); if (log == NULL) allocsize = LOG_ALIGN; else allocsize = log->allocsize; /* * We know all log records are aligned at log->allocsize. The first * item in a log record is always the length. Look for any non-zero * at the allocsize boundary. This may not be a true log record since * it could be the middle of a large record. But we know no log record * starts after it. Return an estimate of the log file size. */ for (off = log_size - (off_t)allocsize; off > 0; off -= (off_t)allocsize) { WT_ERR(__wt_read(session, fh, off, sizeof(uint64_t), &rec)); if (rec != 0) break; } /* * Set EOF to the last zero-filled record we saw. */ *eof = off + (off_t)allocsize; err: return (ret); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; wt_off_t max, offset; uint32_t allocsize, checksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = block->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read( session, fh, offset, (size_t)allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); __wt_block_header_byteswap(blk); size = blk->disk_size; checksum = blk->checksum; /* * Check the block size: if it's not insane, read the block. * Reading the block validates any checksum; if reading the * block succeeds, return its address as a possible page, * otherwise, move past it. */ if (!__wt_block_offset_invalid(block, offset, size) && __wt_block_read_off( session, block, tmp, offset, size, checksum) == 0) break; /* Free the allocation-size block. */ __wt_verbose(session, WT_VERB_SALVAGE, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); WT_ERR(__wt_block_off_free( session, block, offset, (wt_off_t)allocsize)); block->slvg_off += allocsize; } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); done: err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_salvage_next -- * Return the next block from the file. */ int __wt_block_salvage_next( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) { WT_BLOCK_HEADER *blk; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; offset = block->slvg_off; fh = block->fh; allocsize = block->allocsize; WT_RET(__wt_buf_initsize(session, buf, allocsize)); /* Read through the file, looking for pages with valid checksums. */ for (max = fh->file_size;;) { if (offset >= max) { /* Check eof. */ *eofp = 1; return (0); } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. */ WT_RET(__wt_read(session, fh, offset, allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The page size isn't insane, read the entire page: reading the * page validates the checksum and then decompresses the page as * needed. If reading the page fails, it's probably corruption, * we ignore this block. */ if (__wt_block_read_off( session, block, buf, offset, size, cksum)) { skip: WT_VERBOSE_RET(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* * Free the block and make sure we don't return it more * than once. */ WT_RET(__wt_block_off_free( session, block, offset, (off_t)allocsize)); block->slvg_off = offset += allocsize; continue; } /* * Valid block, return to our caller. * * The buffer may have grown: make sure we read from the full * page image. */ blk = WT_BLOCK_HEADER_REF(buf->mem); break; } /* * Track the largest write-generation we've seen in the file so future * writes, done after salvage completes, are preferred to these blocks. */ *write_genp = blk->write_gen; if (block->live.write_gen < blk->write_gen) block->live.write_gen = blk->write_gen; /* Re-create the address cookie that should reference this block. */ endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); /* We're successfully returning the page, move past it. */ block->slvg_off = offset + size; return (0); }
/* * __conn_config_file -- * Read in any WiredTiger_config file in the home directory. */ static int __conn_config_file(WT_SESSION_IMPL *session, const char **cfg, WT_ITEM **cbufp) { WT_DECL_ITEM(cbuf); WT_DECL_RET; WT_FH *fh; off_t size; uint32_t len; int exist, quoted; uint8_t *p, *t; *cbufp = NULL; /* Returned buffer */ fh = NULL; /* Check for an optional configuration file. */ #define WT_CONFIGFILE "WiredTiger.config" WT_RET(__wt_exist(session, WT_CONFIGFILE, &exist)); if (!exist) return (0); /* Open the configuration file. */ WT_RET(__wt_open(session, WT_CONFIGFILE, 0, 0, 0, &fh)); WT_ERR(__wt_filesize(session, fh, &size)); if (size == 0) goto err; /* * Sanity test: a 100KB configuration file would be insane. (There's * no practical reason to limit the file size, but I can either limit * the file size to something rational, or I can add code to test if * the off_t size is larger than a uint32_t, which is more complicated * and a waste of time.) */ if (size > 100 * 1024) WT_ERR_MSG(session, EFBIG, WT_CONFIGFILE); len = (uint32_t)size; /* * Copy the configuration file into memory, with a little slop, I'm not * interested in debugging off-by-ones. * * The beginning of a file is the same as if we run into an unquoted * newline character, simplify the parsing loop by pretending that's * what we're doing. */ WT_ERR(__wt_scr_alloc(session, len + 10, &cbuf)); WT_ERR( __wt_read(session, fh, (off_t)0, len, ((uint8_t *)cbuf->mem) + 1)); ((uint8_t *)cbuf->mem)[0] = '\n'; cbuf->size = len + 1; /* * Collapse the file's lines into a single string: newline characters * are replaced with commas unless the newline is quoted or backslash * escaped. Comment lines (an unescaped newline where the next non- * white-space character is a hash), are discarded. */ for (quoted = 0, p = t = cbuf->mem; len > 0;) { /* * Backslash pairs pass through untouched, unless immediately * preceding a newline, in which case both the backslash and * the newline are discarded. Backslash characters escape * quoted characters, too, that is, a backslash followed by a * quote doesn't start or end a quoted string. */ if (*p == '\\' && len > 1) { if (p[1] != '\n') { *t++ = p[0]; *t++ = p[1]; } p += 2; len -= 2; continue; } /* * If we're in a quoted string, or starting a quoted string, * take all characters, including white-space and newlines. */ if (quoted || *p == '"') { if (*p == '"') quoted = !quoted; *t++ = *p++; --len; continue; } /* Everything else gets taken, except for newline characters. */ if (*p != '\n') { *t++ = *p++; --len; continue; } /* * Replace any newline characters with commas (and strings of * commas are safe). * * After any newline, skip to a non-white-space character; if * the next character is a hash mark, skip to the next newline. */ for (;;) { for (*t++ = ','; --len > 0 && isspace(*++p);) ; if (len == 0) break; if (*p != '#') break; while (--len > 0 && *++p != '\n') ; if (len == 0) break; } } *t = '\0'; #if 0 fprintf(stderr, "file config: {%s}\n", (const char *)cbuf->data); #endif /* Check the configuration string. */ WT_ERR(__wt_config_check( session, __wt_confchk_wiredtiger_open, cbuf->data, 0)); /* * The configuration file falls between the default configuration and * the wiredtiger_open() configuration, overriding the defaults but not * overriding the wiredtiger_open() configuration. */ while (cfg[1] != NULL) ++cfg; cfg[1] = cfg[0]; cfg[0] = cbuf->data; *cbufp = cbuf; if (0) { err: if (cbuf != NULL) __wt_buf_free(session, cbuf); } if (fh != NULL) WT_TRET(__wt_close(session, fh)); return (ret); }
/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) { WT_CONNECTION_IMPL *conn; WT_ITEM buf; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; u_int i, logcount; int eol; char **logfiles; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; eol = 0; WT_CLEAR(buf); /* * If the caller did not give us a callback function there is nothing * to do. */ if (func == NULL) return (0); if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, "__wt_log_scan truncating to %u/%" PRIuMAX, log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); if (log != NULL) { allocsize = log->allocsize; if (lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FIRST)) start_lsn = log->first_lsn; else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) start_lsn = log->ckpt_lsn; else return (WT_ERROR); /* Illegal usage */ } else { if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ if (lsnp->offset % allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); /* * Log cursors may not know the starting LSN. If an * LSN pointer is passed in, but it is the INIT_LSN, * start from the first_lsn. */ start_lsn = *lsnp; if (IS_INIT_LSN(&start_lsn)) start_lsn = log->first_lsn; } end_lsn = log->alloc_lsn; } else { /* * If logging is not configured, we can still print out the log * if log files exist. We just need to set the LSNs from what * is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger * records and larger allocation boundaries should always be * a multiple of this. */ allocsize = LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); if (logcount == 0) /* * Return it is not supported if none don't exist. */ return (ENOTSUP); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } start_lsn.file = firstlog; end_lsn.file = lastlog; start_lsn.offset = end_lsn.offset = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; eol = 1; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* * Avoid an error message when we reach end of log * by checking here. */ if (rd_lsn.file > end_lsn.file) break; WT_ERR(__log_openfile( session, 0, &log_fh, rd_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); continue; } /* * Read the minimum allocation size a record could be. */ WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)buf.mem; /* * Log files are pre-allocated. We never expect a zero length * unless we've reached the end of the log. The log can be * written out of order, so when recovery finds the end of * the log, truncate the file and remove any later log files * that may exist. */ if (reclen == 0) { /* This LSN is the end. */ break; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this * log record. */ if (rd_lsn.offset + rdup_len > log_size) goto advance; /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. */ buf.size = reclen; logrec = (WT_LOG_RECORD *)buf.mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on * the first pass through recovery. In the second pass * where we truncate the log, this is where it should * end. */ if (log != NULL) log->trunc_lsn = rd_lsn; break; } /* * We have a valid log record. If it is not the log file * header, invoke the callback. */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); if (rd_lsn.offset != 0) { WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); if (LF_ISSET(WT_LOGSCAN_ONE)) break; } rd_lsn.offset += (off_t)rdup_len; } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, 0)); err: WT_STAT_FAST_CONN_INCR(session, log_scans); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); /* * If the caller wants one record and it is at the end of log, * return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; if (ret == ENOENT) ret = 0; if (log_fh != NULL) WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __wt_log_read -- * Read the log record at the given LSN. Return the record (including * the log header) in the WT_ITEM. Caller is responsible for freeing it. */ int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; uint32_t cksum, rdup_len, reclen; WT_UNUSED(flags); /* * If the caller didn't give us an LSN or something to return, * there's nothing to do. */ if (lsnp == NULL || record == NULL) return (0); conn = S2C(session); log = conn->log; /* * If the offset isn't on an allocation boundary it must be wrong. */ if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file)); /* * Read the minimum allocation size a record could be. */ WT_ERR(__wt_buf_init(session, record, log->allocsize)); WT_ERR(__wt_read(session, log_fh, lsnp->offset, (size_t)log->allocsize, record->mem)); /* * First 4 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)record->mem; if (reclen == 0) { ret = WT_NOTFOUND; goto err; } if (reclen > log->allocsize) { rdup_len = __wt_rduppo2(reclen, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ERR(__wt_read(session, log_fh, lsnp->offset, (size_t)rdup_len, record->mem)); } /* * We read in the record, verify checksum. */ logrec = (WT_LOG_RECORD *)record->mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum"); record->size = logrec->len; WT_STAT_FAST_CONN_INCR(session, log_reads); err: WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_cksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, page_cksum, cksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, swap.cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = fh->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read(session, fh, offset, allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); block->slvg_off += allocsize; /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The block size isn't insane, read the entire block. Reading * the block validates the checksum; if reading the block fails, * ignore it. If reading the block succeeds, return its address * as a possible page. */ if (__wt_block_read_off( session, block, tmp, offset, size, cksum) == 0) break; skip: WT_VERBOSE_ERR(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* Free the allocation-size block. */ WT_ERR(__wt_block_off_free( session, block, offset, (off_t)allocsize)); } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); done: err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; uint32_t page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * If we're compressing the file blocks, place the initial read into a * scratch buffer, we're going to have to re-allocate more memory for * decompression. Else check the caller's buffer size and grow it as * necessary, there will only be one buffer. */ if (block->compressor == NULL) { F_SET(buf, WT_ITEM_ALIGNED); WT_RET(__wt_buf_init(session, buf, size)); buf->size = size; dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, size, &tmp)); tmp->size = size; dsk = tmp->mem; } /* Read. */ WT_ERR(__wt_read(session, block->fh, offset, size, dsk)); blk = WT_BLOCK_HEADER_REF(dsk); /* Validate the checksum. */ if (block->checksum && cksum != WT_BLOCK_CHECKSUM_NOT_SET && blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) { blk->cksum = 0; page_cksum = __wt_cksum(dsk, size); if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET) ++page_cksum; if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); WT_ERR(WT_ERROR); } } /* * If the in-memory block size is larger than the on-disk block size, * the block is compressed. Size the user's buffer, copy the skipped * bytes of the original image into place, then decompress. * * If the in-memory block size is less than or equal to the on-disk * block size, the block is not compressed. */ if (blk->disk_size < dsk->size) { if (block->compressor == NULL) WT_ERR(__wt_illegal_value(session, block->name)); WT_ERR(__wt_buf_init(session, buf, dsk->size)); buf->size = dsk->size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(block->compressor->decompress( block->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->size - WT_BLOCK_COMPRESS_SKIP, &result_len)); if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP) WT_ERR(__wt_illegal_value(session, block->name)); } else if (block->compressor == NULL) buf->size = dsk->size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, why configure a compressor that * doesn't work? Allocate a buffer of the right size * (we used a scratch buffer which might be large), and * copy the data into place. */ WT_ERR( __wt_buf_set(session, buf, tmp->data, dsk->size)); WT_BSTAT_INCR(session, page_read); WT_CSTAT_INCR(session, block_read); err: __wt_scr_free(&tmp); return (ret); }