/* * __wt_desc_init -- * Write a file's initial descriptor structure. */ int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; /* Use a scratch buffer to get correct alignment for direct I/O. */ WT_RET(__wt_scr_alloc(session, allocsize, &buf)); memset(buf->mem, 0, allocsize); desc = buf->mem; desc->magic = WT_BLOCK_MAGIC; desc->majorv = WT_BLOCK_MAJOR_VERSION; desc->minorv = WT_BLOCK_MINOR_VERSION; /* Update the checksum. */ desc->cksum = 0; desc->cksum = __wt_cksum(desc, allocsize); ret = __wt_write(session, fh, (off_t)0, (size_t)allocsize, desc); __wt_scr_free(&buf); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; size_t bufsize; uint32_t page_cksum; WT_RET(__wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum)); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); page_cksum = blk->cksum; if (page_cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); } if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __desc_read -- * Read and verify the file's metadata. */ static int __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; uint32_t cksum; /* Use a scratch buffer to get correct alignment for direct I/O. */ WT_RET(__wt_scr_alloc(session, block->allocsize, &buf)); /* Read the first allocation-sized block and verify the file format. */ WT_ERR(__wt_read( session, block->fh, (off_t)0, (size_t)block->allocsize, buf->mem)); desc = buf->mem; WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, "%s: magic %" PRIu32 ", major/minor: %" PRIu32 "/%" PRIu32 ", checksum %#" PRIx32, block->name, desc->magic, desc->majorv, desc->minorv, desc->cksum)); /* * We fail the open if the checksum fails, or the magic number is wrong * or the major/minor numbers are unsupported for this version. This * test is done even if the caller is verifying or salvaging the file: * it makes sense for verify, and for salvage we don't overwrite files * without some reason to believe they are WiredTiger files. The user * may have entered the wrong file name, and is now frantically pounding * their interrupt key. */ cksum = desc->cksum; desc->cksum = 0; if (desc->magic != WT_BLOCK_MAGIC || cksum != __wt_cksum(desc, block->allocsize)) WT_ERR_MSG(session, WT_ERROR, "%s does not appear to be a WiredTiger file", block->name); if (desc->majorv > WT_BLOCK_MAJOR_VERSION || (desc->majorv == WT_BLOCK_MAJOR_VERSION && desc->minorv > WT_BLOCK_MINOR_VERSION)) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only " "supports major/minor versions up to %d/%d, and the file " "is version %d/%d", WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, desc->majorv, desc->minorv); err: __wt_scr_free(&buf); return (ret); }
/* * copy -- * Copy the created page to the end of the salvage file. */ void copy(u_int gen, u_int recno) { FILE *ifp, *ofp; WT_PAGE_HEADER *dsk; WT_BLOCK_HEADER *blk; char buf[PSIZE]; assert((ifp = fopen(LOAD, "r")) != NULL); /* * If the salvage file doesn't exist, then we're creating it: * copy the first sector (the file description). * Otherwise, we are appending to an existing file. */ if (access(SLVG, F_OK)) { assert((ofp = fopen(SLVG, "w")) != NULL); assert(fread(buf, 1, 512, ifp) == 512); assert(fwrite(buf, 1, 512, ofp) == 512); } else assert((ofp = fopen(SLVG, "a")) != NULL); /* * If there's data, copy/update the first formatted page. */ if (gen != 0) { assert(fseek(ifp, (long)512, SEEK_SET) == 0); assert(fread(buf, 1, PSIZE, ifp) == PSIZE); dsk = (void *)buf; if (page_type != WT_PAGE_ROW_LEAF) dsk->recno = recno; blk = WT_BLOCK_HEADER_REF(buf); blk->write_gen = gen; blk->cksum = 0; blk->cksum = __wt_cksum(dsk, PSIZE); assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE); } assert(fclose(ifp) == 0); assert(fclose(ofp) == 0); }
/* * __wt_log_write -- * Write a record into the log. */ int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; uint32_t rdup_len; int locked; conn = S2C(session); log = conn->log; locked = 0; INIT_LSN(&lsn); myslot.slot = NULL; /* * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has * a header at the beginning for us to fill in. * * If using direct_io, the caller should pass us an aligned record. * But we need to make sure it is big enough and zero-filled so * that we can write the full amount. Do this whether or not * direct_io is in use because it makes the reading code cleaner. */ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size); rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ASSERT(session, record->data == record->mem); /* * If the caller's record only partially fills the necessary * space, we need to zero-fill the remainder. */ if (record->size != rdup_len) { memset((uint8_t *)record->mem + record->size, 0, rdup_len - record->size); record->size = rdup_len; } logrec = (WT_LOG_RECORD *)record->mem; logrec->len = (uint32_t)record->size; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, record->size); WT_STAT_FAST_CONN_INCR(session, log_writes); if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { ret = __log_direct_write(session, record, lsnp, flags); if (ret == 0) return (0); if (ret != EAGAIN) WT_ERR(ret); /* * An EAGAIN return means we failed to get the try lock - * fall through to the consolidation code in that case. */ } /* * As soon as we see contention for the log slot, disable direct * log writes. We get better performance by forcing writes through * the consolidation code. This is because individual writes flood * the I/O system faster than they contend on the log slot lock. */ F_SET(log, WT_LOG_FORCE_CONSOLIDATE); if ((ret = __wt_log_slot_join( session, rdup_len, flags, &myslot)) == ENOMEM) { /* * If we couldn't find a consolidated slot for this record * write the record directly. */ while ((ret = __log_direct_write( session, record, lsnp, flags)) == EAGAIN) ; WT_ERR(ret); /* * Increase the buffer size of any slots we can get access * to, so future consolidations are likely to succeed. */ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } WT_ERR(ret); if (myslot.offset == 0) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; WT_ERR(__wt_log_slot_close(session, myslot.slot)); WT_ERR(__log_acquire( session, myslot.slot->slot_group_size, myslot.slot)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; WT_ERR(__wt_log_slot_notify(session, myslot.slot)); } else WT_ERR(__wt_log_slot_wait(session, myslot.slot)); WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { WT_ERR(__log_release(session, myslot.slot)); WT_ERR(__wt_log_slot_free(myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); } err: if (locked) __wt_spin_unlock(session, &log->log_slot_lock); if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* * If we're synchronous and some thread had an error, we don't know * if our write made it out to the file or not. The error could be * before or after us. So, if anyone got an error, we report it. * If we're not synchronous, only report if our own operation got * an error. */ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) ret = myslot.slot->slot_error; return (ret); }
/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) { WT_CONNECTION_IMPL *conn; WT_ITEM buf; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; u_int i, logcount; int eol; char **logfiles; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; eol = 0; WT_CLEAR(buf); /* * If the caller did not give us a callback function there is nothing * to do. */ if (func == NULL) return (0); if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, "__wt_log_scan truncating to %u/%" PRIuMAX, log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); if (log != NULL) { allocsize = log->allocsize; if (lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FIRST)) start_lsn = log->first_lsn; else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) start_lsn = log->ckpt_lsn; else return (WT_ERROR); /* Illegal usage */ } else { if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ if (lsnp->offset % allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); /* * Log cursors may not know the starting LSN. If an * LSN pointer is passed in, but it is the INIT_LSN, * start from the first_lsn. */ start_lsn = *lsnp; if (IS_INIT_LSN(&start_lsn)) start_lsn = log->first_lsn; } end_lsn = log->alloc_lsn; } else { /* * If logging is not configured, we can still print out the log * if log files exist. We just need to set the LSNs from what * is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger * records and larger allocation boundaries should always be * a multiple of this. */ allocsize = LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); if (logcount == 0) /* * Return it is not supported if none don't exist. */ return (ENOTSUP); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } start_lsn.file = firstlog; end_lsn.file = lastlog; start_lsn.offset = end_lsn.offset = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; eol = 1; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* * Avoid an error message when we reach end of log * by checking here. */ if (rd_lsn.file > end_lsn.file) break; WT_ERR(__log_openfile( session, 0, &log_fh, rd_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); continue; } /* * Read the minimum allocation size a record could be. */ WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)buf.mem; /* * Log files are pre-allocated. We never expect a zero length * unless we've reached the end of the log. The log can be * written out of order, so when recovery finds the end of * the log, truncate the file and remove any later log files * that may exist. */ if (reclen == 0) { /* This LSN is the end. */ break; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this * log record. */ if (rd_lsn.offset + rdup_len > log_size) goto advance; /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. */ buf.size = reclen; logrec = (WT_LOG_RECORD *)buf.mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on * the first pass through recovery. In the second pass * where we truncate the log, this is where it should * end. */ if (log != NULL) log->trunc_lsn = rd_lsn; break; } /* * We have a valid log record. If it is not the log file * header, invoke the callback. */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); if (rd_lsn.offset != 0) { WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); if (LF_ISSET(WT_LOGSCAN_ONE)) break; } rd_lsn.offset += (off_t)rdup_len; } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, 0)); err: WT_STAT_FAST_CONN_INCR(session, log_scans); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); /* * If the caller wants one record and it is at the end of log, * return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; if (ret == ENOENT) ret = 0; if (log_fh != NULL) WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __wt_log_read -- * Read the log record at the given LSN. Return the record (including * the log header) in the WT_ITEM. Caller is responsible for freeing it. */ int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; uint32_t cksum, rdup_len, reclen; WT_UNUSED(flags); /* * If the caller didn't give us an LSN or something to return, * there's nothing to do. */ if (lsnp == NULL || record == NULL) return (0); conn = S2C(session); log = conn->log; /* * If the offset isn't on an allocation boundary it must be wrong. */ if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file)); /* * Read the minimum allocation size a record could be. */ WT_ERR(__wt_buf_init(session, record, log->allocsize)); WT_ERR(__wt_read(session, log_fh, lsnp->offset, (size_t)log->allocsize, record->mem)); /* * First 4 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)record->mem; if (reclen == 0) { ret = WT_NOTFOUND; goto err; } if (reclen > log->allocsize) { rdup_len = __wt_rduppo2(reclen, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ERR(__wt_read(session, log_fh, lsnp->offset, (size_t)rdup_len, record->mem)); } /* * We read in the record, verify checksum. */ logrec = (WT_LOG_RECORD *)record->mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum"); record->size = logrec->len; WT_STAT_FAST_CONN_INCR(session, log_reads); err: WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __wt_log_newfile -- * Create the next log file and write the file header record into it. */ int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LOG *log; WT_LOG_DESC *desc; WT_LOG_RECORD *logrec; WT_LOGSLOT tmp; WT_MYSLOT myslot; conn = S2C(session); log = conn->log; /* * Set aside the log file handle to be closed later. Other threads * may still be using it to write to the log. */ WT_ASSERT(session, log->log_close_fh == NULL); log->log_close_fh = log->log_fh; log->fileid++; WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid)); log->alloc_lsn.file = log->fileid; log->alloc_lsn.offset = log->log_fh->size; /* * Set up the log descriptor record. Use a scratch buffer to * get correct alignment for direct I/O. */ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); memset(buf->mem, 0, log->allocsize); logrec = (WT_LOG_RECORD *)buf->mem; desc = (WT_LOG_DESC *)logrec->record; desc->log_magic = WT_LOG_MAGIC; desc->majorv = WT_LOG_MAJOR_VERSION; desc->minorv = WT_LOG_MINOR_VERSION; desc->log_size = (uint64_t)conn->log_file_max; /* * Now that the record is set up, initialize the record header. */ logrec->len = log->allocsize; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, log->allocsize); WT_CLEAR(tmp); myslot.slot = &tmp; myslot.offset = 0; /* * Recursively call __log_acquire to allocate log space for the * log descriptor record. Call __log_fill to write it, but we * do not need to call __log_release because we're not waiting for * earlier operations to complete. */ WT_ERR(__log_acquire(session, logrec->len, &tmp)); WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); /* * If we're called from connection creation code, we need to update * the LSNs since we're the only write in progress. */ if (conn_create) { WT_ERR(__wt_fsync(session, log->log_fh)); log->sync_lsn = tmp.slot_end_lsn; log->write_lsn = tmp.slot_end_lsn; } err: __wt_scr_free(&buf); return (ret); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; off_t offset; uint32_t align_size; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = align_size; /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!locked) __wt_spin_lock(session, &block->live_lock); ret = __wt_block_alloc(session, block, &offset, (off_t)align_size); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); #if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE) /* * Extend the file in chunks. We aren't holding a lock and we'd prefer * to limit the number of threads extending the file at the same time, * so choose the one thread that's crossing the extended boundary. We * don't extend newly created files, and it's theoretically possible we * might wait so long our extension of the file is passed by another * thread writing single blocks, that's why there's a check in case the * extended file size becomes too small: if the file size catches up, * every thread will try to extend it. */ if (fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; #if defined(HAVE_POSIX_FALLOCATE) if ((ret = posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0) WT_RET_MSG( session, ret, "%s: posix_fallocate", fh->name); #elif defined(HAVE_FTRUNCATE) if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0) WT_RET_MSG(session, ret, "%s: ftruncate", fh->name); #endif } #endif if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!locked) __wt_spin_lock(session, &block->live_lock); WT_TRET( __wt_block_off_free(session, block, offset, align_size)); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { block->os_cache_dirty = 0; if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) WT_RET_MSG( session, ret, "%s: sync_file_range", block->name); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); WT_VERBOSE_RET(session, write, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, align_size, blk->cksum); *offsetp = offset; *sizep = align_size; *cksump = blk->cksum; return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_cksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, page_cksum, cksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, swap.cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /* * Extend the file in chunks. We want to limit the number of threads * extending the file at the same time, so choose the one thread that's * crossing the extended boundary. We don't extend newly created files, * and it's theoretically possible we might wait so long our extension * of the file is passed by another thread writing single blocks, that's * why there's a check in case the extended file size becomes too small: * if the file size catches up, every thread tries to extend it. * * File extension may require locking: some variants of the system call * used to extend the file initialize the extended space. If a writing * thread races with the extending thread, the extending thread might * overwrite already written data, and that would be very, very bad. * * Some variants of the system call to extend the file fail at run-time * based on the filesystem type, fall back to ftruncate in that case, * and remember that ftruncate requires locking. */ if (ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /* * Release any locally acquired lock if it's not needed * to extend the file, extending the file might require * updating file metadata, which can be slow. (It may be * a bad idea to configure for file extension on systems * that require locking over the extend call.) */ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /* Extend the file. */ if ((ret = __wt_fallocate(session, fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else { extend_truncate: /* * We may have a caller lock or a locally acquired lock, * but we need a lock to call ftruncate. */ if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /* * The truncate might fail if there's a file mapping * (if there's an open checkpoint on the file), that's * OK. */ if ((ret = __wt_ftruncate( session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } /* Release any locally acquired lock. */ if (local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return (ret); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, bool data_cksum, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; uint32_t cksum; bool local_locked; fh = block->fh; /* * Clear the block header to ensure all of it is initialized, even the * unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* * Swap the page-header as needed; this doesn't belong here, but it's * the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything * in little-endian format. The checksum is (potentially) returned in a * big-endian format, swap it into place in a separate step. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; __wt_block_header_byteswap(blk); blk->cksum = cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->cksum = __wt_bswap32(blk->cksum); #endif /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. * Allocate space for the write, and optionally extend the file (note * the block-extend function may release the lock). * Release any locally acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend( session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = cksum; return (0); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/*将buffer的数据写入到block对应的文件中,并计算checksum和size*/ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /*buf不是对齐模式,不能进行写,因为这个是和磁盘相关的写入,必须是对齐的*/ if(!F_ISSET(buf, WT_ITEM_ALIGNED)){ WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /*计算buf->size按block对齐,对齐后有可能会比现有的buf->memsize大,如果大的话,不能进行写,有可能会缓冲区溢出*/ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /*超过4G*/ if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /*将对其后pading的buffer位置进行清0*/ memset((uint8_t*)buf->mem + buf->size, 0, align_size - buf->size); /*设置block header,计算存储的数据长度*/ blk->disk_size = WT_STORE_SIZE(align_size); blk->flags = 0; if(data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); /*计算buf的cksum*/ blk->cksum = __wt_cksum(buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /*判断文件是否需要进行扩大,如果不扩大就有可能存不下写入的block数据*/ if(ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))){ /*调整extend_size为原来的offset + extend_len的两倍*/ fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /*释放block->live_lock的自旋锁,因为重设文件大小会时间比较长,需要先释放自旋锁,防止CPU空转*/ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /*扩大文件的占用空间*/ if ((ret = __wt_fallocate(session,fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else{ extend_truncate: if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /*直接调整文件大小,这个比__wt_fallocate更慢*/ if ((ret = __wt_ftruncate(session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } if(local_locked){ __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /*进行block的数据写入*/ ret =__wt_write(session, fh, offset, align_size, buf->mem); if (ret != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); /*没写成功,将ext对应的数据返回给avail list*/ WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /*需要进行fsync操作,脏页太多,进行一次异步刷盘*/ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /*清理fh->fd文件对应的system page cache中的数据,这个过程可能会有IO操作,相当于同步的sync调用*/ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return ret; }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; uint32_t page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * If we're compressing the file blocks, place the initial read into a * scratch buffer, we're going to have to re-allocate more memory for * decompression. Else check the caller's buffer size and grow it as * necessary, there will only be one buffer. */ if (block->compressor == NULL) { F_SET(buf, WT_ITEM_ALIGNED); WT_RET(__wt_buf_init(session, buf, size)); buf->size = size; dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, size, &tmp)); tmp->size = size; dsk = tmp->mem; } /* Read. */ WT_ERR(__wt_read(session, block->fh, offset, size, dsk)); blk = WT_BLOCK_HEADER_REF(dsk); /* Validate the checksum. */ if (block->checksum && cksum != WT_BLOCK_CHECKSUM_NOT_SET && blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) { blk->cksum = 0; page_cksum = __wt_cksum(dsk, size); if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET) ++page_cksum; if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); WT_ERR(WT_ERROR); } } /* * If the in-memory block size is larger than the on-disk block size, * the block is compressed. Size the user's buffer, copy the skipped * bytes of the original image into place, then decompress. * * If the in-memory block size is less than or equal to the on-disk * block size, the block is not compressed. */ if (blk->disk_size < dsk->size) { if (block->compressor == NULL) WT_ERR(__wt_illegal_value(session, block->name)); WT_ERR(__wt_buf_init(session, buf, dsk->size)); buf->size = dsk->size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(block->compressor->decompress( block->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->size - WT_BLOCK_COMPRESS_SKIP, &result_len)); if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP) WT_ERR(__wt_illegal_value(session, block->name)); } else if (block->compressor == NULL) buf->size = dsk->size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, why configure a compressor that * doesn't work? Allocate a buffer of the right size * (we used a scratch buffer which might be large), and * copy the data into place. */ WT_ERR( __wt_buf_set(session, buf, tmp->data, dsk->size)); WT_BSTAT_INCR(session, page_read); WT_CSTAT_INCR(session, block_read); err: __wt_scr_free(&tmp); return (ret); }
int main(void) { WT_RAND_STATE rnd; size_t len; uint32_t hw, sw; u_int i, j; uint8_t *data; /* Allocate aligned memory for the data. */ data = dcalloc(DATASIZE, sizeof(uint8_t)); /* Initialize the RNG. */ testutil_check(__wt_random_init_seed(NULL, &rnd)); /* Initialize the WiredTiger library checksum functions. */ __wt_cksum_init(); /* * Some simple known checksums. */ len = 1; hw = __wt_cksum(data, len); check(hw, (uint32_t)0x527d5351, len, "nul x1: hardware"); sw = cksum_sw(data, len); check(sw, (uint32_t)0x527d5351, len, "nul x1: software"); len = 2; hw = __wt_cksum(data, len); check(hw, (uint32_t)0xf16177d2, len, "nul x2: hardware"); sw = cksum_sw(data, len); check(sw, (uint32_t)0xf16177d2, len, "nul x2: software"); len = 3; hw = __wt_cksum(data, len); check(hw, (uint32_t)0x6064a37a, len, "nul x3: hardware"); sw = cksum_sw(data, len); check(sw, (uint32_t)0x6064a37a, len, "nul x3: software"); len = 4; hw = __wt_cksum(data, len); check(hw, (uint32_t)0x48674bc7, len, "nul x4: hardware"); sw = cksum_sw(data, len); check(sw, (uint32_t)0x48674bc7, len, "nul x4: software"); len = strlen("123456789"); memcpy(data, "123456789", len); hw = __wt_cksum(data, len); check(hw, (uint32_t)0xe3069283, len, "known string #1: hardware"); sw = cksum_sw(data, len); check(sw, (uint32_t)0xe3069283, len, "known string #1: software"); len = strlen("The quick brown fox jumps over the lazy dog"); memcpy(data, "The quick brown fox jumps over the lazy dog", len); hw = __wt_cksum(data, len); check(hw, (uint32_t)0x22620404, len, "known string #2: hardware"); sw = cksum_sw(data, len); check(sw, (uint32_t)0x22620404, len, "known string #2: software"); /* * Checksums of power-of-two data chunks. */ for (i = 0, len = 512; i < 1000; ++i) { for (j = 0; j < len; ++j) data[j] = __wt_random(&rnd) & 0xff; hw = __wt_cksum(data, len); sw = cksum_sw(data, len); check(hw, sw, len, "random power-of-two"); len *= 2; if (len > DATASIZE) len = 512; } /* * Checksums of random data chunks. */ for (i = 0; i < 1000; ++i) { len = __wt_random(&rnd) % DATASIZE; for (j = 0; j < len; ++j) data[j] = __wt_random(&rnd) & 0xff; hw = __wt_cksum(data, len); sw = cksum_sw(data, len); check(hw, sw, len, "random"); } free(data); return (EXIT_SUCCESS); }