/* * __wt_log_system_record -- * Write a system log record for the previous LSN. */ int __wt_log_system_record( WT_SESSION_IMPL *session, WT_FH *log_fh, WT_LSN *lsn) { WT_DECL_ITEM(logrec_buf); WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LOGSLOT tmp; WT_MYSLOT myslot; const char *fmt = WT_UNCHECKED_STRING(I); uint32_t rectype = WT_LOGREC_SYSTEM; size_t recsize; log = S2C(session)->log; WT_RET(__wt_logrec_alloc(session, log->allocsize, &logrec_buf)); memset((uint8_t *)logrec_buf->mem, 0, log->allocsize); WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec_buf->data + logrec_buf->size, recsize, fmt, rectype)); logrec_buf->size += recsize; WT_ERR(__wt_logop_prev_lsn_pack(session, logrec_buf, lsn)); WT_ASSERT(session, logrec_buf->size <= log->allocsize); logrec = (WT_LOG_RECORD *)logrec_buf->mem; /* * We know system records are this size. And we have to adjust * the size now because we're not going through the normal log * write path and the packing functions needed the correct offset * earlier. */ logrec_buf->size = logrec->len = log->allocsize; /* We do not compress nor encrypt this record. */ logrec->checksum = 0; logrec->flags = 0; __wt_log_record_byteswap(logrec); logrec->checksum = __wt_checksum(logrec, log->allocsize); #ifdef WORDS_BIGENDIAN logrec->checksum = __wt_bswap32(logrec->checksum); #endif WT_CLEAR(tmp); memset(&myslot, 0, sizeof(myslot)); myslot.slot = &tmp; __wt_log_slot_activate(session, &tmp); /* * Override the file handle to the one we're using. */ tmp.slot_fh = log_fh; WT_ERR(__wt_log_fill(session, &myslot, true, logrec_buf, NULL)); err: __wt_logrec_free(session, &logrec_buf); return (ret); }
/* * __wt_decrypt -- * Common code to decrypt and verify the encrypted data in a * WT_ITEM and return the decrypted buffer. */ int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) { size_t encryptor_data_len, result_len; uint32_t encrypt_len; uint8_t *dst, *src; encrypt_len = WT_STORE_SIZE(*((uint32_t *)((uint8_t *)in->data + skip))); #ifdef WORDS_BIGENDIAN encrypt_len = __wt_bswap32(encrypt_len); #endif if (encrypt_len > in->size) WT_RET_MSG(session, WT_ERROR, "corrupted encrypted item: padded size less than " "actual size"); /* * We're allocating the number of bytes we're expecting * from decryption plus the unencrypted header. */ WT_RET(__wt_buf_initsize(session, out, encrypt_len)); src = (uint8_t *)in->data + skip + WT_ENCRYPT_LEN_SIZE; encryptor_data_len = encrypt_len - (skip + WT_ENCRYPT_LEN_SIZE); dst = (uint8_t *)out->mem + skip; WT_RET(encryptor->decrypt(encryptor, &session->iface, src, encryptor_data_len, dst, encryptor_data_len, &result_len)); /* * We require encryption to be byte for byte. It should not expand * the data. */ WT_ASSERT(session, result_len <= encryptor_data_len); /* * Copy in the skipped header bytes. */ memcpy(out->mem, in->data, skip); /* * Set the real result length in the output buffer including the skipped * header size. The encryptor may have done its own padding so the * returned result length is the real data length after decryption * removes any of its padding. */ out->size = result_len + skip; return (0); }
/* * __wt_encrypt -- * Common code to encrypt a WT_ITEM and return the encrypted buffer. */ int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) { size_t dst_len, result_len, src_len; uint32_t *unpadded_lenp; uint8_t *dst, *src; /* Skip the header bytes of the source data. */ src = (uint8_t *)in->mem + skip; src_len = in->size - skip; unpadded_lenp = (uint32_t *)((uint8_t *)out->mem + skip); /* * Skip the header bytes and the length we store in the destination * data. Add in the encryptor size constant to the expected destination * length. */ dst = (uint8_t *)out->mem + skip + WT_ENCRYPT_LEN_SIZE; dst_len = src_len + kencryptor->size_const; WT_RET(kencryptor->encryptor->encrypt(kencryptor->encryptor, &session->iface, src, src_len, dst, dst_len, &result_len)); /* * We require encryption to be byte for byte. It should never expand * the data. */ WT_ASSERT(session, result_len <= dst_len); /* * The final result length includes the skipped lengths. */ result_len += skip + WT_ENCRYPT_LEN_SIZE; /* * Store original size so we know how much space is needed on the * decryption side. */ *unpadded_lenp = WT_STORE_SIZE(result_len); #ifdef WORDS_BIGENDIAN *unpadded_lenp = __wt_bswap32(*unpadded_lenp); #endif /* * Copy in the skipped header bytes, set the final data size. */ memcpy(out->mem, in->mem, skip); out->size = result_len; return (0); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, bool data_cksum, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; uint32_t cksum; bool local_locked; fh = block->fh; /* * Clear the block header to ensure all of it is initialized, even the * unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* * Swap the page-header as needed; this doesn't belong here, but it's * the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything * in little-endian format. The checksum is (potentially) returned in a * big-endian format, swap it into place in a separate step. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; __wt_block_header_byteswap(blk); blk->cksum = cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->cksum = __wt_bswap32(blk->cksum); #endif /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. * Allocate space for the write, and optionally extend the file (note * the block-extend function may release the lock). * Release any locally acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend( session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = cksum; return (0); }