/* * __wt_log_slot_free -- * Free a slot back into the pool. */ int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_DECL_RET; ret = 0; /* * Grow the buffer if needed before returning it to the pool. */ if (F_ISSET(slot, WT_SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } err: /* * No matter if there is an error, we always want to free * the slot back to the pool. */ /* * Make sure flags don't get retained between uses. * We have to reset them them here because multiple threads may * change the flags when joining the slot. */ slot->flags = WT_SLOT_INIT_FLAGS; slot->slot_state = WT_LOG_SLOT_FREE; return (ret); }
/* * __wt_buf_init -- * Initialize a buffer at a specific size. */ int __wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { buf->data = buf->mem; WT_RET(__wt_buf_grow(session, buf, size)); buf->size = 0; return (0); }
/* * __dmsg -- * Debug message. */ static void __dmsg(WT_DBG *ds, const char *fmt, ...) { va_list ap; WT_ITEM *msg; WT_SESSION_IMPL *session; size_t len, space; char *p; session = ds->session; /* * Debug output chunks are not necessarily terminated with a newline * character. It's easy if we're dumping to a stream, but if we're * dumping to an event handler, which is line-oriented, we must buffer * the output chunk, and pass it to the event handler once we see a * terminating newline. */ if (ds->fp == NULL) { msg = ds->msg; for (;;) { p = (char *)msg->mem + msg->size; space = msg->memsize - msg->size; va_start(ap, fmt); len = (size_t)vsnprintf(p, space, fmt, ap); va_end(ap); /* Check if there was enough space. */ if (len < space) { msg->size += len; break; } /* * There's not much to do on error without checking for * an error return on every single printf. Anyway, it's * pretty unlikely and this is debugging output, I'm not * going to worry about it. */ if (__wt_buf_grow( session, msg, msg->memsize + len + 128) != 0) return; } if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') { ((uint8_t *)msg->mem)[msg->size - 1] = '\0'; (void)__wt_msg(session, "%s", (char *)msg->mem); msg->size = 0; } } else { va_start(ap, fmt); (void)__wt_vfprintf(ds->fp, fmt, ap); va_end(ap); } }
/* * __wt_log_slot_grow_buffers -- * Increase the buffer size of all available slots in the buffer pool. * Go to some lengths to include active (but unused) slots to handle * the case where all log write record sizes exceed the size of the * active buffer. */ int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int64_t orig_state; uint64_t old_size, total_growth; int i; conn = S2C(session); log = conn->log; total_growth = 0; WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); /* * Take the log slot lock to prevent other threads growing buffers * at the same time. Could tighten the scope of this lock, or have * a separate lock if there is contention. */ __wt_spin_lock(session, &log->log_slot_lock); for (i = 0; i < SLOT_POOL; i++) { slot = &log->slot_pool[i]; /* Avoid atomic operations if they won't succeed. */ if (slot->slot_state != WT_LOG_SLOT_FREE && slot->slot_state != WT_LOG_SLOT_READY) continue; /* Don't keep growing unrelated buffers. */ if (slot->slot_buf.memsize > (10 * newsize) && !F_ISSET(slot, SLOT_BUF_GROW)) continue; orig_state = WT_ATOMIC_CAS_VAL8( slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_FREE) { orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_READY) continue; } /* We have a slot - now go ahead and grow the buffer. */ old_size = slot->slot_buf.memsize; F_CLR(slot, SLOT_BUF_GROW); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, WT_MAX(slot->slot_buf.memsize * 2, newsize))); slot->slot_state = orig_state; total_growth += slot->slot_buf.memsize - old_size; } err: __wt_spin_unlock(session, &log->log_slot_lock); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); return (ret); }
/* * __curextract_insert -- * Handle a key produced by a custom extractor. */ static int __curextract_insert(WT_CURSOR *cursor) { WT_CURSOR_EXTRACTOR *cextract; WT_ITEM *key, ikey, pkey; WT_SESSION_IMPL *session; cextract = (WT_CURSOR_EXTRACTOR *)cursor; session = (WT_SESSION_IMPL *)cursor->session; WT_ITEM_SET(ikey, cursor->key); /* * We appended a padding byte to the key to avoid rewriting the last * column. Strip that away here. */ WT_ASSERT(session, ikey.size > 0); --ikey.size; WT_RET(__wt_cursor_get_raw_key(cextract->ctable->cg_cursors[0], &pkey)); /* * We have the index key in the format we need, and all of the primary * key columns are required: just append them. */ key = &cextract->idxc->key; WT_RET(__wt_buf_grow(session, key, ikey.size + pkey.size)); memcpy((uint8_t *)key->mem, ikey.data, ikey.size); memcpy((uint8_t *)key->mem + ikey.size, pkey.data, pkey.size); key->size = ikey.size + pkey.size; /* * The index key is now set and the value is empty (it starts clear and * is never set). */ F_SET(cextract->idxc, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); /* Call the underlying cursor function to update the index. */ return (cextract->f(cextract->idxc)); }
/* * __verify_dsk_row -- * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. */ static int __verify_dsk_row( WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(current); WT_DECL_ITEM(last_ovfl); WT_DECL_ITEM(last_pfx); WT_DECL_RET; WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; uint32_t cell_num, cell_type, i, key_cnt, prefix; uint8_t *end; int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); last = last_ovfl; end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { ret = __err_cell_corrupted(session, cell_num, tag); goto err; } /* Check the raw and collapsed cell types. */ WT_ERR(__err_cell_type( session, cell_num, tag, unpack->raw, dsk->type)); WT_ERR(__err_cell_type( session, cell_num, tag, unpack->type, dsk->type)); cell_type = unpack->type; /* * Check ordering relationships between the WT_CELL entries. * For row-store internal pages, check for: * two values in a row, * two keys in a row, * a value as the first cell on a page. * For row-store leaf pages, check for: * two values in a row, * a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: ++key_cnt; switch (last_cell_type) { case FIRST: case WAS_VALUE: break; case WAS_KEY: if (dsk->type == WT_PAGE_ROW_LEAF) break; WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent keys", cell_num - 1, tag); } last_cell_type = WAS_KEY; break; case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_VALUE: case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", tag); case WAS_KEY: break; case WAS_VALUE: WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent values", cell_num - 1, tag); } last_cell_type = WAS_VALUE; break; } /* Check if any referenced item has a valid address. */ switch (cell_type) { case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) goto eof; break; } /* * Remaining checks are for key order and prefix compression. * If this cell isn't a key, we're done, move to the next cell. * If this cell is an overflow item, instantiate the key and * compare it with the last key. Otherwise, we have to deal with * prefix compression. */ switch (cell_type) { case WT_CELL_KEY: break; case WT_CELL_KEY_OVFL: WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); goto key_compare; default: /* Not a key -- continue with the next cell. */ continue; } /* * Prefix compression checks. * * Confirm the first non-overflow key on a page has a zero * prefix compression count. */ prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 " key on page at %s is the first " "non-overflow key on the page and has a non-zero " "prefix compression value", cell_num, tag); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " "compression count of %" PRIu32 ", larger than " "the length of the previous key, %" WT_SIZET_FMT, cell_num, tag, prefix, last->size); /* * If Huffman decoding required, unpack the cell to build the * key, then resolve the prefix. Else, we can do it faster * internally because we don't have to shuffle memory around as * much. */ if (huffman != NULL) { WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); /* * If there's a prefix, make sure there's enough buffer * space, then shift the decoded data past the prefix * and copy the prefix into place. Take care with the * pointers: current->data may be pointing inside the * buffer. */ if (prefix != 0) { WT_ERR(__wt_buf_grow( session, current, prefix + current->size)); memmove((uint8_t *)current->mem + prefix, current->data, current->size); memcpy(current->mem, last->data, prefix); current->data = current->mem; current->size += prefix; } } else { /* * Get the cell's data/length and make sure we have * enough buffer space. */ WT_ERR(__wt_buf_init( session, current, prefix + unpack->size)); /* Copy the prefix then the data into place. */ if (prefix != 0) memcpy(current->mem, last->data, prefix); memcpy((uint8_t *)current->mem + prefix, unpack->data, unpack->size); current->size = prefix + unpack->size; } key_compare: /* * Compare the current key against the last key. * * Be careful about the 0th key on internal pages: we only store * the first byte and custom collators may not be able to handle * truncated keys. */ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { WT_ERR(__wt_compare( session, btree->collator, last, current, &cmp)); if (cmp >= 0) WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " "page at %s are incorrectly sorted", cell_num - 2, cell_num, tag); } /* * Swap the buffers: last always references the last key entry, * last_pfx and last_ovfl reference the last prefix-compressed * and last overflow key entries. Current gets pointed to the * buffer we're not using this time around, which is where the * next key goes. */ last = current; if (cell_type == WT_CELL_KEY) { current = last_pfx; last_pfx = last; } else { current = last_ovfl; last_ovfl = last; } WT_ASSERT(session, last != current); }
/* * __wt_log_write -- * Write a record into the log. */ int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; uint32_t rdup_len; int locked; conn = S2C(session); log = conn->log; locked = 0; INIT_LSN(&lsn); myslot.slot = NULL; /* * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has * a header at the beginning for us to fill in. * * If using direct_io, the caller should pass us an aligned record. * But we need to make sure it is big enough and zero-filled so * that we can write the full amount. Do this whether or not * direct_io is in use because it makes the reading code cleaner. */ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size); rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ASSERT(session, record->data == record->mem); /* * If the caller's record only partially fills the necessary * space, we need to zero-fill the remainder. */ if (record->size != rdup_len) { memset((uint8_t *)record->mem + record->size, 0, rdup_len - record->size); record->size = rdup_len; } logrec = (WT_LOG_RECORD *)record->mem; logrec->len = (uint32_t)record->size; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, record->size); WT_STAT_FAST_CONN_INCR(session, log_writes); if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { ret = __log_direct_write(session, record, lsnp, flags); if (ret == 0) return (0); if (ret != EAGAIN) WT_ERR(ret); /* * An EAGAIN return means we failed to get the try lock - * fall through to the consolidation code in that case. */ } /* * As soon as we see contention for the log slot, disable direct * log writes. We get better performance by forcing writes through * the consolidation code. This is because individual writes flood * the I/O system faster than they contend on the log slot lock. */ F_SET(log, WT_LOG_FORCE_CONSOLIDATE); if ((ret = __wt_log_slot_join( session, rdup_len, flags, &myslot)) == ENOMEM) { /* * If we couldn't find a consolidated slot for this record * write the record directly. */ while ((ret = __log_direct_write( session, record, lsnp, flags)) == EAGAIN) ; WT_ERR(ret); /* * Increase the buffer size of any slots we can get access * to, so future consolidations are likely to succeed. */ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } WT_ERR(ret); if (myslot.offset == 0) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; WT_ERR(__wt_log_slot_close(session, myslot.slot)); WT_ERR(__log_acquire( session, myslot.slot->slot_group_size, myslot.slot)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; WT_ERR(__wt_log_slot_notify(session, myslot.slot)); } else WT_ERR(__wt_log_slot_wait(session, myslot.slot)); WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { WT_ERR(__log_release(session, myslot.slot)); WT_ERR(__wt_log_slot_free(myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); } err: if (locked) __wt_spin_unlock(session, &log->log_slot_lock); if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* * If we're synchronous and some thread had an error, we don't know * if our write made it out to the file or not. The error could be * before or after us. So, if anyone got an error, we report it. * If we're not synchronous, only report if our own operation got * an error. */ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) ret = myslot.slot->slot_error; return (ret); }
/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) { WT_CONNECTION_IMPL *conn; WT_ITEM buf; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; u_int i, logcount; int eol; char **logfiles; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; eol = 0; WT_CLEAR(buf); /* * If the caller did not give us a callback function there is nothing * to do. */ if (func == NULL) return (0); if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, "__wt_log_scan truncating to %u/%" PRIuMAX, log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); if (log != NULL) { allocsize = log->allocsize; if (lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FIRST)) start_lsn = log->first_lsn; else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) start_lsn = log->ckpt_lsn; else return (WT_ERROR); /* Illegal usage */ } else { if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ if (lsnp->offset % allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); /* * Log cursors may not know the starting LSN. If an * LSN pointer is passed in, but it is the INIT_LSN, * start from the first_lsn. */ start_lsn = *lsnp; if (IS_INIT_LSN(&start_lsn)) start_lsn = log->first_lsn; } end_lsn = log->alloc_lsn; } else { /* * If logging is not configured, we can still print out the log * if log files exist. We just need to set the LSNs from what * is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger * records and larger allocation boundaries should always be * a multiple of this. */ allocsize = LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); if (logcount == 0) /* * Return it is not supported if none don't exist. */ return (ENOTSUP); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } start_lsn.file = firstlog; end_lsn.file = lastlog; start_lsn.offset = end_lsn.offset = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; eol = 1; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* * Avoid an error message when we reach end of log * by checking here. */ if (rd_lsn.file > end_lsn.file) break; WT_ERR(__log_openfile( session, 0, &log_fh, rd_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); continue; } /* * Read the minimum allocation size a record could be. */ WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)buf.mem; /* * Log files are pre-allocated. We never expect a zero length * unless we've reached the end of the log. The log can be * written out of order, so when recovery finds the end of * the log, truncate the file and remove any later log files * that may exist. */ if (reclen == 0) { /* This LSN is the end. */ break; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this * log record. */ if (rd_lsn.offset + rdup_len > log_size) goto advance; /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. */ buf.size = reclen; logrec = (WT_LOG_RECORD *)buf.mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on * the first pass through recovery. In the second pass * where we truncate the log, this is where it should * end. */ if (log != NULL) log->trunc_lsn = rd_lsn; break; } /* * We have a valid log record. If it is not the log file * header, invoke the callback. */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); if (rd_lsn.offset != 0) { WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); if (LF_ISSET(WT_LOGSCAN_ONE)) break; } rd_lsn.offset += (off_t)rdup_len; } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, 0)); err: WT_STAT_FAST_CONN_INCR(session, log_scans); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); /* * If the caller wants one record and it is at the end of log, * return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; if (ret == ENOENT) ret = 0; if (log_fh != NULL) WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __wt_log_read -- * Read the log record at the given LSN. Return the record (including * the log header) in the WT_ITEM. Caller is responsible for freeing it. */ int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; uint32_t cksum, rdup_len, reclen; WT_UNUSED(flags); /* * If the caller didn't give us an LSN or something to return, * there's nothing to do. */ if (lsnp == NULL || record == NULL) return (0); conn = S2C(session); log = conn->log; /* * If the offset isn't on an allocation boundary it must be wrong. */ if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file)); /* * Read the minimum allocation size a record could be. */ WT_ERR(__wt_buf_init(session, record, log->allocsize)); WT_ERR(__wt_read(session, log_fh, lsnp->offset, (size_t)log->allocsize, record->mem)); /* * First 4 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)record->mem; if (reclen == 0) { ret = WT_NOTFOUND; goto err; } if (reclen > log->allocsize) { rdup_len = __wt_rduppo2(reclen, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ERR(__wt_read(session, log_fh, lsnp->offset, (size_t)rdup_len, record->mem)); } /* * We read in the record, verify checksum. */ logrec = (WT_LOG_RECORD *)record->mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum"); record->size = logrec->len; WT_STAT_FAST_CONN_INCR(session, log_reads); err: WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __log_release -- * Release a log slot. */ static int __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN sync_lsn; size_t write_size; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ conn = S2C(session); log = conn->log; /* * If we're going to have to close our log file, make a local copy * of the file handle structure. */ close_fh = NULL; if (F_ISSET(slot, SLOT_CLOSEFH)) { close_fh = log->log_close_fh; log->log_close_fh = NULL; F_CLR(slot, SLOT_CLOSEFH); } /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, slot->slot_start_offset, write_size, slot->slot_buf.mem)); } /* * Wait for earlier groups to finish, otherwise there could be holes * in the log file. */ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) __wt_yield(); log->write_lsn = slot->slot_end_lsn; /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->write_lsn. */ while (F_ISSET(slot, SLOT_SYNC) && LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { (void)__wt_cond_wait( session, log->log_sync_cond, 10000); continue; } /* * Record the current end of log after we grabbed the lock. * That is how far our fsync call with guarantee. */ sync_lsn = log->write_lsn; if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_STAT_FAST_CONN_INCR(session, log_sync); ret = __wt_fsync(session, log->log_fh); if (ret == 0) { F_CLR(slot, SLOT_SYNC); log->sync_lsn = sync_lsn; ret = __wt_cond_signal( session, log->log_sync_cond); } } __wt_spin_unlock(session, &log->log_sync_lock); WT_ERR(ret); } if (F_ISSET(slot, SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); F_CLR(slot, SLOT_BUF_GROW); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } /* * If we have a file to close, close it now. */ if (close_fh) WT_ERR(__wt_close(session, close_fh)); err: if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; return (ret); }
/* * __wt_schema_project_merge -- * Given list of cursors and a projection, build a buffer containing the * column values read from the cursors. */ int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value) { WT_CURSOR *c; WT_DECL_PACK(pack); WT_DECL_PACK_VALUE(pv); WT_DECL_PACK_VALUE(vpv); WT_ITEM *buf; WT_PACK vpack; u_long arg; char *proj; const uint8_t *p, *end; uint8_t *vp; size_t len; p = end = NULL; /* -Wuninitialized */ WT_RET(__wt_buf_init(session, value, 0)); WT_RET(__pack_init(session, &vpack, vformat)); for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = buf->data; end = p + buf->size; continue; } /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_NEXT: case WT_PROJ_SKIP: case WT_PROJ_REUSE: WT_RET(__pack_next(&pack, &pv)); WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); /* Only copy the value out once. */ if (*proj != WT_PROJ_NEXT) break; WT_RET(__pack_next(&vpack, &vpv)); /* Make sure the types are compatible. */ WT_ASSERT(session, __wt_tolower((u_char)pv.type) == __wt_tolower((u_char)vpv.type)); vpv.u = pv.u; WT_RET(__pack_size(session, &vpv, &len)); WT_RET(__wt_buf_grow(session, value, value->size + len)); vp = (uint8_t *)value->mem + value->size; WT_RET(__pack_write(session, &vpv, &vp, len)); value->size += len; break; } } } return (0); }
/* * __wt_schema_project_slice -- * Given list of cursors and a projection, read columns from the * a raw buffer. */ int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, bool key_only, const char *vformat, WT_ITEM *value) { WT_CURSOR *c; WT_DECL_ITEM(buf); WT_DECL_PACK(pack); WT_DECL_PACK_VALUE(pv); WT_DECL_PACK_VALUE(vpv); WT_PACK vpack; u_long arg; char *proj; uint8_t *end, *p; const uint8_t *next, *vp, *vend; size_t len, offset, old_len; bool skip; p = end = NULL; /* -Wuninitialized */ WT_RET(__pack_init(session, &vpack, vformat)); vp = value->data; vend = vp + value->size; /* Reset any of the buffers we will be setting. */ for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); if (*proj == WT_PROJ_KEY) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->key, 0)); } else if (*proj == WT_PROJ_VALUE && !key_only) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->value, 0)); } } skip = key_only; for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: skip = false; c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = (uint8_t *)buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: skip = key_only; if (skip) continue; c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = (uint8_t *)buf->data; end = p + buf->size; continue; } /* We have to get a key or value before any operations. */ WT_ASSERT(session, skip || buf != NULL); /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_SKIP: if (skip) break; WT_RET(__pack_next(&pack, &pv)); /* * A nasty case: if we are inserting * out-of-order, append a zero value to keep * the buffer in the correct format. */ if (p == end) { /* Set up an empty value. */ WT_CLEAR(pv.u); if (pv.type == 'S' || pv.type == 's') pv.u.s = ""; WT_RET(__pack_size(session, &pv, &len)); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->data + buf->size; WT_RET(__pack_write( session, &pv, &p, len)); end = p; buf->size += len; } else WT_RET(__unpack_read(session, &pv, (const uint8_t **)&p, (size_t)(end - p))); break; case WT_PROJ_NEXT: WT_RET(__pack_next(&vpack, &vpv)); WT_RET(__unpack_read(session, &vpv, &vp, (size_t)(vend - vp))); /* FALLTHROUGH */ case WT_PROJ_REUSE: if (skip) break; /* * Read the item we're about to overwrite. * * There is subtlety here: the value format * may not exactly match the cursor's format. * In particular, we need lengths with raw * columns in the middle of a packed struct, * but not if they are at the end of a struct. */ WT_RET(__pack_next(&pack, &pv)); next = p; if (p < end) WT_RET(__unpack_read(session, &pv, &next, (size_t)(end - p))); old_len = (size_t)(next - p); /* Make sure the types are compatible. */ WT_ASSERT(session, __wt_tolower((u_char)pv.type) == __wt_tolower((u_char)vpv.type)); pv.u = vpv.u; WT_RET(__pack_size(session, &pv, &len)); offset = WT_PTRDIFF(p, buf->data); /* * Avoid growing the buffer if the value fits. * This is not just a performance issue: it * covers the case of record number keys, which * have to be written to cursor->recno. */ if (len > old_len) WT_RET(__wt_buf_grow(session, buf, buf->size + len - old_len)); p = (uint8_t *)buf->data + offset; /* Make room if we're inserting out-of-order. */ if (offset + old_len < buf->size) memmove(p + len, p + old_len, buf->size - (offset + old_len)); WT_RET(__pack_write(session, &pv, &p, len)); buf->size += len - old_len; end = (uint8_t *)buf->data + buf->size; break; default: WT_RET_MSG(session, EINVAL, "unexpected projection plan: %c", (int)*proj); } } } return (0); }
/* * __wt_schema_project_in -- * Given list of cursors and a projection, read columns from the * application into the dependent cursors. */ int __wt_schema_project_in(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap) { WT_CURSOR *c; WT_DECL_ITEM(buf); WT_DECL_PACK(pack); WT_DECL_PACK_VALUE(pv); WT_PACK_VALUE old_pv; size_t len, offset, old_len; u_long arg; char *proj; uint8_t *p, *end; const uint8_t *next; p = end = NULL; /* -Wuninitialized */ /* Reset any of the buffers we will be setting. */ for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); if (*proj == WT_PROJ_KEY) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->key, 0)); } else if (*proj == WT_PROJ_VALUE) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->value, 0)); } } for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = (uint8_t *)buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = (uint8_t *)buf->data; end = p + buf->size; continue; } /* We have to get a key or value before any operations. */ WT_ASSERT(session, buf != NULL); /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_SKIP: WT_RET(__pack_next(&pack, &pv)); /* * A nasty case: if we are inserting * out-of-order, we may reach the end of the * data. That's okay: we want to append in * that case, and we're positioned to do that. */ if (p == end) { /* Set up an empty value. */ WT_CLEAR(pv.u); if (pv.type == 'S' || pv.type == 's') pv.u.s = ""; WT_RET(__pack_size(session, &pv, &len)); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->mem + buf->size; WT_RET(__pack_write( session, &pv, &p, len)); buf->size += len; end = (uint8_t *)buf->mem + buf->size; } else if (*proj == WT_PROJ_SKIP) WT_RET(__unpack_read(session, &pv, (const uint8_t **)&p, (size_t)(end - p))); break; case WT_PROJ_NEXT: WT_RET(__pack_next(&pack, &pv)); WT_PACK_GET(session, pv, ap); /* FALLTHROUGH */ case WT_PROJ_REUSE: /* Read the item we're about to overwrite. */ next = p; if (p < end) { old_pv = pv; WT_RET(__unpack_read(session, &old_pv, &next, (size_t)(end - p))); } old_len = (size_t)(next - p); WT_RET(__pack_size(session, &pv, &len)); offset = WT_PTRDIFF(p, buf->mem); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->mem + offset; end = (uint8_t *)buf->mem + buf->size + len; /* Make room if we're inserting out-of-order. */ if (offset + old_len < buf->size) memmove(p + len, p + old_len, buf->size - (offset + old_len)); WT_RET(__pack_write(session, &pv, &p, len)); buf->size += len; break; default: WT_RET_MSG(session, EINVAL, "unexpected projection plan: %c", (int)*proj); } } } return (0); }
/* * __wt_schema_project_merge -- * Given list of cursors and a projection, build a buffer containing the * column values read from the cursors. */ int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value) { WT_CURSOR *c; WT_ITEM *buf; WT_PACK pack, vpack; WT_PACK_VALUE pv, vpv; char *proj; uint8_t *p, *end, *vp; size_t len; uint32_t arg; WT_CLEAR(pack); /* -Wuninitialized */ WT_CLEAR(pv); /* -Wuninitialized */ WT_CLEAR(vpv); /* -Wuninitialized */ p = end = NULL; /* -Wuninitialized */ WT_RET(__wt_buf_init(session, value, 0)); WT_RET(__pack_init(session, &vpack, vformat)); for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = (uint32_t)strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = (uint8_t *)buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = (uint8_t *)buf->data; end = p + buf->size; continue; } /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_NEXT: case WT_PROJ_SKIP: WT_RET(__pack_next(&pack, &pv)); WT_RET(__unpack_read(session, &pv, (const uint8_t **)&p, (size_t)(end - p))); if (*proj == WT_PROJ_SKIP) break; WT_RET(__pack_next(&vpack, &vpv)); vpv.u = pv.u; len = __pack_size(session, &vpv); WT_RET(__wt_buf_grow(session, value, value->size + len)); vp = (uint8_t *)value->data + value->size; WT_RET(__pack_write(session, &vpv, &vp, len)); value->size += WT_STORE_SIZE(len); /* FALLTHROUGH */ case WT_PROJ_REUSE: /* Don't copy the same value twice. */ break; } } } return (0); }
/* * __wt_schema_project_slice -- * Given list of cursors and a projection, read columns from the * a raw buffer. */ int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value) { WT_CURSOR *c; WT_ITEM *buf; WT_PACK pack, vpack; WT_PACK_VALUE pv, vpv; char *proj; uint8_t *end, *p; const uint8_t *next, *vp, *vend; size_t len, offset, old_len; uint32_t arg; int skip; WT_CLEAR(pack); /* -Wuninitialized */ WT_CLEAR(vpv); /* -Wuninitialized */ buf = NULL; /* -Wuninitialized */ p = end = NULL; /* -Wuninitialized */ WT_RET(__pack_init(session, &vpack, vformat)); vp = (uint8_t *)value->data; vend = vp + value->size; /* Reset any of the buffers we will be setting. */ for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = (uint32_t)strtoul(proj, &proj, 10); if (*proj == WT_PROJ_KEY) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->key, 0)); } else if (*proj == WT_PROJ_VALUE && !key_only) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->value, 0)); } } skip = key_only; for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = (uint32_t)strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: skip = 0; c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = (uint8_t *)buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: if ((skip = key_only) != 0) continue; c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = (uint8_t *)buf->data; end = p + buf->size; continue; } /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_NEXT: case WT_PROJ_SKIP: if (!skip) { WT_RET(__pack_next(&pack, &pv)); /* * A nasty case: if we are inserting * out-of-order, append a zero value * to keep the buffer in the correct * format. */ if (*proj == WT_PROJ_SKIP && p == end) { /* Set up an empty value. */ WT_CLEAR(pv.u); if (pv.type == 'S' || pv.type == 's') pv.u.s = ""; len = __pack_size(session, &pv); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->data + buf->size; WT_RET(__pack_write( session, &pv, &p, len)); end = p; buf->size += WT_STORE_SIZE(len); } else if (*proj == WT_PROJ_SKIP) WT_RET(__unpack_read(session, &pv, (const uint8_t **)&p, (size_t)(end - p))); } if (*proj == WT_PROJ_SKIP) break; WT_RET(__pack_next(&vpack, &vpv)); WT_RET(__unpack_read(session, &vpv, &vp, (size_t)(vend - vp))); /* FALLTHROUGH */ case WT_PROJ_REUSE: if (skip) break; /* Read the item we're about to overwrite. */ next = p; if (p < end) WT_RET(__unpack_read(session, &pv, &next, (size_t)(end - p))); old_len = (size_t)(next - p); /* * There is subtlety here: the value format * may not exactly match the cursor's format. * In particular, we need lengths with raw * columns in the middle of a packed struct, * but not if they are at the end of a column. */ pv.u = vpv.u; len = __pack_size(session, &pv); offset = WT_PTRDIFF(p, buf->data); WT_RET(__wt_buf_grow(session, buf, buf->size + len - old_len)); p = (uint8_t *)buf->data + offset; /* Make room if we're inserting out-of-order. */ if (offset + old_len < buf->size) memmove(p + len, p + old_len, buf->size - (offset + old_len)); WT_RET(__pack_write(session, &pv, &p, len)); buf->size += WT_STORE_SIZE(len - old_len); end = (uint8_t *)buf->data + buf->size; break; default: WT_RET_MSG(session, EINVAL, "unexpected projection plan: %c", (int)*proj); } } } return (0); }
/* * slow_apply_api -- * Apply a set of modification changes using a different algorithm. */ static void slow_apply_api(WT_ITEM *orig) { static WT_ITEM _tb; WT_ITEM *ta, *tb, *tmp, _tmp; size_t len, size; int i; ta = orig; tb = &_tb; /* Mess up anything not initialized in the buffers. */ memset((uint8_t *)ta->mem + ta->size, 0xff, ta->memsize - ta->size); memset((uint8_t *)tb->mem, 0xff, tb->memsize); /* * Process the entries to figure out how large a buffer we need. This is * a bit pessimistic because we're ignoring replacement bytes, but it's * a simpler calculation. */ for (size = ta->size, i = 0; i < nentries; ++i) { if (entries[i].offset >= size) size = entries[i].offset; size += entries[i].data.size; } testutil_check(__wt_buf_grow(NULL, ta, size)); testutil_check(__wt_buf_grow(NULL, tb, size)); #if DEBUG show(ta, "slow-apply start"); #endif /* * From the starting buffer, create a new buffer b based on changes * in the entries array. We're doing a brute force solution here to * test the faster solution implemented in the library. */ for (i = 0; i < nentries; ++i) { /* Take leading bytes from the original, plus any gap bytes. */ if (entries[i].offset >= ta->size) { memcpy(tb->mem, ta->mem, ta->size); if (entries[i].offset > ta->size) memset((uint8_t *)tb->mem + ta->size, '\0', entries[i].offset - ta->size); } else if (entries[i].offset > 0) memcpy(tb->mem, ta->mem, entries[i].offset); tb->size = entries[i].offset; /* Take replacement bytes. */ if (entries[i].data.size > 0) { memcpy((uint8_t *)tb->mem + tb->size, entries[i].data.data, entries[i].data.size); tb->size += entries[i].data.size; } /* Take trailing bytes from the original. */ len = entries[i].offset + entries[i].size; if (ta->size > len) { memcpy((uint8_t *)tb->mem + tb->size, (uint8_t *)ta->mem + len, ta->size - len); tb->size += ta->size - len; } testutil_assert(tb->size <= size); /* Swap the buffers and do it again. */ tmp = ta; ta = tb; tb = tmp; } ta->data = ta->mem; tb->data = tb->mem; /* * The final results may not be in the original buffer, in which case * we swap them back around. */ if (ta != orig) { _tmp = *ta; *ta = *tb; *tb = _tmp; } #if DEBUG show(ta, "slow-apply finish"); #endif }