/* * __log_filesize -- * Returns an estimate of the real end of log file. */ static int __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint64_t rec; uint32_t allocsize; off_t log_size, off; conn = S2C(session); log = conn->log; if (eof == NULL) return (0); *eof = 0; WT_ERR(__wt_filesize(session, fh, &log_size)); if (log == NULL) allocsize = LOG_ALIGN; else allocsize = log->allocsize; /* * We know all log records are aligned at log->allocsize. The first * item in a log record is always the length. Look for any non-zero * at the allocsize boundary. This may not be a true log record since * it could be the middle of a large record. But we know no log record * starts after it. Return an estimate of the log file size. */ for (off = log_size - (off_t)allocsize; off > 0; off -= (off_t)allocsize) { WT_ERR(__wt_read(session, fh, off, sizeof(uint64_t), &rec)); if (rec != 0) break; } /* * Set EOF to the last zero-filled record we saw. */ *eof = off + (off_t)allocsize; err: return (ret); }
/* * __wt_fopen -- * Open a stream handle. */ int __wt_fopen(WT_SESSION_IMPL *session, const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fstrp) { WT_DECL_RET; WT_FH *fh; WT_FSTREAM *fstr; *fstrp = NULL; fstr = NULL; WT_RET(__wt_open( session, name, WT_OPEN_FILE_TYPE_REGULAR, open_flags, &fh)); WT_ERR(__wt_calloc_one(session, &fstr)); fstr->fh = fh; fstr->name = fh->name; fstr->flags = flags; fstr->close = __fstream_close; WT_ERR(__wt_filesize(session, fh, &fstr->size)); if (LF_ISSET(WT_STREAM_APPEND)) fstr->off = fstr->size; if (LF_ISSET(WT_STREAM_APPEND | WT_STREAM_WRITE)) { fstr->fstr_flush = __fstream_flush; fstr->fstr_getline = __fstream_getline_notsup; fstr->fstr_printf = __fstream_printf; } else { WT_ASSERT(session, LF_ISSET(WT_STREAM_READ)); fstr->fstr_flush = __fstream_flush_notsup; fstr->fstr_getline = __fstream_getline; fstr->fstr_printf = __fstream_printf_notsup; } *fstrp = fstr; return (0); err: WT_TRET(__wt_close(session, &fh)); __wt_free(session, fstr); return (ret); }
/* * __wt_optrack_record_funcid -- * Allocate and record optrack function ID. */ void __wt_optrack_record_funcid( WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp) { static uint16_t optrack_uid = 0; /* Unique for the process lifetime. */ WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t fsize; bool locked; conn = S2C(session); locked = false; WT_ERR(__wt_scr_alloc(session, strlen(func) + 32, &tmp)); __wt_spin_lock(session, &conn->optrack_map_spinlock); locked = true; if (*func_idp == 0) { *func_idp = ++optrack_uid; WT_ERR(__wt_buf_fmt( session, tmp, "%" PRIu16 " %s\n", *func_idp, func)); WT_ERR(__wt_filesize(session, conn->optrack_map_fh, &fsize)); WT_ERR(__wt_write(session, conn->optrack_map_fh, fsize, tmp->size, tmp->data)); } if (0) { err: WT_PANIC_MSG(session, ret, "operation tracking initialization failure"); } if (locked) __wt_spin_unlock(session, &conn->optrack_map_spinlock); __wt_scr_free(session, &tmp); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { DWORD dwCreationDisposition; HANDLE filehandle, filehandle_secondary; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; uint64_t bucket, hash; int direct_io, f, matched, share_mode; char *path; conn = S2C(session); fh = NULL; path = NULL; filehandle = INVALID_HANDLE_VALUE; filehandle_secondary = INVALID_HANDLE_VALUE; direct_io = 0; hash = __wt_hash_city64(name, strlen(name)); bucket = hash % WT_HASH_ARRAY_SIZE; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); /* For directories, create empty file handles with invalid handles */ if (dio_type == WT_FILE_TYPE_DIRECTORY) { goto setupfh; } WT_RET(__wt_filename(session, name, &path)); share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. * * TODO: Set tighter file permissions but set bInheritHandle to false * to prevent inheritance */ f = FILE_ATTRIBUTE_NORMAL; dwCreationDisposition = 0; if (ok_create) { dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; } else dwCreationDisposition = OPEN_EXISTING; if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; direct_io = 1; } if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) { f |= FILE_FLAG_WRITE_THROUGH; } /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) f |= FILE_FLAG_RANDOM_ACCESS; filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, dwCreationDisposition, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) { if (GetLastError() == ERROR_FILE_EXISTS && ok_create) filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); } /* * Open a second handle to file to support allocation/truncation * concurrently with reads on the file. Writes would also move the file * pointer. */ filehandle_secondary = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), "open failed for secondary handle: %s", path); setupfh: WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->name_hash = hash; fh->filehandle = filehandle; fh->filehandle_secondary = filehandle_secondary; fh->ref = 1; fh->direct_io = direct_io; /* Set the file's size. */ if (dio_type != WT_FILE_TYPE_DIRECTORY) WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) fh->extend_len = conn->data_extend_len; /* Configure fallocate/posix_fallocate calls. */ __wt_fallocate_config(session, fh); /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } if (!matched) { WT_CONN_FILE_INSERT(conn, fh, bucket); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (filehandle != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle); if (filehandle_secondary != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle_secondary); } __wt_free(session, path); return (ret); }
/* * __conn_single -- * Confirm that no other thread of control is using this database. */ static int __conn_single(WT_SESSION_IMPL *session, const char **cfg) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn, *t; WT_DECL_RET; off_t size; uint32_t len; int created; char buf[256]; conn = S2C(session); /* * Optionally create the wiredtiger flag file if it doesn't already * exist. We don't actually care if we create it or not, the "am I the * only locker" tests are all that matter. */ WT_RET(__wt_config_gets(session, cfg, "create", &cval)); WT_RET(__wt_open(session, WT_SINGLETHREAD, cval.val == 0 ? 0 : 1, 0, 0, &conn->lock_fh)); /* * Lock a byte of the file: if we don't get the lock, some other process * is holding it, we're done. Note the file may be zero-length length, * and that's OK, the underlying call supports acquisition of locks past * the end-of-file. */ if (__wt_bytelock(conn->lock_fh, (off_t)0, 1) != 0) WT_ERR_MSG(session, EBUSY, "%s", "WiredTiger database is already being managed by another " "process"); /* Check to see if another thread of control has this database open. */ __wt_spin_lock(session, &__wt_process.spinlock); TAILQ_FOREACH(t, &__wt_process.connqh, q) if (t->home != NULL && t != conn && strcmp(t->home, conn->home) == 0) { ret = EBUSY; break; } __wt_spin_unlock(session, &__wt_process.spinlock); if (ret != 0) WT_ERR_MSG(session, EBUSY, "%s", "WiredTiger database is already being managed by another " "thread in this process"); /* * If the size of the file is 0, we created it (or we're racing with * the thread that created it, it doesn't matter), write some bytes * into the file. Strictly speaking, this isn't even necessary, but * zero-length files always make me nervous. */ WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); if (size == 0) { len = (uint32_t)snprintf(buf, sizeof(buf), "%s\n%s\n", WT_SINGLETHREAD, wiredtiger_version(NULL, NULL, NULL)); WT_ERR(__wt_write( session, conn->lock_fh, (off_t)0, (uint32_t)len, buf)); created = 1; } else created = 0; /* * If we found a zero-length WiredTiger lock file, and eventually ended * as the database owner, return that we created the database. (There * is a theoretical chance that another process created the WiredTiger * lock file but we won the race to add the WT_CONNECTION_IMPL structure * to the process' list. It doesn't much matter, only one thread will * be told it created the database.) */ conn->is_new = created; return (0); err: if (conn->lock_fh != NULL) { WT_TRET(__wt_close(session, conn->lock_fh)); conn->lock_fh = NULL; } return (ret); }
/* * __conn_config_file -- * Read in any WiredTiger_config file in the home directory. */ static int __conn_config_file(WT_SESSION_IMPL *session, const char **cfg, WT_ITEM **cbufp) { WT_DECL_ITEM(cbuf); WT_DECL_RET; WT_FH *fh; off_t size; uint32_t len; int exist, quoted; uint8_t *p, *t; *cbufp = NULL; /* Returned buffer */ fh = NULL; /* Check for an optional configuration file. */ #define WT_CONFIGFILE "WiredTiger.config" WT_RET(__wt_exist(session, WT_CONFIGFILE, &exist)); if (!exist) return (0); /* Open the configuration file. */ WT_RET(__wt_open(session, WT_CONFIGFILE, 0, 0, 0, &fh)); WT_ERR(__wt_filesize(session, fh, &size)); if (size == 0) goto err; /* * Sanity test: a 100KB configuration file would be insane. (There's * no practical reason to limit the file size, but I can either limit * the file size to something rational, or I can add code to test if * the off_t size is larger than a uint32_t, which is more complicated * and a waste of time.) */ if (size > 100 * 1024) WT_ERR_MSG(session, EFBIG, WT_CONFIGFILE); len = (uint32_t)size; /* * Copy the configuration file into memory, with a little slop, I'm not * interested in debugging off-by-ones. * * The beginning of a file is the same as if we run into an unquoted * newline character, simplify the parsing loop by pretending that's * what we're doing. */ WT_ERR(__wt_scr_alloc(session, len + 10, &cbuf)); WT_ERR( __wt_read(session, fh, (off_t)0, len, ((uint8_t *)cbuf->mem) + 1)); ((uint8_t *)cbuf->mem)[0] = '\n'; cbuf->size = len + 1; /* * Collapse the file's lines into a single string: newline characters * are replaced with commas unless the newline is quoted or backslash * escaped. Comment lines (an unescaped newline where the next non- * white-space character is a hash), are discarded. */ for (quoted = 0, p = t = cbuf->mem; len > 0;) { /* * Backslash pairs pass through untouched, unless immediately * preceding a newline, in which case both the backslash and * the newline are discarded. Backslash characters escape * quoted characters, too, that is, a backslash followed by a * quote doesn't start or end a quoted string. */ if (*p == '\\' && len > 1) { if (p[1] != '\n') { *t++ = p[0]; *t++ = p[1]; } p += 2; len -= 2; continue; } /* * If we're in a quoted string, or starting a quoted string, * take all characters, including white-space and newlines. */ if (quoted || *p == '"') { if (*p == '"') quoted = !quoted; *t++ = *p++; --len; continue; } /* Everything else gets taken, except for newline characters. */ if (*p != '\n') { *t++ = *p++; --len; continue; } /* * Replace any newline characters with commas (and strings of * commas are safe). * * After any newline, skip to a non-white-space character; if * the next character is a hash mark, skip to the next newline. */ for (;;) { for (*t++ = ','; --len > 0 && isspace(*++p);) ; if (len == 0) break; if (*p != '#') break; while (--len > 0 && *++p != '\n') ; if (len == 0) break; } } *t = '\0'; #if 0 fprintf(stderr, "file config: {%s}\n", (const char *)cbuf->data); #endif /* Check the configuration string. */ WT_ERR(__wt_config_check( session, __wt_confchk_wiredtiger_open, cbuf->data, 0)); /* * The configuration file falls between the default configuration and * the wiredtiger_open() configuration, overriding the defaults but not * overriding the wiredtiger_open() configuration. */ while (cfg[1] != NULL) ++cfg; cfg[1] = cfg[0]; cfg[0] = cbuf->data; *cbufp = cbuf; if (0) { err: if (cbuf != NULL) __wt_buf_free(session, cbuf); } if (fh != NULL) WT_TRET(__wt_close(session, fh)); return (ret); }
/* * __ckpt_update -- * Update a checkpoint. */ static int __ckpt_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, uint64_t ckpt_size, int is_live) { WT_EXTLIST *alloc; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); #endif /* * Write the checkpoint's alloc and discard extent lists. After each * write, remove any allocated blocks from the system's allocation * list, checkpoint extent blocks don't appear on any extent lists. */ alloc = &block->live.alloc; WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->alloc.offset, ci->alloc.size)); WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->discard.offset, ci->discard.size)); /* * We only write an avail list for the live system, other checkpoint's * avail lists are static and never change. * * Write the avail list last so it reflects changes due to allocating * blocks for the alloc and discard lists. Second, when we write the * live system's avail list, it's two lists: the current avail list * plus the list of blocks to be made available when the new checkpoint * completes. We can't merge that second list into the real list yet, * it's not truly available until the new checkpoint locations have been * saved to the metadata. */ if (is_live) { WT_RET(__wt_block_extlist_write( session, block, &ci->avail, &ci->ckpt_avail)); if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->avail.offset, ci->avail.size)); } /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing checkpoints because we * want to test the checkpoint's blocks against a reasonable maximum * file size during verification. This is bad: imagine a checkpoint * appearing early in the file, re-written, and then the checkpoint * requires blocks at the end of the file, blocks after the listed file * size. If the application opens that checkpoint for writing * (discarding subsequent checkpoints), we would truncate the file to * the early chunk, discarding the re-written checkpoint information. * The alternative, updating the file size has its own problems, in * that case we'd work correctly, but we'd lose all of the blocks * between the original checkpoint and the re-written checkpoint. * Currently, there's no API to roll-forward intermediate checkpoints, * if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &ci->file_size)); /* Set the checkpoint size for the live system. */ if (is_live) ci->ckpt_size = ckpt_size; /* * Copy the checkpoint information into the checkpoint array's address * cookie. */ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = ckpt->raw.mem; WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); ckpt->raw.size = WT_PTRDIFF32(endp, ckpt->raw.mem); if (WT_VERBOSE_ISSET(session, ckpt)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); WT_VERBOSE_ERR(session, ckpt, "%s: create-checkpoint: %s: %s", block->name, ckpt->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }
/* * __snapshot_update -- * Update a snapshot. */ static int __snapshot_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap, WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard)); #endif /* * Write the snapshot's extent lists; we only write an avail list for * the live system, other snapshot's avail lists are static and never * change. When we do write the avail list for the live system it's * two lists: the current avail list plus the list of blocks that are * being made available as of the new snapshot. We can't merge that * second list into the real list yet, it's not truly available until * the new snapshot location has been saved to the metadata. */ WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL)); if (is_live) WT_RET(__wt_block_extlist_write( session, block, &si->avail, &si->snapshot_avail)); WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL)); /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing snapshots because we want * to test the snapshot's blocks against a reasonable maximum file size * during verification. This is not good: imagine a snapshot appearing * early in the file, re-written, and then the snapshot requires blocks * at the end of the file, blocks after the listed file size. If the * application opens that snapshot for writing (discarding subsequent * snapshots), we would truncate the file to the early chunk, discarding * the re-written snapshot information. The alternative, updating the * file size has its own problems, in that case we'd work correctly, but * we'd lose all of the blocks between the original snapshot and the * re-written snapshot. Currently, there's no API to roll-forward * intermediate snapshots, if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &si->file_size)); /* Set the snapshot size for the live system. */ if (is_live) si->snapshot_size = snapshot_size; /* * Copy the snapshot information into the snapshot array's address * cookie. */ WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = snap->raw.mem; WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si)); snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem); if (WT_VERBOSE_ISSET(session, snapshot)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: create-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }
WT_ERR(__wt_remove_if_exists(session, tmp->data, false)); /* Open the from and temporary file handles. */ WT_ERR(__wt_open(session, from, WT_FS_OPEN_FILE_TYPE_REGULAR, 0, &ffh)); WT_ERR(__wt_open(session, tmp->data, WT_FS_OPEN_FILE_TYPE_REGULAR, WT_FS_OPEN_CREATE | WT_FS_OPEN_EXCLUSIVE, &tfh)); /* * Allocate a copy buffer. Don't use a scratch buffer, this thing is * big, and we don't want it hanging around. */ #define WT_BACKUP_COPY_SIZE (128 * 1024) WT_ERR(__wt_malloc(session, WT_BACKUP_COPY_SIZE, &buf)); /* Get the file's size, then copy the bytes. */ WT_ERR(__wt_filesize(session, ffh, &size)); for (offset = 0; size > 0; size -= n, offset += n) { n = WT_MIN(size, WT_BACKUP_COPY_SIZE); WT_ERR(__wt_read(session, ffh, offset, (size_t)n, buf)); WT_ERR(__wt_write(session, tfh, offset, (size_t)n, buf)); } /* Close the from handle, then swap the temporary file into place. */ WT_ERR(__wt_close(session, &ffh)); WT_ERR(__wt_fsync(session, tfh, true)); WT_ERR(__wt_close(session, &tfh)); ret = __wt_fs_rename(session, tmp->data, to, true); err: WT_TRET(__wt_close(session, &ffh)); WT_TRET(__wt_close(session, &tfh));
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; mode_t mode; int direct_io, f, fd, matched; const char *path; conn = S2C(session); fh = NULL; fd = -1; path = NULL; WT_VERBOSE_RET(session, fileops, "%s: open", name); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->refcnt; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); WT_RET(__wt_filename(session, name, &path)); f = O_RDWR; #ifdef O_BINARY /* Windows clones: we always want to treat the file as a binary. */ f |= O_BINARY; #endif #ifdef O_CLOEXEC /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. */ f |= O_CLOEXEC; #endif #ifdef O_NOATIME /* Avoid updating metadata for read-only workloads. */ if (dio_type == WT_FILE_TYPE_DATA) f |= O_NOATIME; #endif if (ok_create) { f |= O_CREAT; if (exclusive) f |= O_EXCL; mode = 0666; } else mode = 0; direct_io = 0; #ifdef O_DIRECT if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= O_DIRECT; direct_io = 1; } #endif if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) #ifdef O_DSYNC f |= O_DSYNC; #elif defined(O_SYNC) f |= O_SYNC; #else WT_ERR_MSG(session, ENOTSUP, "Unsupported log sync mode requested"); #endif WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); #if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC) /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. There's an obvious * race here, so we prefer the flag to open if available. */ if ((f = fcntl(fd, F_GETFD)) == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name); #endif #if defined(HAVE_POSIX_FADVISE) /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA) WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); #endif if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_ERR(__open_directory_sync(session)); WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->fd = fd; fh->refcnt = 1; fh->direct_io = direct_io; /* Set the file's size. */ WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA) fh->extend_len = conn->data_extend_len; /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->refcnt; *fhp = tfh; matched = 1; break; } if (!matched) { TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (fd != -1) (void)close(fd); } __wt_free(session, path); return (ret); }