static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) { enum fio_ddir odir = ddir ^ 1; struct timeval t; long usec; assert(ddir_rw(ddir)); if (td->rate_pending_usleep[ddir] <= 0) return ddir; /* * We have too much pending sleep in this direction. See if we * should switch. */ if (td_rw(td) && td->o.rwmix[odir]) { /* * Other direction does not have too much pending, switch */ if (td->rate_pending_usleep[odir] < 100000) return odir; /* * Both directions have pending sleep. Sleep the minimum time * and deduct from both. */ if (td->rate_pending_usleep[ddir] <= td->rate_pending_usleep[odir]) { usec = td->rate_pending_usleep[ddir]; } else { usec = td->rate_pending_usleep[odir]; ddir = odir; } } else usec = td->rate_pending_usleep[ddir]; io_u_quiesce(td); fio_gettime(&t, NULL); usec_sleep(td, usec); usec = utime_since_now(&t); td->rate_pending_usleep[ddir] -= usec; odir = ddir ^ 1; if (td_rw(td) && __should_check_rate(td, odir)) td->rate_pending_usleep[odir] -= usec; if (ddir_trim(ddir)) return ddir; return ddir; }
static int verify_io_u_meta(struct verify_header *hdr, struct vcont *vc) { struct thread_data *td = vc->td; struct vhdr_meta *vh = hdr_priv(hdr); struct io_u *io_u = vc->io_u; int ret = EILSEQ; dprint(FD_VERIFY, "meta verify io_u %p, len %u\n", io_u, hdr->len); if (vh->offset == io_u->offset + vc->hdr_num * td->o.verify_interval) ret = 0; if (td->o.verify_pattern_bytes) ret |= verify_io_u_pattern(hdr, vc); /* * For read-only workloads, the program cannot be certain of the * last numberio written to a block. Checking of numberio will be done * only for workloads that write data. * For verify_only, numberio will be checked in the last iteration when * the correct state of numberio, that would have been written to each * block in a previous run of fio, has been reached. */ if (td_write(td) || td_rw(td)) if (!td->o.verify_only || td->o.loops == 0) if (vh->numberio != io_u->numberio) ret = EILSEQ; if (!ret) return 0; vc->name = "meta"; log_verify_failure(hdr, vc); return ret; }
/* * Sets the status of the 'td' in the printed status map. */ static void check_str_update(struct thread_data *td) { char c = run_str[td->thread_number - 1]; switch (td->runstate) { case TD_REAPED: c = '_'; break; case TD_EXITED: c = 'E'; break; case TD_RAMP: c = '/'; break; case TD_RUNNING: if (td_rw(td)) { if (td_random(td)) c = 'm'; else c = 'M'; } else if (td_read(td)) { if (td_random(td)) c = 'r'; else c = 'R'; } else { if (td_random(td)) c = 'w'; else c = 'W'; } break; case TD_PRE_READING: c = 'p'; break; case TD_VERIFYING: c = 'V'; break; case TD_FSYNCING: c = 'F'; break; case TD_CREATED: c = 'C'; break; case TD_INITIALIZED: c = 'I'; break; case TD_NOT_CREATED: c = 'P'; break; default: log_err("state %d\n", td->runstate); } run_str[td->thread_number - 1] = c; }
/* * Return the data direction for the next io_u. If the job is a * mixed read/write workload, check the rwmix cycle and switch if * necessary. */ static enum fio_ddir get_rw_ddir(struct thread_data *td) { enum fio_ddir ddir; /* * see if it's time to fsync */ if (td->o.fsync_blocks && !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) && td->io_issues[DDIR_WRITE] && should_fsync(td)) return DDIR_SYNC; /* * see if it's time to fdatasync */ if (td->o.fdatasync_blocks && !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) && td->io_issues[DDIR_WRITE] && should_fsync(td)) return DDIR_DATASYNC; /* * see if it's time to sync_file_range */ if (td->sync_file_range_nr && !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) && td->io_issues[DDIR_WRITE] && should_fsync(td)) return DDIR_SYNC_FILE_RANGE; if (td_rw(td)) { /* * Check if it's time to seed a new data direction. */ if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) { /* * Put a top limit on how many bytes we do for * one data direction, to avoid overflowing the * ranges too much */ ddir = get_rand_ddir(td); if (ddir != td->rwmix_ddir) set_rwmix_bytes(td); td->rwmix_ddir = ddir; } ddir = td->rwmix_ddir; } else if (td_read(td)) ddir = DDIR_READ; else if (td_write(td)) ddir = DDIR_WRITE; else ddir = DDIR_TRIM; td->rwmix_ddir = rate_ddir(td, ddir); return td->rwmix_ddir; }
static int fio_mmapio_open(struct thread_data *td, struct fio_file *f) { int ret, flags; ret = generic_open_file(td, f); if (ret) return ret; /* * for size checkup, don't mmap anything. */ if (!f->io_size) return 0; if (td_rw(td)) flags = PROT_READ | PROT_WRITE; else if (td_write(td)) { flags = PROT_WRITE; if (td->o.verify != VERIFY_NONE) flags |= PROT_READ; } else flags = PROT_READ; f->mmap = mmap(NULL, f->io_size, flags, MAP_SHARED, f->fd, f->file_offset); if (f->mmap == MAP_FAILED) { f->mmap = NULL; td_verror(td, errno, "mmap"); goto err; } if (file_invalidate_cache(td, f)) goto err; if (!td_random(td)) { if (madvise(f->mmap, f->io_size, MADV_SEQUENTIAL) < 0) { td_verror(td, errno, "madvise"); goto err; } } else { if (madvise(f->mmap, f->io_size, MADV_RANDOM) < 0) { td_verror(td, errno, "madvise"); goto err; } } return 0; err: td->io_ops->close_file(td, f); return 1; }
static int fio_mmap_file(struct thread_data *td, struct fio_file *f, size_t length, off_t off) { struct fio_mmap_data *fmd = FILE_ENG_DATA(f); int flags = 0; if (td_rw(td) && !td->o.verify_only) flags = PROT_READ | PROT_WRITE; else if (td_write(td) && !td->o.verify_only) { flags = PROT_WRITE; if (td->o.verify != VERIFY_NONE) flags |= PROT_READ; } else flags = PROT_READ; fmd->mmap_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off); if (fmd->mmap_ptr == MAP_FAILED) { fmd->mmap_ptr = NULL; td_verror(td, errno, "mmap"); goto err; } if (!fio_madvise_file(td, f, length)) goto err; if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_DONTNEED) < 0) { td_verror(td, errno, "madvise"); goto err; } #ifdef FIO_MADV_FREE if (f->filetype == FIO_TYPE_BLOCK) (void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE); #endif err: if (td->error && fmd->mmap_ptr) munmap(fmd->mmap_ptr, length); return td->error; }
/* * This is the mmap execution function */ static int fio_libpmem_file(struct thread_data *td, struct fio_file *f, size_t length, off_t off) { struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); int flags = 0; void *addr = NULL; dprint(FD_IO, "DEBUG fio_libpmem_file\n"); if (td_rw(td)) flags = PROT_READ | PROT_WRITE; else if (td_write(td)) { flags = PROT_WRITE; if (td->o.verify != VERIFY_NONE) flags |= PROT_READ; } else flags = PROT_READ; dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name, td->o.verify); dprint(FD_IO, "length = %ld flags = %d f->fd = %d off = %ld \n", length, flags, f->fd,off); addr = util_map_hint(length, 0); fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off); if (fdd->libpmem_ptr == MAP_FAILED) { fdd->libpmem_ptr = NULL; td_verror(td, errno, "mmap"); } if (td->error && fdd->libpmem_ptr) munmap(fdd->libpmem_ptr, length); return td->error; }
void rate_throttle(struct thread_data *td, unsigned long time_spent, unsigned int bytes) { unsigned long usec_cycle; unsigned int bs; if (!td->o.rate && !td->o.rate_iops) return; if (td_rw(td)) bs = td->o.rw_min_bs; else if (td_read(td)) bs = td->o.min_bs[DDIR_READ]; else bs = td->o.min_bs[DDIR_WRITE]; usec_cycle = td->rate_usec_cycle * (bytes / bs); if (time_spent < usec_cycle) { unsigned long s = usec_cycle - time_spent; td->rate_pending_usleep += s; if (td->rate_pending_usleep >= 100000) { struct timeval t; fio_gettime(&t, NULL); usec_sleep(td, td->rate_pending_usleep); td->rate_pending_usleep -= utime_since_now(&t); } } else { long overtime = time_spent - usec_cycle; td->rate_pending_usleep -= overtime; } }
/* * Best effort calculation of the estimated pending runtime of a job. */ static int thread_eta(struct thread_data *td) { unsigned long long bytes_total, bytes_done; unsigned long eta_sec = 0; unsigned long elapsed; uint64_t timeout; elapsed = (mtime_since_now(&td->epoch) + 999) / 1000; timeout = td->o.timeout / 1000000UL; bytes_total = td->total_io_size; if (td->o.fill_device && td->o.size == -1ULL) { if (!td->fill_device_size || td->fill_device_size == -1ULL) return 0; bytes_total = td->fill_device_size; } if (td->o.zone_size && td->o.zone_skip && bytes_total) { unsigned int nr_zones; uint64_t zone_bytes; zone_bytes = bytes_total + td->o.zone_size + td->o.zone_skip; nr_zones = (zone_bytes - 1) / (td->o.zone_size + td->o.zone_skip); bytes_total -= nr_zones * td->o.zone_skip; } /* * if writing and verifying afterwards, bytes_total will be twice the * size. In a mixed workload, verify phase will be the size of the * first stage writes. */ if (td->o.do_verify && td->o.verify && td_write(td)) { if (td_rw(td)) { unsigned int perc = 50; if (td->o.rwmix[DDIR_WRITE]) perc = td->o.rwmix[DDIR_WRITE]; bytes_total += (bytes_total * perc) / 100; } else bytes_total <<= 1; } if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) { double perc, perc_t; bytes_done = ddir_rw_sum(td->io_bytes); if (bytes_total) { perc = (double) bytes_done / (double) bytes_total; if (perc > 1.0) perc = 1.0; } else perc = 0.0; if (td->o.time_based) { if (timeout) { perc_t = (double) elapsed / (double) timeout; if (perc_t < perc) perc = perc_t; } else { /* * Will never hit, we can't have time_based * without a timeout set. */ perc = 0.0; } } eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed; if (td->o.timeout && eta_sec > (timeout + done_secs - elapsed)) eta_sec = timeout + done_secs - elapsed; } else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED || td->runstate == TD_INITIALIZED || td->runstate == TD_SETTING_UP || td->runstate == TD_RAMP || td->runstate == TD_PRE_READING) { int t_eta = 0, r_eta = 0; unsigned long long rate_bytes; /* * We can only guess - assume it'll run the full timeout * if given, otherwise assume it'll run at the specified rate. */ if (td->o.timeout) { uint64_t timeout = td->o.timeout; uint64_t start_delay = td->o.start_delay; uint64_t ramp_time = td->o.ramp_time; t_eta = timeout + start_delay + ramp_time; t_eta /= 1000000ULL; if (in_ramp_time(td)) { unsigned long ramp_left; ramp_left = mtime_since_now(&td->epoch); ramp_left = (ramp_left + 999) / 1000; if (ramp_left <= t_eta) t_eta -= ramp_left; } } rate_bytes = ddir_rw_sum(td->o.rate); if (rate_bytes) { r_eta = (bytes_total / 1024) / rate_bytes; r_eta += (td->o.start_delay / 1000000ULL); } if (r_eta && t_eta) eta_sec = min(r_eta, t_eta); else if (r_eta) eta_sec = r_eta; else if (t_eta) eta_sec = t_eta; else eta_sec = 0; } else { /* * thread is already done or waiting for fsync */ eta_sec = 0; } return eta_sec; }
/* * Best effort calculation of the estimated pending runtime of a job. */ static int thread_eta(struct thread_data *td) { unsigned long long bytes_total, bytes_done; unsigned long eta_sec = 0; unsigned long elapsed; elapsed = (mtime_since_now(&td->epoch) + 999) / 1000; bytes_total = td->total_io_size; /* * if writing, bytes_total will be twice the size. If mixing, * assume a 50/50 split and thus bytes_total will be 50% larger. */ if (td->o.do_verify && td->o.verify && td_write(td)) { if (td_rw(td)) bytes_total = bytes_total * 3 / 2; else bytes_total <<= 1; } if (td->o.zone_size && td->o.zone_skip) bytes_total /= (td->o.zone_skip / td->o.zone_size); if (td->o.fill_device && td->o.size == -1ULL) return 0; if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) { double perc, perc_t; bytes_done = td->io_bytes[DDIR_READ] + td->io_bytes[DDIR_WRITE]; perc = (double) bytes_done / (double) bytes_total; if (perc > 1.0) perc = 1.0; if (td->o.time_based) { perc_t = (double) elapsed / (double) td->o.timeout; if (perc_t < perc) perc = perc_t; } eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed; if (td->o.timeout && eta_sec > (td->o.timeout + done_secs - elapsed)) eta_sec = td->o.timeout + done_secs - elapsed; } else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED || td->runstate == TD_INITIALIZED || td->runstate == TD_RAMP || td->runstate == TD_PRE_READING) { int t_eta = 0, r_eta = 0; /* * We can only guess - assume it'll run the full timeout * if given, otherwise assume it'll run at the specified rate. */ if (td->o.timeout) { t_eta = td->o.timeout + td->o.start_delay; if (in_ramp_time(td)) { unsigned long ramp_left; ramp_left = mtime_since_now(&td->start); ramp_left = (ramp_left + 999) / 1000; if (ramp_left <= t_eta) t_eta -= ramp_left; } } if (td->o.rate[0] || td->o.rate[1]) { r_eta = (bytes_total / 1024) / (td->o.rate[0] + td->o.rate[1]); r_eta += td->o.start_delay; } if (r_eta && t_eta) eta_sec = min(r_eta, t_eta); else if (r_eta) eta_sec = r_eta; else if (t_eta) eta_sec = t_eta; else eta_sec = 0; } else { /* * thread is already done or waiting for fsync */ eta_sec = 0; } return eta_sec; }
static int verify_header(struct io_u *io_u, struct thread_data *td, struct verify_header *hdr, unsigned int hdr_num, unsigned int hdr_len) { void *p = hdr; uint32_t crc; if (hdr->magic != FIO_HDR_MAGIC) { log_err("verify: bad magic header %x, wanted %x", hdr->magic, FIO_HDR_MAGIC); goto err; } if (hdr->len != hdr_len) { log_err("verify: bad header length %u, wanted %u", hdr->len, hdr_len); goto err; } if (hdr->rand_seed != io_u->rand_seed) { log_err("verify: bad header rand_seed %"PRIu64 ", wanted %"PRIu64, hdr->rand_seed, io_u->rand_seed); goto err; } if (hdr->offset != io_u->offset + hdr_num * td->o.verify_interval) { log_err("verify: bad header offset %"PRIu64 ", wanted %llu", hdr->offset, io_u->offset); goto err; } /* * For read-only workloads, the program cannot be certain of the * last numberio written to a block. Checking of numberio will be * done only for workloads that write data. For verify_only, * numberio will be checked in the last iteration when the correct * state of numberio, that would have been written to each block * in a previous run of fio, has been reached. */ if ((td_write(td) || td_rw(td)) && (td_min_bs(td) == td_max_bs(td)) && !td->o.time_based) if (!td->o.verify_only || td->o.loops == 0) if (hdr->numberio != io_u->numberio) { log_err("verify: bad header numberio %"PRIu16 ", wanted %"PRIu16, hdr->numberio, io_u->numberio); goto err; } crc = fio_crc32c(p, offsetof(struct verify_header, crc32)); if (crc != hdr->crc32) { log_err("verify: bad header crc %x, calculated %x", hdr->crc32, crc); goto err; } return 0; err: log_err(" at file %s offset %llu, length %u\n", io_u->file->file_name, io_u->offset + hdr_num * hdr_len, hdr_len); if (td->o.verify_dump) dump_buf(p, hdr_len, io_u->offset + hdr_num * hdr_len, "hdr_fail", io_u->file); return EILSEQ; }
static int fio_rdmaio_init(struct thread_data *td) { struct rdmaio_data *rd = td->io_ops->data; struct rdmaio_options *o = td->eo; unsigned int max_bs; int ret, i; if (td_rw(td)) { log_err("fio: rdma connections must be read OR write\n"); return 1; } if (td_random(td)) { log_err("fio: RDMA network IO can't be random\n"); return 1; } if (compat_options(td)) return 1; if (!o->port) { log_err("fio: no port has been specified which is required " "for the rdma engine\n"); return 1; } if (check_set_rlimits(td)) return 1; rd->rdma_protocol = o->verb; rd->cq_event_num = 0; rd->cm_channel = rdma_create_event_channel(); if (!rd->cm_channel) { log_err("fio: rdma_create_event_channel fail\n"); return 1; } ret = rdma_create_id(rd->cm_channel, &rd->cm_id, rd, RDMA_PS_TCP); if (ret) { log_err("fio: rdma_create_id fail\n"); return 1; } if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) || (rd->rdma_protocol == FIO_RDMA_MEM_READ)) { rd->rmt_us = malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u)); memset(rd->rmt_us, 0, FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u)); rd->rmt_nr = 0; } rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *)); memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *)); rd->io_u_queued_nr = 0; rd->io_us_flight = malloc(td->o.iodepth * sizeof(struct io_u *)); memset(rd->io_us_flight, 0, td->o.iodepth * sizeof(struct io_u *)); rd->io_u_flight_nr = 0; rd->io_us_completed = malloc(td->o.iodepth * sizeof(struct io_u *)); memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *)); rd->io_u_completed_nr = 0; if (td_read(td)) { /* READ as the server */ rd->is_client = 0; td->flags |= TD_F_NO_PROGRESS; /* server rd->rdma_buf_len will be setup after got request */ ret = fio_rdmaio_setup_listen(td, o->port); } else { /* WRITE as the client */ rd->is_client = 1; ret = fio_rdmaio_setup_connect(td, td->o.filename, o->port); } max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]); rd->send_buf.max_bs = htonl(max_bs); /* register each io_u in the free list */ for (i = 0; i < td->io_u_freelist.nr; i++) { struct io_u *io_u = td->io_u_freelist.io_us[i]; io_u->engine_data = malloc(sizeof(struct rdma_io_u_data)); memset(io_u->engine_data, 0, sizeof(struct rdma_io_u_data)); ((struct rdma_io_u_data *)io_u->engine_data)->wr_id = i; io_u->mr = ibv_reg_mr(rd->pd, io_u->buf, max_bs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); if (io_u->mr == NULL) { log_err("fio: ibv_reg_mr io_u failed\n"); return 1; } rd->send_buf.rmt_us[i].buf = htonll((uint64_t) (unsigned long)io_u->buf); rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey); rd->send_buf.rmt_us[i].size = htonl(max_bs); #if 0 log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n", io_u->mr->rkey, io_u->buf, max_bs); */ #endif } rd->send_buf.nr = htonl(i); return ret; }
static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) { enum fio_ddir odir = ddir ^ 1; struct timeval t; long usec; assert(ddir_rw(ddir)); if (td->rate_pending_usleep[ddir] <= 0) return ddir; /* * We have too much pending sleep in this direction. See if we * should switch. */ if (td_rw(td)) { /* * Other direction does not have too much pending, switch */ if (td->rate_pending_usleep[odir] < 100000) return odir; /* * Both directions have pending sleep. Sleep the minimum time * and deduct from both. */ if (td->rate_pending_usleep[ddir] <= td->rate_pending_usleep[odir]) { usec = td->rate_pending_usleep[ddir]; } else { usec = td->rate_pending_usleep[odir]; ddir = odir; } } else usec = td->rate_pending_usleep[ddir]; /* * We are going to sleep, ensure that we flush anything pending as * not to skew our latency numbers. * * Changed to only monitor 'in flight' requests here instead of the * td->cur_depth, b/c td->cur_depth does not accurately represent * io's that have been actually submitted to an async engine, * and cur_depth is meaningless for sync engines. */ if (td->io_u_in_flight) { int fio_unused ret; ret = io_u_queued_complete(td, td->io_u_in_flight, NULL); } fio_gettime(&t, NULL); usec_sleep(td, usec); usec = utime_since_now(&t); td->rate_pending_usleep[ddir] -= usec; odir = ddir ^ 1; if (td_rw(td) && __should_check_rate(td, odir)) td->rate_pending_usleep[odir] -= usec; if (ddir_trim(ddir)) return ddir; return ddir; }
int generic_open_file(struct thread_data *td, struct fio_file *f) { int is_std = 0; int flags = 0; int from_hash = 0; dprint(FD_FILE, "fd open %s\n", f->file_name); if (td_trim(td) && f->filetype != FIO_TYPE_BD) { log_err("fio: trim only applies to block device\n"); return 1; } if (!strcmp(f->file_name, "-")) { if (td_rw(td)) { log_err("fio: can't read/write to stdin/out\n"); return 1; } is_std = 1; /* * move output logging to stderr, if we are writing to stdout */ if (td_write(td)) f_out = stderr; } if (td_trim(td)) goto skip_flags; if (td->o.odirect) flags |= OS_O_DIRECT; if (td->o.sync_io) flags |= O_SYNC; if (td->o.create_on_open) flags |= O_CREAT; skip_flags: if (f->filetype != FIO_TYPE_FILE) flags |= FIO_O_NOATIME; open_again: if (td_write(td)) { if (!read_only) flags |= O_RDWR; if (f->filetype == FIO_TYPE_FILE) flags |= O_CREAT; if (is_std) f->fd = dup(STDOUT_FILENO); else from_hash = file_lookup_open(f, flags); } else if (td_read(td)) { if (f->filetype == FIO_TYPE_CHAR && !read_only) flags |= O_RDWR; else flags |= O_RDONLY; if (is_std) f->fd = dup(STDIN_FILENO); else from_hash = file_lookup_open(f, flags); } else { //td trim flags |= O_RDWR; from_hash = file_lookup_open(f, flags); } if (f->fd == -1) { char buf[FIO_VERROR_SIZE]; int __e = errno; if (__e == EPERM && (flags & FIO_O_NOATIME)) { flags &= ~FIO_O_NOATIME; goto open_again; } if (__e == EMFILE && file_close_shadow_fds(td)) goto open_again; snprintf(buf, sizeof(buf), "open(%s)", f->file_name); if (__e == EINVAL && (flags & OS_O_DIRECT)) { log_err("fio: looks like your file system does not " \ "support direct=1/buffered=0\n"); } td_verror(td, __e, buf); } if (!from_hash && f->fd != -1) { if (add_file_hash(f)) { int fio_unused ret; /* * Stash away descriptor for later close. This is to * work-around a "feature" on Linux, where a close of * an fd that has been opened for write will trigger * udev to call blkid to check partitions, fs id, etc. * That polutes the device cache, which can slow down * unbuffered accesses. */ if (f->shadow_fd == -1) f->shadow_fd = f->fd; else { /* * OK to ignore, we haven't done anything * with it */ ret = generic_close_file(td, f); } goto open_again; } } return 0; }
int generic_open_file(struct thread_data *td, struct fio_file *f) { int is_std = 0; int flags = 0; int from_hash = 0; dprint(FD_FILE, "fd open %s\n", f->file_name); if (!strcmp(f->file_name, "-")) { if (td_rw(td)) { log_err("fio: can't read/write to stdin/out\n"); return 1; } is_std = 1; /* * move output logging to stderr, if we are writing to stdout */ if (td_write(td)) f_out = stderr; } if (td->o.odirect) flags |= OS_O_DIRECT; if (td->o.sync_io) flags |= O_SYNC; if (f->filetype != FIO_TYPE_FILE) flags |= FIO_O_NOATIME; if (td->o.create_on_open) flags |= O_CREAT; open_again: if (td_write(td)) { if (!read_only) flags |= O_RDWR; if (f->filetype == FIO_TYPE_FILE) flags |= O_CREAT; if (is_std) f->fd = dup(STDOUT_FILENO); else from_hash = file_lookup_open(f, flags); } else { if (f->filetype == FIO_TYPE_CHAR && !read_only) flags |= O_RDWR; else flags |= O_RDONLY; if (is_std) f->fd = dup(STDIN_FILENO); else from_hash = file_lookup_open(f, flags); } if (f->fd == -1) { char buf[FIO_VERROR_SIZE]; int __e = errno; if (__e == EPERM && (flags & FIO_O_NOATIME)) { flags &= ~FIO_O_NOATIME; goto open_again; } snprintf(buf, sizeof(buf) - 1, "open(%s)", f->file_name); td_verror(td, __e, buf); } if (!from_hash && f->fd != -1) { if (add_file_hash(f)) { int fio_unused ret; /* * OK to ignore, we haven't done anything with it */ ret = generic_close_file(td, f); goto open_again; } } return 0; }
/* * Sets the status of the 'td' in the printed status map. */ static void check_str_update(struct thread_data *td) { char c = __run_str[td->thread_number - 1]; switch (td->runstate) { case TD_REAPED: if (td->error) c = 'X'; else if (td->sig) c = 'K'; else c = '_'; break; case TD_EXITED: c = 'E'; break; case TD_RAMP: c = '/'; break; case TD_RUNNING: if (td_rw(td)) { if (td_random(td)) { if (td->o.rwmix[DDIR_READ] == 100) c = 'r'; else if (td->o.rwmix[DDIR_WRITE] == 100) c = 'w'; else c = 'm'; } else { if (td->o.rwmix[DDIR_READ] == 100) c = 'R'; else if (td->o.rwmix[DDIR_WRITE] == 100) c = 'W'; else c = 'M'; } } else if (td_read(td)) { if (td_random(td)) c = 'r'; else c = 'R'; } else if (td_write(td)) { if (td_random(td)) c = 'w'; else c = 'W'; } else { if (td_random(td)) c = 'd'; else c = 'D'; } break; case TD_PRE_READING: c = 'p'; break; case TD_VERIFYING: c = 'V'; break; case TD_FSYNCING: c = 'F'; break; case TD_FINISHING: c = 'f'; break; case TD_CREATED: c = 'C'; break; case TD_INITIALIZED: case TD_SETTING_UP: c = 'I'; break; case TD_NOT_CREATED: c = 'P'; break; default: log_err("state %d\n", td->runstate); } __run_str[td->thread_number - 1] = c; update_condensed_str(__run_str, run_str); }
/* * Lazy way of fixing up options that depend on each other. We could also * define option callback handlers, but this is easier. */ static int fixup_options(struct thread_data *td) { struct thread_options *o = &td->o; int ret = 0; #ifndef FIO_HAVE_PSHARED_MUTEX if (!o->use_thread) { log_info("fio: this platform does not support process shared" " mutexes, forcing use of threads. Use the 'thread'" " option to get rid of this warning.\n"); o->use_thread = 1; ret = warnings_fatal; } #endif if (o->write_iolog_file && o->read_iolog_file) { log_err("fio: read iolog overrides write_iolog\n"); free(o->write_iolog_file); o->write_iolog_file = NULL; ret = warnings_fatal; } /* * only really works with 1 file */ if (o->zone_size && o->open_files == 1) o->zone_size = 0; /* * If zone_range isn't specified, backward compatibility dictates it * should be made equal to zone_size. */ if (o->zone_size && !o->zone_range) o->zone_range = o->zone_size; /* * Reads can do overwrites, we always need to pre-create the file */ if (td_read(td) || td_rw(td)) o->overwrite = 1; if (!o->min_bs[DDIR_READ]) o->min_bs[DDIR_READ] = o->bs[DDIR_READ]; if (!o->max_bs[DDIR_READ]) o->max_bs[DDIR_READ] = o->bs[DDIR_READ]; if (!o->min_bs[DDIR_WRITE]) o->min_bs[DDIR_WRITE] = o->bs[DDIR_WRITE]; if (!o->max_bs[DDIR_WRITE]) o->max_bs[DDIR_WRITE] = o->bs[DDIR_WRITE]; o->rw_min_bs = min(o->min_bs[DDIR_READ], o->min_bs[DDIR_WRITE]); /* * For random IO, allow blockalign offset other than min_bs. */ if (!o->ba[DDIR_READ] || !td_random(td)) o->ba[DDIR_READ] = o->min_bs[DDIR_READ]; if (!o->ba[DDIR_WRITE] || !td_random(td)) o->ba[DDIR_WRITE] = o->min_bs[DDIR_WRITE]; if ((o->ba[DDIR_READ] != o->min_bs[DDIR_READ] || o->ba[DDIR_WRITE] != o->min_bs[DDIR_WRITE]) && !o->norandommap) { log_err("fio: Any use of blockalign= turns off randommap\n"); o->norandommap = 1; ret = warnings_fatal; } if (!o->file_size_high) o->file_size_high = o->file_size_low; if (o->norandommap && o->verify != VERIFY_NONE && !fixed_block_size(o)) { log_err("fio: norandommap given for variable block sizes, " "verify disabled\n"); o->verify = VERIFY_NONE; ret = warnings_fatal; } if (o->bs_unaligned && (o->odirect || td->io_ops->flags & FIO_RAWIO)) log_err("fio: bs_unaligned may not work with raw io\n"); /* * thinktime_spin must be less than thinktime */ if (o->thinktime_spin > o->thinktime) o->thinktime_spin = o->thinktime; /* * The low water mark cannot be bigger than the iodepth */ if (o->iodepth_low > o->iodepth || !o->iodepth_low) { /* * syslet work around - if the workload is sequential, * we want to let the queue drain all the way down to * avoid seeking between async threads */ if (!strcmp(td->io_ops->name, "syslet-rw") && !td_random(td)) o->iodepth_low = 1; else o->iodepth_low = o->iodepth; } /* * If batch number isn't set, default to the same as iodepth */ if (o->iodepth_batch > o->iodepth || !o->iodepth_batch) o->iodepth_batch = o->iodepth; if (o->nr_files > td->files_index) o->nr_files = td->files_index; if (o->open_files > o->nr_files || !o->open_files) o->open_files = o->nr_files; if (((o->rate[0] + o->rate[1]) && (o->rate_iops[0] + o->rate_iops[1]))|| ((o->ratemin[0] + o->ratemin[1]) && (o->rate_iops_min[0] + o->rate_iops_min[1]))) { log_err("fio: rate and rate_iops are mutually exclusive\n"); ret = 1; } if ((o->rate[0] < o->ratemin[0]) || (o->rate[1] < o->ratemin[1]) || (o->rate_iops[0] < o->rate_iops_min[0]) || (o->rate_iops[1] < o->rate_iops_min[1])) { log_err("fio: minimum rate exceeds rate\n"); ret = 1; } if (!o->timeout && o->time_based) { log_err("fio: time_based requires a runtime/timeout setting\n"); o->time_based = 0; ret = warnings_fatal; } if (o->fill_device && !o->size) o->size = -1ULL; if (o->verify != VERIFY_NONE) { if (td_write(td) && o->do_verify && o->numjobs > 1) { log_info("Multiple writers may overwrite blocks that " "belong to other jobs. This can cause " "verification failures.\n"); ret = warnings_fatal; } o->refill_buffers = 1; if (o->max_bs[DDIR_WRITE] != o->min_bs[DDIR_WRITE] && !o->verify_interval) o->verify_interval = o->min_bs[DDIR_WRITE]; } if (o->pre_read) { o->invalidate_cache = 0; if (td->io_ops->flags & FIO_PIPEIO) { log_info("fio: cannot pre-read files with an IO engine" " that isn't seekable. Pre-read disabled.\n"); ret = warnings_fatal; } } #ifndef FIO_HAVE_FDATASYNC if (o->fdatasync_blocks) { log_info("fio: this platform does not support fdatasync()" " falling back to using fsync(). Use the 'fsync'" " option instead of 'fdatasync' to get rid of" " this warning\n"); o->fsync_blocks = o->fdatasync_blocks; o->fdatasync_blocks = 0; ret = warnings_fatal; } #endif #ifdef WIN32 /* * Windows doesn't support O_DIRECT or O_SYNC with the _open interface, * so fail if we're passed those flags */ if ((td->io_ops->flags & FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) { log_err("fio: Windows does not support direct or non-buffered io with" " the synchronous ioengines. Use the 'windowsaio' ioengine" " with 'direct=1' and 'iodepth=1' instead.\n"); ret = 1; } #endif /* * For fully compressible data, just zero them at init time. * It's faster than repeatedly filling it. */ if (td->o.compress_percentage == 100) { td->o.zero_buffers = 1; td->o.compress_percentage = 0; } return ret; }
/* * Lazy way of fixing up options that depend on each other. We could also * define option callback handlers, but this is easier. */ static int fixup_options(struct thread_data *td) { struct thread_options *o = &td->o; int ret = 0; #ifndef FIO_HAVE_PSHARED_MUTEX if (!o->use_thread) { log_info("fio: this platform does not support process shared" " mutexes, forcing use of threads. Use the 'thread'" " option to get rid of this warning.\n"); o->use_thread = 1; ret = warnings_fatal; } #endif if (o->write_iolog_file && o->read_iolog_file) { log_err("fio: read iolog overrides write_iolog\n"); free(o->write_iolog_file); o->write_iolog_file = NULL; ret = warnings_fatal; } /* * only really works with 1 file */ if (o->zone_size && o->open_files > 1) o->zone_size = 0; /* * If zone_range isn't specified, backward compatibility dictates it * should be made equal to zone_size. */ if (o->zone_size && !o->zone_range) o->zone_range = o->zone_size; /* * Reads can do overwrites, we always need to pre-create the file */ if (td_read(td) || td_rw(td)) o->overwrite = 1; if (!o->min_bs[DDIR_READ]) o->min_bs[DDIR_READ] = o->bs[DDIR_READ]; if (!o->max_bs[DDIR_READ]) o->max_bs[DDIR_READ] = o->bs[DDIR_READ]; if (!o->min_bs[DDIR_WRITE]) o->min_bs[DDIR_WRITE] = o->bs[DDIR_WRITE]; if (!o->max_bs[DDIR_WRITE]) o->max_bs[DDIR_WRITE] = o->bs[DDIR_WRITE]; if (!o->min_bs[DDIR_TRIM]) o->min_bs[DDIR_TRIM] = o->bs[DDIR_TRIM]; if (!o->max_bs[DDIR_TRIM]) o->max_bs[DDIR_TRIM] = o->bs[DDIR_TRIM]; o->rw_min_bs = min(o->min_bs[DDIR_READ], o->min_bs[DDIR_WRITE]); o->rw_min_bs = min(o->min_bs[DDIR_TRIM], o->rw_min_bs); /* * For random IO, allow blockalign offset other than min_bs. */ if (!o->ba[DDIR_READ] || !td_random(td)) o->ba[DDIR_READ] = o->min_bs[DDIR_READ]; if (!o->ba[DDIR_WRITE] || !td_random(td)) o->ba[DDIR_WRITE] = o->min_bs[DDIR_WRITE]; if (!o->ba[DDIR_TRIM] || !td_random(td)) o->ba[DDIR_TRIM] = o->min_bs[DDIR_TRIM]; if ((o->ba[DDIR_READ] != o->min_bs[DDIR_READ] || o->ba[DDIR_WRITE] != o->min_bs[DDIR_WRITE] || o->ba[DDIR_TRIM] != o->min_bs[DDIR_TRIM]) && !o->norandommap) { log_err("fio: Any use of blockalign= turns off randommap\n"); o->norandommap = 1; ret = warnings_fatal; } if (!o->file_size_high) o->file_size_high = o->file_size_low; if (o->start_delay_high) o->start_delay = get_rand_start_delay(td); if (o->norandommap && o->verify != VERIFY_NONE && !fixed_block_size(o)) { log_err("fio: norandommap given for variable block sizes, " "verify disabled\n"); o->verify = VERIFY_NONE; ret = warnings_fatal; } if (o->bs_unaligned && (o->odirect || td->io_ops->flags & FIO_RAWIO)) log_err("fio: bs_unaligned may not work with raw io\n"); /* * thinktime_spin must be less than thinktime */ if (o->thinktime_spin > o->thinktime) o->thinktime_spin = o->thinktime; /* * The low water mark cannot be bigger than the iodepth */ if (o->iodepth_low > o->iodepth || !o->iodepth_low) o->iodepth_low = o->iodepth; /* * If batch number isn't set, default to the same as iodepth */ if (o->iodepth_batch > o->iodepth || !o->iodepth_batch) o->iodepth_batch = o->iodepth; if (o->nr_files > td->files_index) o->nr_files = td->files_index; if (o->open_files > o->nr_files || !o->open_files) o->open_files = o->nr_files; if (((o->rate[DDIR_READ] + o->rate[DDIR_WRITE] + o->rate[DDIR_TRIM]) && (o->rate_iops[DDIR_READ] + o->rate_iops[DDIR_WRITE] + o->rate_iops[DDIR_TRIM])) || ((o->ratemin[DDIR_READ] + o->ratemin[DDIR_WRITE] + o->ratemin[DDIR_TRIM]) && (o->rate_iops_min[DDIR_READ] + o->rate_iops_min[DDIR_WRITE] + o->rate_iops_min[DDIR_TRIM]))) { log_err("fio: rate and rate_iops are mutually exclusive\n"); ret = 1; } if ((o->rate[DDIR_READ] < o->ratemin[DDIR_READ]) || (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE]) || (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM]) || (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ]) || (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE]) || (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM])) { log_err("fio: minimum rate exceeds rate\n"); ret = 1; } if (!o->timeout && o->time_based) { log_err("fio: time_based requires a runtime/timeout setting\n"); o->time_based = 0; ret = warnings_fatal; } if (o->fill_device && !o->size) o->size = -1ULL; if (o->verify != VERIFY_NONE) { if (td_write(td) && o->do_verify && o->numjobs > 1) { log_info("Multiple writers may overwrite blocks that " "belong to other jobs. This can cause " "verification failures.\n"); ret = warnings_fatal; } o->refill_buffers = 1; if (o->max_bs[DDIR_WRITE] != o->min_bs[DDIR_WRITE] && !o->verify_interval) o->verify_interval = o->min_bs[DDIR_WRITE]; /* * Verify interval must be smaller or equal to the * write size. */ if (o->verify_interval > o->min_bs[DDIR_WRITE]) o->verify_interval = o->min_bs[DDIR_WRITE]; else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ]) o->verify_interval = o->min_bs[DDIR_READ]; } if (o->pre_read) { o->invalidate_cache = 0; if (td->io_ops->flags & FIO_PIPEIO) { log_info("fio: cannot pre-read files with an IO engine" " that isn't seekable. Pre-read disabled.\n"); ret = warnings_fatal; } } if (!o->unit_base) { if (td->io_ops->flags & FIO_BIT_BASED) o->unit_base = 1; else o->unit_base = 8; } #ifndef CONFIG_FDATASYNC if (o->fdatasync_blocks) { log_info("fio: this platform does not support fdatasync()" " falling back to using fsync(). Use the 'fsync'" " option instead of 'fdatasync' to get rid of" " this warning\n"); o->fsync_blocks = o->fdatasync_blocks; o->fdatasync_blocks = 0; ret = warnings_fatal; } #endif #ifdef WIN32 /* * Windows doesn't support O_DIRECT or O_SYNC with the _open interface, * so fail if we're passed those flags */ if ((td->io_ops->flags & FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) { log_err("fio: Windows does not support direct or non-buffered io with" " the synchronous ioengines. Use the 'windowsaio' ioengine" " with 'direct=1' and 'iodepth=1' instead.\n"); ret = 1; } #endif /* * For fully compressible data, just zero them at init time. * It's faster than repeatedly filling it. */ if (td->o.compress_percentage == 100) { td->o.zero_buffers = 1; td->o.compress_percentage = 0; } /* * Using a non-uniform random distribution excludes usage of * a random map */ if (td->o.random_distribution != FIO_RAND_DIST_RANDOM) td->o.norandommap = 1; /* * If size is set but less than the min block size, complain */ if (o->size && o->size < td_min_bs(td)) { log_err("fio: size too small, must be larger than the IO size: %llu\n", (unsigned long long) o->size); ret = 1; } /* * O_ATOMIC implies O_DIRECT */ if (td->o.oatomic) td->o.odirect = 1; /* * If randseed is set, that overrides randrepeat */ if (td->o.rand_seed) td->o.rand_repeatable = 0; if ((td->io_ops->flags & FIO_NOEXTEND) && td->o.file_append) { log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name); ret = 1; } return ret; }