static void update_accounting(struct submit_worker *sw) { struct thread_data *src = &sw->td; struct thread_data *dst = sw->wq->td; if (td_read(src)) sum_ddir(dst, src, DDIR_READ); if (td_write(src)) sum_ddir(dst, src, DDIR_WRITE); if (td_trim(src)) sum_ddir(dst, src, DDIR_TRIM); }
static void io_workqueue_update_acct_fn(struct submit_worker *sw) { struct thread_data *src = sw->priv; struct thread_data *dst = sw->wq->td; if (td_read(src)) sum_ddir(dst, src, DDIR_READ); if (td_write(src)) sum_ddir(dst, src, DDIR_WRITE); if (td_trim(src)) sum_ddir(dst, src, DDIR_TRIM); }
/* * Print status of the jobs we know about. This includes rate estimates, * ETA, thread state, etc. */ int calc_thread_status(struct jobs_eta *je, int force) { struct thread_data *td; int i, unified_rw_rep; unsigned long rate_time, disp_time, bw_avg_time, *eta_secs; unsigned long long io_bytes[DDIR_RWDIR_CNT]; unsigned long long io_iops[DDIR_RWDIR_CNT]; struct timeval now; static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT]; static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT]; static unsigned long long disp_io_iops[DDIR_RWDIR_CNT]; static struct timeval rate_prev_time, disp_prev_time; if (!force) { if (output_format != FIO_OUTPUT_NORMAL && f_out == stdout) return 0; if (temp_stall_ts || eta_print == FIO_ETA_NEVER) return 0; if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS)) return 0; } if (!ddir_rw_sum(rate_io_bytes)) fill_start_time(&rate_prev_time); if (!ddir_rw_sum(disp_io_bytes)) fill_start_time(&disp_prev_time); eta_secs = malloc(thread_number * sizeof(unsigned long)); memset(eta_secs, 0, thread_number * sizeof(unsigned long)); je->elapsed_sec = (mtime_since_genesis() + 999) / 1000; io_bytes[DDIR_READ] = io_bytes[DDIR_WRITE] = io_bytes[DDIR_TRIM] = 0; io_iops[DDIR_READ] = io_iops[DDIR_WRITE] = io_iops[DDIR_TRIM] = 0; bw_avg_time = ULONG_MAX; unified_rw_rep = 0; for_each_td(td, i) { unified_rw_rep += td->o.unified_rw_rep; if (is_power_of_2(td->o.kb_base)) je->is_pow2 = 1; je->unit_base = td->o.unit_base; if (td->o.bw_avg_time < bw_avg_time) bw_avg_time = td->o.bw_avg_time; if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING || td->runstate == TD_FSYNCING || td->runstate == TD_PRE_READING || td->runstate == TD_FINISHING) { je->nr_running++; if (td_read(td)) { je->t_rate[0] += td->o.rate[DDIR_READ]; je->t_iops[0] += td->o.rate_iops[DDIR_READ]; je->m_rate[0] += td->o.ratemin[DDIR_READ]; je->m_iops[0] += td->o.rate_iops_min[DDIR_READ]; } if (td_write(td)) { je->t_rate[1] += td->o.rate[DDIR_WRITE]; je->t_iops[1] += td->o.rate_iops[DDIR_WRITE]; je->m_rate[1] += td->o.ratemin[DDIR_WRITE]; je->m_iops[1] += td->o.rate_iops_min[DDIR_WRITE]; } if (td_trim(td)) { je->t_rate[2] += td->o.rate[DDIR_TRIM]; je->t_iops[2] += td->o.rate_iops[DDIR_TRIM]; je->m_rate[2] += td->o.ratemin[DDIR_TRIM]; je->m_iops[2] += td->o.rate_iops_min[DDIR_TRIM]; } je->files_open += td->nr_open_files; } else if (td->runstate == TD_RAMP) { je->nr_running++; je->nr_ramp++; } else if (td->runstate == TD_SETTING_UP) { je->nr_running++; je->nr_setting_up++; } else if (td->runstate < TD_RUNNING) je->nr_pending++; if (je->elapsed_sec >= 3) eta_secs[i] = thread_eta(td); else eta_secs[i] = INT_MAX; check_str_update(td); if (td->runstate > TD_SETTING_UP) { int ddir; for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { if (unified_rw_rep) { io_bytes[0] += td->io_bytes[ddir]; io_iops[0] += td->io_blocks[ddir]; } else { io_bytes[ddir] += td->io_bytes[ddir]; io_iops[ddir] += td->io_blocks[ddir]; } } } }
int generic_open_file(struct thread_data *td, struct fio_file *f) { int is_std = 0; int flags = 0; int from_hash = 0; dprint(FD_FILE, "fd open %s\n", f->file_name); if (td_trim(td) && f->filetype != FIO_TYPE_BD) { log_err("fio: trim only applies to block device\n"); return 1; } if (!strcmp(f->file_name, "-")) { if (td_rw(td)) { log_err("fio: can't read/write to stdin/out\n"); return 1; } is_std = 1; /* * move output logging to stderr, if we are writing to stdout */ if (td_write(td)) f_out = stderr; } if (td_trim(td)) goto skip_flags; if (td->o.odirect) flags |= OS_O_DIRECT; if (td->o.sync_io) flags |= O_SYNC; if (td->o.create_on_open) flags |= O_CREAT; skip_flags: if (f->filetype != FIO_TYPE_FILE) flags |= FIO_O_NOATIME; open_again: if (td_write(td)) { if (!read_only) flags |= O_RDWR; if (f->filetype == FIO_TYPE_FILE) flags |= O_CREAT; if (is_std) f->fd = dup(STDOUT_FILENO); else from_hash = file_lookup_open(f, flags); } else if (td_read(td)) { if (f->filetype == FIO_TYPE_CHAR && !read_only) flags |= O_RDWR; else flags |= O_RDONLY; if (is_std) f->fd = dup(STDIN_FILENO); else from_hash = file_lookup_open(f, flags); } else { //td trim flags |= O_RDWR; from_hash = file_lookup_open(f, flags); } if (f->fd == -1) { char buf[FIO_VERROR_SIZE]; int __e = errno; if (__e == EPERM && (flags & FIO_O_NOATIME)) { flags &= ~FIO_O_NOATIME; goto open_again; } if (__e == EMFILE && file_close_shadow_fds(td)) goto open_again; snprintf(buf, sizeof(buf), "open(%s)", f->file_name); if (__e == EINVAL && (flags & OS_O_DIRECT)) { log_err("fio: looks like your file system does not " \ "support direct=1/buffered=0\n"); } td_verror(td, __e, buf); } if (!from_hash && f->fd != -1) { if (add_file_hash(f)) { int fio_unused ret; /* * Stash away descriptor for later close. This is to * work-around a "feature" on Linux, where a close of * an fd that has been opened for write will trigger * udev to call blkid to check partitions, fs id, etc. * That polutes the device cache, which can slow down * unbuffered accesses. */ if (f->shadow_fd == -1) f->shadow_fd = f->fd; else { /* * OK to ignore, we haven't done anything * with it */ ret = generic_close_file(td, f); } goto open_again; } } return 0; }
/* This functions mimics the generic_file_open function, but issues IME native calls instead of POSIX calls. */ static int fio_ime_open_file(struct thread_data *td, struct fio_file *f) { int flags = 0; int ret; uint64_t desired_fs; char *ime_filename; dprint(FD_FILE, "fd open %s\n", f->file_name); if (td_trim(td)) { td_verror(td, EINVAL, "IME does not support TRIM operation"); return 1; } if (td->o.oatomic) { td_verror(td, EINVAL, "IME does not support atomic IO"); return 1; } if (td->o.odirect) flags |= O_DIRECT; if (td->o.sync_io) flags |= O_SYNC; if (td->o.create_on_open && td->o.allow_create) flags |= O_CREAT; if (td_write(td)) { if (!read_only) flags |= O_RDWR; if (td->o.allow_create) flags |= O_CREAT; } else if (td_read(td)) { flags |= O_RDONLY; } else { /* We should never go here. */ td_verror(td, EINVAL, "Unsopported open mode"); return 1; } ime_filename = fio_set_ime_filename(f->file_name); if (ime_filename == NULL) return 1; f->fd = ime_native_open(ime_filename, flags, 0600); if (f->fd == -1) { char buf[FIO_VERROR_SIZE]; int __e = errno; snprintf(buf, sizeof(buf), "open(%s)", f->file_name); td_verror(td, __e, buf); return 1; } /* Now we need to make sure the real file size is sufficient for FIO to do its things. This is normally done before the file open function is called, but because FIO would use POSIX calls, we need to do it ourselves */ ret = fio_ime_get_file_size(td, f); if (ret < 0) { ime_native_close(f->fd); td_verror(td, errno, "ime_get_file_size"); return 1; } desired_fs = f->io_size + f->file_offset; if (td_write(td)) { dprint(FD_FILE, "Laying out file %s%s\n", DEFAULT_IME_FILE_PREFIX, f->file_name); if (!td->o.create_on_open && f->real_file_size < desired_fs && ime_native_ftruncate(f->fd, desired_fs) < 0) { ime_native_close(f->fd); td_verror(td, errno, "ime_native_ftruncate"); return 1; } if (f->real_file_size < desired_fs) f->real_file_size = desired_fs; } else if (td_read(td) && f->real_file_size < desired_fs) { ime_native_close(f->fd); log_err("error: can't read %lu bytes from file with " "%lu bytes\n", desired_fs, f->real_file_size); return 1; } return 0; }
/* * Best effort calculation of the estimated pending runtime of a job. */ static unsigned long thread_eta(struct thread_data *td) { unsigned long long bytes_total, bytes_done; unsigned long eta_sec = 0; unsigned long elapsed; uint64_t timeout; elapsed = (mtime_since_now(&td->epoch) + 999) / 1000; timeout = td->o.timeout / 1000000UL; bytes_total = td->total_io_size; if (td->flags & TD_F_NO_PROGRESS) return -1; if (td->o.fill_device && td->o.size == -1ULL) { if (!td->fill_device_size || td->fill_device_size == -1ULL) return 0; bytes_total = td->fill_device_size; } if (td->o.zone_size && td->o.zone_skip && bytes_total) { unsigned int nr_zones; uint64_t zone_bytes; zone_bytes = bytes_total + td->o.zone_size + td->o.zone_skip; nr_zones = (zone_bytes - 1) / (td->o.zone_size + td->o.zone_skip); bytes_total -= nr_zones * td->o.zone_skip; } /* * if writing and verifying afterwards, bytes_total will be twice the * size. In a mixed workload, verify phase will be the size of the * first stage writes. */ if (td->o.do_verify && td->o.verify && td_write(td)) { if (td_rw(td)) { unsigned int perc = 50; if (td->o.rwmix[DDIR_WRITE]) perc = td->o.rwmix[DDIR_WRITE]; bytes_total += (bytes_total * perc) / 100; } else bytes_total <<= 1; } if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) { double perc, perc_t; bytes_done = ddir_rw_sum(td->io_bytes); if (bytes_total) { perc = (double) bytes_done / (double) bytes_total; if (perc > 1.0) perc = 1.0; } else perc = 0.0; if (td->o.time_based) { if (timeout) { perc_t = (double) elapsed / (double) timeout; if (perc_t < perc) perc = perc_t; } else { /* * Will never hit, we can't have time_based * without a timeout set. */ perc = 0.0; } } if (perc == 0.0) { eta_sec = timeout; } else { eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed; } if (td->o.timeout && eta_sec > (timeout + done_secs - elapsed)) eta_sec = timeout + done_secs - elapsed; } else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED || td->runstate == TD_INITIALIZED || td->runstate == TD_SETTING_UP || td->runstate == TD_RAMP || td->runstate == TD_PRE_READING) { int64_t t_eta = 0, r_eta = 0; unsigned long long rate_bytes; /* * We can only guess - assume it'll run the full timeout * if given, otherwise assume it'll run at the specified rate. */ if (td->o.timeout) { uint64_t __timeout = td->o.timeout; uint64_t start_delay = td->o.start_delay; uint64_t ramp_time = td->o.ramp_time; t_eta = __timeout + start_delay; if (!td->ramp_time_over) { t_eta += ramp_time; } t_eta /= 1000000ULL; if ((td->runstate == TD_RAMP) && in_ramp_time(td)) { unsigned long ramp_left; ramp_left = mtime_since_now(&td->epoch); ramp_left = (ramp_left + 999) / 1000; if (ramp_left <= t_eta) t_eta -= ramp_left; } } rate_bytes = 0; if (td_read(td)) rate_bytes = td->o.rate[DDIR_READ]; if (td_write(td)) rate_bytes += td->o.rate[DDIR_WRITE]; if (td_trim(td)) rate_bytes += td->o.rate[DDIR_TRIM]; if (rate_bytes) { r_eta = bytes_total / rate_bytes; r_eta += (td->o.start_delay / 1000000ULL); } if (r_eta && t_eta) eta_sec = min(r_eta, t_eta); else if (r_eta) eta_sec = r_eta; else if (t_eta) eta_sec = t_eta; else eta_sec = 0; } else { /* * thread is already done or waiting for fsync */ eta_sec = 0; } return eta_sec; }
/* * Print status of the jobs we know about. This includes rate estimates, * ETA, thread state, etc. */ int calc_thread_status(struct jobs_eta *je, int force) { struct thread_data *td; int i; unsigned long rate_time, disp_time, bw_avg_time, *eta_secs; unsigned long long io_bytes[DDIR_RWDIR_CNT]; unsigned long long io_iops[DDIR_RWDIR_CNT]; struct timeval now; static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT]; static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT]; static unsigned long long disp_io_iops[DDIR_RWDIR_CNT]; static struct timeval rate_prev_time, disp_prev_time; if (!force) { if (output_format != FIO_OUTPUT_NORMAL) return 0; if (temp_stall_ts || eta_print == FIO_ETA_NEVER) return 0; if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS)) return 0; } if (!ddir_rw_sum(rate_io_bytes)) fill_start_time(&rate_prev_time); if (!ddir_rw_sum(disp_io_bytes)) fill_start_time(&disp_prev_time); eta_secs = malloc(thread_number * sizeof(unsigned long)); memset(eta_secs, 0, thread_number * sizeof(unsigned long)); je->elapsed_sec = (mtime_since_genesis() + 999) / 1000; io_bytes[DDIR_READ] = io_bytes[DDIR_WRITE] = io_bytes[DDIR_TRIM] = 0; io_iops[DDIR_READ] = io_iops[DDIR_WRITE] = io_iops[DDIR_TRIM] = 0; bw_avg_time = ULONG_MAX; for_each_td(td, i) { if (is_power_of_2(td->o.kb_base)) je->is_pow2 = 1; if (td->o.bw_avg_time < bw_avg_time) bw_avg_time = td->o.bw_avg_time; if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING || td->runstate == TD_FSYNCING || td->runstate == TD_PRE_READING) { je->nr_running++; if (td_read(td)) { je->t_rate += td->o.rate[DDIR_READ]; je->t_iops += td->o.rate_iops[DDIR_READ]; je->m_rate += td->o.ratemin[DDIR_READ]; je->m_iops += td->o.rate_iops_min[DDIR_READ]; } if (td_write(td)) { je->t_rate += td->o.rate[DDIR_WRITE]; je->t_iops += td->o.rate_iops[DDIR_WRITE]; je->m_rate += td->o.ratemin[DDIR_WRITE]; je->m_iops += td->o.rate_iops_min[DDIR_WRITE]; } if (td_trim(td)) { je->t_rate += td->o.rate[DDIR_TRIM]; je->t_iops += td->o.rate_iops[DDIR_TRIM]; je->m_rate += td->o.ratemin[DDIR_TRIM]; je->m_iops += td->o.rate_iops_min[DDIR_TRIM]; } je->files_open += td->nr_open_files; } else if (td->runstate == TD_RAMP) { je->nr_running++; je->nr_ramp++; } else if (td->runstate == TD_SETTING_UP) je->nr_running++; else if (td->runstate < TD_RUNNING) je->nr_pending++; if (je->elapsed_sec >= 3) eta_secs[i] = thread_eta(td); else eta_secs[i] = INT_MAX; check_str_update(td); if (td->runstate > TD_RAMP) { int ddir; for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) { io_bytes[ddir] += td->io_bytes[ddir]; io_iops[ddir] += td->io_blocks[ddir]; } } } if (exitall_on_terminate) je->eta_sec = INT_MAX; else je->eta_sec = 0; for_each_td(td, i) { if (exitall_on_terminate) { if (eta_secs[i] < je->eta_sec) je->eta_sec = eta_secs[i]; } else { if (eta_secs[i] > je->eta_sec) je->eta_sec = eta_secs[i]; } } free(eta_secs); fio_gettime(&now, NULL); rate_time = mtime_since(&rate_prev_time, &now); if (write_bw_log && rate_time > bw_avg_time && !in_ramp_time(td)) { calc_rate(rate_time, io_bytes, rate_io_bytes, je->rate); memcpy(&rate_prev_time, &now, sizeof(now)); add_agg_sample(je->rate[DDIR_READ], DDIR_READ, 0); add_agg_sample(je->rate[DDIR_WRITE], DDIR_WRITE, 0); add_agg_sample(je->rate[DDIR_TRIM], DDIR_TRIM, 0); } disp_time = mtime_since(&disp_prev_time, &now); /* * Allow a little slack, the target is to print it every 1000 msecs */ if (!force && disp_time < 900) return 0; calc_rate(disp_time, io_bytes, disp_io_bytes, je->rate); calc_iops(disp_time, io_iops, disp_io_iops, je->iops); memcpy(&disp_prev_time, &now, sizeof(now)); if (!force && !je->nr_running && !je->nr_pending) return 0; je->nr_threads = thread_number; memcpy(je->run_str, run_str, thread_number * sizeof(char)); return 1; }