static void _kill_socket(struct client_output *output) { output->brick->connection_state = 1; if (mars_socket_is_alive(&output->socket)) { MARS_DBG("shutdown socket\n"); mars_shutdown_socket(&output->socket); } _kill_thread(&output->receiver, "receiver"); output->recv_error = 0; MARS_DBG("close socket\n"); mars_put_socket(&output->socket); }
static int _request_info(struct client_output *output) { struct mars_cmd cmd = { .cmd_code = CMD_GETINFO, }; int status; MARS_DBG("\n"); status = mars_send_struct(&output->socket, &cmd, mars_cmd_meta); if (unlikely(status < 0)) { MARS_DBG("send of getinfo failed, status = %d\n", status); } return status; }
static int lamport_sysctl_handler( struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { ssize_t res = 0; size_t len = *length; MARS_DBG("write = %d len = %ld pos = %lld\n", write, len, *ppos); if (!len || *ppos > 0) { goto done; } if (write) { return -EINVAL; } else { int my_len = 128; char *tmp = brick_string_alloc(my_len); struct timespec know = CURRENT_TIME; struct timespec lnow; get_lamport(&lnow); res = scnprintf(tmp, my_len, "CURRENT_TIME=%ld.%09ld\n" "lamport_now=%ld.%09ld\n", know.tv_sec, know.tv_nsec, lnow.tv_sec, lnow.tv_nsec ); if (copy_to_user(buffer, tmp, res)) { MARS_ERR("write %ld bytes at %p failed\n", res, buffer); res = -EFAULT; } brick_string_free(tmp); } done: MARS_DBG("res = %ld\n", res); *length = res; if (res >= 0) { *ppos += res; return 0; } return res; }
static void _kill_thread(struct client_threadinfo *ti, const char *name) { if (ti->thread) { MARS_DBG("stopping %s thread\n", name); brick_thread_stop(ti->thread); ti->thread = NULL; } }
/* Workaround for non-implemented aio_fsync() */ static int aio_sync_thread(void *data) { struct aio_threadinfo *tinfo = data; struct aio_output *output = tinfo->output; MARS_DBG("sync thread has started on '%s'.\n", output->brick->brick_path); //set_user_nice(current, -20); while (!brick_thread_should_stop() || atomic_read(&tinfo->queued_sum) > 0) { LIST_HEAD(tmp_list); unsigned long flags; int i; output->fdsync_active = false; wake_up_interruptible_all(&output->fdsync_event); wait_event_interruptible_timeout( tinfo->event, atomic_read(&tinfo->queued_sum) > 0, HZ / 4); traced_lock(&tinfo->lock, flags); for (i = 0; i < MARS_PRIO_NR; i++) { struct list_head *start = &tinfo->mref_list[i]; if (!list_empty(start)) { // move over the whole list list_replace_init(start, &tmp_list); atomic_sub(tinfo->queued[i], &tinfo->queued_sum); tinfo->queued[i] = 0; break; } } traced_unlock(&tinfo->lock, flags); if (!list_empty(&tmp_list)) { aio_sync_all(output, &tmp_list); } } MARS_DBG("sync thread has stopped.\n"); tinfo->terminated = true; wake_up_interruptible_all(&tinfo->terminate_event); return 0; }
static char *_mars_translate_hostname(const char *name) { struct mars_global *global = mars_global; char *res = brick_strdup(name); struct mars_dent *test; char *tmp; if (unlikely(!global)) { goto done; } for (tmp = res; *tmp; tmp++) { if (*tmp == ':') { *tmp = '\0'; break; } } tmp = path_make("/mars/ips/ip-%s", res); if (unlikely(!tmp)) { goto done; } test = mars_find_dent(global, tmp); if (test && test->new_link) { MARS_DBG("'%s' => '%s'\n", tmp, test->new_link); brick_string_free(res); res = brick_strdup(test->new_link); } else { MARS_DBG("no translation for '%s'\n", tmp); } brick_string_free(tmp); done: return res; }
static int aio_event_thread(void *data) { struct aio_threadinfo *tinfo = data; struct aio_output *output = tinfo->output; struct aio_threadinfo *other = &output->tinfo[2]; int err = -ENOMEM; MARS_DBG("event thread has started.\n"); //set_user_nice(current, -20); use_fake_mm(); if (!current->mm) goto err; err = aio_start_thread(output, &output->tinfo[2], aio_sync_thread, 'y'); if (unlikely(err < 0)) goto err; while (!brick_thread_should_stop() || atomic_read(&tinfo->queued_sum) > 0) { mm_segment_t oldfs; int count; int i; struct timespec timeout = { .tv_sec = 1, }; struct io_event events[MARS_MAX_AIO_READ]; oldfs = get_fs(); set_fs(get_ds()); /* TODO: don't timeout upon termination. * Probably we should submit a dummy request. */ count = sys_io_getevents(output->ctxp, 1, MARS_MAX_AIO_READ, events, &timeout); set_fs(oldfs); if (likely(count > 0)) { atomic_sub(count, &output->submit_count); } for (i = 0; i < count; i++) { struct aio_mref_aspect *mref_a = (void*)events[i].data; struct mref_object *mref; int err = events[i].res; if (!mref_a) { continue; // this was a dummy request } mref = mref_a->object; MARS_IO("AIO done %p pos = %lld len = %d rw = %d\n", mref, mref->ref_pos, mref->ref_len, mref->ref_rw); mapfree_set(output->mf, mref->ref_pos, mref->ref_pos + mref->ref_len); if (output->brick->o_fdsync && err >= 0 && mref->ref_rw != READ && !mref->ref_skip_sync && !mref_a->resubmit++) { // workaround for non-implemented AIO FSYNC operation if (output->mf && output->mf->mf_filp && output->mf->mf_filp->f_op && !output->mf->mf_filp->f_op->aio_fsync) { mars_trace(mref, "aio_fsync"); _enqueue(other, mref_a, mref->ref_prio, true); continue; } err = aio_submit(output, mref_a, true); if (likely(err >= 0)) continue; } _complete(output, mref_a, err); } } err = 0; err: MARS_DBG("event thread has stopped, err = %d\n", err); aio_stop_thread(output, 2, false); unuse_fake_mm(); tinfo->terminated = true; wake_up_interruptible_all(&tinfo->terminate_event); return err; } #if 1 /* This should go to fs/open.c (as long as vfs_submit() is not implemented) */ #include <linux/fdtable.h> void fd_uninstall(unsigned int fd) { struct files_struct *files = current->files; struct fdtable *fdt; MARS_DBG("fd = %d\n", fd); if (unlikely(fd < 0)) { MARS_ERR("bad fd = %d\n", fd); return; } spin_lock(&files->file_lock); fdt = files_fdtable(files); rcu_assign_pointer(fdt->fd[fd], NULL); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(fd_uninstall); #endif static atomic_t ioctx_count = ATOMIC_INIT(0); static void _destroy_ioctx(struct aio_output *output) { if (unlikely(!output)) goto done; aio_stop_thread(output, 1, true); use_fake_mm(); if (likely(output->ctxp)) { mm_segment_t oldfs; int err; MARS_DBG("ioctx count = %d destroying %p\n", atomic_read(&ioctx_count), (void*)output->ctxp); oldfs = get_fs(); set_fs(get_ds()); err = sys_io_destroy(output->ctxp); set_fs(oldfs); atomic_dec(&ioctx_count); MARS_DBG("ioctx count = %d status = %d\n", atomic_read(&ioctx_count), err); output->ctxp = 0; } if (likely(output->fd >= 0)) { MARS_DBG("destroying fd %d\n", output->fd); fd_uninstall(output->fd); put_unused_fd(output->fd); output->fd = -1; } done: if (likely(current->mm)) { unuse_fake_mm(); } } static int _create_ioctx(struct aio_output *output) { struct file *file; mm_segment_t oldfs; int err = -EINVAL; CHECK_PTR_NULL(output, done); CHECK_PTR_NULL(output->mf, done); file = output->mf->mf_filp; CHECK_PTR_NULL(file, done); /* TODO: this is provisionary. We only need it for sys_io_submit() * which uses userspace concepts like file handles. * This should be accompanied by a future kernelsapce vfs_submit() or * do_submit() which currently does not exist :( */ err = get_unused_fd(); MARS_DBG("file %p '%s' new fd = %d\n", file, output->mf->mf_name, err); if (unlikely(err < 0)) { MARS_ERR("cannot get fd, err=%d\n", err); goto done; } output->fd = err; fd_install(err, file); use_fake_mm(); err = -ENOMEM; if (unlikely(!current->mm)) { MARS_ERR("cannot fake mm\n"); goto done; } MARS_DBG("ioctx count = %d old = %p\n", atomic_read(&ioctx_count), (void*)output->ctxp); output->ctxp = 0; oldfs = get_fs(); set_fs(get_ds()); err = sys_io_setup(MARS_MAX_AIO, &output->ctxp); set_fs(oldfs); if (likely(output->ctxp)) atomic_inc(&ioctx_count); MARS_DBG("ioctx count = %d new = %p status = %d\n", atomic_read(&ioctx_count), (void*)output->ctxp, err); if (unlikely(err < 0)) { MARS_ERR("io_setup failed, err=%d\n", err); goto done; } err = aio_start_thread(output, &output->tinfo[1], aio_event_thread, 'e'); if (unlikely(err < 0)) { MARS_ERR("could not start event thread\n"); goto done; } done: if (likely(current->mm)) { unuse_fake_mm(); } return err; } static int aio_submit_thread(void *data) { struct aio_threadinfo *tinfo = data; struct aio_output *output = tinfo->output; struct file *file; int err = -EINVAL; MARS_DBG("submit thread has started.\n"); file = output->mf->mf_filp; use_fake_mm(); while (!brick_thread_should_stop() || atomic_read(&output->read_count) + atomic_read(&output->write_count) + atomic_read(&tinfo->queued_sum) > 0) { struct aio_mref_aspect *mref_a; struct mref_object *mref; int sleeptime; int status; wait_event_interruptible_timeout( tinfo->event, atomic_read(&tinfo->queued_sum) > 0, HZ / 4); mref_a = _dequeue(tinfo); if (!mref_a) { continue; } mref = mref_a->object; status = -EINVAL; CHECK_PTR(mref, error); mapfree_set(output->mf, mref->ref_pos, -1); if (mref->ref_rw) { insert_dirty(output, mref_a); } // check for reads exactly at EOF (special case) if (mref->ref_pos == mref->ref_total_size && !mref->ref_rw && mref->ref_timeout > 0) { loff_t total_size = i_size_read(file->f_mapping->host); loff_t len = total_size - mref->ref_pos; if (len > 0) { mref->ref_total_size = total_size; mref->ref_len = len; } else { if (!mref_a->start_jiffies) { mref_a->start_jiffies = jiffies; } if ((long long)jiffies - mref_a->start_jiffies <= mref->ref_timeout) { if (atomic_read(&tinfo->queued_sum) <= 0) { atomic_inc(&output->total_msleep_count); brick_msleep(1000 * 4 / HZ); } _enqueue(tinfo, mref_a, MARS_PRIO_LOW, true); continue; } MARS_DBG("ENODATA %lld\n", len); _complete(output, mref_a, -ENODATA); continue; } } sleeptime = 1; for (;;) { status = aio_submit(output, mref_a, false); if (likely(status != -EAGAIN)) { break; } atomic_inc(&output->total_delay_count); brick_msleep(sleeptime); if (sleeptime < 100) { sleeptime++; } } error: if (unlikely(status < 0)) { MARS_IO("submit_count = %d status = %d\n", atomic_read(&output->submit_count), status); _complete_mref(output, mref, status); } } MARS_DBG("submit thread has stopped, status = %d.\n", err); if (likely(current->mm)) { unuse_fake_mm(); } tinfo->terminated = true; wake_up_interruptible_all(&tinfo->terminate_event); return err; } static int aio_get_info(struct aio_output *output, struct mars_info *info) { struct file *file; loff_t min; loff_t max; if (unlikely(!output || !output->mf || !(file = output->mf->mf_filp) || !file->f_mapping || !file->f_mapping->host)) return -EINVAL; info->tf_align = 1; info->tf_min_size = 1; /* Workaround for races in the page cache. * * It appears that concurrent reads and writes seem to * result in inconsistent reads in some very rare cases, due to * races. Sometimes, the inode claims that the file has been already * appended by a write operation, but the data has not actually hit * the page cache, such that a concurrent read gets NULL blocks. */ min = i_size_read(file->f_mapping->host); max = 0; if (!output->brick->is_static_device) { get_dirty(output, &min, &max); } info->current_size = min; MARS_DBG("determined file size = %lld\n", info->current_size); return 0; } //////////////// informational / statistics /////////////// static noinline char *aio_statistics(struct aio_brick *brick, int verbose) { struct aio_output *output = brick->outputs[0]; char *res = brick_string_alloc(4096); char *sync = NULL; int pos = 0; if (!res) return NULL; pos += report_timing(&timings[0], res + pos, 4096 - pos); pos += report_timing(&timings[1], res + pos, 4096 - pos); pos += report_timing(&timings[2], res + pos, 4096 - pos); snprintf(res + pos, 4096 - pos, "total " "reads = %d " "writes = %d " "allocs = %d " "submits = %d " "again = %d " "delays = %d " "msleeps = %d " "fdsyncs = %d " "fdsync_waits = %d " "map_free = %d | " "flying reads = %d " "writes = %d " "allocs = %d " "submits = %d " "q0 = %d " "q1 = %d " "q2 = %d " "| total " "q0 = %d " "q1 = %d " "q2 = %d " "%s\n", atomic_read(&output->total_read_count), atomic_read(&output->total_write_count), atomic_read(&output->total_alloc_count), atomic_read(&output->total_submit_count), atomic_read(&output->total_again_count), atomic_read(&output->total_delay_count), atomic_read(&output->total_msleep_count), atomic_read(&output->total_fdsync_count), atomic_read(&output->total_fdsync_wait_count), atomic_read(&output->total_mapfree_count), atomic_read(&output->read_count), atomic_read(&output->write_count), atomic_read(&output->alloc_count), atomic_read(&output->submit_count), atomic_read(&output->tinfo[0].queued_sum), atomic_read(&output->tinfo[1].queued_sum), atomic_read(&output->tinfo[2].queued_sum), atomic_read(&output->tinfo[0].total_enqueue_count), atomic_read(&output->tinfo[1].total_enqueue_count), atomic_read(&output->tinfo[2].total_enqueue_count), sync ? sync : ""); if (sync) brick_string_free(sync); return res; } static noinline void aio_reset_statistics(struct aio_brick *brick) { struct aio_output *output = brick->outputs[0]; int i; atomic_set(&output->total_read_count, 0); atomic_set(&output->total_write_count, 0); atomic_set(&output->total_alloc_count, 0); atomic_set(&output->total_submit_count, 0); atomic_set(&output->total_again_count, 0); atomic_set(&output->total_delay_count, 0); atomic_set(&output->total_msleep_count, 0); atomic_set(&output->total_fdsync_count, 0); atomic_set(&output->total_fdsync_wait_count, 0); atomic_set(&output->total_mapfree_count, 0); for (i = 0; i < 3; i++) { struct aio_threadinfo *tinfo = &output->tinfo[i]; atomic_set(&tinfo->total_enqueue_count, 0); } } //////////////// object / aspect constructors / destructors /////////////// static int aio_mref_aspect_init_fn(struct generic_aspect *_ini) { struct aio_mref_aspect *ini = (void*)_ini; INIT_LIST_HEAD(&ini->io_head); INIT_LIST_HEAD(&ini->dirty_head); return 0; } static void aio_mref_aspect_exit_fn(struct generic_aspect *_ini) { struct aio_mref_aspect *ini = (void*)_ini; CHECK_HEAD_EMPTY(&ini->dirty_head); CHECK_HEAD_EMPTY(&ini->io_head); } MARS_MAKE_STATICS(aio); ////////////////////// brick constructors / destructors //////////////////// static int aio_brick_construct(struct aio_brick *brick) { return 0; } static int aio_switch(struct aio_brick *brick) { static int index; struct aio_output *output = brick->outputs[0]; const char *path = output->brick->brick_path; int flags = O_RDWR | O_LARGEFILE; int status = 0; MARS_DBG("power.button = %d\n", brick->power.button); if (!brick->power.button) goto cleanup; if (brick->power.led_on || output->mf) goto done; mars_power_led_off((void*)brick, false); if (brick->o_creat) { flags |= O_CREAT; MARS_DBG("using O_CREAT on %s\n", path); } if (brick->o_direct) { flags |= O_DIRECT; MARS_DBG("using O_DIRECT on %s\n", path); } output->mf = mapfree_get(path, flags); if (unlikely(!output->mf)) { MARS_ERR("could not open file = '%s' flags = %d\n", path, flags); status = -ENOENT; goto err; } output->index = ++index; status = _create_ioctx(output); if (unlikely(status < 0)) { MARS_ERR("could not create ioctx, status = %d\n", status); goto err; } status = aio_start_thread(output, &output->tinfo[0], aio_submit_thread, 's'); if (unlikely(status < 0)) { MARS_ERR("could not start theads, status = %d\n", status); goto err; } MARS_DBG("opened file '%s'\n", path); mars_power_led_on((void*)brick, true); done: return 0; err: MARS_ERR("status = %d\n", status); cleanup: if (brick->power.led_off) { goto done; } mars_power_led_on((void*)brick, false); aio_stop_thread(output, 0, false); _destroy_ioctx(output); mars_power_led_off((void*)brick, (output->tinfo[0].thread == NULL && output->tinfo[1].thread == NULL && output->tinfo[2].thread == NULL)); MARS_DBG("switch off led_off = %d status = %d\n", brick->power.led_off, status); if (brick->power.led_off) { if (output->mf) { MARS_DBG("closing file = '%s'\n", output->mf->mf_name); mapfree_put(output->mf); output->mf = NULL; } } return status; } static int aio_output_construct(struct aio_output *output) { INIT_LIST_HEAD(&output->dirty_anchor); spin_lock_init(&output->dirty_lock); init_waitqueue_head(&output->fdsync_event); output->fd = -1; return 0; }
static int aio_submit(struct aio_output *output, struct aio_mref_aspect *mref_a, bool use_fdsync) { struct mref_object *mref = mref_a->object; mm_segment_t oldfs; int res; struct iocb iocb = { .aio_data = (__u64)mref_a, .aio_lio_opcode = use_fdsync ? IOCB_CMD_FDSYNC : (mref->ref_rw != 0 ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD), .aio_fildes = output->fd, .aio_buf = (unsigned long)mref->ref_data, .aio_nbytes = mref->ref_len, .aio_offset = mref->ref_pos, // .aio_reqprio = something(mref->ref_prio) field exists, but not yet implemented in kernelspace :( }; struct iocb *iocbp = &iocb; unsigned long long latency; mars_trace(mref, "aio_submit"); if (unlikely(output->fd < 0)) { MARS_ERR("bad fd = %d\n", output->fd); res = -EBADF; goto done; } oldfs = get_fs(); set_fs(get_ds()); latency = TIME_STATS(&timings[mref->ref_rw & 1], res = sys_io_submit(output->ctxp, 1, &iocbp)); set_fs(oldfs); threshold_check(&aio_submit_threshold, latency); atomic_inc(&output->total_submit_count); if (likely(res >= 0)) { atomic_inc(&output->submit_count); } else if (likely(res == -EAGAIN)) { atomic_inc(&output->total_again_count); } else { MARS_ERR("error = %d\n", res); } done: return res; } static int aio_submit_dummy(struct aio_output *output) { mm_segment_t oldfs; int res; int dummy; struct iocb iocb = { .aio_buf = (__u64)&dummy, }; struct iocb *iocbp = &iocb; oldfs = get_fs(); set_fs(get_ds()); res = sys_io_submit(output->ctxp, 1, &iocbp); set_fs(oldfs); if (likely(res >= 0)) { atomic_inc(&output->submit_count); } return res; } static int aio_start_thread( struct aio_output *output, struct aio_threadinfo *tinfo, int(*fn)(void*), char class) { int j; for (j = 0; j < MARS_PRIO_NR; j++) { INIT_LIST_HEAD(&tinfo->mref_list[j]); } tinfo->output = output; spin_lock_init(&tinfo->lock); init_waitqueue_head(&tinfo->event); init_waitqueue_head(&tinfo->terminate_event); tinfo->terminated = false; tinfo->thread = brick_thread_create(fn, tinfo, "mars_aio_%c%d", class, output->index); if (unlikely(!tinfo->thread)) { MARS_ERR("cannot create thread\n"); return -ENOENT; } return 0; } static void aio_stop_thread(struct aio_output *output, int i, bool do_submit_dummy) { struct aio_threadinfo *tinfo = &output->tinfo[i]; if (tinfo->thread) { MARS_DBG("stopping thread %d ...\n", i); brick_thread_stop_nowait(tinfo->thread); // workaround for waking up the receiver thread. TODO: check whether signal handlong could do better. if (do_submit_dummy) { MARS_DBG("submitting dummy for wakeup %d...\n", i); use_fake_mm(); aio_submit_dummy(output); if (likely(current->mm)) { unuse_fake_mm(); } } // wait for termination MARS_DBG("waiting for thread %d ...\n", i); wait_event_interruptible_timeout( tinfo->terminate_event, tinfo->terminated, (60 - i * 2) * HZ); if (likely(tinfo->terminated)) { brick_thread_stop(tinfo->thread); } else { MARS_ERR("thread %d did not terminate - leaving a zombie\n", i); } } } static int aio_sync(struct file *file) { int err; switch (aio_sync_mode) { case 1: #if defined(S_BIAS) || (defined(RHEL_MAJOR) && (RHEL_MAJOR < 7)) err = vfs_fsync_range(file, file->f_path.dentry, 0, LLONG_MAX, 1); #else err = vfs_fsync_range(file, 0, LLONG_MAX, 1); #endif break; case 2: #if defined(S_BIAS) || (defined(RHEL_MAJOR) && (RHEL_MAJOR < 7)) err = vfs_fsync_range(file, file->f_path.dentry, 0, LLONG_MAX, 0); #else err = vfs_fsync_range(file, 0, LLONG_MAX, 0); #endif break; default: err = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); } return err; }
static int aio_ref_get(struct aio_output *output, struct mref_object *mref) { struct file *file; struct inode *inode; loff_t total_size; if (unlikely(!output->mf)) { MARS_ERR("brick is not switched on\n"); return -EILSEQ; } if (unlikely(mref->ref_len <= 0)) { MARS_ERR("bad ref_len=%d\n", mref->ref_len); return -EILSEQ; } if (mref->ref_initialized) { _mref_get(mref); return mref->ref_len; } file = output->mf->mf_filp; if (unlikely(!file)) { MARS_ERR("file is not open\n"); return -EILSEQ; } if (unlikely(!file->f_mapping)) { MARS_ERR("file %p has no mapping\n", file); return -EILSEQ; } inode = file->f_mapping->host; if (unlikely(!inode)) { MARS_ERR("file %p has no inode\n", file); return -EILSEQ; } total_size = i_size_read(inode); mref->ref_total_size = total_size; /* Only check reads. * Writes behind EOF are always allowed (sparse files) */ if (!mref->ref_may_write) { loff_t len = total_size - mref->ref_pos; if (unlikely(len <= 0)) { /* Special case: allow reads starting _exactly_ at EOF when a timeout is specified. */ if (len < 0 || mref->ref_timeout <= 0) { MARS_DBG("ENODATA %lld\n", len); return -ENODATA; } } // Shorten below EOF, but allow special case if (mref->ref_len > len && len > 0) { mref->ref_len = len; } } /* Buffered IO. */ if (!mref->ref_data) { struct aio_mref_aspect *mref_a = aio_mref_get_aspect(output->brick, mref); if (unlikely(!mref_a)) { MARS_ERR("bad mref_a\n"); return -EILSEQ; } if (unlikely(mref->ref_len <= 0)) { MARS_ERR("bad ref_len = %d\n", mref->ref_len); return -ENOMEM; } mref->ref_data = brick_block_alloc(mref->ref_pos, (mref_a->alloc_len = mref->ref_len)); if (unlikely(!mref->ref_data)) { MARS_ERR("ENOMEM %d bytes\n", mref->ref_len); return -ENOMEM; } #if 0 // ??? mref->ref_flags = 0; #endif mref_a->do_dealloc = true; atomic_inc(&output->total_alloc_count); atomic_inc(&output->alloc_count); } _mref_get_first(mref); return mref->ref_len; }
void __exit exit_mars_aio(void) { MARS_DBG("exit_aio()\n"); aio_unregister_brick_type(); }
int __init init_mars_aio(void) { MARS_DBG("init_aio()\n"); _aio_brick_type = (void*)&aio_brick_type; return aio_register_brick_type(); }
static int _connect(struct client_output *output, const char *str) { struct sockaddr_storage sockaddr = {}; int status; if (unlikely(!output->path)) { output->path = brick_strdup(str); status = -ENOMEM; if (!output->path) { MARS_DBG("no mem\n"); goto done; } status = -EINVAL; output->host = strchr(output->path, '@'); if (!output->host) { brick_string_free(output->path); output->path = NULL; MARS_ERR("parameter string '%s' contains no remote specifier with '@'-syntax\n", str); goto done; } *output->host++ = '\0'; } if (unlikely(output->receiver.thread)) { MARS_WRN("receiver thread unexpectedly not dead\n"); _kill_thread(&output->receiver, "receiver"); } status = mars_create_sockaddr(&sockaddr, output->host); if (unlikely(status < 0)) { MARS_DBG("no sockaddr, status = %d\n", status); goto done; } status = mars_create_socket(&output->socket, &sockaddr, false); if (unlikely(status < 0)) { MARS_DBG("no socket, status = %d\n", status); goto really_done; } output->socket.s_shutdown_on_err = true; output->receiver.thread = brick_thread_create(receiver_thread, output, "mars_receiver%d", thread_count++); if (unlikely(!output->receiver.thread)) { MARS_ERR("cannot start receiver thread, status = %d\n", status); status = -ENOENT; goto done; } { struct mars_cmd cmd = { .cmd_code = CMD_CONNECT, .cmd_str1 = output->path, }; status = mars_send_struct(&output->socket, &cmd, mars_cmd_meta); if (unlikely(status < 0)) { MARS_DBG("send of connect failed, status = %d\n", status); goto done; } } if (status >= 0) { status = _request_info(output); } done: if (status < 0) { MARS_INF("cannot connect to remote host '%s' (status = %d) -- retrying\n", output->host ? output->host : "NULL", status); _kill_socket(output); } really_done: return status; } ////////////////// own brick / input / output operations ////////////////// static int client_get_info(struct client_output *output, struct mars_info *info) { int status; output->got_info = false; output->get_info = true; wake_up_interruptible(&output->event); wait_event_interruptible_timeout(output->info_event, output->got_info, 60 * HZ); status = -EIO; if (output->got_info && info) { memcpy(info, &output->info, sizeof(*info)); status = 0; } //done: return status; }
static int sender_thread(void *data) { struct client_output *output = data; struct client_brick *brick = output->brick; unsigned long flags; bool do_kill = false; int status = 0; output->receiver.restart_count = 0; while (!brick_thread_should_stop()) { struct list_head *tmp = NULL; struct client_mref_aspect *mref_a; struct mref_object *mref; if (unlikely(output->recv_error != 0 || !mars_socket_is_alive(&output->socket))) { MARS_DBG("recv_error = %d do_kill = %d\n", output->recv_error, do_kill); if (do_kill) { do_kill = false; _kill_socket(output); brick_msleep(3000); } status = _connect(output, brick->brick_name); MARS_IO("connect status = %d\n", status); if (unlikely(status < 0)) { brick_msleep(3000); _do_timeout(output, &output->wait_list, false); _do_timeout(output, &output->mref_list, false); continue; } brick->connection_state = 2; do_kill = true; /* Re-Submit any waiting requests */ MARS_IO("re-submit\n"); _do_resubmit(output); } wait_event_interruptible_timeout(output->event, !list_empty(&output->mref_list) || output->get_info || output->recv_error != 0 || brick_thread_should_stop(), 1 * HZ); if (unlikely(output->recv_error != 0)) { MARS_DBG("recv_error = %d\n", output->recv_error); brick_msleep(1000); continue; } if (output->get_info) { status = _request_info(output); if (status >= 0) { output->get_info = false; } else { MARS_WRN("cannot get info, status = %d\n", status); brick_msleep(1000); } } /* Grab the next mref from the queue */ traced_lock(&output->lock, flags); if (list_empty(&output->mref_list)) { traced_unlock(&output->lock, flags); continue; } tmp = output->mref_list.next; list_del(tmp); list_add(tmp, &output->wait_list); mref_a = container_of(tmp, struct client_mref_aspect, io_head); traced_unlock(&output->lock, flags); mref = mref_a->object; if (brick->limit_mode) { int amount = 0; if (mref->ref_cs_mode < 2) amount = (mref->ref_len - 1) / 1024 + 1; mars_limit_sleep(&client_limiter, amount); } MARS_IO("sending mref, id = %d pos = %lld len = %d rw = %d\n", mref->ref_id, mref->ref_pos, mref->ref_len, mref->ref_rw); status = mars_send_mref(&output->socket, mref); MARS_IO("status = %d\n", status); if (unlikely(status < 0)) { // retry submission on next occasion.. MARS_WRN("sending failed, status = %d\n", status); if (do_kill) { do_kill = false; _kill_socket(output); } _hash_insert(output, mref_a); brick_msleep(1000); continue; } } //done: if (status < 0) { MARS_WRN("sender thread terminated with status = %d\n", status); } if (do_kill) { _kill_socket(output); } /* Signal error on all pending IO requests. * We have no other chance (except probably delaying * this until destruction which is probably not what * we want). */ _do_timeout(output, &output->wait_list, true); _do_timeout(output, &output->mref_list, true); wake_up_interruptible(&output->sender.run_event); MARS_DBG("sender terminated\n"); return status; }
static int receiver_thread(void *data) { struct client_output *output = data; int status = 0; while (!brick_thread_should_stop()) { struct mars_cmd cmd = {}; struct list_head *tmp; struct client_mref_aspect *mref_a = NULL; struct mref_object *mref = NULL; unsigned long flags; status = mars_recv_struct(&output->socket, &cmd, mars_cmd_meta); MARS_IO("got cmd = %d status = %d\n", cmd.cmd_code, status); if (status < 0) goto done; switch (cmd.cmd_code & CMD_FLAG_MASK) { case CMD_NOTIFY: mars_trigger(); break; case CMD_CONNECT: if (cmd.cmd_int1 < 0) { status = cmd.cmd_int1; MARS_ERR("at remote side: brick connect failed, remote status = %d\n", status); goto done; } break; case CMD_CB: { int hash_index = cmd.cmd_int1 % CLIENT_HASH_MAX; traced_lock(&output->lock, flags); for (tmp = output->hash_table[hash_index].next; tmp != &output->hash_table[hash_index]; tmp = tmp->next) { struct mref_object *tmp_mref; mref_a = container_of(tmp, struct client_mref_aspect, hash_head); tmp_mref = mref_a->object; if (unlikely(!tmp_mref)) { traced_unlock(&output->lock, flags); MARS_ERR("bad internal mref pointer\n"); status = -EBADR; goto done; } if (tmp_mref->ref_id == cmd.cmd_int1) { mref = tmp_mref; list_del_init(&mref_a->hash_head); list_del_init(&mref_a->io_head); break; } } traced_unlock(&output->lock, flags); if (unlikely(!mref)) { MARS_WRN("got unknown id = %d for callback\n", cmd.cmd_int1); status = -EBADR; goto done; } MARS_IO("got callback id = %d, old pos = %lld len = %d rw = %d\n", mref->ref_id, mref->ref_pos, mref->ref_len, mref->ref_rw); status = mars_recv_cb(&output->socket, mref, &cmd); MARS_IO("new status = %d, pos = %lld len = %d rw = %d\n", status, mref->ref_pos, mref->ref_len, mref->ref_rw); if (unlikely(status < 0)) { MARS_WRN("interrupted data transfer during callback, status = %d\n", status); _hash_insert(output, mref_a); goto done; } SIMPLE_CALLBACK(mref, 0); client_ref_put(output, mref); atomic_dec(&output->fly_count); atomic_dec(&mars_global_io_flying); break; } case CMD_GETINFO: status = mars_recv_struct(&output->socket, &output->info, mars_info_meta); if (status < 0) { MARS_WRN("got bad info from remote side, status = %d\n", status); goto done; } output->got_info = true; wake_up_interruptible(&output->info_event); break; default: MARS_ERR("got bad command %d from remote side, terminating.\n", cmd.cmd_code); status = -EBADR; goto done; } done: brick_string_free(cmd.cmd_str1); if (unlikely(status < 0)) { if (!output->recv_error) { MARS_DBG("signalling status = %d\n", status); output->recv_error = status; } wake_up_interruptible(&output->event); brick_msleep(100); } } if (status < 0) { MARS_WRN("receiver thread terminated with status = %d, recv_error = %d\n", status, output->recv_error); } mars_shutdown_socket(&output->socket); wake_up_interruptible(&output->receiver.run_event); return status; }
static int trigger_sysctl_handler( struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { ssize_t res = 0; size_t len = *length; MARS_DBG("write = %d len = %ld pos = %lld\n", write, len, *ppos); if (!len || *ppos > 0) { goto done; } if (write) { char tmp[8] = {}; int code = 0; res = len; // fake consumption of all data if (len > 7) len = 7; if (!copy_from_user(tmp, buffer, len)) { sscanf(tmp, "%d", &code); if (code > 0) { mars_trigger(); } if (code > 1) { mars_remote_trigger(); } } } else { char *answer = "MARS module not operational\n"; char *tmp = NULL; int mylen; if (mars_info) { answer = "internal error while determining mars_info\n"; tmp = mars_info(); if (tmp) answer = tmp; } mylen = strlen(answer); if (len > mylen) len = mylen; res = len; if (copy_to_user(buffer, answer, len)) { MARS_ERR("write %ld bytes at %p failed\n", len, buffer); res = -EFAULT; } brick_string_free(tmp); } done: MARS_DBG("res = %ld\n", res); *length = res; if (res >= 0) { *ppos += res; return 0; } return res; }