Beispiel #1
0
static void _kill_socket(struct client_output *output)
{
	output->brick->connection_state = 1;
	if (mars_socket_is_alive(&output->socket)) {
		MARS_DBG("shutdown socket\n");
		mars_shutdown_socket(&output->socket);
	}
	_kill_thread(&output->receiver, "receiver");
	output->recv_error = 0;
	MARS_DBG("close socket\n");
	mars_put_socket(&output->socket);
}
Beispiel #2
0
static int _request_info(struct client_output *output)
{
	struct mars_cmd cmd = {
		.cmd_code = CMD_GETINFO,
	};
	int status;
	
	MARS_DBG("\n");
	status = mars_send_struct(&output->socket, &cmd, mars_cmd_meta);
	if (unlikely(status < 0)) {
		MARS_DBG("send of getinfo failed, status = %d\n", status);
	}
	return status;
}
Beispiel #3
0
static
int lamport_sysctl_handler(
	struct ctl_table *table,
	int write, 
	void __user *buffer,
	size_t *length,
	loff_t *ppos)
{
	ssize_t res = 0;
	size_t len = *length;

	MARS_DBG("write = %d len = %ld pos = %lld\n", write, len, *ppos);

	if (!len || *ppos > 0) {
		goto done;
	}

	if (write) {
		return -EINVAL;
	} else {
		int my_len = 128;
		char *tmp = brick_string_alloc(my_len);
		struct timespec know = CURRENT_TIME;
		struct timespec lnow;

		get_lamport(&lnow);
		
		res = scnprintf(tmp, my_len,
			       "CURRENT_TIME=%ld.%09ld\n"
			       "lamport_now=%ld.%09ld\n",
			       know.tv_sec, know.tv_nsec,
			       lnow.tv_sec, lnow.tv_nsec
			);

		if (copy_to_user(buffer, tmp, res)) {
			MARS_ERR("write %ld bytes at %p failed\n", res, buffer);
			res = -EFAULT;
		}
		brick_string_free(tmp);
	}

done:
	MARS_DBG("res = %ld\n", res);
	*length = res;
	if (res >= 0) {
	        *ppos += res;
		return 0;
	}
	return res;
}
Beispiel #4
0
static void _kill_thread(struct client_threadinfo *ti, const char *name)
{
	if (ti->thread) {
		MARS_DBG("stopping %s thread\n", name);
		brick_thread_stop(ti->thread);
		ti->thread = NULL;
	}
}
Beispiel #5
0
/* Workaround for non-implemented aio_fsync()
 */
static
int aio_sync_thread(void *data)
{
	struct aio_threadinfo *tinfo = data;
	struct aio_output *output = tinfo->output;
	
	MARS_DBG("sync thread has started on '%s'.\n", output->brick->brick_path);
	//set_user_nice(current, -20);

	while (!brick_thread_should_stop() || atomic_read(&tinfo->queued_sum) > 0) {
		LIST_HEAD(tmp_list);
		unsigned long flags;
		int i;

		output->fdsync_active = false;
		wake_up_interruptible_all(&output->fdsync_event);

		wait_event_interruptible_timeout(
			tinfo->event,
			atomic_read(&tinfo->queued_sum) > 0,
			HZ / 4);

		traced_lock(&tinfo->lock, flags);
		for (i = 0; i < MARS_PRIO_NR; i++) {
			struct list_head *start = &tinfo->mref_list[i];
			if (!list_empty(start)) {
				// move over the whole list
				list_replace_init(start, &tmp_list);
				atomic_sub(tinfo->queued[i], &tinfo->queued_sum);
				tinfo->queued[i] = 0;
				break;
			}
		}
		traced_unlock(&tinfo->lock, flags);

		if (!list_empty(&tmp_list)) {
			aio_sync_all(output, &tmp_list);
		}
	}

	MARS_DBG("sync thread has stopped.\n");
	tinfo->terminated = true;
	wake_up_interruptible_all(&tinfo->terminate_event);
	return 0;
}
Beispiel #6
0
static
char *_mars_translate_hostname(const char *name)
{
	struct mars_global *global = mars_global;
	char *res = brick_strdup(name);
	struct mars_dent *test;
	char *tmp;

	if (unlikely(!global)) {
		goto done;
	}

	for (tmp = res; *tmp; tmp++) {
		if (*tmp == ':') {
			*tmp = '\0';
			break;
		}
	}

	tmp = path_make("/mars/ips/ip-%s", res);
	if (unlikely(!tmp)) {
		goto done;
	}

	test = mars_find_dent(global, tmp);
	if (test && test->new_link) {
		MARS_DBG("'%s' => '%s'\n", tmp, test->new_link);
		brick_string_free(res);
		res = brick_strdup(test->new_link);
	} else {
		MARS_DBG("no translation for '%s'\n", tmp);
	}
	brick_string_free(tmp);

done:
	return res;
}
Beispiel #7
0
static int aio_event_thread(void *data)
{
	struct aio_threadinfo *tinfo = data;
	struct aio_output *output = tinfo->output;
	struct aio_threadinfo *other = &output->tinfo[2];
	int err = -ENOMEM;
	
	MARS_DBG("event thread has started.\n");
	//set_user_nice(current, -20);

	use_fake_mm();
	if (!current->mm)
		goto err;

	err = aio_start_thread(output, &output->tinfo[2], aio_sync_thread, 'y');
	if (unlikely(err < 0))
		goto err;

	while (!brick_thread_should_stop() || atomic_read(&tinfo->queued_sum) > 0) {
		mm_segment_t oldfs;
		int count;
		int i;
		struct timespec timeout = {
			.tv_sec = 1,
		};
		struct io_event events[MARS_MAX_AIO_READ];

		oldfs = get_fs();
		set_fs(get_ds());
		/* TODO: don't timeout upon termination.
		 * Probably we should submit a dummy request.
		 */
		count = sys_io_getevents(output->ctxp, 1, MARS_MAX_AIO_READ, events, &timeout);
		set_fs(oldfs);

		if (likely(count > 0)) {
			atomic_sub(count, &output->submit_count);
		}

		for (i = 0; i < count; i++) {
			struct aio_mref_aspect *mref_a = (void*)events[i].data;
			struct mref_object *mref;
			int err = events[i].res;

			if (!mref_a) {
				continue; // this was a dummy request
			}
			mref = mref_a->object;

			MARS_IO("AIO done %p pos = %lld len = %d rw = %d\n", mref, mref->ref_pos, mref->ref_len, mref->ref_rw);

			mapfree_set(output->mf, mref->ref_pos, mref->ref_pos + mref->ref_len);

			if (output->brick->o_fdsync
			   && err >= 0 
			   && mref->ref_rw != READ
			   && !mref->ref_skip_sync
			   && !mref_a->resubmit++) {
				// workaround for non-implemented AIO FSYNC operation
				if (output->mf &&
				    output->mf->mf_filp &&
				    output->mf->mf_filp->f_op &&
				    !output->mf->mf_filp->f_op->aio_fsync) {
					mars_trace(mref, "aio_fsync");
					_enqueue(other, mref_a, mref->ref_prio, true);
					continue;
				}
				err = aio_submit(output, mref_a, true);
				if (likely(err >= 0))
					continue;
			}

			_complete(output, mref_a, err);

		}
	}
	err = 0;

 err:
	MARS_DBG("event thread has stopped, err = %d\n", err);

	aio_stop_thread(output, 2, false);

	unuse_fake_mm();

	tinfo->terminated = true;
	wake_up_interruptible_all(&tinfo->terminate_event);
	return err;
}

#if 1
/* This should go to fs/open.c (as long as vfs_submit() is not implemented)
 */
#include <linux/fdtable.h>
void fd_uninstall(unsigned int fd)
{
	struct files_struct *files = current->files;
	struct fdtable *fdt;
	MARS_DBG("fd = %d\n", fd);
	if (unlikely(fd < 0)) {
		MARS_ERR("bad fd = %d\n", fd);
		return;
	}
	spin_lock(&files->file_lock);
	fdt = files_fdtable(files);
	rcu_assign_pointer(fdt->fd[fd], NULL);
	spin_unlock(&files->file_lock);
}
EXPORT_SYMBOL(fd_uninstall);
#endif

static
atomic_t ioctx_count = ATOMIC_INIT(0);

static
void _destroy_ioctx(struct aio_output *output)
{
	if (unlikely(!output))
		goto done;

	aio_stop_thread(output, 1, true);

	use_fake_mm();

	if (likely(output->ctxp)) {
		mm_segment_t oldfs;
		int err;

		MARS_DBG("ioctx count = %d destroying %p\n", atomic_read(&ioctx_count), (void*)output->ctxp);
		oldfs = get_fs();
		set_fs(get_ds());
		err = sys_io_destroy(output->ctxp);
		set_fs(oldfs);
		atomic_dec(&ioctx_count);
		MARS_DBG("ioctx count = %d status = %d\n", atomic_read(&ioctx_count), err);
		output->ctxp = 0;
	}

	if (likely(output->fd >= 0)) {
		MARS_DBG("destroying fd %d\n", output->fd);
		fd_uninstall(output->fd);
		put_unused_fd(output->fd);
		output->fd = -1;
	}

 done:
	if (likely(current->mm)) {
		unuse_fake_mm();
	}
}

static
int _create_ioctx(struct aio_output *output)
{
	struct file *file;
	mm_segment_t oldfs;
	int err = -EINVAL;

	CHECK_PTR_NULL(output, done);
	CHECK_PTR_NULL(output->mf, done);
	file = output->mf->mf_filp;
	CHECK_PTR_NULL(file, done);

	/* TODO: this is provisionary. We only need it for sys_io_submit()
	 * which uses userspace concepts like file handles.
	 * This should be accompanied by a future kernelsapce vfs_submit() or
	 * do_submit() which currently does not exist :(
	 */
	err = get_unused_fd();
	MARS_DBG("file %p '%s' new fd = %d\n", file, output->mf->mf_name, err);
	if (unlikely(err < 0)) {
		MARS_ERR("cannot get fd, err=%d\n", err);
		goto done;
	}
	output->fd = err;
	fd_install(err, file);

	use_fake_mm();

	err = -ENOMEM;
	if (unlikely(!current->mm)) {
		MARS_ERR("cannot fake mm\n");
		goto done;
	}

	MARS_DBG("ioctx count = %d old = %p\n", atomic_read(&ioctx_count), (void*)output->ctxp);
	output->ctxp = 0;

	oldfs = get_fs();
	set_fs(get_ds());
	err = sys_io_setup(MARS_MAX_AIO, &output->ctxp);
	set_fs(oldfs);
	if (likely(output->ctxp))
		atomic_inc(&ioctx_count);
	MARS_DBG("ioctx count = %d new = %p status = %d\n", atomic_read(&ioctx_count), (void*)output->ctxp, err);
	if (unlikely(err < 0)) {
		MARS_ERR("io_setup failed, err=%d\n", err);
		goto done;
	}
	
	err = aio_start_thread(output, &output->tinfo[1], aio_event_thread, 'e');
	if (unlikely(err < 0)) {
		MARS_ERR("could not start event thread\n");
		goto done;
	}

 done:
	if (likely(current->mm)) {
		unuse_fake_mm();
	}
	return err;
}

static int aio_submit_thread(void *data)
{
	struct aio_threadinfo *tinfo = data;
	struct aio_output *output = tinfo->output;
	struct file *file;
	int err = -EINVAL;

	MARS_DBG("submit thread has started.\n");

	file = output->mf->mf_filp;

	use_fake_mm();

	while (!brick_thread_should_stop() || atomic_read(&output->read_count) + atomic_read(&output->write_count) + atomic_read(&tinfo->queued_sum) > 0) {
		struct aio_mref_aspect *mref_a;
		struct mref_object *mref;
		int sleeptime;
		int status;

		wait_event_interruptible_timeout(
			tinfo->event,
			atomic_read(&tinfo->queued_sum) > 0,
			HZ / 4);

		mref_a = _dequeue(tinfo);
		if (!mref_a) {
			continue;
		}

		mref = mref_a->object;
		status = -EINVAL;
		CHECK_PTR(mref, error);

		mapfree_set(output->mf, mref->ref_pos, -1);

		if (mref->ref_rw) {
			insert_dirty(output, mref_a);
		}

		// check for reads exactly at EOF (special case)
		if (mref->ref_pos == mref->ref_total_size &&
		   !mref->ref_rw &&
		   mref->ref_timeout > 0) {
			loff_t total_size = i_size_read(file->f_mapping->host);
			loff_t len = total_size - mref->ref_pos;
			if (len > 0) {
				mref->ref_total_size = total_size;
				mref->ref_len = len;
			} else {
				if (!mref_a->start_jiffies) {
					mref_a->start_jiffies = jiffies;
				}
				if ((long long)jiffies - mref_a->start_jiffies <= mref->ref_timeout) {
					if (atomic_read(&tinfo->queued_sum) <= 0) {
						atomic_inc(&output->total_msleep_count);
						brick_msleep(1000 * 4 / HZ);
					}
					_enqueue(tinfo, mref_a, MARS_PRIO_LOW, true);
					continue;
				}
				MARS_DBG("ENODATA %lld\n", len);
				_complete(output, mref_a, -ENODATA);
				continue;
			}
		}

		sleeptime = 1;
		for (;;) {
			status = aio_submit(output, mref_a, false);

			if (likely(status != -EAGAIN)) {
				break;
			}
			atomic_inc(&output->total_delay_count);
			brick_msleep(sleeptime);
			if (sleeptime < 100) {
				sleeptime++;
			}
		}
	error:
		if (unlikely(status < 0)) {
			MARS_IO("submit_count = %d status = %d\n", atomic_read(&output->submit_count), status);
			_complete_mref(output, mref, status);
		}
	}

	MARS_DBG("submit thread has stopped, status = %d.\n", err);

	if (likely(current->mm)) {
		unuse_fake_mm();
	}

	tinfo->terminated = true;
	wake_up_interruptible_all(&tinfo->terminate_event);
	return err;
}

static int aio_get_info(struct aio_output *output, struct mars_info *info)
{
	struct file *file;
	loff_t min;
	loff_t max;

	if (unlikely(!output ||
		     !output->mf ||
		     !(file = output->mf->mf_filp) ||
		     !file->f_mapping ||
		     !file->f_mapping->host))
		return -EINVAL;

	info->tf_align = 1;
	info->tf_min_size = 1;

	/* Workaround for races in the page cache.
	 *
	 * It appears that concurrent reads and writes seem to
	 * result in inconsistent reads in some very rare cases, due to
	 * races. Sometimes, the inode claims that the file has been already
	 * appended by a write operation, but the data has not actually hit
	 * the page cache, such that a concurrent read gets NULL blocks.
	 */
	min = i_size_read(file->f_mapping->host);
	max = 0;

	if (!output->brick->is_static_device) {
		get_dirty(output, &min, &max);
	}

	info->current_size = min;
	MARS_DBG("determined file size = %lld\n", info->current_size);

	return 0;
}

//////////////// informational / statistics ///////////////

static noinline
char *aio_statistics(struct aio_brick *brick, int verbose)
{
	struct aio_output *output = brick->outputs[0];
	char *res = brick_string_alloc(4096);
	char *sync = NULL;
	int pos = 0;
	if (!res)
		return NULL;

	pos += report_timing(&timings[0], res + pos, 4096 - pos);
	pos += report_timing(&timings[1], res + pos, 4096 - pos);
	pos += report_timing(&timings[2], res + pos, 4096 - pos);

	snprintf(res + pos, 4096 - pos,
		 "total "
		 "reads = %d "
		 "writes = %d "
		 "allocs = %d "
		 "submits = %d "
		 "again = %d "
		 "delays = %d "
		 "msleeps = %d "
		 "fdsyncs = %d "
		 "fdsync_waits = %d "
		 "map_free = %d | "
		 "flying reads = %d "
		 "writes = %d "
		 "allocs = %d "
		 "submits = %d "
		 "q0 = %d "
		 "q1 = %d "
		 "q2 = %d "
		 "| total "
		 "q0 = %d "
		 "q1 = %d "
		 "q2 = %d "
		 "%s\n",
		 atomic_read(&output->total_read_count),
		 atomic_read(&output->total_write_count),
		 atomic_read(&output->total_alloc_count),
		 atomic_read(&output->total_submit_count),
		 atomic_read(&output->total_again_count),
		 atomic_read(&output->total_delay_count),
		 atomic_read(&output->total_msleep_count),
		 atomic_read(&output->total_fdsync_count),
		 atomic_read(&output->total_fdsync_wait_count),
		 atomic_read(&output->total_mapfree_count),
		 atomic_read(&output->read_count),
		 atomic_read(&output->write_count),
		 atomic_read(&output->alloc_count),
		 atomic_read(&output->submit_count),
		 atomic_read(&output->tinfo[0].queued_sum),
		 atomic_read(&output->tinfo[1].queued_sum),
		 atomic_read(&output->tinfo[2].queued_sum),
		 atomic_read(&output->tinfo[0].total_enqueue_count),
		 atomic_read(&output->tinfo[1].total_enqueue_count),
		 atomic_read(&output->tinfo[2].total_enqueue_count),
		 sync ? sync : "");
	
	if (sync)
		brick_string_free(sync);

	return res;
}

static noinline
void aio_reset_statistics(struct aio_brick *brick)
{
	struct aio_output *output = brick->outputs[0];
	int i;
	atomic_set(&output->total_read_count, 0);
	atomic_set(&output->total_write_count, 0);
	atomic_set(&output->total_alloc_count, 0);
	atomic_set(&output->total_submit_count, 0);
	atomic_set(&output->total_again_count, 0);
	atomic_set(&output->total_delay_count, 0);
	atomic_set(&output->total_msleep_count, 0);
	atomic_set(&output->total_fdsync_count, 0);
	atomic_set(&output->total_fdsync_wait_count, 0);
	atomic_set(&output->total_mapfree_count, 0);
	for (i = 0; i < 3; i++) {
		struct aio_threadinfo *tinfo = &output->tinfo[i];
		atomic_set(&tinfo->total_enqueue_count, 0);
	}
}


//////////////// object / aspect constructors / destructors ///////////////

static int aio_mref_aspect_init_fn(struct generic_aspect *_ini)
{
	struct aio_mref_aspect *ini = (void*)_ini;
	INIT_LIST_HEAD(&ini->io_head);
	INIT_LIST_HEAD(&ini->dirty_head);
	return 0;
}

static void aio_mref_aspect_exit_fn(struct generic_aspect *_ini)
{
	struct aio_mref_aspect *ini = (void*)_ini;
	CHECK_HEAD_EMPTY(&ini->dirty_head);
	CHECK_HEAD_EMPTY(&ini->io_head);
}

MARS_MAKE_STATICS(aio);

////////////////////// brick constructors / destructors ////////////////////

static int aio_brick_construct(struct aio_brick *brick)
{
	return 0;
}

static int aio_switch(struct aio_brick *brick)
{
	static int index;
	struct aio_output *output = brick->outputs[0];
	const char *path = output->brick->brick_path;
	int flags = O_RDWR | O_LARGEFILE;
	int status = 0;

	MARS_DBG("power.button = %d\n", brick->power.button);
	if (!brick->power.button)
		goto cleanup;

	if (brick->power.led_on || output->mf)
		goto done;

	mars_power_led_off((void*)brick, false);

	if (brick->o_creat) {
		flags |= O_CREAT;
		MARS_DBG("using O_CREAT on %s\n", path);
	}
	if (brick->o_direct) {
		flags |= O_DIRECT;
		MARS_DBG("using O_DIRECT on %s\n", path);
	}

	output->mf = mapfree_get(path, flags);
	if (unlikely(!output->mf)) {
		MARS_ERR("could not open file = '%s' flags = %d\n", path, flags);
		status = -ENOENT;
		goto err;
	} 

	output->index = ++index;

	status = _create_ioctx(output);
	if (unlikely(status < 0)) {
		MARS_ERR("could not create ioctx, status = %d\n", status);
		goto err;
	}

	status = aio_start_thread(output, &output->tinfo[0], aio_submit_thread, 's');
	if (unlikely(status < 0)) {
		MARS_ERR("could not start theads, status = %d\n", status);
		goto err;
	}

	MARS_DBG("opened file '%s'\n", path);
	mars_power_led_on((void*)brick, true);

done:
	return 0;

err:
	MARS_ERR("status = %d\n", status);
cleanup:
	if (brick->power.led_off) {
		goto done;
	}

	mars_power_led_on((void*)brick, false);

	aio_stop_thread(output, 0, false);

	_destroy_ioctx(output);

	mars_power_led_off((void*)brick,
			  (output->tinfo[0].thread == NULL &&
			   output->tinfo[1].thread == NULL &&
			   output->tinfo[2].thread == NULL));

	MARS_DBG("switch off led_off = %d status = %d\n", brick->power.led_off, status);
	if (brick->power.led_off) {
		if (output->mf) {
			MARS_DBG("closing file = '%s'\n", output->mf->mf_name);
			mapfree_put(output->mf);
			output->mf = NULL;
		}
	}
	return status;
}

static int aio_output_construct(struct aio_output *output)
{
	INIT_LIST_HEAD(&output->dirty_anchor);
	spin_lock_init(&output->dirty_lock);
	init_waitqueue_head(&output->fdsync_event);
	output->fd = -1;
	return 0;
}
Beispiel #8
0
static int aio_submit(struct aio_output *output, struct aio_mref_aspect *mref_a, bool use_fdsync)
{
	struct mref_object *mref = mref_a->object;
	mm_segment_t oldfs;
	int res;
	struct iocb iocb = {
		.aio_data = (__u64)mref_a,
		.aio_lio_opcode = use_fdsync ? IOCB_CMD_FDSYNC : (mref->ref_rw != 0 ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD),
		.aio_fildes = output->fd,
		.aio_buf = (unsigned long)mref->ref_data,
		.aio_nbytes = mref->ref_len,
		.aio_offset = mref->ref_pos,
		// .aio_reqprio = something(mref->ref_prio) field exists, but not yet implemented in kernelspace :(
	};
	struct iocb *iocbp = &iocb;
	unsigned long long latency;

	mars_trace(mref, "aio_submit");

	if (unlikely(output->fd < 0)) {
		MARS_ERR("bad fd = %d\n", output->fd);
		res = -EBADF;
		goto done;
	}

	oldfs = get_fs();
	set_fs(get_ds());
	latency = TIME_STATS(&timings[mref->ref_rw & 1], res = sys_io_submit(output->ctxp, 1, &iocbp));
	set_fs(oldfs);

	threshold_check(&aio_submit_threshold, latency);

	atomic_inc(&output->total_submit_count);

	if (likely(res >= 0)) {
		atomic_inc(&output->submit_count);
	} else if (likely(res == -EAGAIN)) {
		atomic_inc(&output->total_again_count);
	} else {
		MARS_ERR("error = %d\n", res);
	}

done:
	return res;
}

static int aio_submit_dummy(struct aio_output *output)
{
	mm_segment_t oldfs;
	int res;
	int dummy;
	struct iocb iocb = {
		.aio_buf = (__u64)&dummy,
	};
	struct iocb *iocbp = &iocb;

	oldfs = get_fs();
	set_fs(get_ds());
	res = sys_io_submit(output->ctxp, 1, &iocbp);
	set_fs(oldfs);

	if (likely(res >= 0)) {
		atomic_inc(&output->submit_count);
	}
	return res;
}

static
int aio_start_thread(
	struct aio_output *output,
	struct aio_threadinfo *tinfo,
	int(*fn)(void*),
	char class)
{
	int j;

	for (j = 0; j < MARS_PRIO_NR; j++) {
		INIT_LIST_HEAD(&tinfo->mref_list[j]);
	}
	tinfo->output = output;
	spin_lock_init(&tinfo->lock);
	init_waitqueue_head(&tinfo->event);
	init_waitqueue_head(&tinfo->terminate_event);
	tinfo->terminated = false;
	tinfo->thread = brick_thread_create(fn, tinfo, "mars_aio_%c%d", class, output->index);
	if (unlikely(!tinfo->thread)) {
		MARS_ERR("cannot create thread\n");
		return -ENOENT;
	}
	return 0;
}

static
void aio_stop_thread(struct aio_output *output, int i, bool do_submit_dummy)
{
	struct aio_threadinfo *tinfo = &output->tinfo[i];

	if (tinfo->thread) {
		MARS_DBG("stopping thread %d ...\n", i);
		brick_thread_stop_nowait(tinfo->thread);

		// workaround for waking up the receiver thread. TODO: check whether signal handlong could do better.
		if (do_submit_dummy) {
			MARS_DBG("submitting dummy for wakeup %d...\n", i);
			use_fake_mm();
			aio_submit_dummy(output);
			if (likely(current->mm)) {
				unuse_fake_mm();
			}
		}

		// wait for termination
		MARS_DBG("waiting for thread %d ...\n", i);
		wait_event_interruptible_timeout(
			tinfo->terminate_event,
			tinfo->terminated,
			(60 - i * 2) * HZ);
		if (likely(tinfo->terminated)) {
			brick_thread_stop(tinfo->thread);
		} else {
			MARS_ERR("thread %d did not terminate - leaving a zombie\n", i);
		}
	}
}

static
int aio_sync(struct file *file)
{
	int err;

	switch (aio_sync_mode) {
	case 1:
#if defined(S_BIAS) || (defined(RHEL_MAJOR) && (RHEL_MAJOR < 7))
		err = vfs_fsync_range(file, file->f_path.dentry, 0, LLONG_MAX, 1);
#else
		err = vfs_fsync_range(file, 0, LLONG_MAX, 1);
#endif
		break;
	case 2:
#if defined(S_BIAS) || (defined(RHEL_MAJOR) && (RHEL_MAJOR < 7))
		err = vfs_fsync_range(file, file->f_path.dentry, 0, LLONG_MAX, 0);
#else
		err = vfs_fsync_range(file, 0, LLONG_MAX, 0);
#endif
		break;
	default:
		err = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
	}

	return err;
}
Beispiel #9
0
static int aio_ref_get(struct aio_output *output, struct mref_object *mref)
{
	struct file *file;
	struct inode *inode;
	loff_t total_size;

	if (unlikely(!output->mf)) {
		MARS_ERR("brick is not switched on\n");
		return -EILSEQ;
	}

	if (unlikely(mref->ref_len <= 0)) {
		MARS_ERR("bad ref_len=%d\n", mref->ref_len);
		return -EILSEQ;
	}

	if (mref->ref_initialized) {
		_mref_get(mref);
		return mref->ref_len;
	}

	file = output->mf->mf_filp;
	if (unlikely(!file)) {
		MARS_ERR("file is not open\n");
		return -EILSEQ;
	}
	if (unlikely(!file->f_mapping)) {
		MARS_ERR("file %p has no mapping\n", file);
		return -EILSEQ;
	}
	inode = file->f_mapping->host;
	if (unlikely(!inode)) {
		MARS_ERR("file %p has no inode\n", file);
		return -EILSEQ;
	}
	
	total_size = i_size_read(inode);
	mref->ref_total_size = total_size;
	/* Only check reads.
	 * Writes behind EOF are always allowed (sparse files)
	 */
	if (!mref->ref_may_write) {
		loff_t len = total_size - mref->ref_pos;
		if (unlikely(len <= 0)) {
			/* Special case: allow reads starting _exactly_ at EOF when a timeout is specified.
			 */
			if (len < 0 || mref->ref_timeout <= 0) {
				MARS_DBG("ENODATA %lld\n", len);
				return -ENODATA;
			}
		}
		// Shorten below EOF, but allow special case
		if (mref->ref_len > len && len > 0) {
			mref->ref_len = len;
		}
	}

	/* Buffered IO.
	 */
	if (!mref->ref_data) {
		struct aio_mref_aspect *mref_a = aio_mref_get_aspect(output->brick, mref);
		if (unlikely(!mref_a)) {
			MARS_ERR("bad mref_a\n");
			return -EILSEQ;
		}
		if (unlikely(mref->ref_len <= 0)) {
			MARS_ERR("bad ref_len = %d\n", mref->ref_len);
			return -ENOMEM;
		}
		mref->ref_data = brick_block_alloc(mref->ref_pos, (mref_a->alloc_len = mref->ref_len));
		if (unlikely(!mref->ref_data)) {
			MARS_ERR("ENOMEM %d bytes\n", mref->ref_len);
			return -ENOMEM;
		}
#if 0 // ???
		mref->ref_flags = 0;
#endif
		mref_a->do_dealloc = true;
		atomic_inc(&output->total_alloc_count);
		atomic_inc(&output->alloc_count);
	}

	_mref_get_first(mref);
	return mref->ref_len;
}
Beispiel #10
0
void __exit exit_mars_aio(void)
{
	MARS_DBG("exit_aio()\n");
	aio_unregister_brick_type();
}
Beispiel #11
0
int __init init_mars_aio(void)
{
	MARS_DBG("init_aio()\n");
	_aio_brick_type = (void*)&aio_brick_type;
	return aio_register_brick_type();
}
Beispiel #12
0
static int _connect(struct client_output *output, const char *str)
{
	struct sockaddr_storage sockaddr = {};
	int status;

	if (unlikely(!output->path)) {
		output->path = brick_strdup(str);
		status = -ENOMEM;
		if (!output->path) {
			MARS_DBG("no mem\n");
			goto done;
		}
		status = -EINVAL;
		output->host = strchr(output->path, '@');
		if (!output->host) {
			brick_string_free(output->path);
			output->path = NULL;
			MARS_ERR("parameter string '%s' contains no remote specifier with '@'-syntax\n", str);
			goto done;
		}
		*output->host++ = '\0';
	}

	if (unlikely(output->receiver.thread)) {
		MARS_WRN("receiver thread unexpectedly not dead\n");
		_kill_thread(&output->receiver, "receiver");
	}

	status = mars_create_sockaddr(&sockaddr, output->host);
	if (unlikely(status < 0)) {
		MARS_DBG("no sockaddr, status = %d\n", status);
		goto done;
	}
	
	status = mars_create_socket(&output->socket, &sockaddr, false);
	if (unlikely(status < 0)) {
		MARS_DBG("no socket, status = %d\n", status);
		goto really_done;
	}
	output->socket.s_shutdown_on_err = true;

	output->receiver.thread = brick_thread_create(receiver_thread, output, "mars_receiver%d", thread_count++);
	if (unlikely(!output->receiver.thread)) {
		MARS_ERR("cannot start receiver thread, status = %d\n", status);
		status = -ENOENT;
		goto done;
	}


	{
		struct mars_cmd cmd = {
			.cmd_code = CMD_CONNECT,
			.cmd_str1 = output->path,
		};

		status = mars_send_struct(&output->socket, &cmd, mars_cmd_meta);
		if (unlikely(status < 0)) {
			MARS_DBG("send of connect failed, status = %d\n", status);
			goto done;
		}
	}
	if (status >= 0) {
		status = _request_info(output);
	}

done:
	if (status < 0) {
		MARS_INF("cannot connect to remote host '%s' (status = %d) -- retrying\n", output->host ? output->host : "NULL", status);
		_kill_socket(output);
	}
really_done:
	return status;
}

////////////////// own brick / input / output operations //////////////////

static int client_get_info(struct client_output *output, struct mars_info *info)
{
	int status;

	output->got_info = false;
	output->get_info = true;
	wake_up_interruptible(&output->event);
	
	wait_event_interruptible_timeout(output->info_event, output->got_info, 60 * HZ);
	status = -EIO;
	if (output->got_info && info) {
		memcpy(info, &output->info, sizeof(*info));
		status = 0;
	}

//done:
	return status;
}
Beispiel #13
0
static int sender_thread(void *data)
{
	struct client_output *output = data;
	struct client_brick *brick = output->brick;
	unsigned long flags;
	bool do_kill = false;
	int status = 0;

	output->receiver.restart_count = 0;

        while (!brick_thread_should_stop()) {
		struct list_head *tmp = NULL;
		struct client_mref_aspect *mref_a;
		struct mref_object *mref;

		if (unlikely(output->recv_error != 0 || !mars_socket_is_alive(&output->socket))) {
			MARS_DBG("recv_error = %d do_kill = %d\n", output->recv_error, do_kill);
			if (do_kill) {
				do_kill = false;
				_kill_socket(output);
				brick_msleep(3000);
			}

			status = _connect(output, brick->brick_name);
			MARS_IO("connect status = %d\n", status);
			if (unlikely(status < 0)) {
				brick_msleep(3000);
				_do_timeout(output, &output->wait_list, false);
				_do_timeout(output, &output->mref_list, false);
				continue;
			}
			brick->connection_state = 2;
			do_kill = true;
			/* Re-Submit any waiting requests
			 */
			MARS_IO("re-submit\n");
			_do_resubmit(output);
		}
		
		wait_event_interruptible_timeout(output->event,
						 !list_empty(&output->mref_list) ||
						 output->get_info ||
						 output->recv_error != 0 ||
						 brick_thread_should_stop(),
						 1 * HZ);

		if (unlikely(output->recv_error != 0)) {
			MARS_DBG("recv_error = %d\n", output->recv_error);
			brick_msleep(1000);
			continue;
		}
		
		if (output->get_info) {
			status = _request_info(output);
			if (status >= 0) {
				output->get_info = false;
			} else {
				MARS_WRN("cannot get info, status = %d\n", status);
				brick_msleep(1000);
			}
		}

		/* Grab the next mref from the queue
		 */
		traced_lock(&output->lock, flags);
		if (list_empty(&output->mref_list)) {
			traced_unlock(&output->lock, flags);
			continue;
		}
		tmp = output->mref_list.next;
		list_del(tmp);
		list_add(tmp, &output->wait_list);
		mref_a = container_of(tmp, struct client_mref_aspect, io_head);
		traced_unlock(&output->lock, flags);

		mref = mref_a->object;

		if (brick->limit_mode) {
			int amount = 0;
			if (mref->ref_cs_mode < 2)
				amount = (mref->ref_len - 1) / 1024 + 1;
			mars_limit_sleep(&client_limiter, amount);
		}

		MARS_IO("sending mref, id = %d pos = %lld len = %d rw = %d\n", mref->ref_id, mref->ref_pos, mref->ref_len, mref->ref_rw);

		status = mars_send_mref(&output->socket, mref);
		MARS_IO("status = %d\n", status);
		if (unlikely(status < 0)) {
			// retry submission on next occasion..
			MARS_WRN("sending failed, status = %d\n", status);

			if (do_kill) {
				do_kill = false;
				_kill_socket(output);
			}
			_hash_insert(output, mref_a);
			brick_msleep(1000);
			continue;
		}
	}
//done:
	if (status < 0) {
		MARS_WRN("sender thread terminated with status = %d\n", status);
	}

	if (do_kill) {
		_kill_socket(output);
	}

	/* Signal error on all pending IO requests.
	 * We have no other chance (except probably delaying
	 * this until destruction which is probably not what
	 * we want).
	 */
	_do_timeout(output, &output->wait_list, true);
	_do_timeout(output, &output->mref_list, true);

	wake_up_interruptible(&output->sender.run_event);
	MARS_DBG("sender terminated\n");
	return status;
}
Beispiel #14
0
static
int receiver_thread(void *data)
{
	struct client_output *output = data;
	int status = 0;

        while (!brick_thread_should_stop()) {
		struct mars_cmd cmd = {};
		struct list_head *tmp;
		struct client_mref_aspect *mref_a = NULL;
		struct mref_object *mref = NULL;
		unsigned long flags;

		status = mars_recv_struct(&output->socket, &cmd, mars_cmd_meta);
		MARS_IO("got cmd = %d status = %d\n", cmd.cmd_code, status);
		if (status < 0)
			goto done;

		switch (cmd.cmd_code & CMD_FLAG_MASK) {
		case CMD_NOTIFY:
			mars_trigger();
			break;
		case CMD_CONNECT:
			if (cmd.cmd_int1 < 0) {
				status = cmd.cmd_int1;
				MARS_ERR("at remote side: brick connect failed, remote status = %d\n", status);
				goto done;
			}
			break;
		case CMD_CB:
		{
			int hash_index = cmd.cmd_int1 % CLIENT_HASH_MAX;

			traced_lock(&output->lock, flags);
			for (tmp = output->hash_table[hash_index].next; tmp != &output->hash_table[hash_index]; tmp = tmp->next) {
				struct mref_object *tmp_mref;
				mref_a = container_of(tmp, struct client_mref_aspect, hash_head);
				tmp_mref = mref_a->object;
				if (unlikely(!tmp_mref)) {
					traced_unlock(&output->lock, flags);
					MARS_ERR("bad internal mref pointer\n");
					status = -EBADR;
					goto done;
				}
				if (tmp_mref->ref_id == cmd.cmd_int1) {
					mref = tmp_mref;
					list_del_init(&mref_a->hash_head);
					list_del_init(&mref_a->io_head);
					break;
				}
			}
			traced_unlock(&output->lock, flags);

			if (unlikely(!mref)) {
				MARS_WRN("got unknown id = %d for callback\n", cmd.cmd_int1);
				status = -EBADR;
				goto done;
			}

			MARS_IO("got callback id = %d, old pos = %lld len = %d rw = %d\n", mref->ref_id, mref->ref_pos, mref->ref_len, mref->ref_rw);

			status = mars_recv_cb(&output->socket, mref, &cmd);
			MARS_IO("new status = %d, pos = %lld len = %d rw = %d\n", status, mref->ref_pos, mref->ref_len, mref->ref_rw);
			if (unlikely(status < 0)) {
				MARS_WRN("interrupted data transfer during callback, status = %d\n", status);
				_hash_insert(output, mref_a);
				goto done;
			}

			SIMPLE_CALLBACK(mref, 0);

			client_ref_put(output, mref);

			atomic_dec(&output->fly_count);
			atomic_dec(&mars_global_io_flying);
			break;
		}
		case CMD_GETINFO:
			status = mars_recv_struct(&output->socket, &output->info, mars_info_meta);
			if (status < 0) {
				MARS_WRN("got bad info from remote side, status = %d\n", status);
				goto done;
			}
			output->got_info = true;
			wake_up_interruptible(&output->info_event);
			break;
		default:
			MARS_ERR("got bad command %d from remote side, terminating.\n", cmd.cmd_code);
			status = -EBADR;
			goto done;
		}
	done:
		brick_string_free(cmd.cmd_str1);
		if (unlikely(status < 0)) {
			if (!output->recv_error) {
				MARS_DBG("signalling status = %d\n", status);
				output->recv_error = status;
			}
			wake_up_interruptible(&output->event);
			brick_msleep(100);
		}
	}

	if (status < 0) {
		MARS_WRN("receiver thread terminated with status = %d, recv_error = %d\n", status, output->recv_error);
	}

	mars_shutdown_socket(&output->socket);
	wake_up_interruptible(&output->receiver.run_event);
	return status;
}
Beispiel #15
0
static
int trigger_sysctl_handler(
	struct ctl_table *table,
	int write, 
	void __user *buffer,
	size_t *length,
	loff_t *ppos)
{
	ssize_t res = 0;
	size_t len = *length;

	MARS_DBG("write = %d len = %ld pos = %lld\n", write, len, *ppos);

	if (!len || *ppos > 0) {
		goto done;
	}

	if (write) {
		char tmp[8] = {};
		int code = 0;

		res = len; // fake consumption of all data

		if (len > 7)
			len = 7;
		if (!copy_from_user(tmp, buffer, len)) {
			sscanf(tmp, "%d", &code);
			if (code > 0) {
				mars_trigger();
			}
			if (code > 1) {
				mars_remote_trigger();
			}
		}
	} else {
		char *answer = "MARS module not operational\n";
		char *tmp = NULL;
		int mylen;

		if (mars_info) {
			answer = "internal error while determining mars_info\n";
			tmp = mars_info();
			if (tmp)
				answer = tmp;
		}

		mylen = strlen(answer);
		if (len > mylen)
			len = mylen;
		res = len;
		if (copy_to_user(buffer, answer, len)) {
			MARS_ERR("write %ld bytes at %p failed\n", len, buffer);
			res = -EFAULT;
		}
		brick_string_free(tmp);
	}

done:
	MARS_DBG("res = %ld\n", res);
	*length = res;
	if (res >= 0) {
	        *ppos += res;
		return 0;
	}
	return res;
}