static inline void _enqueue(struct aio_threadinfo *tinfo, struct aio_mref_aspect *mref_a, int prio, bool at_end) { unsigned long flags; #if 1 prio++; if (unlikely(prio < 0)) { prio = 0; } else if (unlikely(prio >= MARS_PRIO_NR)) { prio = MARS_PRIO_NR - 1; } #else prio = 0; #endif mref_a->enqueue_stamp = cpu_clock(raw_smp_processor_id()); traced_lock(&tinfo->lock, flags); if (at_end) { list_add_tail(&mref_a->io_head, &tinfo->mref_list[prio]); } else { list_add(&mref_a->io_head, &tinfo->mref_list[prio]); } tinfo->queued[prio]++; atomic_inc(&tinfo->queued_sum); traced_unlock(&tinfo->lock, flags); atomic_inc(&tinfo->total_enqueue_count); wake_up_interruptible_all(&tinfo->event); }
static inline void insert_dirty(struct aio_output *output, struct aio_mref_aspect *mref_a) { unsigned long flags = 0; traced_lock(&output->dirty_lock, flags); list_del(&mref_a->dirty_head); list_add(&mref_a->dirty_head, &output->dirty_anchor); traced_unlock(&output->dirty_lock, flags); }
static inline void remove_dirty(struct aio_output *output, struct aio_mref_aspect *mref_a) { if (!list_empty(&mref_a->dirty_head)) { unsigned long flags = 0; traced_lock(&output->dirty_lock, flags); list_del_init(&mref_a->dirty_head); traced_unlock(&output->dirty_lock, flags); } }
static void _hash_insert(struct client_output *output, struct client_mref_aspect *mref_a) { struct mref_object *mref = mref_a->object; unsigned long flags; int hash_index; traced_lock(&output->lock, flags); list_del(&mref_a->io_head); list_add_tail(&mref_a->io_head, &output->mref_list); list_del(&mref_a->hash_head); mref->ref_id = ++output->last_id; hash_index = mref->ref_id % CLIENT_HASH_MAX; list_add_tail(&mref_a->hash_head, &output->hash_table[hash_index]); traced_unlock(&output->lock, flags); }
/* Workaround for non-implemented aio_fsync() */ static int aio_sync_thread(void *data) { struct aio_threadinfo *tinfo = data; struct aio_output *output = tinfo->output; MARS_DBG("sync thread has started on '%s'.\n", output->brick->brick_path); //set_user_nice(current, -20); while (!brick_thread_should_stop() || atomic_read(&tinfo->queued_sum) > 0) { LIST_HEAD(tmp_list); unsigned long flags; int i; output->fdsync_active = false; wake_up_interruptible_all(&output->fdsync_event); wait_event_interruptible_timeout( tinfo->event, atomic_read(&tinfo->queued_sum) > 0, HZ / 4); traced_lock(&tinfo->lock, flags); for (i = 0; i < MARS_PRIO_NR; i++) { struct list_head *start = &tinfo->mref_list[i]; if (!list_empty(start)) { // move over the whole list list_replace_init(start, &tmp_list); atomic_sub(tinfo->queued[i], &tinfo->queued_sum); tinfo->queued[i] = 0; break; } } traced_unlock(&tinfo->lock, flags); if (!list_empty(&tmp_list)) { aio_sync_all(output, &tmp_list); } } MARS_DBG("sync thread has stopped.\n"); tinfo->terminated = true; wake_up_interruptible_all(&tinfo->terminate_event); return 0; }
static void _do_resubmit(struct client_output *output) { unsigned long flags; traced_lock(&output->lock, flags); if (!list_empty(&output->wait_list)) { struct list_head *first = output->wait_list.next; struct list_head *last = output->wait_list.prev; struct list_head *old_start = output->mref_list.next; #define list_connect __list_del // the original routine has a misleading name: in reality it is more general list_connect(&output->mref_list, first); list_connect(last, old_start); INIT_LIST_HEAD(&output->wait_list); MARS_IO("done re-submit %p %p\n", first, last); } traced_unlock(&output->lock, flags); }
static inline void get_dirty(struct aio_output *output, loff_t *min, loff_t *max) { struct list_head *tmp; unsigned long flags = 0; traced_lock(&output->dirty_lock, flags); for (tmp = output->dirty_anchor.next; tmp != &output->dirty_anchor; tmp = tmp->next) { struct aio_mref_aspect *mref_a = container_of(tmp, struct aio_mref_aspect, dirty_head); struct mref_object *mref = mref_a->object; if (mref->ref_pos < *min) { *min = mref->ref_pos; } if (mref->ref_pos + mref->ref_len > *max) { *max = mref->ref_pos + mref->ref_len; } } traced_unlock(&output->dirty_lock, flags); }
static void *_get_free(int order, int cline) { void *data; unsigned long flags; traced_lock(&freelist_lock[order], flags); data = brick_freelist[order]; if (likely(data)) { void *next = *(void**)data; #ifdef BRICK_DEBUG_MEM // check for corruptions long pattern = *(((long*)data)+1); void *copy = *(((void**)data)+2); if (unlikely(pattern != 0xf0f0f0f0f0f0f0f0 || next != copy)) { // found a corruption // prevent further trouble by leaving a memleak brick_freelist[order] = NULL; traced_unlock(&freelist_lock[order], flags); BRICK_ERR("line %d:freelist corruption at %p (pattern = %lx next %p != %p, murdered = %d), order = %d\n", cline, data, pattern, next, copy, atomic_read(&freelist_count[order]), order); return NULL; } #endif brick_freelist[order] = next; atomic_dec(&freelist_count[order]); } traced_unlock(&freelist_lock[order], flags); #ifdef CONFIG_MARS_DEBUG_MEM_STRONG if (data) { struct mem_block_info *inf = _find_block_info(data, false); if (likely(inf)) { if (unlikely(inf->inf_len != (PAGE_SIZE << order))) { BRICK_ERR("line %d: address %p: bad freelist size %d (correct should be %d, previous line = %d)\n", cline, data, (int)(PAGE_SIZE << order), inf->inf_len, inf->inf_line); } inf->inf_line = cline; inf->inf_used = true; } else { BRICK_ERR("line %d: freelist address %p is invalid (order = %d)\n", cline, data, order); } } #endif return data; }
static inline struct aio_mref_aspect *_dequeue(struct aio_threadinfo *tinfo) { struct aio_mref_aspect *mref_a = NULL; int prio; unsigned long flags = 0; traced_lock(&tinfo->lock, flags); for (prio = 0; prio < MARS_PRIO_NR; prio++) { struct list_head *start = &tinfo->mref_list[prio]; struct list_head *tmp = start->next; if (tmp != start) { list_del_init(tmp); tinfo->queued[prio]--; atomic_dec(&tinfo->queued_sum); mref_a = container_of(tmp, struct aio_mref_aspect, io_head); goto done; } }
static void _put_free(void *data, int order) { void *next; unsigned long flags; #ifdef BRICK_DEBUG_MEM // fill with pattern memset(data, 0xf0, PAGE_SIZE << order); #endif traced_lock(&freelist_lock[order], flags); next = brick_freelist[order]; *(void**)data = next; #ifdef BRICK_DEBUG_MEM // insert redundant copy for checking *(((void**)data)+2) = next; #endif brick_freelist[order] = data; traced_unlock(&freelist_lock[order], flags); atomic_inc(&freelist_count[order]); }
static int sender_thread(void *data) { struct client_output *output = data; struct client_brick *brick = output->brick; unsigned long flags; bool do_kill = false; int status = 0; output->receiver.restart_count = 0; while (!brick_thread_should_stop()) { struct list_head *tmp = NULL; struct client_mref_aspect *mref_a; struct mref_object *mref; if (unlikely(output->recv_error != 0 || !mars_socket_is_alive(&output->socket))) { MARS_DBG("recv_error = %d do_kill = %d\n", output->recv_error, do_kill); if (do_kill) { do_kill = false; _kill_socket(output); brick_msleep(3000); } status = _connect(output, brick->brick_name); MARS_IO("connect status = %d\n", status); if (unlikely(status < 0)) { brick_msleep(3000); _do_timeout(output, &output->wait_list, false); _do_timeout(output, &output->mref_list, false); continue; } brick->connection_state = 2; do_kill = true; /* Re-Submit any waiting requests */ MARS_IO("re-submit\n"); _do_resubmit(output); } wait_event_interruptible_timeout(output->event, !list_empty(&output->mref_list) || output->get_info || output->recv_error != 0 || brick_thread_should_stop(), 1 * HZ); if (unlikely(output->recv_error != 0)) { MARS_DBG("recv_error = %d\n", output->recv_error); brick_msleep(1000); continue; } if (output->get_info) { status = _request_info(output); if (status >= 0) { output->get_info = false; } else { MARS_WRN("cannot get info, status = %d\n", status); brick_msleep(1000); } } /* Grab the next mref from the queue */ traced_lock(&output->lock, flags); if (list_empty(&output->mref_list)) { traced_unlock(&output->lock, flags); continue; } tmp = output->mref_list.next; list_del(tmp); list_add(tmp, &output->wait_list); mref_a = container_of(tmp, struct client_mref_aspect, io_head); traced_unlock(&output->lock, flags); mref = mref_a->object; if (brick->limit_mode) { int amount = 0; if (mref->ref_cs_mode < 2) amount = (mref->ref_len - 1) / 1024 + 1; mars_limit_sleep(&client_limiter, amount); } MARS_IO("sending mref, id = %d pos = %lld len = %d rw = %d\n", mref->ref_id, mref->ref_pos, mref->ref_len, mref->ref_rw); status = mars_send_mref(&output->socket, mref); MARS_IO("status = %d\n", status); if (unlikely(status < 0)) { // retry submission on next occasion.. MARS_WRN("sending failed, status = %d\n", status); if (do_kill) { do_kill = false; _kill_socket(output); } _hash_insert(output, mref_a); brick_msleep(1000); continue; } } //done: if (status < 0) { MARS_WRN("sender thread terminated with status = %d\n", status); } if (do_kill) { _kill_socket(output); } /* Signal error on all pending IO requests. * We have no other chance (except probably delaying * this until destruction which is probably not what * we want). */ _do_timeout(output, &output->wait_list, true); _do_timeout(output, &output->mref_list, true); wake_up_interruptible(&output->sender.run_event); MARS_DBG("sender terminated\n"); return status; }
static void _do_timeout(struct client_output *output, struct list_head *anchor, bool force) { struct client_brick *brick = output->brick; struct list_head *tmp; struct list_head *next; LIST_HEAD(tmp_list); int rounds = 0; long io_timeout = brick->io_timeout; unsigned long flags; if (io_timeout <= 0) io_timeout = global_net_io_timeout; if (!mars_net_is_alive) force = true; if (!force && io_timeout <= 0) return; io_timeout *= HZ; traced_lock(&output->lock, flags); for (tmp = anchor->next, next = tmp->next; tmp != anchor; tmp = next, next = tmp->next) { struct client_mref_aspect *mref_a; mref_a = container_of(tmp, struct client_mref_aspect, io_head); if (!force && !time_is_before_jiffies(mref_a->submit_jiffies + io_timeout)) { continue; } list_del_init(&mref_a->hash_head); list_del_init(&mref_a->io_head); list_add_tail(&mref_a->tmp_head, &tmp_list); } traced_unlock(&output->lock, flags); while (!list_empty(&tmp_list)) { struct client_mref_aspect *mref_a; struct mref_object *mref; tmp = tmp_list.next; list_del_init(tmp); mref_a = container_of(tmp, struct client_mref_aspect, tmp_head); mref = mref_a->object; if (!rounds++) { MARS_WRN("timeout after %ld: signalling IO error at pos = %lld len = %d\n", io_timeout, mref->ref_pos, mref->ref_len); } atomic_inc(&output->timeout_count); SIMPLE_CALLBACK(mref, -ENOTCONN); client_ref_put(output, mref); atomic_dec(&output->fly_count); atomic_dec(&mars_global_io_flying); } }
static int receiver_thread(void *data) { struct client_output *output = data; int status = 0; while (!brick_thread_should_stop()) { struct mars_cmd cmd = {}; struct list_head *tmp; struct client_mref_aspect *mref_a = NULL; struct mref_object *mref = NULL; unsigned long flags; status = mars_recv_struct(&output->socket, &cmd, mars_cmd_meta); MARS_IO("got cmd = %d status = %d\n", cmd.cmd_code, status); if (status < 0) goto done; switch (cmd.cmd_code & CMD_FLAG_MASK) { case CMD_NOTIFY: mars_trigger(); break; case CMD_CONNECT: if (cmd.cmd_int1 < 0) { status = cmd.cmd_int1; MARS_ERR("at remote side: brick connect failed, remote status = %d\n", status); goto done; } break; case CMD_CB: { int hash_index = cmd.cmd_int1 % CLIENT_HASH_MAX; traced_lock(&output->lock, flags); for (tmp = output->hash_table[hash_index].next; tmp != &output->hash_table[hash_index]; tmp = tmp->next) { struct mref_object *tmp_mref; mref_a = container_of(tmp, struct client_mref_aspect, hash_head); tmp_mref = mref_a->object; if (unlikely(!tmp_mref)) { traced_unlock(&output->lock, flags); MARS_ERR("bad internal mref pointer\n"); status = -EBADR; goto done; } if (tmp_mref->ref_id == cmd.cmd_int1) { mref = tmp_mref; list_del_init(&mref_a->hash_head); list_del_init(&mref_a->io_head); break; } } traced_unlock(&output->lock, flags); if (unlikely(!mref)) { MARS_WRN("got unknown id = %d for callback\n", cmd.cmd_int1); status = -EBADR; goto done; } MARS_IO("got callback id = %d, old pos = %lld len = %d rw = %d\n", mref->ref_id, mref->ref_pos, mref->ref_len, mref->ref_rw); status = mars_recv_cb(&output->socket, mref, &cmd); MARS_IO("new status = %d, pos = %lld len = %d rw = %d\n", status, mref->ref_pos, mref->ref_len, mref->ref_rw); if (unlikely(status < 0)) { MARS_WRN("interrupted data transfer during callback, status = %d\n", status); _hash_insert(output, mref_a); goto done; } SIMPLE_CALLBACK(mref, 0); client_ref_put(output, mref); atomic_dec(&output->fly_count); atomic_dec(&mars_global_io_flying); break; } case CMD_GETINFO: status = mars_recv_struct(&output->socket, &output->info, mars_info_meta); if (status < 0) { MARS_WRN("got bad info from remote side, status = %d\n", status); goto done; } output->got_info = true; wake_up_interruptible(&output->info_event); break; default: MARS_ERR("got bad command %d from remote side, terminating.\n", cmd.cmd_code); status = -EBADR; goto done; } done: brick_string_free(cmd.cmd_str1); if (unlikely(status < 0)) { if (!output->recv_error) { MARS_DBG("signalling status = %d\n", status); output->recv_error = status; } wake_up_interruptible(&output->event); brick_msleep(100); } } if (status < 0) { MARS_WRN("receiver thread terminated with status = %d, recv_error = %d\n", status, output->recv_error); } mars_shutdown_socket(&output->socket); wake_up_interruptible(&output->receiver.run_event); return status; }