rstatus_t core_core(void *arg, uint32_t events) { rstatus_t status; struct conn *conn = arg; struct context *ctx = conn_to_ctx(conn); log_debug(LOG_VVERB, "event %04"PRIX32" on %s %d", events, conn_get_type_string(conn), conn->sd); conn->events = events; /* error takes precedence over read | write */ if (events & EVENT_ERR) { if (conn->err && conn->dyn_mode) { loga("conn err on dnode EVENT_ERR: %d", conn->err); } core_error(ctx, conn); return DN_ERROR; } /* read takes precedence over write */ if (events & EVENT_READ) { status = core_recv(ctx, conn); if (status != DN_OK || conn->done || conn->err) { if (conn->dyn_mode) { if (conn->err) { loga("conn err on dnode EVENT_READ: %d", conn->err); core_close(ctx, conn); return DN_ERROR; } return DN_OK; } core_close(ctx, conn); return DN_ERROR; } } if (events & EVENT_WRITE) { status = core_send(ctx, conn); if (status != DN_OK || conn->done || conn->err) { if (conn->dyn_mode) { if (conn->err) { loga("conn err on dnode EVENT_WRITE: %d", conn->err); core_close(ctx, conn); return DN_ERROR; } return DN_OK; } core_close(ctx, conn); return DN_ERROR; } } return DN_OK; }
static void dnode_rsp_send_done(struct context *ctx, struct conn *conn, struct msg *rsp) { if (log_loggable(LOG_VVERB)) { log_debug(LOG_VVERB, "dnode_rsp_send_done entering"); } struct msg *req; /* peer message (request) */ ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); ASSERT(conn->smsg == NULL); log_debug(LOG_VERB, "dyn: send done rsp %"PRIu64" on c %d", rsp->id, conn->sd); req = rsp->peer; ASSERT(!rsp->request && req->request); ASSERT(req->selected_rsp == rsp); ASSERT(req->done && !req->swallow); log_debug(LOG_DEBUG, "DNODE RSP SENT %s %d dmsg->id %u", conn_get_type_string(conn), conn->sd, req->dmsg->id); /* dequeue request from client outq */ conn_dequeue_outq(ctx, conn, req); req_put(req); }
static void core_timeout(struct context *ctx) { for (;;) { struct msg *msg; struct conn *conn; int64_t now, then; msg = msg_tmo_min(); if (msg == NULL) { ctx->timeout = ctx->max_timeout; return; } /* skip over req that are in-error or done */ if (msg->error || msg->done) { msg_tmo_delete(msg); continue; } /* * timeout expired req and all the outstanding req on the timing * out server */ conn = msg->tmo_rbe.data; then = msg->tmo_rbe.key; now = dn_msec_now(); if (now < then) { int delta = (int)(then - now); ctx->timeout = MIN(delta, ctx->max_timeout); return; } log_warn("req %"PRIu64" on %s %d timedout, timeout was %d", msg->id, conn_get_type_string(conn), conn->sd, msg->tmo_rbe.timeout); msg_tmo_delete(msg); if (conn->dyn_mode) { if (conn->type == CONN_DNODE_PEER_SERVER) { //outgoing peer requests struct server *server = conn->owner; if (conn->same_dc) stats_pool_incr(ctx, server->owner, peer_timedout_requests); else stats_pool_incr(ctx, server->owner, remote_peer_timedout_requests); } } else { if (conn->type == CONN_SERVER) { //storage server requests stats_server_incr(ctx, conn->owner, server_dropped_requests); } } conn->err = ETIMEDOUT; core_close(ctx, conn); } }
static rstatus_t core_send(struct context *ctx, struct conn *conn) { rstatus_t status; status = conn_send(ctx, conn); if (status != DN_OK) { log_info("send on %s %d failed: %s", conn_get_type_string(conn), conn->sd, strerror(errno)); } return status; }
static void core_error(struct context *ctx, struct conn *conn) { rstatus_t status; status = dn_get_soerror(conn->sd); if (status < 0) { log_warn("get soerr on %s client %d failed, ignored: %s", conn_get_type_string(conn), conn->sd, strerror(errno)); } conn->err = errno; core_close(ctx, conn); }
static void server_ack_err(struct context *ctx, struct conn *conn, struct msg *req) { // I want to make sure we do not have swallow here. //ASSERT_LOG(!req->swallow, "req %d:%d has swallow set??", req->id, req->parent_id); if ((req->swallow && req->noreply) || (req->swallow && (req->consistency == DC_ONE)) || (req->swallow && (req->consistency == DC_QUORUM) && (!conn->same_dc))) { log_debug(LOG_INFO, "dyn: close s %d swallow req %"PRIu64" len %"PRIu32 " type %d", conn->sd, req->id, req->mlen, req->type); req_put(req); return; } struct conn *c_conn = req->owner; // At other connections, these responses would be swallowed. ASSERT_LOG((c_conn->type == CONN_CLIENT) || (c_conn->type == CONN_DNODE_PEER_CLIENT), "c_conn type %s", conn_get_type_string(c_conn)); // Create an appropriate response for the request so its propagated up; // This response gets dropped in rsp_make_error anyways. But since this is // an error path its ok with the overhead. struct msg *rsp = msg_get(conn, false, conn->data_store); if (rsp == NULL) { log_warn("Could not allocate msg."); return; } req->done = 1; req->peer = rsp; rsp->peer = req; rsp->error = req->error = 1; rsp->err = req->err = conn->err; rsp->dyn_error = req->dyn_error = STORAGE_CONNECTION_REFUSE; rsp->dmsg = NULL; log_warn("%d:%d <-> %d:%d", req->id, req->parent_id, rsp->id, rsp->parent_id); log_warn("dyn: close s %d schedule error for req %u:%u " "len %"PRIu32" type %d from c %d%c %s", conn->sd, req->id, req->parent_id, req->mlen, req->type, c_conn->sd, conn->err ? ':' : ' ', conn->err ? strerror(conn->err): " "); rstatus_t status = conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, rsp); IGNORE_RET_VAL(status); if (req->swallow) req_put(req); }
static void core_close_log(struct conn *conn) { char *addrstr; if ((conn->type == CONN_CLIENT) || (conn->type == CONN_DNODE_PEER_CLIENT)) { addrstr = dn_unresolve_peer_desc(conn->sd); } else { addrstr = dn_unresolve_addr(conn->addr, conn->addrlen); } log_debug(LOG_NOTICE, "close %s %d '%s' on event %04"PRIX32" eof %d done " "%d rb %zu sb %zu%c %s", conn_get_type_string(conn), conn->sd, addrstr, conn->events, conn->eof, conn->done, conn->recv_bytes, conn->send_bytes, conn->err ? ':' : ' ', conn->err ? strerror(conn->err) : ""); }
static rstatus_t proxy_accept(struct context *ctx, struct conn *p) { rstatus_t status; struct conn *c; int sd; ASSERT(p->type == CONN_PROXY); ASSERT(p->sd > 0); ASSERT(p->recv_active && p->recv_ready); for (;;) { sd = accept(p->sd, NULL, NULL); if (sd < 0) { if (errno == EINTR) { log_warn("accept on %s %d not ready - eintr", conn_get_type_string(p), p->sd); continue; } if (errno == EAGAIN || errno == EWOULDBLOCK) { p->recv_ready = 0; return DN_OK; } /* * FIXME: On EMFILE or ENFILE mask out IN event on the proxy; mask * it back in when some existing connection gets closed */ log_error("accept on %s %d failed: %s", conn_get_type_string(p), p->sd, strerror(errno)); return DN_ERROR; } break; } c = conn_get(p->owner, true, p->data_store); if (c == NULL) { log_error("get conn for CLIENT %d from %s %d failed: %s", sd, conn_get_type_string(p), p->sd, strerror(errno)); status = close(sd); if (status < 0) { log_error("close c %d failed, ignored: %s", sd, strerror(errno)); } return DN_ENOMEM; } c->sd = sd; stats_pool_incr(ctx, c->owner, client_connections); status = dn_set_nonblocking(c->sd); if (status < 0) { log_error("set nonblock on %s %d from p %d failed: %s", conn_get_type_string(c), c->sd, p->sd, strerror(errno)); conn_close(ctx, c); return status; } if (p->family == AF_INET || p->family == AF_INET6) { status = dn_set_tcpnodelay(c->sd); if (status < 0) { log_warn("set tcpnodelay on %s %d from %s %d failed, ignored: %s", conn_get_type_string(c), c->sd, conn_get_type_string(p), p->sd, strerror(errno)); } } status = event_add_conn(ctx->evb, c); if (status < 0) { log_error("event add conn from %s %d failed: %s",conn_get_type_string(p), p->sd, strerror(errno)); conn_close(ctx, c); return status; } log_notice("accepted %s %d on %s %d from '%s'", conn_get_type_string(c), c->sd, conn_get_type_string(p), p->sd, dn_unresolve_peer_desc(c->sd)); return DN_OK; }
struct msg * rsp_send_next(struct context *ctx, struct conn *conn) { rstatus_t status; struct msg *rsp, *req; /* response and it's peer request */ ASSERT_LOG((conn->type == CONN_DNODE_PEER_CLIENT) || (conn->type = CONN_CLIENT), "conn %s", conn_get_type_string(conn)); req = TAILQ_FIRST(&conn->omsg_q); if (req == NULL || (!req->selected_rsp && !req_done(conn, req))) { /* nothing is outstanding, initiate close? */ if (req == NULL && conn->eof) { conn->done = 1; log_debug(LOG_INFO, "c %d is done", conn->sd); } status = event_del_out(ctx->evb, conn); if (status != DN_OK) { conn->err = errno; } return NULL; } rsp = conn->smsg; if (rsp != NULL) { ASSERT(!rsp->request); ASSERT(rsp->peer != NULL); req = TAILQ_NEXT(rsp->peer, c_tqe); } if (req == NULL || !req_done(conn, req)) { conn->smsg = NULL; return NULL; } ASSERT(req->request && !req->swallow); if (req_error(conn, req)) { rsp = rsp_make_error(ctx, conn, req); if (rsp == NULL) { conn->err = errno; return NULL; } rsp->peer = req; req->selected_rsp = rsp; log_debug(LOG_VERB, "creating new error rsp %p", rsp); if (conn->dyn_mode) { stats_pool_incr(ctx, peer_forward_error); } else { stats_pool_incr(ctx, forward_error); } } else { rsp = req->selected_rsp; } ASSERT(!rsp->request); conn->smsg = rsp; if (log_loggable(LOG_VVERB)) { log_debug(LOG_VVERB, "send next rsp %"PRIu64" on c %d", rsp->id, conn->sd); } return rsp; }
static void dnode_req_forward(struct context *ctx, struct conn *conn, struct msg *msg) { struct server_pool *pool; uint8_t *key; uint32_t keylen; if (log_loggable(LOG_DEBUG)) { log_debug(LOG_DEBUG, "dnode_req_forward entering "); } log_debug(LOG_DEBUG, "DNODE REQ RECEIVED %s %d dmsg->id %u", conn_get_type_string(conn), conn->sd, msg->dmsg->id); ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); pool = conn->owner; key = NULL; keylen = 0; log_debug(LOG_DEBUG, "conn %p adding message %d:%d", conn, msg->id, msg->parent_id); dictAdd(conn->outstanding_msgs_dict, &msg->id, msg); if (!string_empty(&pool->hash_tag)) { struct string *tag = &pool->hash_tag; uint8_t *tag_start, *tag_end; tag_start = dn_strchr(msg->key_start, msg->key_end, tag->data[0]); if (tag_start != NULL) { tag_end = dn_strchr(tag_start + 1, msg->key_end, tag->data[1]); if (tag_end != NULL) { key = tag_start + 1; keylen = (uint32_t)(tag_end - key); } } } if (keylen == 0) { key = msg->key_start; keylen = (uint32_t)(msg->key_end - msg->key_start); } ASSERT(msg->dmsg != NULL); if (msg->dmsg->type == DMSG_REQ) { local_req_forward(ctx, conn, msg, key, keylen); } else if (msg->dmsg->type == DMSG_REQ_FORWARD) { struct mbuf *orig_mbuf = STAILQ_FIRST(&msg->mhdr); struct datacenter *dc = server_get_dc(pool, &pool->dc); uint32_t rack_cnt = array_n(&dc->racks); uint32_t rack_index; for(rack_index = 0; rack_index < rack_cnt; rack_index++) { struct rack *rack = array_get(&dc->racks, rack_index); //log_debug(LOG_DEBUG, "forwarding to rack '%.*s'", // rack->name->len, rack->name->data); struct msg *rack_msg; if (string_compare(rack->name, &pool->rack) == 0 ) { rack_msg = msg; } else { rack_msg = msg_get(conn, msg->request, __FUNCTION__); if (rack_msg == NULL) { log_debug(LOG_VERB, "whelp, looks like yer screwed now, buddy. no inter-rack messages for you!"); continue; } if (msg_clone(msg, orig_mbuf, rack_msg) != DN_OK) { msg_put(rack_msg); continue; } rack_msg->swallow = true; } if (log_loggable(LOG_DEBUG)) { log_debug(LOG_DEBUG, "forwarding request from conn '%s' to rack '%.*s' dc '%.*s' ", dn_unresolve_peer_desc(conn->sd), rack->name->len, rack->name->data, rack->dc->len, rack->dc->data); } remote_req_forward(ctx, conn, rack_msg, rack, key, keylen); } } }
/* Description: link data from a peer connection to a client-facing connection * peer_conn: a peer connection * msg : msg with data from the peer connection after parsing */ static void dnode_rsp_forward_match(struct context *ctx, struct conn *peer_conn, struct msg *rsp) { rstatus_t status; struct msg *req; struct conn *c_conn; req = TAILQ_FIRST(&peer_conn->omsg_q); c_conn = req->owner; /* if client consistency is dc_one forward the response from only the local node. Since dyn_dnode_peer is always a remote node, drop the rsp */ if (req->consistency == DC_ONE) { if (req->swallow) { dnode_rsp_swallow(ctx, peer_conn, req, rsp); return; } log_warn("req %d:%d with DC_ONE consistency is not being swallowed"); } /* if client consistency is dc_quorum, forward the response from only the local region/DC. */ if ((req->consistency == DC_QUORUM) && !peer_conn->same_dc) { if (req->swallow) { dnode_rsp_swallow(ctx, peer_conn, req, rsp); return; } log_warn("req %d:%d with DC_QUORUM consistency is not being swallowed"); } log_debug(LOG_DEBUG, "DNODE RSP RECEIVED %s %d dmsg->id %u req %u:%u rsp %u:%u, ", conn_get_type_string(peer_conn), peer_conn->sd, rsp->dmsg->id, req->id, req->parent_id, rsp->id, rsp->parent_id); ASSERT(req != NULL && req->peer == NULL); ASSERT(req->request && !req->done); if (log_loggable(LOG_VVERB)) { loga("Dumping content for response: "); msg_dump(rsp); loga("rsp id %d", rsp->id); loga("Dumping content for request:"); msg_dump(req); loga("req id %d", req->id); } conn_dequeue_outq(ctx, peer_conn, req); req->done = 1; log_info("c_conn:%p %d:%d <-> %d:%d", c_conn, req->id, req->parent_id, rsp->id, rsp->parent_id); /* establish rsp <-> req (response <-> request) link */ req->peer = rsp; rsp->peer = req; rsp->pre_coalesce(rsp); ASSERT_LOG((c_conn->type == CONN_CLIENT) || (c_conn->type == CONN_DNODE_PEER_CLIENT), "c_conn type %s", conn_get_type_string(c_conn)); dnode_rsp_forward_stats(ctx, peer_conn->owner, rsp); // c_conn owns respnse now status = conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, rsp); IGNORE_RET_VAL(status); if (req->swallow) { log_info("swallow request %d:%d", req->id, req->parent_id); req_put(req); } }
/* There are chances that the request to the remote peer or its response got dropped. * Hence we may not always receive a response to the request at the head of the FIFO. * Hence what we do is we mark that request as errored and move on the next one * in the outgoing queue. This works since we always have message ids in monotonically * increasing order. */ static void dnode_rsp_forward(struct context *ctx, struct conn *peer_conn, struct msg *rsp) { rstatus_t status; struct msg *req; struct conn *c_conn; ASSERT(peer_conn->type == CONN_DNODE_PEER_SERVER); /* response from a peer implies that peer is ok and heartbeating */ dnode_peer_ok(ctx, peer_conn); /* dequeue peer message (request) from peer conn */ while (true) { req = TAILQ_FIRST(&peer_conn->omsg_q); log_debug(LOG_VERB, "dnode_rsp_forward entering req %p rsp %p...", req, rsp); c_conn = req->owner; if (!peer_conn->same_dc && req->remote_region_send_time) { struct stats *st = ctx->stats; uint64_t delay = dn_usec_now() - req->remote_region_send_time; histo_add(&st->cross_region_histo, delay); } if (req->id == rsp->dmsg->id) { dnode_rsp_forward_match(ctx, peer_conn, rsp); return; } // Report a mismatch and try to rectify log_error("MISMATCH: dnode %s %d rsp_dmsg_id %u req %u:%u dnode rsp %u:%u", conn_get_type_string(peer_conn), peer_conn->sd, rsp->dmsg->id, req->id, req->parent_id, rsp->id, rsp->parent_id); if (c_conn && conn_to_ctx(c_conn)) stats_pool_incr(conn_to_ctx(c_conn), c_conn->owner, peer_mismatch_requests); // TODO : should you be worried about message id getting wrapped around to 0? if (rsp->dmsg->id < req->id) { // We received a response from the past. This indeed proves out of order // responses. A blunder to the architecture. Log it and drop the response. log_error("MISMATCH: received response from the past. Dropping it"); rsp_put(rsp); return; } if (req->consistency == DC_ONE) { if (req->swallow) { // swallow the request and move on the next one dnode_rsp_swallow(ctx, peer_conn, req, NULL); continue; } log_warn("req %d:%d with DC_ONE consistency is not being swallowed"); } if ((req->consistency == DC_QUORUM) && !peer_conn->same_dc) { if (req->swallow) { // swallow the request and move on the next one dnode_rsp_swallow(ctx, peer_conn, req, NULL); continue; } log_warn("req %d:%d with DC_QUORUM consistency is not being swallowed"); } log_error("MISMATCHED DNODE RSP RECEIVED %s %d dmsg->id %u req %u:%u rsp %u:%u, skipping....", conn_get_type_string(peer_conn), peer_conn->sd, rsp->dmsg->id, req->id, req->parent_id, rsp->id, rsp->parent_id); ASSERT(req != NULL && req->peer == NULL); ASSERT(req->request && !req->done); if (log_loggable(LOG_VVERB)) { loga("skipping req: "); msg_dump(req); } conn_dequeue_outq(ctx, peer_conn, req); req->done = 1; // Create an appropriate response for the request so its propagated up; struct msg *err_rsp = msg_get(peer_conn, false, peer_conn->data_store, __FUNCTION__); err_rsp->error = req->error = 1; err_rsp->err = req->err = BAD_FORMAT; err_rsp->dyn_error = req->dyn_error = BAD_FORMAT; err_rsp->dmsg = dmsg_get(); err_rsp->dmsg->id = req->id; log_debug(LOG_VERB, "%p <-> %p", req, err_rsp); /* establish err_rsp <-> req (response <-> request) link */ req->peer = err_rsp; err_rsp->peer = req; log_error("Peer connection s %d skipping request %u:%u, dummy err_rsp %u:%u", peer_conn->sd, req->id, req->parent_id, err_rsp->id, err_rsp->parent_id); rstatus_t status = conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, err_rsp); IGNORE_RET_VAL(status); if (req->swallow) { log_debug(LOG_INFO, "swallow request %d:%d", req->id, req->parent_id); req_put(req); } } }