/** * Depending on our configuration, we can optimize SET's * on certain keys by making them fire-and-forget and * immediately transmitting a success response to the * upstream client. */ bool cproxy_optimize_set_ascii(downstream *d, conn *uc, char *key, int key_len) { assert(d); assert(d->ptd); assert(d->ptd->proxy); assert(uc); assert(uc->next == NULL); if (d->ptd->behavior_pool.base.optimize_set[0] == '\0') { return false; } if (matcher_check(&d->ptd->proxy->optimize_set_matcher, key, key_len, false)) { d->upstream_conn = NULL; d->upstream_suffix = NULL; d->upstream_suffix_len = 0; d->upstream_retry = 0; out_string(uc, "STORED"); if (!update_event(uc, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 1) { moxi_log_write("ERROR: Can't update upstream write event\n"); } d->ptd->stats.stats.err_oom++; cproxy_close_conn(uc); } return true; } return false; }
bool b2b_forward_item_vbucket(conn *uc, downstream *d, item *it, conn *c, bool self, int vbucket) { (void)self; assert(d != NULL); assert(d->ptd != NULL); assert(uc != NULL); assert(uc->next == NULL); assert(uc->noreply == false); assert(c != NULL); // Assuming we're already connected to downstream. // // TODO: Optimize to self codepath. // if (settings.verbose > 2) { moxi_log_write("%d: b2b_forward_item_vbucket %x to %d, vbucket %d\n", uc->sfd, uc->cmd, c->sfd, vbucket); } protocol_binary_request_header *req = (protocol_binary_request_header *) ITEM_data(it); if (vbucket >= 0) { req->request.reserved = htons(vbucket); } if (add_conn_item(c, it) == true) { // The caller keeps its refcount, and we need our own. // it->refcount++; if (add_iov(c, ITEM_data(it), it->nbytes) == 0) { conn_set_state(c, conn_mwrite); c->write_and_go = conn_new_cmd; if (update_event(c, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 2) { moxi_log_write("%d: b2b_forward %x to %d success\n", uc->sfd, uc->cmd, c->sfd); } return true; } } } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); return false; }
/* Forward an upstream command that came with item data, * like set/add/replace/etc. */ bool cproxy_forward_a2a_item_downstream(downstream *d, short cmd, item *it, conn *uc) { assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); assert(d->downstream_conns != NULL); assert(it != NULL); assert(uc != NULL); assert(uc->next == NULL); // Assuming we're already connected to downstream. // bool self = false; conn *c = cproxy_find_downstream_conn(d, ITEM_key(it), it->nkey, &self); if (c != NULL) { if (self) { cproxy_optimize_to_self(d, uc, uc->cmd_start); complete_nread_ascii(uc); return true; } if (cproxy_prep_conn_for_write(c)) { assert(c->state == conn_pause); char *verb = nread_text(cmd); assert(verb != NULL); char *str_flags = ITEM_suffix(it); char *str_length = strchr(str_flags + 1, ' '); int len_flags = str_length - str_flags; int len_length = it->nsuffix - len_flags - 2; char *str_exptime = add_conn_suffix(c); char *str_cas = (cmd == NREAD_CAS ? add_conn_suffix(c) : NULL); if (str_flags != NULL && str_length != NULL && len_flags > 1 && len_length > 1 && str_exptime != NULL && (cmd != NREAD_CAS || str_cas != NULL)) { sprintf(str_exptime, " %u", it->exptime); if (str_cas != NULL) sprintf(str_cas, " %llu", (unsigned long long) ITEM_get_cas(it)); if (add_iov(c, verb, strlen(verb)) == 0 && add_iov(c, ITEM_key(it), it->nkey) == 0 && add_iov(c, str_flags, len_flags) == 0 && add_iov(c, str_exptime, strlen(str_exptime)) == 0 && add_iov(c, str_length, len_length) == 0 && (str_cas == NULL || add_iov(c, str_cas, strlen(str_cas)) == 0) && (uc->noreply == false || add_iov(c, " noreply", 8) == 0) && add_iov(c, ITEM_data(it) - 2, it->nbytes + 2) == 0) { conn_set_state(c, conn_mwrite); c->write_and_go = conn_new_cmd; if (update_event(c, EV_WRITE | EV_PERSIST)) { d->downstream_used_start = 1; d->downstream_used = 1; if (cproxy_dettach_if_noreply(d, uc) == false) { cproxy_start_downstream_timeout(d, c); // During a synchronous (with-reply) SET, // handle fire-&-forget SET optimization. // if (cmd == NREAD_SET && cproxy_optimize_set_ascii(d, uc, ITEM_key(it), it->nkey)) { d->ptd->stats.stats.tot_optimize_sets++; } } else { c->write_and_go = conn_pause; mcache_delete(&d->ptd->proxy->front_cache, ITEM_key(it), it->nkey); } return true; } } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } else { // TODO: Handle this weird error case. } } else { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); } if (settings.verbose > 1) fprintf(stderr, "Proxy item write out of memory"); } return false; }
/* Used for broadcast commands, like flush_all or stats. */ bool cproxy_broadcast_a2a_downstream(downstream *d, char *command, conn *uc, char *suffix) { assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); assert(d->downstream_conns != NULL); assert(command != NULL); assert(uc != NULL); assert(uc->next == NULL); assert(uc->item == NULL); int nwrite = 0; int nconns = memcached_server_count(&d->mst); for (int i = 0; i < nconns; i++) { conn *c = d->downstream_conns[i]; if (c != NULL) { if (cproxy_prep_conn_for_write(c)) { assert(c->state == conn_pause); out_string(c, command); if (update_event(c, EV_WRITE | EV_PERSIST)) { nwrite++; if (uc->noreply) { c->write_and_go = conn_pause; } } else { if (settings.verbose > 1) fprintf(stderr, "Update cproxy write event failed\n"); d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } } else { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); } } } if (settings.verbose > 1) fprintf(stderr, "forward multiget nwrite %d out of %d\n", nwrite, nconns); d->downstream_used_start = nwrite; d->downstream_used = nwrite; if (cproxy_dettach_if_noreply(d, uc) == false) { d->upstream_suffix = suffix; cproxy_start_downstream_timeout(d, NULL); } else { // TODO: Handle flush_all's expiration parameter against // the front_cache. // if (strncmp(command, "flush_all", 9) == 0) { mcache_flush_all(&d->ptd->proxy->front_cache, 0); } } return nwrite > 0; }
void cproxy_process_a2a_downstream(conn *c, char *line) { assert(c != NULL); assert(c->next == NULL); assert(c->extra != NULL); assert(c->cmd == -1); assert(c->item == NULL); assert(line != NULL); assert(line == c->rcurr); assert(IS_ASCII(c->protocol)); assert(IS_PROXY(c->protocol)); if (settings.verbose > 1) fprintf(stderr, "<%d cproxy_process_a2a_downstream %s\n", c->sfd, line); downstream *d = c->extra; assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); if (strncmp(line, "VALUE ", 6) == 0) { token_t tokens[MAX_TOKENS]; size_t ntokens; unsigned int flags; int clen = 0; int vlen; uint64_t cas = CPROXY_NOT_CAS; ntokens = scan_tokens(line, tokens, MAX_TOKENS, &clen); if (ntokens >= 5 && // Accounts for extra termimation token. ntokens <= 6 && tokens[KEY_TOKEN].length <= KEY_MAX_LENGTH && safe_strtoul(tokens[2].value, (uint32_t *) &flags) && safe_strtoul(tokens[3].value, (uint32_t *) &vlen)) { char *key = tokens[KEY_TOKEN].value; size_t nkey = tokens[KEY_TOKEN].length; item *it = item_alloc(key, nkey, flags, 0, vlen + 2); if (it != NULL) { if (ntokens == 5 || safe_strtoull(tokens[4].value, &cas)) { ITEM_set_cas(it, cas); c->item = it; c->ritem = ITEM_data(it); c->rlbytes = it->nbytes; c->cmd = -1; conn_set_state(c, conn_nread); return; // Success. } else { if (settings.verbose > 1) fprintf(stderr, "cproxy could not parse cas\n"); } } else { if (settings.verbose > 1) fprintf(stderr, "cproxy could not item_alloc size %u\n", vlen + 2); } if (it != NULL) item_remove(it); it = NULL; c->sbytes = vlen + 2; // Number of bytes to swallow. conn_set_state(c, conn_swallow); // Note, eventually, we'll see an END later. } else { // We don't know how much to swallow, so close the downstream. // The conn_closing should release the downstream, // which should write a suffix/error to the upstream. // conn_set_state(c, conn_closing); } } else if (strncmp(line, "END", 3) == 0) { conn_set_state(c, conn_pause); } else if (strncmp(line, "OK", 2) == 0) { conn_set_state(c, conn_pause); // TODO: Handle flush_all's expiration parameter against // the front_cache. // // TODO: We flush the front_cache too often, inefficiently // on every downstream flush_all OK response, rather than // on just the last flush_all OK response. // conn *uc = d->upstream_conn; if (uc != NULL && uc->cmd_curr == PROTOCOL_BINARY_CMD_FLUSH) { mcache_flush_all(&d->ptd->proxy->front_cache, 0); } } else if (strncmp(line, "STAT ", 5) == 0 || strncmp(line, "ITEM ", 5) == 0 || strncmp(line, "PREFIX ", 7) == 0) { assert(d->merger != NULL); conn *uc = d->upstream_conn; if (uc != NULL) { assert(uc->next == NULL); if (protocol_stats_merge_line(d->merger, line) == false) { // Forward the line as-is if we couldn't merge it. // int nline = strlen(line); item *it = item_alloc("s", 1, 0, 0, nline + 2); if (it != NULL) { strncpy(ITEM_data(it), line, nline); strncpy(ITEM_data(it) + nline, "\r\n", 2); if (add_conn_item(uc, it)) { add_iov(uc, ITEM_data(it), nline + 2); it = NULL; } if (it != NULL) item_remove(it); } } } conn_set_state(c, conn_new_cmd); } else { conn_set_state(c, conn_pause); // The upstream conn might be NULL when closed already // or while handling a noreply. // conn *uc = d->upstream_conn; if (uc != NULL) { assert(uc->next == NULL); out_string(uc, line); if (!update_event(uc, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 1) fprintf(stderr, "Can't update upstream write event\n"); d->ptd->stats.stats.err_oom++; cproxy_close_conn(uc); } cproxy_del_front_cache_key_ascii_response(d, line, uc->cmd_start); } } }
/* Forward a simple one-liner command downstream. * For example, get, incr/decr, delete, etc. * The response, though, might be a simple line or * multiple VALUE+END lines. */ bool cproxy_forward_a2a_simple_downstream(downstream *d, char *command, conn *uc) { assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); assert(d->downstream_conns != NULL); assert(command != NULL); assert(uc != NULL); assert(uc->item == NULL); assert(uc->cmd_curr != -1); assert(d->multiget == NULL); assert(d->merger == NULL); // Handles get and gets. // if (uc->cmd_curr == PROTOCOL_BINARY_CMD_GET) { // Only use front_cache for 'get', not for 'gets'. // mcache *front_cache = (command[3] == ' ') ? &d->ptd->proxy->front_cache : NULL; return multiget_ascii_downstream(d, uc, a2a_multiget_start, a2a_multiget_skey, a2a_multiget_end, front_cache); } assert(uc->next == NULL); if (uc->cmd_curr == PROTOCOL_BINARY_CMD_FLUSH) return cproxy_broadcast_a2a_downstream(d, command, uc, "OK\r\n"); if (uc->cmd_curr == PROTOCOL_BINARY_CMD_STAT) { if (strncmp(command + 5, " reset", 6) == 0) return cproxy_broadcast_a2a_downstream(d, command, uc, "RESET\r\n"); if (cproxy_broadcast_a2a_downstream(d, command, uc, "END\r\n")) { d->merger = genhash_init(512, skeyhash_ops); return true; } else { return false; } } // TODO: Inefficient repeated scan_tokens. // int cmd_len = 0; token_t tokens[MAX_TOKENS]; size_t ntokens = scan_tokens(command, tokens, MAX_TOKENS, &cmd_len); char *key = tokens[KEY_TOKEN].value; int key_len = tokens[KEY_TOKEN].length; if (ntokens <= 1) { // This was checked long ago, while parsing assert(false); // the upstream conn. return false; } // Assuming we're already connected to downstream. // bool self = false; conn *c = cproxy_find_downstream_conn(d, key, key_len, &self); if (c != NULL) { if (self) { // TODO: This optimization could be done much earlier, // even before the upstream conn was assigned // to a downstream. // cproxy_optimize_to_self(d, uc, command); process_command(uc, command); return true; } if (cproxy_prep_conn_for_write(c)) { assert(c->state == conn_pause); out_string(c, command); if (settings.verbose > 1) fprintf(stderr, "forwarding to %d, noreply %d\n", c->sfd, uc->noreply); if (update_event(c, EV_WRITE | EV_PERSIST)) { d->downstream_used_start = 1; d->downstream_used = 1; if (cproxy_dettach_if_noreply(d, uc) == false) { cproxy_start_downstream_timeout(d, c); } else { c->write_and_go = conn_pause; // Do mcache_delete() here only during a noreply, // otherwise for with-reply requests, we could // be in a race with other clients repopulating // the front_cache. For with-reply requests, we // clear the front_cache when we get a success reply. // mcache_delete(&d->ptd->proxy->front_cache, key, key_len); } return true; } if (settings.verbose > 1) fprintf(stderr, "Couldn't update cproxy write event\n"); d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } else { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); } } return false; }
void cproxy_process_upstream_binary(conn *c) { cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->item == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); proxy_td *ptd = c->extra; cb_assert(ptd != NULL); if (!cproxy_prep_conn_for_write(c)) { ptd->stats.stats.err_upstream_write_prep++; conn_set_state(c, conn_closing); return; } c->cmd_curr = -1; c->cmd_start = NULL; c->cmd_start_time = msec_current_time; c->cmd_retries = 0; int extlen = c->binary_header.request.extlen; int keylen = c->binary_header.request.keylen; uint32_t bodylen = c->binary_header.request.bodylen; cb_assert(bodylen >= (uint32_t) keylen + extlen); if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary %x %d %d %u\n", c->sfd, c->cmd, extlen, keylen, bodylen); } process_bin_noreply(c); /* Map quiet c->cmd values into non-quiet. */ if (c->cmd == PROTOCOL_BINARY_CMD_VERSION || c->cmd == PROTOCOL_BINARY_CMD_QUIT) { dispatch_bin_command(c); return; } /* Alloc an item and continue with an rest-of-body nread if */ /* necessary. The item will hold the entire request message */ /* (the header + body). */ char *ikey = "u"; int ikeylen = 1; c->item = item_alloc(ikey, ikeylen, 0, 0, sizeof(c->binary_header) + bodylen); if (c->item != NULL) { item *it = c->item; void *rb = c->rcurr; cb_assert(it->refcount == 1); memcpy(ITEM_data(it), rb, sizeof(c->binary_header)); if (bodylen > 0) { c->ritem = ITEM_data(it) + sizeof(c->binary_header); c->rlbytes = bodylen; c->substate = bin_read_set_value; conn_set_state(c, conn_nread); } else { /* Since we have no body bytes, we can go immediately to */ /* the nread completed processing step. */ if (c->binary_header.request.opcode == PROTOCOL_BINARY_CMD_SASL_LIST_MECHS) { /* TODO: One day handle more than just PLAIN sasl auth. */ write_bin_response(c, "PLAIN", 0, 0, strlen("PLAIN")); return; } cproxy_pause_upstream_for_downstream(ptd, c); } } else { if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary OOM\n", c->sfd); } ptd->stats.stats.err_oom++; cproxy_close_conn(c); } }
/* We get here after reading the header+body into an item. */ void cproxy_process_upstream_binary_nread(conn *c) { cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->cmd_start == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); protocol_binary_request_header *header = (protocol_binary_request_header *) &c->binary_header; int extlen = header->request.extlen; int keylen = header->request.keylen; uint32_t bodylen = header->request.bodylen; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary_nread %x %d %d %u\n", c->sfd, c->cmd, extlen, keylen, bodylen); } /* pthread_mutex_lock(&c->thread->stats.mutex); */ /* c->thread->stats.slab_stats[it->slabs_clsid].set_cmds++; */ /* pthread_mutex_unlock(&c->thread->stats.mutex); */ proxy_td *ptd = c->extra; cb_assert(ptd != NULL); if (header->request.opcode == PROTOCOL_BINARY_CMD_SASL_AUTH) { item *it = c->item; cb_assert(it); cproxy_sasl_plain_auth(c, (char *) ITEM_data(it)); return; } if (header->request.opcode == PROTOCOL_BINARY_CMD_SASL_STEP) { write_bin_error(c, PROTOCOL_BINARY_RESPONSE_AUTH_ERROR, 0); return; } if (c->binary_header.request.opcode == PROTOCOL_BINARY_CMD_STAT) { char *subcommand = binary_get_key(c); size_t nkey = c->binary_header.request.keylen; if (nkey == 13 && memcmp(subcommand, "proxy buckets", 13) == 0) { process_bin_proxy_stats(c); return; } } if (c->noreply) { if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary_nread " "corking quiet command %x %d\n", c->sfd, c->cmd, (c->corked != NULL)); } /* TODO: We currently don't support binary FLUSHQ. */ /* Rather than having the downstream connections get */ /* into a wonky state, prevent it. */ if (header->request.opcode == PROTOCOL_BINARY_CMD_FLUSHQ) { /* Note: don't use cproxy_close_conn(c), as it goes */ /* through the drive_machine() loop again. */ /* cproxy_close_conn(c); */ conn_set_state(c, conn_closing); return; } /* Hold onto or 'cork' all the binary quiet commands */ /* until there's a later non-quiet command. */ if (cproxy_binary_cork_cmd(c)) { conn_set_state(c, conn_new_cmd); } else { ptd->stats.stats.err_oom++; cproxy_close_conn(c); } return; } cb_assert(c->item == NULL || ((item *) c->item)->refcount == 1); cproxy_pause_upstream_for_downstream(ptd, c); }
/* Used for broadcast commands, like flush_all or stats. */ bool cproxy_broadcast_a2a_downstream(downstream *d, char *command, conn *uc, char *suffix) { assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); assert(d->downstream_conns != NULL); assert(d->downstream_used_start == 0); assert(d->downstream_used == 0); assert(command != NULL); assert(uc != NULL); assert(uc->next == NULL); assert(uc->item == NULL); int nwrite = 0; int nconns = mcs_server_count(&d->mst); for (int i = 0; i < nconns; i++) { conn *c = d->downstream_conns[i]; if (c != NULL && c != NULL_CONN) { if (cproxy_prep_conn_for_write(c)) { assert(c->state == conn_pause); out_string(c, command); if (update_event(c, EV_WRITE | EV_PERSIST)) { nwrite++; if (uc->noreply) { c->write_and_go = conn_pause; } } else { if (settings.verbose > 1) { moxi_log_write("Update cproxy write event failed\n"); } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } } else { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); } } } if (settings.verbose > 1) { moxi_log_write("%d: a2a broadcast nwrite %d out of %d\n", uc->sfd, nwrite, nconns); } if (nwrite > 0) { d->downstream_used_start = nwrite; d->downstream_used = nwrite; if (cproxy_dettach_if_noreply(d, uc) == false) { d->upstream_suffix = suffix; d->upstream_suffix_len = 0; d->upstream_status = PROTOCOL_BINARY_RESPONSE_SUCCESS; d->upstream_retry = 0; d->target_host_ident = NULL; cproxy_start_downstream_timeout(d, NULL); } else { // TODO: Handle flush_all's expiration parameter against // the front_cache. // if (strncmp(command, "flush_all", 9) == 0) { mcache_flush_all(&d->ptd->proxy->front_cache, 0); } } return true; } return false; }
/* We reach here after nread'ing a header+body into an item. */ void cproxy_process_b2b_downstream_nread(conn *c) { conn *uc; item *it; downstream *d; protocol_binary_response_header *header; int extlen; int keylen; uint32_t bodylen; int status; int opcode; cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->cmd_start == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); header = (protocol_binary_response_header *) &c->binary_header; extlen = header->response.extlen; keylen = header->response.keylen; bodylen = header->response.bodylen; status = ntohs(header->response.status); opcode = header->response.opcode; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread %x %x %d %d %u %d %x\n", c->sfd, c->cmd, opcode, extlen, keylen, bodylen, c->noreply, status); } d = c->extra; cb_assert(d != NULL); cb_assert(d->ptd != NULL); cb_assert(d->ptd->proxy != NULL); /* TODO: Need to handle quiet binary command error response, */ /* in the right order. */ /* TODO: Need to handle not-my-vbucket error during a quiet cmd. */ uc = d->upstream_conn; it = c->item; /* Clear c->item because we either move it to the upstream or */ /* item_remove() it on error. */ c->item = NULL; cb_assert(it != NULL); cb_assert(it->refcount == 1); if (cproxy_binary_ignore_reply(c, header, it)) { return; } if (c->noreply) { conn_set_state(c, conn_new_cmd); } else { conn_set_state(c, conn_pause); if (opcode == PROTOCOL_BINARY_CMD_NOOP || opcode == PROTOCOL_BINARY_CMD_FLUSH) { goto done; } if (opcode == PROTOCOL_BINARY_CMD_STAT) { if (status == PROTOCOL_BINARY_RESPONSE_SUCCESS) { if (keylen > 0) { if (d->merger != NULL) { char *key = (ITEM_data(it)) + sizeof(*header) + extlen; char *val = key + keylen; protocol_stats_merge_name_val(d->merger, "STAT", 4, key, keylen, val, bodylen - keylen - extlen); } conn_set_state(c, conn_new_cmd); /* Get next STATS response. */ } } goto done; } /* If the client is still there, we should handle */ /* a not-my-vbucket error with a possible retry. */ if (uc != NULL && status == PROTOCOL_BINARY_RESPONSE_NOT_MY_VBUCKET) { int max_retries; protocol_binary_request_header *req; int vbucket; int sindex; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread not-my-vbucket, " "cmd: %x %d\n", c->sfd, header->response.opcode, uc->item != NULL); } cb_assert(uc->item != NULL); req = (protocol_binary_request_header *)ITEM_data((item*)uc->item); vbucket = ntohs(req->request.reserved); sindex = downstream_conn_index(d, c); if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread not-my-vbucket, " "cmd: %x not multi-key get, sindex %d, vbucket %d, retries %d\n", c->sfd, header->response.opcode, sindex, vbucket, uc->cmd_retries); } mcs_server_invalid_vbucket(&d->mst, sindex, vbucket); /* As long as the upstream is still open and we haven't */ /* retried too many times already. */ max_retries = cproxy_max_retries(d); if (uc->cmd_retries < max_retries) { uc->cmd_retries++; d->upstream_retry++; d->ptd->stats.stats.tot_retry_vbucket++; goto done; } if (settings.verbose > 2) { moxi_log_write("%d: cproxy_process_b2b_downstream_nread not-my-vbucket, " "cmd: %x skipping retry %d >= %d\n", c->sfd, header->response.opcode, uc->cmd_retries, max_retries); } } } /* Write the response to the upstream connection. */ if (uc != NULL) { if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread got %u\n", c->sfd, it->nbytes); cproxy_dump_header(c->sfd, ITEM_data(it)); } if (add_conn_item(uc, it) == true) { it->refcount++; if (add_iov(uc, ITEM_data(it), it->nbytes) == 0) { /* If we got a quiet response, however, don't change the */ /* upstream connection's state (should be in paused state), */ /* as we expect the downstream server to provide a */ /* verbal/non-quiet response that moves the downstream */ /* conn through the conn_pause countdown codepath. */ if (c->noreply == false) { cproxy_update_event_write(d, uc); conn_set_state(uc, conn_mwrite); } goto done; } } d->ptd->stats.stats.err_oom++; cproxy_close_conn(uc); } done: if (it != NULL) { item_remove(it); } }
/* Called when we receive a binary response header from * a downstream server, via try_read_command()/drive_machine(). */ void cproxy_process_b2b_downstream(conn *c) { char *ikey; int ikeylen; downstream *d; int extlen; int keylen; uint32_t bodylen; cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->item == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); cb_assert(c->substate == bin_no_state); d = c->extra; cb_assert(d); c->cmd_curr = -1; c->cmd_start = NULL; c->cmd_start_time = msec_current_time; c->cmd_retries = 0; extlen = c->binary_header.request.extlen; keylen = c->binary_header.request.keylen; bodylen = c->binary_header.request.bodylen; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream %x %d %d %u\n", c->sfd, c->cmd, extlen, keylen, bodylen); } cb_assert(bodylen >= (uint32_t) keylen + extlen); process_bin_noreply(c); /* Map quiet c->cmd values into non-quiet. */ /* Our approach is to read everything we can before */ /* getting into big switch/case statements for the */ /* actual processing. */ /* Alloc an item and continue with an rest-of-body nread if */ /* necessary. The item will hold the entire response message */ /* (the header + body). */ ikey = "q"; ikeylen = 1; c->item = item_alloc(ikey, ikeylen, 0, 0, sizeof(c->binary_header) + bodylen); if (c->item != NULL) { item *it = c->item; void *rb = c->rcurr; cb_assert(it->refcount == 1); memcpy(ITEM_data(it), rb, sizeof(c->binary_header)); if (bodylen > 0) { c->ritem = ITEM_data(it) + sizeof(c->binary_header); c->rlbytes = bodylen; c->substate = bin_read_set_value; conn_set_state(c, conn_nread); } else { /* Since we have no body bytes, we can go immediately to */ /* the nread completed processing step. */ cproxy_process_b2b_downstream_nread(c); } } else { d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } }
/* Do the actual work of forwarding the command from an * upstream binary conn to its assigned binary downstream. */ bool cproxy_forward_b2b_downstream(downstream *d) { int nc; int server_index; conn *uc; cb_assert(d != NULL); cb_assert(d->ptd != NULL); cb_assert(d->ptd->proxy != NULL); cb_assert(d->downstream_conns != NULL); cb_assert(d->downstream_used == 0); cb_assert(d->multiget == NULL); cb_assert(d->merger == NULL); d->downstream_used_start = 0; uc = d->upstream_conn; if (settings.verbose > 2) { moxi_log_write("%d: cproxy_forward_b2b_downstream %x\n", uc->sfd, uc->cmd); } cb_assert(uc != NULL); cb_assert(uc->state == conn_pause); cb_assert(uc->cmd >= 0); cb_assert(uc->cmd_start == NULL); cb_assert(uc->thread != NULL); cb_assert(uc->thread->base != NULL); cb_assert(uc->noreply == false); cb_assert(IS_BINARY(uc->protocol)); cb_assert(IS_PROXY(uc->protocol)); server_index = -1; if (cproxy_is_broadcast_cmd(uc->cmd) == false && uc->corked == NULL) { item *it = uc->item; protocol_binary_request_header *req; char *key; int key_len; cb_assert(it != NULL); req = (protocol_binary_request_header *) ITEM_data(it); key = ((char *) req) + sizeof(*req) + req->request.extlen; key_len = ntohs(req->request.keylen); if (key_len > 0) { server_index = cproxy_server_index(d, key, key_len, NULL); if (server_index < 0) { return false; } } } nc = cproxy_connect_downstream(d, uc->thread, server_index); if (nc == -1) { return true; } if (nc > 0) { int i; int nconns; cb_assert(d->downstream_conns != NULL); if (d->usec_start == 0 && d->ptd->behavior_pool.base.time_stats) { d->usec_start = usec_now(); } nconns = mcs_server_count(&d->mst); for (i = 0; i < nconns; i++) { conn *c = d->downstream_conns[i]; if (c != NULL && c != NULL_CONN) { cb_assert(c->state == conn_pause); cb_assert(c->item == NULL); if (cproxy_prep_conn_for_write(c) == false) { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); return false; } } } /* Uncork the saved-up quiet binary commands. */ cproxy_binary_uncork_cmds(d, uc); if (uc->cmd == PROTOCOL_BINARY_CMD_FLUSH || uc->cmd == PROTOCOL_BINARY_CMD_NOOP || uc->cmd == PROTOCOL_BINARY_CMD_STAT) { return cproxy_broadcast_b2b_downstream(d, uc); } return cproxy_forward_b2b_simple_downstream(d, uc); } if (settings.verbose > 2) { moxi_log_write("%d: cproxy_forward_b2b_downstream connect failed\n", uc->sfd); } return false; }
bool multiget_ascii_downstream(downstream *d, conn *uc, int (*emit_start)(conn *c, char *cmd, int cmd_len), int (*emit_skey)(conn *c, char *skey, int skey_len), int (*emit_end)(conn *c), mcache *front_cache) { assert(d != NULL); assert(d->downstream_conns != NULL); assert(d->multiget == NULL); assert(uc != NULL); assert(uc->noreply == false); proxy_td *ptd = d->ptd; assert(ptd != NULL); proxy_stats_cmd *psc_get = &ptd->stats.stats_cmd[STATS_CMD_TYPE_REGULAR][STATS_CMD_GET]; proxy_stats_cmd *psc_get_key = &ptd->stats.stats_cmd[STATS_CMD_TYPE_REGULAR][STATS_CMD_GET_KEY]; int nwrite = 0; int nconns = mcs_server_count(&d->mst); for (int i = 0; i < nconns; i++) { if (d->downstream_conns[i] != NULL && cproxy_prep_conn_for_write(d->downstream_conns[i]) == false) { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(d->downstream_conns[i]); return false; } } if (uc->next != NULL) { // More than one upstream conn, so we need a hashtable // to track keys for de-deplication. // d->multiget = genhash_init(128, skeyhash_ops); if (settings.verbose > 1) { fprintf(stderr, "cproxy multiget hash table new\n"); } } // Snapshot the volatile only once. // uint32_t msec_current_time_snapshot = msec_current_time; int uc_num = 0; conn *uc_cur = uc; while (uc_cur != NULL) { assert(uc_cur->cmd == -1); assert(uc_cur->item == NULL); assert(uc_cur->state == conn_pause); assert(IS_ASCII(uc_cur->protocol)); assert(IS_PROXY(uc_cur->protocol)); char *command = uc_cur->cmd_start; assert(command != NULL); char *space = strchr(command, ' '); assert(space > command); int cmd_len = space - command; assert(cmd_len == 3 || cmd_len == 4); // Either get or gets. int cas_emit = (command[3] == 's'); if (settings.verbose > 1) { fprintf(stderr, "forward multiget %s (%d %d)\n", command, cmd_len, uc_num); } while (space != NULL) { char *key = space + 1; char *next_space = strchr(key, ' '); int key_len; if (next_space != NULL) { key_len = next_space - key; } else { key_len = strlen(key); // We've reached the last key. // psc_get->read_bytes += (key - command + key_len); } // This key_len check helps skip consecutive spaces. // if (key_len > 0) { ptd->stats.stats.tot_multiget_keys++; psc_get_key->seen++; psc_get_key->read_bytes += key_len; // Update key-based statistics. // bool do_key_stats = matcher_check(&ptd->key_stats_matcher, key, key_len, true) == true && matcher_check(&ptd->key_stats_unmatcher, key, key_len, false) == false; if (do_key_stats) { touch_key_stats(ptd, key, key_len, msec_current_time_snapshot, STATS_CMD_TYPE_REGULAR, STATS_CMD_GET_KEY, 1, 0, 0, key_len, 0); } // Handle a front cache hit by queuing response. // // Note, front cache stats are part of mcache. // if (!cas_emit) { item *it = mcache_get(front_cache, key, key_len, msec_current_time_snapshot); if (it != NULL) { assert(it->nkey == key_len); assert(strncmp(ITEM_key(it), key, it->nkey) == 0); cproxy_upstream_ascii_item_response(it, uc_cur, 0); psc_get_key->hits++; psc_get_key->write_bytes += it->nbytes; if (do_key_stats) { touch_key_stats(ptd, key, key_len, msec_current_time_snapshot, STATS_CMD_TYPE_REGULAR, STATS_CMD_GET_KEY, 0, 1, 0, 0, it->nbytes); } // The refcount was inc'ed by mcache_get() for us. // item_remove(it); goto loop_next; } } bool self = false; conn *c = cproxy_find_downstream_conn(d, key, key_len, &self); if (c != NULL) { if (self) { // Optimization for talking with ourselves, // to avoid extra network hop. // ptd->stats.stats.tot_optimize_self++; item *it = item_get(key, key_len); if (it != NULL) { cproxy_upstream_ascii_item_response(it, uc_cur, cas_emit); psc_get_key->hits++; psc_get_key->write_bytes += it->nbytes; if (do_key_stats) { touch_key_stats(ptd, key, key_len, msec_current_time_snapshot, STATS_CMD_TYPE_REGULAR, STATS_CMD_GET_KEY, 0, 1, 0, 0, it->nbytes); } // The refcount was inc'ed by item_get() for us. // item_remove(it); if (settings.verbose > 1) { fprintf(stderr, "optimize self multiget hit: %s\n", key); } } else { psc_get_key->misses++; if (do_key_stats) { touch_key_stats(ptd, key, key_len, msec_current_time_snapshot, STATS_CMD_TYPE_REGULAR, STATS_CMD_GET_KEY, 0, 0, 1, 0, 0); } if (settings.verbose > 1) { fprintf(stderr, "optimize self multiget miss: %s\n", key); } } goto loop_next; } // See if we've already requested this key via // the multiget hash table, in order to // de-deplicate repeated keys. // bool first_request = true; if (d->multiget != NULL) { // TODO: Use Trond's allocator here. // multiget_entry *entry = calloc(1, sizeof(multiget_entry)); if (entry != NULL) { entry->upstream_conn = uc_cur; entry->opaque = 0; entry->hits = 0; entry->next = genhash_find(d->multiget, key); genhash_update(d->multiget, key, entry); if (entry->next != NULL) { first_request = false; } } else { // TODO: Handle out of multiget entry memory. } } if (first_request) { assert(c->item == NULL); assert(c->state == conn_pause); assert(IS_PROXY(c->protocol)); assert(c->ilist != NULL); assert(c->isize > 0); if (c->msgused <= 1 && c->msgbytes <= 0) { emit_start(c, command, cmd_len); } // Provide the preceding space as optimization // for ascii-to-ascii configuration. // emit_skey(c, key - 1, key_len + 1); } else { ptd->stats.stats.tot_multiget_keys_dedupe++; if (settings.verbose > 1) { char buf[KEY_MAX_LENGTH + 10]; memcpy(buf, key, key_len); buf[key_len] = '\0'; fprintf(stderr, "%d cproxy multiget dedpue: %s\n", uc_cur->sfd, buf); } } } else { // TODO: Handle when downstream conn is down. } } loop_next: space = next_space; } uc_num++; uc_cur = uc_cur->next; } for (int i = 0; i < nconns; i++) { conn *c = d->downstream_conns[i]; if (c != NULL && (c->msgused > 1 || c->msgbytes > 0)) { emit_end(c); conn_set_state(c, conn_mwrite); c->write_and_go = conn_new_cmd; if (update_event(c, EV_WRITE | EV_PERSIST)) { nwrite++; if (uc->noreply) { c->write_and_go = conn_pause; } } else { if (settings.verbose > 1) { fprintf(stderr, "Couldn't update cproxy write event\n"); } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } } } if (settings.verbose > 1) { fprintf(stderr, "forward multiget nwrite %d out of %d\n", nwrite, nconns); } d->downstream_used_start = nwrite; d->downstream_used = nwrite; if (cproxy_dettach_if_noreply(d, uc) == false) { d->upstream_suffix = "END\r\n"; cproxy_start_downstream_timeout(d, NULL); } return nwrite > 0; }
/* Forward a simple one-liner command downstream. * For example, get, incr/decr, delete, etc. * The response, though, might be a simple line or * multiple VALUE+END lines. */ bool cproxy_forward_a2a_simple_downstream(downstream *d, char *command, conn *uc) { cb_assert(d != NULL); cb_assert(d->ptd != NULL); cb_assert(d->ptd->proxy != NULL); cb_assert(d->downstream_conns != NULL); cb_assert(command != NULL); cb_assert(uc != NULL); cb_assert(uc->item == NULL); cb_assert(uc->cmd_curr != (protocol_binary_command) -1); cb_assert(d->multiget == NULL); cb_assert(d->merger == NULL); /* Handles get and gets. */ if (uc->cmd_curr == PROTOCOL_BINARY_CMD_GETK || uc->cmd_curr == PROTOCOL_BINARY_CMD_GETKQ || uc->cmd_curr == PROTOCOL_BINARY_CMD_GETL) { /* Only use front_cache for 'get', not for 'gets'. */ mcache *front_cache = (command[3] == ' ') ? &d->ptd->proxy->front_cache : NULL; return multiget_ascii_downstream(d, uc, a2a_multiget_start, a2a_multiget_skey, a2a_multiget_end, front_cache); } cb_assert(uc->next == NULL); if (uc->cmd_curr == PROTOCOL_BINARY_CMD_FLUSH) { return cproxy_broadcast_a2a_downstream(d, command, uc, "OK\r\n"); } if (uc->cmd_curr == PROTOCOL_BINARY_CMD_STAT) { if (strncmp(command + 5, " reset", 6) == 0) { return cproxy_broadcast_a2a_downstream(d, command, uc, "RESET\r\n"); } if (cproxy_broadcast_a2a_downstream(d, command, uc, "END\r\n")) { d->merger = genhash_init(512, skeyhash_ops); return true; } else { return false; } } /* TODO: Inefficient repeated scan_tokens. */ int cmd_len = 0; token_t tokens[MAX_TOKENS]; size_t ntokens = scan_tokens(command, tokens, MAX_TOKENS, &cmd_len); char *key = tokens[KEY_TOKEN].value; int key_len = tokens[KEY_TOKEN].length; if (ntokens <= 1) { /* This was checked long ago, while parsing */ cb_assert(false); /* the upstream conn. */ return false; } /* Assuming we're already connected to downstream. */ if (!strcmp(command, "version")) { /* fake key for version command handling */ key = "v"; key_len = 1; } conn *c = cproxy_find_downstream_conn(d, key, key_len, NULL); if (c != NULL) { if (cproxy_prep_conn_for_write(c)) { cb_assert(c->state == conn_pause); out_string(c, command); if (settings.verbose > 1) { moxi_log_write("forwarding to %d, noreply %d\n", c->sfd, uc->noreply); } if (update_event(c, EV_WRITE | EV_PERSIST)) { d->downstream_used_start = 1; d->downstream_used = 1; if (cproxy_dettach_if_noreply(d, uc) == false) { cproxy_start_downstream_timeout(d, c); } else { c->write_and_go = conn_pause; /* Do mcache_delete() here only during a noreply, */ /* otherwise for with-reply requests, we could */ /* be in a race with other clients repopulating */ /* the front_cache. For with-reply requests, we */ /* clear the front_cache when we get a success reply. */ cproxy_front_cache_delete(d->ptd, key, key_len); } return true; } if (settings.verbose > 1) { moxi_log_write("Couldn't update cproxy write event\n"); } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } else { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); } } return false; }