void slabs_alloc_test(void) { unsigned int total_chunk = 0; const char *key = "charliezhao"; size_t nkey = strlen(key) + 1; item *ptr = (item *)slabs_alloc(1024, slabs_clsid(1024), &total_chunk); strcpy(ITEM_key(ptr), key); ptr->nkey = nkey; strcpy(ITEM_data(ptr), "xuechaozhao"); uint32_t hv = jenkins_hash(key, strlen(key)); assoc_insert(ptr, hv); for(int i = 0; i <= 10922; ++i) { void *ptr = slabs_alloc(96, slabs_clsid(96), &total_chunk); if(ptr == NULL) { fprintf(stderr, "i: %7d slabs_alloc fail\n", i); break; } else { slabs_free(ptr, 96, slabs_clsid(96)); } } item *ptr2 = assoc_find(key, nkey, hv); fprintf(stdout, "key:%20s value:%20s\n", ITEM_key(ptr2), ITEM_data(ptr2)); }
bool b2b_forward_item_vbucket(conn *uc, downstream *d, item *it, conn *c, bool self, int vbucket) { (void)self; assert(d != NULL); assert(d->ptd != NULL); assert(uc != NULL); assert(uc->next == NULL); assert(uc->noreply == false); assert(c != NULL); // Assuming we're already connected to downstream. // // TODO: Optimize to self codepath. // if (settings.verbose > 2) { moxi_log_write("%d: b2b_forward_item_vbucket %x to %d, vbucket %d\n", uc->sfd, uc->cmd, c->sfd, vbucket); } protocol_binary_request_header *req = (protocol_binary_request_header *) ITEM_data(it); if (vbucket >= 0) { req->request.reserved = htons(vbucket); } if (add_conn_item(c, it) == true) { // The caller keeps its refcount, and we need our own. // it->refcount++; if (add_iov(c, ITEM_data(it), it->nbytes) == 0) { conn_set_state(c, conn_mwrite); c->write_and_go = conn_new_cmd; if (update_event(c, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 2) { moxi_log_write("%d: b2b_forward %x to %d success\n", uc->sfd, uc->cmd, c->sfd); } return true; } } } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); return false; }
int item_test() { int maxi = 0; //test set. for(int i = 0; i < 10; i++) { char key[1024]; memset(key, 0, 1024); sprintf(key, "charlie_%d", i); const size_t nkey = strlen(key) + 1; const int flags = 0; const time_t exptime = 0; const int nbytes = 1024; uint32_t cur_hv = jenkins_hash((void *)key, nkey); item *it = do_item_alloc((const char *)key, nkey, flags, exptime, nbytes, cur_hv); if(it == NULL) { fprintf(stderr, "\033[31malloc fail\033[0m"); maxi = i; break; } char val[1024]; sprintf(val, "%d", i); memcpy(ITEM_data(it), (void *)&val, strlen(val)+1); } //test get. for(int i = 0; i < 10; ++i) { char key[1024]; memset(key, 0, 1024); sprintf(key, "charlie_%d", i); const size_t nkey = strlen(key) + 1; uint32_t cur_hv = jenkins_hash((void *)key, nkey); item *it = assoc_find(key, nkey, cur_hv); if(it == NULL) { fprintf(stderr, "\033[31mget fail\033[0m"); return -1; } int val = 0; memcpy((void *)&val, ITEM_data(it), sizeof(val)); if(i&0x1) { fprintf(stdout, "del key:%s value:%d\n", ITEM_key(it), val); do_item_unlink(it, cur_hv); lru_traverse(NULL); } } return 0; }
static void do_slabs_free_chunked(item *it, const size_t size, unsigned int id, slabclass_t *p) { item_chunk *chunk = (item_chunk *) ITEM_data(it); size_t realsize = size; while (chunk) { realsize += sizeof(item_chunk); chunk = chunk->next; } chunk = (item_chunk *) ITEM_data(it); unsigned int chunks_found = 1; it->it_flags = ITEM_SLABBED; it->slabs_clsid = 0; it->prev = 0; it->next = (item *) chunk->next; assert(it->next); /* top chunk should already point back to head */ assert(it->next && (void*)it->next->prev == (void*)chunk); chunk = chunk->next; chunk->prev = (item_chunk *)it; while (chunk) { assert(chunk->it_flags == ITEM_CHUNK); chunk->it_flags = ITEM_SLABBED; chunk->slabs_clsid = 0; chunks_found++; if (chunk->next) { chunk = chunk->next; } else { break; } } /* must have had nothing hanging off of the final chunk */ assert(chunk && chunk->next == 0); /* Tail chunk, link the freelist here. */ chunk->next = p->slots; if (chunk->next) chunk->next->prev = chunk; p->slots = it; p->sl_curr += chunks_found; p->requested -= size; return; }
bool b2b_forward_item(conn *uc, downstream *d, item *it) { int vbucket = -1; bool local; conn *c; protocol_binary_request_header *req; char *key; int keylen; cb_assert(uc != NULL); cb_assert(uc->next == NULL); cb_assert(uc->noreply == false); cb_assert(it != NULL); req = (protocol_binary_request_header *) ITEM_data(it); key = ((char *) req) + sizeof(*req) + req->request.extlen; keylen = ntohs(req->request.keylen); if (settings.verbose > 2) { char buf[300]; memcpy(buf, key, keylen); buf[keylen] = '\0'; moxi_log_write("%d: b2b_forward_item nbytes %u, extlen %d, keylen %d opcode %x key (%s)\n", uc->sfd, it->nbytes, req->request.extlen, keylen, req->request.opcode, buf); cproxy_dump_header(uc->sfd, (char *) req); } if (key == NULL || keylen <= 0) { return false; /* We don't know how to hash an empty key. */ } c = cproxy_find_downstream_conn_ex(d, key, keylen, &local, &vbucket); if (c != NULL) { if (local) { uc->hit_local = true; } if (b2b_forward_item_vbucket(uc, d, it, c, vbucket) == true) { d->downstream_used_start = 1; d->downstream_used = 1; cproxy_start_downstream_timeout(d, c); return true; } } if (settings.verbose > 2) { moxi_log_write("%d: b2b_forward_item failed (%d)\n", uc->sfd, (c != NULL)); } return false; }
void complete_nread(conn *c) { item *it = c->item; int comm = c->item_comm; item *old_it; time_t now = time(0); stats.set_cmds++; while(1) { if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) != 0) { out_string(c, "CLIENT_ERROR bad data chunk"); break; } old_it = assoc_find(ITEM_key(it)); if (old_it && settings.oldest_live && old_it->time <= settings.oldest_live) { item_unlink(old_it); old_it = 0; } if (old_it && old_it->exptime && old_it->exptime < now) { item_unlink(old_it); old_it = 0; } if (old_it && comm==NREAD_ADD) { item_update(old_it); out_string(c, "NOT_STORED"); break; } if (!old_it && comm == NREAD_REPLACE) { out_string(c, "NOT_STORED"); break; } if (old_it && (old_it->it_flags & ITEM_DELETED) && (comm == NREAD_REPLACE || comm == NREAD_ADD)) { out_string(c, "NOT_STORED"); break; } if (old_it) { item_replace(old_it, it); } else item_link(it); c->item = 0; out_string(c, "STORED"); return; } item_free(it); c->item = 0; return; }
static void do_slabs_free_chunked(item *it, const size_t size) { item_chunk *chunk = (item_chunk *) ITEM_data(it); slabclass_t *p; it->it_flags = ITEM_SLABBED; it->slabs_clsid = 0; it->prev = 0; // header object's original classid is stored in chunk. p = &slabclass[chunk->orig_clsid]; if (chunk->next) { chunk = chunk->next; chunk->prev = 0; } else { // header with no attached chunk chunk = NULL; } // return the header object. // TODO: This is in three places, here and in do_slabs_free(). it->prev = 0; it->next = p->slots; if (it->next) it->next->prev = it; p->slots = it; p->sl_curr++; // TODO: macro p->requested -= it->nkey + 1 + it->nsuffix + sizeof(item) + sizeof(item_chunk); if (settings.use_cas) { p->requested -= sizeof(uint64_t); } item_chunk *next_chunk; while (chunk) { assert(chunk->it_flags == ITEM_CHUNK); chunk->it_flags = ITEM_SLABBED; p = &slabclass[chunk->slabs_clsid]; chunk->slabs_clsid = 0; next_chunk = chunk->next; chunk->prev = 0; chunk->next = p->slots; if (chunk->next) chunk->next->prev = chunk; p->slots = chunk; p->sl_curr++; p->requested -= chunk->size + sizeof(item_chunk); chunk = next_chunk; } return; }
int process_get_command ( char *key, size_t nkey, char *dst, int *len ) { item *it; char *src = NULL; it = item_get (key, nkey); if ( it ) { item_update (it); src = ITEM_data (it); *len = it->nbytes - 2; memcpy (dst, src, it->nbytes); item_remove (it); return true; } else { //item_remove(it); return false; } }
/* We get here after reading the value in set/add/replace * commands. The command has been stored in c->cmd, and * the item is ready in c->item. */ void cproxy_process_upstream_ascii_nread(conn *c) { assert(c != NULL); assert(c->next == NULL); item *it = c->item; assert(it != NULL); // pthread_mutex_lock(&c->thread->stats.mutex); // c->thread->stats.slab_stats[it->slabs_clsid].set_cmds++; // pthread_mutex_unlock(&c->thread->stats.mutex); if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) == 0) { proxy_td *ptd = c->extra; assert(ptd != NULL); cproxy_pause_upstream_for_downstream(ptd, c); } else { out_string(c, "CLIENT_ERROR bad data chunk"); } }
void drive_machine(conn *c) { int exit = 0; int sfd, flags = 1; socklen_t addrlen; struct sockaddr addr; conn *newc; int res; while (!exit) { /* printf("state %d\n", c->state);*/ switch(c->state) { case conn_listening: addrlen = sizeof(addr); if ((sfd = accept(c->sfd, &addr, &addrlen)) == -1) { if (errno == EAGAIN || errno == EWOULDBLOCK) { exit = 1; break; } else { perror("accept()"); } break; } if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 || fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) { perror("setting O_NONBLOCK"); close(sfd); break; } newc = conn_new(sfd, conn_read, EV_READ | EV_PERSIST); if (!newc) { if (settings.verbose > 0) fprintf(stderr, "couldn't create new connection\n"); close(sfd); break; } break; case conn_read: if (try_read_command(c)) { continue; } if (try_read_network(c)) { continue; } /* we have no command line and no data to read from network */ if (!update_event(c, EV_READ | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); c->state = conn_closing; break; } exit = 1; break; case conn_nread: /* we are reading rlbytes into rcurr; */ if (c->rlbytes == 0) { complete_nread(c); break; } /* first check if we have leftovers in the conn_read buffer */ if (c->rbytes > 0) { int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes; memcpy(c->rcurr, c->rbuf, tocopy); c->rcurr += tocopy; c->rlbytes -= tocopy; if (c->rbytes > tocopy) { memmove(c->rbuf, c->rbuf+tocopy, c->rbytes - tocopy); } c->rbytes -= tocopy; break; } /* now try reading from the socket */ res = read(c->sfd, c->rcurr, c->rlbytes); if (res > 0) { stats.bytes_read += res; c->rcurr += res; c->rlbytes -= res; break; } if (res == 0) { /* end of stream */ c->state = conn_closing; break; } if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { if (!update_event(c, EV_READ | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); c->state = conn_closing; break; } exit = 1; break; } /* otherwise we have a real error, on which we close the connection */ if (settings.verbose > 0) fprintf(stderr, "Failed to read, and not due to blocking\n"); c->state = conn_closing; break; case conn_swallow: /* we are reading sbytes and throwing them away */ if (c->sbytes == 0) { c->state = conn_read; break; } /* first check if we have leftovers in the conn_read buffer */ if (c->rbytes > 0) { int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes; c->sbytes -= tocopy; if (c->rbytes > tocopy) { memmove(c->rbuf, c->rbuf+tocopy, c->rbytes - tocopy); } c->rbytes -= tocopy; break; } /* now try reading from the socket */ res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize); if (res > 0) { stats.bytes_read += res; c->sbytes -= res; break; } if (res == 0) { /* end of stream */ c->state = conn_closing; break; } if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { if (!update_event(c, EV_READ | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); c->state = conn_closing; break; } exit = 1; break; } /* otherwise we have a real error, on which we close the connection */ if (settings.verbose > 0) fprintf(stderr, "Failed to read, and not due to blocking\n"); c->state = conn_closing; break; case conn_write: /* we are writing wbytes bytes starting from wcurr */ if (c->wbytes == 0) { if (c->write_and_free) { free(c->write_and_free); c->write_and_free = 0; } c->state = c->write_and_go; if (c->state == conn_read) set_cork(c, 0); break; } res = write(c->sfd, c->wcurr, c->wbytes); if (res > 0) { stats.bytes_written += res; c->wcurr += res; c->wbytes -= res; break; } if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { if (!update_event(c, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); c->state = conn_closing; break; } exit = 1; break; } /* if res==0 or res==-1 and error is not EAGAIN or EWOULDBLOCK, we have a real error, on which we close the connection */ if (settings.verbose > 0) fprintf(stderr, "Failed to write, and not due to blocking\n"); c->state = conn_closing; break; case conn_mwrite: /* * we're writing ibytes bytes from iptr. iptr alternates between * ibuf, where we build a string "VALUE...", and ITEM_data(it) for the * current item. When we finish a chunk, we choose the next one using * ipart, which has the following semantics: 0 - start the loop, 1 - * we finished ibuf, go to current ITEM_data(it); 2 - we finished ITEM_data(it), * move to the next item and build its ibuf; 3 - we finished all items, * write "END". */ if (c->ibytes > 0) { res = write(c->sfd, c->iptr, c->ibytes); if (res > 0) { stats.bytes_written += res; c->iptr += res; c->ibytes -= res; break; } if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { if (!update_event(c, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); c->state = conn_closing; break; } exit = 1; break; } /* if res==0 or res==-1 and error is not EAGAIN or EWOULDBLOCK, we have a real error, on which we close the connection */ if (settings.verbose > 0) fprintf(stderr, "Failed to write, and not due to blocking\n"); c->state = conn_closing; break; } else { item *it; /* we finished a chunk, decide what to do next */ switch (c->ipart) { case 1: it = *(c->icurr); assert((it->it_flags & ITEM_SLABBED) == 0); c->iptr = ITEM_data(it); c->ibytes = it->nbytes; c->ipart = 2; break; case 2: it = *(c->icurr); item_remove(it); c->ileft--; if (c->ileft <= 0) { c->ipart = 3; break; } else { c->icurr++; } /* FALL THROUGH */ case 0: it = *(c->icurr); assert((it->it_flags & ITEM_SLABBED) == 0); c->ibytes = sprintf(c->ibuf, "VALUE %s %u %u\r\n", ITEM_key(it), it->flags, it->nbytes - 2); if (settings.verbose > 1) fprintf(stderr, ">%d sending key %s\n", c->sfd, ITEM_key(it)); c->iptr = c->ibuf; c->ipart = 1; break; case 3: out_string(c, "END"); break; } } break; case conn_closing: conn_close(c); exit = 1; break; } } return; }
/** * @param cas_emit 1: emit CAS. * 0: do not emit CAS. * -1: data driven. */ void cproxy_upstream_ascii_item_response(item *it, conn *uc, int cas_emit) { assert(it != NULL); assert(uc != NULL); assert(uc->state == conn_pause); assert(uc->funcs != NULL); assert(IS_ASCII(uc->protocol)); assert(IS_PROXY(uc->protocol)); if (settings.verbose > 2) { char key[KEY_MAX_LENGTH + 10]; assert(it->nkey <= KEY_MAX_LENGTH); memcpy(key, ITEM_key(it), it->nkey); key[it->nkey] = '\0'; moxi_log_write("<%d cproxy ascii item response, key %s\n", uc->sfd, key); } if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) == 0) { // TODO: Need to clean up half-written add_iov()'s. // Consider closing the upstream_conns? // uint64_t cas = ITEM_get_cas(it); if ((cas_emit == 0) || (cas_emit < 0 && cas == CPROXY_NOT_CAS)) { if (add_conn_item(uc, it)) { it->refcount++; if (add_iov(uc, "VALUE ", 6) == 0 && add_iov(uc, ITEM_key(it), it->nkey) == 0 && add_iov(uc, ITEM_suffix(it), it->nsuffix + it->nbytes) == 0) { if (settings.verbose > 2) { moxi_log_write("<%d cproxy ascii item response success\n", uc->sfd); } } } } else { char *suffix = add_conn_suffix(uc); if (suffix != NULL) { sprintf(suffix, " %llu\r\n", (unsigned long long) cas); if (add_conn_item(uc, it)) { it->refcount++; if (add_iov(uc, "VALUE ", 6) == 0 && add_iov(uc, ITEM_key(it), it->nkey) == 0 && add_iov(uc, ITEM_suffix(it), it->nsuffix - 2) == 0 && add_iov(uc, suffix, strlen(suffix)) == 0 && add_iov(uc, ITEM_data(it), it->nbytes) == 0) { if (settings.verbose > 2) { moxi_log_write("<%d cproxy ascii item response ok\n", uc->sfd); } } } } } } else { if (settings.verbose > 1) { moxi_log_write("ERROR: unexpected downstream data block"); } } }
enum store_item_type do_store_item ( item *it, int comm, const uint32_t hv ) { char *key = ITEM_key (it); item *old_it = do_item_get (key, it->nkey, hv); enum store_item_type stored = NOT_STORED; item *new_it = NULL; int flags; if ( old_it != NULL && comm == NREAD_ADD ) { do_item_update (old_it); } else if ( ! old_it && ( comm == NREAD_REPLACE || comm == NREAD_APPEND || comm == NREAD_PREPEND ) ) { } else { if ( comm == NREAD_APPEND || comm == NREAD_PREPEND ) { if ( stored == NOT_STORED ) { flags = ( int ) strtol (ITEM_suffix (old_it), ( char ** ) NULL, 10); new_it = do_item_alloc (key, it->nkey, flags, old_it->exptime, ITEM_data (it), it->nbytes + old_it->nbytes - 2, hv); if ( ! new_it ) { if ( old_it ) do_item_remove (old_it); return NOT_STORED; } if ( comm == NREAD_APPEND ) { memcpy (ITEM_data (new_it), ITEM_data (old_it), old_it->nbytes); memcpy (ITEM_data (new_it) + old_it->nbytes - 2, ITEM_data (it), it->nbytes); } else { memcpy (ITEM_data (new_it), ITEM_data (it), it->nbytes); memcpy (ITEM_data (new_it) + it->nbytes - 2, ITEM_data (old_it), old_it->nbytes); } it = new_it; } } if ( stored == NOT_STORED ) { if ( old_it != NULL ) { item_replace (old_it, it, hv); } else { do_item_link (it, hv); } stored = STORED; } } if ( old_it != NULL ) { do_item_remove (old_it); } if ( new_it != NULL ) { do_item_remove (new_it); } return stored; }
void protocol_stats_foreach_write(const void *key, const void *value, void *user_data) { char *line = (char *) value; conn *uc = (conn *) user_data; int nline; cb_assert(line != NULL); cb_assert(uc != NULL); (void)key; nline = strlen(line); if (nline > 0) { item *it; if (settings.verbose > 2) { moxi_log_write("%d: cproxy_stats writing: %s\n", uc->sfd, line); } if (IS_BINARY(uc->protocol)) { token_t line_tokens[MAX_TOKENS]; size_t line_ntokens = scan_tokens(line, line_tokens, MAX_TOKENS, NULL); if (line_ntokens == 4) { uint16_t key_len = line_tokens[NAME_TOKEN].length; uint32_t data_len = line_tokens[VALUE_TOKEN].length; it = item_alloc("s", 1, 0, 0, sizeof(protocol_binary_response_stats) + key_len + data_len); if (it != NULL) { protocol_binary_response_stats *header = (protocol_binary_response_stats *) ITEM_data(it); memset(ITEM_data(it), 0, it->nbytes); header->message.header.response.magic = (uint8_t) PROTOCOL_BINARY_RES; header->message.header.response.opcode = uc->binary_header.request.opcode; header->message.header.response.keylen = (uint16_t) htons(key_len); header->message.header.response.bodylen = htonl(key_len + data_len); header->message.header.response.opaque = uc->opaque; memcpy((ITEM_data(it)) + sizeof(protocol_binary_response_stats), line_tokens[NAME_TOKEN].value, key_len); memcpy((ITEM_data(it)) + sizeof(protocol_binary_response_stats) + key_len, line_tokens[VALUE_TOKEN].value, data_len); if (add_conn_item(uc, it)) { add_iov(uc, ITEM_data(it), it->nbytes); if (settings.verbose > 2) { moxi_log_write("%d: cproxy_stats writing binary", uc->sfd); cproxy_dump_header(uc->sfd, ITEM_data(it)); } return; } item_remove(it); } } return; } it = item_alloc("s", 1, 0, 0, nline + 2); if (it != NULL) { strncpy(ITEM_data(it), line, nline); strncpy(ITEM_data(it) + nline, "\r\n", 2); if (add_conn_item(uc, it)) { add_iov(uc, ITEM_data(it), nline + 2); return; } item_remove(it); } } }
void cproxy_process_a2a_downstream(conn *c, char *line) { assert(c != NULL); assert(c->next == NULL); assert(c->extra != NULL); assert(c->cmd == -1); assert(c->item == NULL); assert(line != NULL); assert(line == c->rcurr); assert(IS_ASCII(c->protocol)); assert(IS_PROXY(c->protocol)); if (settings.verbose > 1) fprintf(stderr, "<%d cproxy_process_a2a_downstream %s\n", c->sfd, line); downstream *d = c->extra; assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); if (strncmp(line, "VALUE ", 6) == 0) { token_t tokens[MAX_TOKENS]; size_t ntokens; unsigned int flags; int clen = 0; int vlen; uint64_t cas = CPROXY_NOT_CAS; ntokens = scan_tokens(line, tokens, MAX_TOKENS, &clen); if (ntokens >= 5 && // Accounts for extra termimation token. ntokens <= 6 && tokens[KEY_TOKEN].length <= KEY_MAX_LENGTH && safe_strtoul(tokens[2].value, (uint32_t *) &flags) && safe_strtoul(tokens[3].value, (uint32_t *) &vlen)) { char *key = tokens[KEY_TOKEN].value; size_t nkey = tokens[KEY_TOKEN].length; item *it = item_alloc(key, nkey, flags, 0, vlen + 2); if (it != NULL) { if (ntokens == 5 || safe_strtoull(tokens[4].value, &cas)) { ITEM_set_cas(it, cas); c->item = it; c->ritem = ITEM_data(it); c->rlbytes = it->nbytes; c->cmd = -1; conn_set_state(c, conn_nread); return; // Success. } else { if (settings.verbose > 1) fprintf(stderr, "cproxy could not parse cas\n"); } } else { if (settings.verbose > 1) fprintf(stderr, "cproxy could not item_alloc size %u\n", vlen + 2); } if (it != NULL) item_remove(it); it = NULL; c->sbytes = vlen + 2; // Number of bytes to swallow. conn_set_state(c, conn_swallow); // Note, eventually, we'll see an END later. } else { // We don't know how much to swallow, so close the downstream. // The conn_closing should release the downstream, // which should write a suffix/error to the upstream. // conn_set_state(c, conn_closing); } } else if (strncmp(line, "END", 3) == 0) { conn_set_state(c, conn_pause); } else if (strncmp(line, "OK", 2) == 0) { conn_set_state(c, conn_pause); // TODO: Handle flush_all's expiration parameter against // the front_cache. // // TODO: We flush the front_cache too often, inefficiently // on every downstream flush_all OK response, rather than // on just the last flush_all OK response. // conn *uc = d->upstream_conn; if (uc != NULL && uc->cmd_curr == PROTOCOL_BINARY_CMD_FLUSH) { mcache_flush_all(&d->ptd->proxy->front_cache, 0); } } else if (strncmp(line, "STAT ", 5) == 0 || strncmp(line, "ITEM ", 5) == 0 || strncmp(line, "PREFIX ", 7) == 0) { assert(d->merger != NULL); conn *uc = d->upstream_conn; if (uc != NULL) { assert(uc->next == NULL); if (protocol_stats_merge_line(d->merger, line) == false) { // Forward the line as-is if we couldn't merge it. // int nline = strlen(line); item *it = item_alloc("s", 1, 0, 0, nline + 2); if (it != NULL) { strncpy(ITEM_data(it), line, nline); strncpy(ITEM_data(it) + nline, "\r\n", 2); if (add_conn_item(uc, it)) { add_iov(uc, ITEM_data(it), nline + 2); it = NULL; } if (it != NULL) item_remove(it); } } } conn_set_state(c, conn_new_cmd); } else { conn_set_state(c, conn_pause); // The upstream conn might be NULL when closed already // or while handling a noreply. // conn *uc = d->upstream_conn; if (uc != NULL) { assert(uc->next == NULL); out_string(uc, line); if (!update_event(uc, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 1) fprintf(stderr, "Can't update upstream write event\n"); d->ptd->stats.stats.err_oom++; cproxy_close_conn(uc); } cproxy_del_front_cache_key_ascii_response(d, line, uc->cmd_start); } } }
/* We get here after reading the header+body into an item. */ void cproxy_process_upstream_binary_nread(conn *c) { cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->cmd_start == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); protocol_binary_request_header *header = (protocol_binary_request_header *) &c->binary_header; int extlen = header->request.extlen; int keylen = header->request.keylen; uint32_t bodylen = header->request.bodylen; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary_nread %x %d %d %u\n", c->sfd, c->cmd, extlen, keylen, bodylen); } /* pthread_mutex_lock(&c->thread->stats.mutex); */ /* c->thread->stats.slab_stats[it->slabs_clsid].set_cmds++; */ /* pthread_mutex_unlock(&c->thread->stats.mutex); */ proxy_td *ptd = c->extra; cb_assert(ptd != NULL); if (header->request.opcode == PROTOCOL_BINARY_CMD_SASL_AUTH) { item *it = c->item; cb_assert(it); cproxy_sasl_plain_auth(c, (char *) ITEM_data(it)); return; } if (header->request.opcode == PROTOCOL_BINARY_CMD_SASL_STEP) { write_bin_error(c, PROTOCOL_BINARY_RESPONSE_AUTH_ERROR, 0); return; } if (c->binary_header.request.opcode == PROTOCOL_BINARY_CMD_STAT) { char *subcommand = binary_get_key(c); size_t nkey = c->binary_header.request.keylen; if (nkey == 13 && memcmp(subcommand, "proxy buckets", 13) == 0) { process_bin_proxy_stats(c); return; } } if (c->noreply) { if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary_nread " "corking quiet command %x %d\n", c->sfd, c->cmd, (c->corked != NULL)); } /* TODO: We currently don't support binary FLUSHQ. */ /* Rather than having the downstream connections get */ /* into a wonky state, prevent it. */ if (header->request.opcode == PROTOCOL_BINARY_CMD_FLUSHQ) { /* Note: don't use cproxy_close_conn(c), as it goes */ /* through the drive_machine() loop again. */ /* cproxy_close_conn(c); */ conn_set_state(c, conn_closing); return; } /* Hold onto or 'cork' all the binary quiet commands */ /* until there's a later non-quiet command. */ if (cproxy_binary_cork_cmd(c)) { conn_set_state(c, conn_new_cmd); } else { ptd->stats.stats.err_oom++; cproxy_close_conn(c); } return; } cb_assert(c->item == NULL || ((item *) c->item)->refcount == 1); cproxy_pause_upstream_for_downstream(ptd, c); }
/* Called when we receive a binary response header from * a downstream server, via try_read_command()/drive_machine(). */ void cproxy_process_b2b_downstream(conn *c) { char *ikey; int ikeylen; downstream *d; int extlen; int keylen; uint32_t bodylen; cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->item == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); cb_assert(c->substate == bin_no_state); d = c->extra; cb_assert(d); c->cmd_curr = -1; c->cmd_start = NULL; c->cmd_start_time = msec_current_time; c->cmd_retries = 0; extlen = c->binary_header.request.extlen; keylen = c->binary_header.request.keylen; bodylen = c->binary_header.request.bodylen; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream %x %d %d %u\n", c->sfd, c->cmd, extlen, keylen, bodylen); } cb_assert(bodylen >= (uint32_t) keylen + extlen); process_bin_noreply(c); /* Map quiet c->cmd values into non-quiet. */ /* Our approach is to read everything we can before */ /* getting into big switch/case statements for the */ /* actual processing. */ /* Alloc an item and continue with an rest-of-body nread if */ /* necessary. The item will hold the entire response message */ /* (the header + body). */ ikey = "q"; ikeylen = 1; c->item = item_alloc(ikey, ikeylen, 0, 0, sizeof(c->binary_header) + bodylen); if (c->item != NULL) { item *it = c->item; void *rb = c->rcurr; cb_assert(it->refcount == 1); memcpy(ITEM_data(it), rb, sizeof(c->binary_header)); if (bodylen > 0) { c->ritem = ITEM_data(it) + sizeof(c->binary_header); c->rlbytes = bodylen; c->substate = bin_read_set_value; conn_set_state(c, conn_nread); } else { /* Since we have no body bytes, we can go immediately to */ /* the nread completed processing step. */ cproxy_process_b2b_downstream_nread(c); } } else { d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } }
/* Used for broadcast commands, like no-op, flush_all or stats. */ bool cproxy_broadcast_b2b_downstream(downstream *d, conn *uc) { int nwrite = 0; int nconns; int i; cb_assert(d != NULL); cb_assert(d->ptd != NULL); cb_assert(d->ptd->proxy != NULL); cb_assert(d->downstream_conns != NULL); cb_assert(uc != NULL); cb_assert(uc->next == NULL); cb_assert(uc->noreply == false); nconns = mcs_server_count(&d->mst); for (i = 0; i < nconns; i++) { conn *c = d->downstream_conns[i]; if (c != NULL && c != NULL_CONN && b2b_forward_item_vbucket(uc, d, uc->item, c, -1) == true) { nwrite++; } } if (settings.verbose > 2) { moxi_log_write("%d: b2b broadcast nwrite %d out of %d\n", uc->sfd, nwrite, nconns); } if (nwrite > 0) { /* TODO: Handle binary 'stats reset' sub-command. */ item *it; if (uc->cmd == PROTOCOL_BINARY_CMD_STAT && d->merger == NULL) { d->merger = genhash_init(128, skeyhash_ops); } it = item_alloc("h", 1, 0, 0, sizeof(protocol_binary_response_header)); if (it != NULL) { protocol_binary_response_header *header = (protocol_binary_response_header *) ITEM_data(it); memset(ITEM_data(it), 0, it->nbytes); header->response.magic = (uint8_t) PROTOCOL_BINARY_RES; header->response.opcode = uc->binary_header.request.opcode; header->response.opaque = uc->opaque; if (add_conn_item(uc, it)) { d->upstream_suffix = ITEM_data(it); d->upstream_suffix_len = it->nbytes; d->upstream_status = PROTOCOL_BINARY_RESPONSE_SUCCESS; d->target_host_ident = NULL; if (settings.verbose > 2) { moxi_log_write("%d: b2b broadcast upstream_suffix", uc->sfd); cproxy_dump_header(uc->sfd, ITEM_data(it)); } /* TODO: Handle FLUSHQ (quiet binary flush_all). */ d->downstream_used_start = nwrite; d->downstream_used = nwrite; cproxy_start_downstream_timeout(d, NULL); return true; } item_remove(it); } } return false; }
void process_command(conn *c, char *command) { int comm = 0; int incr = 0; /* * for commands set/add/replace, we build an item and read the data * directly into it, then continue in nread_complete(). */ if (settings.verbose > 1) fprintf(stderr, "<%d %s\n", c->sfd, command); /* All incoming commands will require a response, so we cork at the beginning, and uncork at the very end (usually by means of out_string) */ set_cork(c, 1); if ((strncmp(command, "add ", 4) == 0 && (comm = NREAD_ADD)) || (strncmp(command, "set ", 4) == 0 && (comm = NREAD_SET)) || (strncmp(command, "replace ", 8) == 0 && (comm = NREAD_REPLACE))) { char key[251]; int flags; time_t expire; int len, res; item *it; res = sscanf(command, "%*s %250s %u %ld %d\n", key, &flags, &expire, &len); if (res!=4 || strlen(key)==0 ) { out_string(c, "CLIENT_ERROR bad command line format"); return; } expire = realtime(expire); it = item_alloc(key, flags, expire, len+2); if (it == 0) { out_string(c, "SERVER_ERROR out of memory"); /* swallow the data line */ c->write_and_go = conn_swallow; c->sbytes = len+2; return; } c->item_comm = comm; c->item = it; c->rcurr = ITEM_data(it); c->rlbytes = it->nbytes; c->state = conn_nread; return; } if ((strncmp(command, "incr ", 5) == 0 && (incr = 1)) || (strncmp(command, "decr ", 5) == 0)) { char temp[32]; unsigned int value; item *it; unsigned int delta; char key[251]; int res; char *ptr; time_t now = time(0); res = sscanf(command, "%*s %250s %u\n", key, &delta); if (res!=2 || strlen(key)==0 ) { out_string(c, "CLIENT_ERROR bad command line format"); return; } it = assoc_find(key); if (it && (it->it_flags & ITEM_DELETED)) { it = 0; } if (it && it->exptime && it->exptime < now) { item_unlink(it); it = 0; } if (!it) { out_string(c, "NOT_FOUND"); return; } ptr = ITEM_data(it); while (*ptr && (*ptr<'0' && *ptr>'9')) ptr++; value = atoi(ptr); if (incr) value+=delta; else { if (delta >= value) value = 0; else value-=delta; } sprintf(temp, "%u", value); res = strlen(temp); if (res + 2 > it->nbytes) { /* need to realloc */ item *new_it; new_it = item_alloc(ITEM_key(it), it->flags, it->exptime, res + 2 ); if (new_it == 0) { out_string(c, "SERVER_ERROR out of memory"); return; } memcpy(ITEM_data(new_it), temp, res); memcpy(ITEM_data(new_it) + res, "\r\n", 2); item_replace(it, new_it); } else { /* replace in-place */ memcpy(ITEM_data(it), temp, res); memset(ITEM_data(it) + res, ' ', it->nbytes-res-2); } out_string(c, temp); return; } if (strncmp(command, "get ", 4) == 0) { char *start = command + 4; char key[251]; int next; int i = 0; item *it; time_t now = time(0); while(sscanf(start, " %250s%n", key, &next) >= 1) { start+=next; stats.get_cmds++; it = assoc_find(key); if (it && (it->it_flags & ITEM_DELETED)) { it = 0; } if (settings.oldest_live && it && it->time <= settings.oldest_live) { item_unlink(it); it = 0; } if (it && it->exptime && it->exptime < now) { item_unlink(it); it = 0; } if (it) { if (i >= c->isize) { item **new_list = realloc(c->ilist, sizeof(item *)*c->isize*2); if (new_list) { c->isize *= 2; c->ilist = new_list; } else break; } stats.get_hits++; it->refcount++; item_update(it); *(c->ilist + i) = it; i++; } else stats.get_misses++; } c->icurr = c->ilist; c->ileft = i; if (c->ileft) { c->ipart = 0; c->state = conn_mwrite; c->ibytes = 0; return; } else { out_string(c, "END"); return; } } if (strncmp(command, "delete ", 7) == 0) { char key[251]; item *it; int res; time_t exptime = 0; res = sscanf(command, "%*s %250s %ld", key, &exptime); it = assoc_find(key); if (!it) { out_string(c, "NOT_FOUND"); return; } if (exptime == 0) { item_unlink(it); out_string(c, "DELETED"); return; } if (delcurr >= deltotal) { item **new_delete = realloc(todelete, sizeof(item *) * deltotal * 2); if (new_delete) { todelete = new_delete; deltotal *= 2; } else { /* * can't delete it immediately, user wants a delay, * but we ran out of memory for the delete queue */ out_string(c, "SERVER_ERROR out of memory"); return; } } exptime = realtime(exptime); it->refcount++; /* use its expiration time as its deletion time now */ it->exptime = exptime; it->it_flags |= ITEM_DELETED; todelete[delcurr++] = it; out_string(c, "DELETED"); return; } if (strncmp(command, "stats", 5) == 0) { process_stat(c, command); return; } if (strcmp(command, "flush_all") == 0) { settings.oldest_live = time(0); out_string(c, "OK"); return; } if (strcmp(command, "version") == 0) { out_string(c, "VERSION " VERSION); return; } if (strcmp(command, "quit") == 0) { c->state = conn_closing; return; } if (strncmp(command, "slabs reassign ", 15) == 0) { int src, dst; char *start = command+15; if (sscanf(start, "%u %u\r\n", &src, &dst) == 2) { int rv = slabs_reassign(src, dst); if (rv == 1) { out_string(c, "DONE"); return; } if (rv == 0) { out_string(c, "CANT"); return; } if (rv == -1) { out_string(c, "BUSY"); return; } } out_string(c, "CLIENT_ERROR bogus command"); return; } out_string(c, "ERROR"); return; }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; item_chunk *ch = NULL; status = MOVE_PASS; if (it->it_flags & ITEM_CHUNK) { /* This chunk is a chained part of a larger item. */ ch = (item_chunk *) it; /* Instead, we use the head chunk to find the item and effectively * lock the entire structure. If a chunk has ITEM_CHUNK flag, its * head cannot be slabbed, so the normal routine is safe. */ it = ch->head; assert(it->it_flags & ITEM_CHUNKED); } /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared * the chunk for move. Only these two flags should exist. */ if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { assert(ch == NULL); slab_rebalance_cut_free(s_cls, it); status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(it); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(it); item_trylock_unlock(hold_lock); } } } else { /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark * busy and wait for item to complete its upload. */ status = MOVE_BUSY; } } int save_item = 0; item *new_it = NULL; size_t ntotal = 0; switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ /* Check if expired or flushed */ ntotal = ITEM_ntotal(it); /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */ if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) { /* Chunked should be identical to non-chunked, except we need * to swap out ntotal for the head-chunk-total. */ ntotal = s_cls->size; } if ((it->exptime != 0 && it->exptime < current_time) || item_is_flushed(it)) { /* Expired, don't save. */ save_item = 0; } else if (ch == NULL && (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) { /* Not a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else if (ch != NULL && (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) { /* Is a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else { /* Was whatever it was, and we have memory for it. */ save_item = 1; } pthread_mutex_unlock(&slabs_lock); unsigned int requested_adjust = 0; if (save_item) { if (ch == NULL) { assert((new_it->it_flags & ITEM_CHUNKED) == 0); /* if free memory, memcpy. clear prev/next/h_bucket */ memcpy(new_it, it, ntotal); new_it->prev = 0; new_it->next = 0; new_it->h_next = 0; /* These are definitely required. else fails assert */ new_it->it_flags &= ~ITEM_LINKED; new_it->refcount = 0; do_item_replace(it, new_it, hv); /* Need to walk the chunks and repoint head */ if (new_it->it_flags & ITEM_CHUNKED) { item_chunk *fch = (item_chunk *) ITEM_data(new_it); fch->next->prev = fch; while (fch) { fch->head = new_it; fch = fch->next; } } it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif slab_rebal.rescues++; requested_adjust = ntotal; } else { item_chunk *nch = (item_chunk *) new_it; /* Chunks always have head chunk (the main it) */ ch->prev->next = nch; if (ch->next) ch->next->prev = nch; memcpy(nch, ch, ch->used + sizeof(item_chunk)); ch->refcount = 0; ch->it_flags = ITEM_SLABBED|ITEM_FETCHED; slab_rebal.chunk_rescues++; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key((item *)ch), "deadbeef", 8); #endif refcount_decr(it); requested_adjust = s_cls->size; } } else { /* restore ntotal in case we tried saving a head chunk. */ ntotal = ITEM_ntotal(it); do_item_unlink(it, hv); slabs_free(it, ntotal, slab_rebal.s_clsid); /* Swing around again later to remove it from the freelist. */ slab_rebal.busy_items++; was_busy++; } item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); /* Always remove the ntotal, as we added it in during * do_slabs_alloc() when copying the item. */ s_cls->requested -= requested_adjust; break; case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; STATS_LOCK(); stats.slab_reassign_busy_items += slab_rebal.busy_items; STATS_UNLOCK(); slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }
/* Do the actual work of forwarding the command from an * upstream binary conn to its assigned binary downstream. */ bool cproxy_forward_b2b_downstream(downstream *d) { int nc; int server_index; conn *uc; cb_assert(d != NULL); cb_assert(d->ptd != NULL); cb_assert(d->ptd->proxy != NULL); cb_assert(d->downstream_conns != NULL); cb_assert(d->downstream_used == 0); cb_assert(d->multiget == NULL); cb_assert(d->merger == NULL); d->downstream_used_start = 0; uc = d->upstream_conn; if (settings.verbose > 2) { moxi_log_write("%d: cproxy_forward_b2b_downstream %x\n", uc->sfd, uc->cmd); } cb_assert(uc != NULL); cb_assert(uc->state == conn_pause); cb_assert(uc->cmd >= 0); cb_assert(uc->cmd_start == NULL); cb_assert(uc->thread != NULL); cb_assert(uc->thread->base != NULL); cb_assert(uc->noreply == false); cb_assert(IS_BINARY(uc->protocol)); cb_assert(IS_PROXY(uc->protocol)); server_index = -1; if (cproxy_is_broadcast_cmd(uc->cmd) == false && uc->corked == NULL) { item *it = uc->item; protocol_binary_request_header *req; char *key; int key_len; cb_assert(it != NULL); req = (protocol_binary_request_header *) ITEM_data(it); key = ((char *) req) + sizeof(*req) + req->request.extlen; key_len = ntohs(req->request.keylen); if (key_len > 0) { server_index = cproxy_server_index(d, key, key_len, NULL); if (server_index < 0) { return false; } } } nc = cproxy_connect_downstream(d, uc->thread, server_index); if (nc == -1) { return true; } if (nc > 0) { int i; int nconns; cb_assert(d->downstream_conns != NULL); if (d->usec_start == 0 && d->ptd->behavior_pool.base.time_stats) { d->usec_start = usec_now(); } nconns = mcs_server_count(&d->mst); for (i = 0; i < nconns; i++) { conn *c = d->downstream_conns[i]; if (c != NULL && c != NULL_CONN) { cb_assert(c->state == conn_pause); cb_assert(c->item == NULL); if (cproxy_prep_conn_for_write(c) == false) { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); return false; } } } /* Uncork the saved-up quiet binary commands. */ cproxy_binary_uncork_cmds(d, uc); if (uc->cmd == PROTOCOL_BINARY_CMD_FLUSH || uc->cmd == PROTOCOL_BINARY_CMD_NOOP || uc->cmd == PROTOCOL_BINARY_CMD_STAT) { return cproxy_broadcast_b2b_downstream(d, uc); } return cproxy_forward_b2b_simple_downstream(d, uc); } if (settings.verbose > 2) { moxi_log_write("%d: cproxy_forward_b2b_downstream connect failed\n", uc->sfd); } return false; }
static int storage_write(void *storage, const int clsid, const int item_age) { int did_moves = 0; struct lru_pull_tail_return it_info; it_info.it = NULL; lru_pull_tail(clsid, COLD_LRU, 0, LRU_PULL_RETURN_ITEM, 0, &it_info); /* Item is locked, and we have a reference to it. */ if (it_info.it == NULL) { return did_moves; } obj_io io; item *it = it_info.it; /* First, storage for the header object */ size_t orig_ntotal = ITEM_ntotal(it); uint32_t flags; if ((it->it_flags & ITEM_HDR) == 0 && (item_age == 0 || current_time - it->time > item_age)) { FLAGS_CONV(it, flags); item *hdr_it = do_item_alloc(ITEM_key(it), it->nkey, flags, it->exptime, sizeof(item_hdr)); /* Run the storage write understanding the start of the item is dirty. * We will fill it (time/exptime/etc) from the header item on read. */ if (hdr_it != NULL) { int bucket = (it->it_flags & ITEM_CHUNKED) ? PAGE_BUCKET_CHUNKED : PAGE_BUCKET_DEFAULT; // Compress soon to expire items into similar pages. if (it->exptime - current_time < settings.ext_low_ttl) { bucket = PAGE_BUCKET_LOWTTL; } hdr_it->it_flags |= ITEM_HDR; io.len = orig_ntotal; io.mode = OBJ_IO_WRITE; // NOTE: when the item is read back in, the slab mover // may see it. Important to have refcount>=2 or ~ITEM_LINKED assert(it->refcount >= 2); // NOTE: write bucket vs free page bucket will disambiguate once // lowttl feature is better understood. if (extstore_write_request(storage, bucket, bucket, &io) == 0) { // cuddle the hash value into the time field so we don't have // to recalculate it. item *buf_it = (item *) io.buf; buf_it->time = it_info.hv; // copy from past the headers + time headers. // TODO: should be in items.c if (it->it_flags & ITEM_CHUNKED) { // Need to loop through the item and copy item_chunk *sch = (item_chunk *) ITEM_schunk(it); int remain = orig_ntotal; int copied = 0; // copy original header int hdrtotal = ITEM_ntotal(it) - it->nbytes; memcpy((char *)io.buf+STORE_OFFSET, (char *)it+STORE_OFFSET, hdrtotal - STORE_OFFSET); copied = hdrtotal; // copy data in like it were one large object. while (sch && remain) { assert(remain >= sch->used); memcpy((char *)io.buf+copied, sch->data, sch->used); // FIXME: use one variable? remain -= sch->used; copied += sch->used; sch = sch->next; } } else { memcpy((char *)io.buf+STORE_OFFSET, (char *)it+STORE_OFFSET, io.len-STORE_OFFSET); } // crc what we copied so we can do it sequentially. buf_it->it_flags &= ~ITEM_LINKED; buf_it->exptime = crc32c(0, (char*)io.buf+STORE_OFFSET, orig_ntotal-STORE_OFFSET); extstore_write(storage, &io); item_hdr *hdr = (item_hdr *) ITEM_data(hdr_it); hdr->page_version = io.page_version; hdr->page_id = io.page_id; hdr->offset = io.offset; // overload nbytes for the header it hdr_it->nbytes = it->nbytes; /* success! Now we need to fill relevant data into the new * header and replace. Most of this requires the item lock */ /* CAS gets set while linking. Copy post-replace */ item_replace(it, hdr_it, it_info.hv); ITEM_set_cas(hdr_it, ITEM_get_cas(it)); do_item_remove(hdr_it); did_moves = 1; LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EXTSTORE_WRITE, it, bucket); } else { /* Failed to write for some reason, can't continue. */ slabs_free(hdr_it, ITEM_ntotal(hdr_it), ITEM_clsid(hdr_it)); } } } do_item_remove(it); item_unlock(it_info.hv); return did_moves; }
/* We reach here after nread'ing a header+body into an item. */ void cproxy_process_b2b_downstream_nread(conn *c) { conn *uc; item *it; downstream *d; protocol_binary_response_header *header; int extlen; int keylen; uint32_t bodylen; int status; int opcode; cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->cmd_start == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); header = (protocol_binary_response_header *) &c->binary_header; extlen = header->response.extlen; keylen = header->response.keylen; bodylen = header->response.bodylen; status = ntohs(header->response.status); opcode = header->response.opcode; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread %x %x %d %d %u %d %x\n", c->sfd, c->cmd, opcode, extlen, keylen, bodylen, c->noreply, status); } d = c->extra; cb_assert(d != NULL); cb_assert(d->ptd != NULL); cb_assert(d->ptd->proxy != NULL); /* TODO: Need to handle quiet binary command error response, */ /* in the right order. */ /* TODO: Need to handle not-my-vbucket error during a quiet cmd. */ uc = d->upstream_conn; it = c->item; /* Clear c->item because we either move it to the upstream or */ /* item_remove() it on error. */ c->item = NULL; cb_assert(it != NULL); cb_assert(it->refcount == 1); if (cproxy_binary_ignore_reply(c, header, it)) { return; } if (c->noreply) { conn_set_state(c, conn_new_cmd); } else { conn_set_state(c, conn_pause); if (opcode == PROTOCOL_BINARY_CMD_NOOP || opcode == PROTOCOL_BINARY_CMD_FLUSH) { goto done; } if (opcode == PROTOCOL_BINARY_CMD_STAT) { if (status == PROTOCOL_BINARY_RESPONSE_SUCCESS) { if (keylen > 0) { if (d->merger != NULL) { char *key = (ITEM_data(it)) + sizeof(*header) + extlen; char *val = key + keylen; protocol_stats_merge_name_val(d->merger, "STAT", 4, key, keylen, val, bodylen - keylen - extlen); } conn_set_state(c, conn_new_cmd); /* Get next STATS response. */ } } goto done; } /* If the client is still there, we should handle */ /* a not-my-vbucket error with a possible retry. */ if (uc != NULL && status == PROTOCOL_BINARY_RESPONSE_NOT_MY_VBUCKET) { int max_retries; protocol_binary_request_header *req; int vbucket; int sindex; if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread not-my-vbucket, " "cmd: %x %d\n", c->sfd, header->response.opcode, uc->item != NULL); } cb_assert(uc->item != NULL); req = (protocol_binary_request_header *)ITEM_data((item*)uc->item); vbucket = ntohs(req->request.reserved); sindex = downstream_conn_index(d, c); if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread not-my-vbucket, " "cmd: %x not multi-key get, sindex %d, vbucket %d, retries %d\n", c->sfd, header->response.opcode, sindex, vbucket, uc->cmd_retries); } mcs_server_invalid_vbucket(&d->mst, sindex, vbucket); /* As long as the upstream is still open and we haven't */ /* retried too many times already. */ max_retries = cproxy_max_retries(d); if (uc->cmd_retries < max_retries) { uc->cmd_retries++; d->upstream_retry++; d->ptd->stats.stats.tot_retry_vbucket++; goto done; } if (settings.verbose > 2) { moxi_log_write("%d: cproxy_process_b2b_downstream_nread not-my-vbucket, " "cmd: %x skipping retry %d >= %d\n", c->sfd, header->response.opcode, uc->cmd_retries, max_retries); } } } /* Write the response to the upstream connection. */ if (uc != NULL) { if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_b2b_downstream_nread got %u\n", c->sfd, it->nbytes); cproxy_dump_header(c->sfd, ITEM_data(it)); } if (add_conn_item(uc, it) == true) { it->refcount++; if (add_iov(uc, ITEM_data(it), it->nbytes) == 0) { /* If we got a quiet response, however, don't change the */ /* upstream connection's state (should be in paused state), */ /* as we expect the downstream server to provide a */ /* verbal/non-quiet response that moves the downstream */ /* conn through the conn_pause countdown codepath. */ if (c->noreply == false) { cproxy_update_event_write(d, uc); conn_set_state(uc, conn_mwrite); } goto done; } } d->ptd->stats.stats.err_oom++; cproxy_close_conn(uc); } done: if (it != NULL) { item_remove(it); } }
static void storage_compact_readback(void *storage, logger *l, bool drop_unread, char *readback_buf, uint32_t page_id, uint64_t page_version, uint64_t read_size) { uint64_t offset = 0; unsigned int rescues = 0; unsigned int lost = 0; unsigned int skipped = 0; while (offset < read_size) { item *hdr_it = NULL; item_hdr *hdr = NULL; item *it = (item *)(readback_buf+offset); unsigned int ntotal; // probably zeroed out junk at the end of the wbuf if (it->nkey == 0) { break; } ntotal = ITEM_ntotal(it); uint32_t hv = (uint32_t)it->time; item_lock(hv); // We don't have a conn and don't need to do most of do_item_get hdr_it = assoc_find(ITEM_key(it), it->nkey, hv); if (hdr_it != NULL) { bool do_write = false; refcount_incr(hdr_it); // Check validity but don't bother removing it. if ((hdr_it->it_flags & ITEM_HDR) && !item_is_flushed(hdr_it) && (hdr_it->exptime == 0 || hdr_it->exptime > current_time)) { hdr = (item_hdr *)ITEM_data(hdr_it); if (hdr->page_id == page_id && hdr->page_version == page_version) { // Item header is still completely valid. extstore_delete(storage, page_id, page_version, 1, ntotal); // drop inactive items. if (drop_unread && GET_LRU(hdr_it->slabs_clsid) == COLD_LRU) { do_write = false; skipped++; } else { do_write = true; } } } if (do_write) { bool do_update = false; int tries; obj_io io; io.len = ntotal; io.mode = OBJ_IO_WRITE; for (tries = 10; tries > 0; tries--) { if (extstore_write_request(storage, PAGE_BUCKET_COMPACT, PAGE_BUCKET_COMPACT, &io) == 0) { memcpy(io.buf, it, io.len); extstore_write(storage, &io); do_update = true; break; } else { usleep(1000); } } if (do_update) { if (it->refcount == 2) { hdr->page_version = io.page_version; hdr->page_id = io.page_id; hdr->offset = io.offset; rescues++; } else { lost++; // TODO: re-alloc and replace header. } } else { lost++; } } do_item_remove(hdr_it); } item_unlock(hv); offset += ntotal; if (read_size - offset < sizeof(struct _stritem)) break; } STATS_LOCK(); stats.extstore_compact_lost += lost; stats.extstore_compact_rescues += rescues; stats.extstore_compact_skipped += skipped; STATS_UNLOCK(); LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_COMPACT_READ_END, NULL, page_id, offset, rescues, lost, skipped); }
void cproxy_process_upstream_binary(conn *c) { cb_assert(c != NULL); cb_assert(c->cmd >= 0); cb_assert(c->next == NULL); cb_assert(c->item == NULL); cb_assert(IS_BINARY(c->protocol)); cb_assert(IS_PROXY(c->protocol)); proxy_td *ptd = c->extra; cb_assert(ptd != NULL); if (!cproxy_prep_conn_for_write(c)) { ptd->stats.stats.err_upstream_write_prep++; conn_set_state(c, conn_closing); return; } c->cmd_curr = -1; c->cmd_start = NULL; c->cmd_start_time = msec_current_time; c->cmd_retries = 0; int extlen = c->binary_header.request.extlen; int keylen = c->binary_header.request.keylen; uint32_t bodylen = c->binary_header.request.bodylen; cb_assert(bodylen >= (uint32_t) keylen + extlen); if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary %x %d %d %u\n", c->sfd, c->cmd, extlen, keylen, bodylen); } process_bin_noreply(c); /* Map quiet c->cmd values into non-quiet. */ if (c->cmd == PROTOCOL_BINARY_CMD_VERSION || c->cmd == PROTOCOL_BINARY_CMD_QUIT) { dispatch_bin_command(c); return; } /* Alloc an item and continue with an rest-of-body nread if */ /* necessary. The item will hold the entire request message */ /* (the header + body). */ char *ikey = "u"; int ikeylen = 1; c->item = item_alloc(ikey, ikeylen, 0, 0, sizeof(c->binary_header) + bodylen); if (c->item != NULL) { item *it = c->item; void *rb = c->rcurr; cb_assert(it->refcount == 1); memcpy(ITEM_data(it), rb, sizeof(c->binary_header)); if (bodylen > 0) { c->ritem = ITEM_data(it) + sizeof(c->binary_header); c->rlbytes = bodylen; c->substate = bin_read_set_value; conn_set_state(c, conn_nread); } else { /* Since we have no body bytes, we can go immediately to */ /* the nread completed processing step. */ if (c->binary_header.request.opcode == PROTOCOL_BINARY_CMD_SASL_LIST_MECHS) { /* TODO: One day handle more than just PLAIN sasl auth. */ write_bin_response(c, "PLAIN", 0, 0, strlen("PLAIN")); return; } cproxy_pause_upstream_for_downstream(ptd, c); } } else { if (settings.verbose > 2) { moxi_log_write("<%d cproxy_process_upstream_binary OOM\n", c->sfd); } ptd->stats.stats.err_oom++; cproxy_close_conn(c); } }
static int replication_rep(conn *c, item *it) { int r = 0; int exp = 0; int len = 0; char *s = "rep "; char *n = "\r\n"; char *p = NULL; char flag[40]; if(it->exptime) exp = it->exptime + stats.started; flag[0]=0; if(p=ITEM_suffix(it)){ int i; memcpy(flag, p, it->nsuffix - 2); flag[it->nsuffix - 2] = 0; for(i=0;i<strlen(flag);i++){ if(flag[i] > ' ') break; } memmove(flag,&flag[i],strlen(flag)-i); for(p=flag;*p>' ';p++); *p=0; } len += strlen(s); len += it->nkey; len += 1; len += strlen(flag); len += 1; len += replication_get_num(NULL, exp); len += 1; len += replication_get_num(NULL, it->nbytes - 2); len += 1; len += replication_get_num(NULL, it->cas_id); len += strlen(n); len += it->nbytes; len += strlen(n); if(replication_alloc(c,len) == -1){ fprintf(stderr, "replication: rep alloc error\n"); return(-1); } p = c->wcurr + c->wbytes; memcpy(p, s, strlen(s)); p += strlen(s); memcpy(p, ITEM_key(it), it->nkey); p += it->nkey; *(p++) = ' '; memcpy(p, flag, strlen(flag)); p += strlen(flag); *(p++) = ' '; p += replication_get_num(p, exp); *(p++) = ' '; p += replication_get_num(p, it->nbytes - 2); *(p++) = ' '; p += replication_get_num(p, it->cas_id); memcpy(p, n, strlen(n)); p += strlen(n); memcpy(p, ITEM_data(it), it->nbytes); p += it->nbytes; c->wbytes = p - c->wcurr; return(0); }
/* Forward an upstream command that came with item data, * like set/add/replace/etc. */ bool cproxy_forward_a2a_item_downstream(downstream *d, short cmd, item *it, conn *uc) { assert(d != NULL); assert(d->ptd != NULL); assert(d->ptd->proxy != NULL); assert(d->downstream_conns != NULL); assert(it != NULL); assert(uc != NULL); assert(uc->next == NULL); // Assuming we're already connected to downstream. // bool self = false; conn *c = cproxy_find_downstream_conn(d, ITEM_key(it), it->nkey, &self); if (c != NULL) { if (self) { cproxy_optimize_to_self(d, uc, uc->cmd_start); complete_nread_ascii(uc); return true; } if (cproxy_prep_conn_for_write(c)) { assert(c->state == conn_pause); char *verb = nread_text(cmd); assert(verb != NULL); char *str_flags = ITEM_suffix(it); char *str_length = strchr(str_flags + 1, ' '); int len_flags = str_length - str_flags; int len_length = it->nsuffix - len_flags - 2; char *str_exptime = add_conn_suffix(c); char *str_cas = (cmd == NREAD_CAS ? add_conn_suffix(c) : NULL); if (str_flags != NULL && str_length != NULL && len_flags > 1 && len_length > 1 && str_exptime != NULL && (cmd != NREAD_CAS || str_cas != NULL)) { sprintf(str_exptime, " %u", it->exptime); if (str_cas != NULL) sprintf(str_cas, " %llu", (unsigned long long) ITEM_get_cas(it)); if (add_iov(c, verb, strlen(verb)) == 0 && add_iov(c, ITEM_key(it), it->nkey) == 0 && add_iov(c, str_flags, len_flags) == 0 && add_iov(c, str_exptime, strlen(str_exptime)) == 0 && add_iov(c, str_length, len_length) == 0 && (str_cas == NULL || add_iov(c, str_cas, strlen(str_cas)) == 0) && (uc->noreply == false || add_iov(c, " noreply", 8) == 0) && add_iov(c, ITEM_data(it) - 2, it->nbytes + 2) == 0) { conn_set_state(c, conn_mwrite); c->write_and_go = conn_new_cmd; if (update_event(c, EV_WRITE | EV_PERSIST)) { d->downstream_used_start = 1; d->downstream_used = 1; if (cproxy_dettach_if_noreply(d, uc) == false) { cproxy_start_downstream_timeout(d, c); // During a synchronous (with-reply) SET, // handle fire-&-forget SET optimization. // if (cmd == NREAD_SET && cproxy_optimize_set_ascii(d, uc, ITEM_key(it), it->nkey)) { d->ptd->stats.stats.tot_optimize_sets++; } } else { c->write_and_go = conn_pause; mcache_delete(&d->ptd->proxy->front_cache, ITEM_key(it), it->nkey); } return true; } } d->ptd->stats.stats.err_oom++; cproxy_close_conn(c); } else { // TODO: Handle this weird error case. } } else { d->ptd->stats.stats.err_downstream_write_prep++; cproxy_close_conn(c); } if (settings.verbose > 1) fprintf(stderr, "Proxy item write out of memory"); } return false; }
item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, const rel_time_t exptime, const int nbytes) { int i; uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; /* If no memory is available, attempt a direct LRU juggle/eviction */ /* This is a race in order to simplify lru_pull_tail; in cases where * locked items are on the tail, you want them to fall out and cause * occasional OOM's, rather than internally work around them. * This also gives one fewer code path for slab alloc/free */ /* TODO: if power_largest, try a lot more times? or a number of times * based on how many chunks the new object should take up? * or based on the size of an object lru_pull_tail() says it evicted? * This is a classical GC problem if "large items" are of too varying of * sizes. This is actually okay here since the larger the data, the more * bandwidth it takes, the more time we can loop in comparison to serving * and replacing small items. */ for (i = 0; i < 10; i++) { uint64_t total_bytes; /* Try to reclaim memory first */ if (!settings.lru_maintainer_thread) { lru_pull_tail(id, COLD_LRU, 0, 0); } it = slabs_alloc(ntotal, id, &total_bytes, 0); if (settings.expirezero_does_not_evict) total_bytes -= noexp_lru_size(id); if (it == NULL) { if (settings.lru_maintainer_thread) { lru_pull_tail(id, HOT_LRU, total_bytes, 0); lru_pull_tail(id, WARM_LRU, total_bytes, 0); if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT) <= 0) break; } else { if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT) <= 0) break; } } else { break; } } if (i > 0) { pthread_mutex_lock(&lru_locks[id]); itemstats[id].direct_reclaims += i; pthread_mutex_unlock(&lru_locks[id]); } if (it == NULL) { pthread_mutex_lock(&lru_locks[id]); itemstats[id].outofmemory++; pthread_mutex_unlock(&lru_locks[id]); return NULL; } assert(it->slabs_clsid == 0); //assert(it != heads[id]); /* Refcount is seeded to 1 by slabs_alloc() */ it->next = it->prev = 0; /* Items are initially loaded into the HOT_LRU. This is '0' but I want at * least a note here. Compiler (hopefully?) optimizes this out. */ if (settings.lru_maintainer_thread) { if (exptime == 0 && settings.expirezero_does_not_evict) { id |= NOEXP_LRU; } else { id |= HOT_LRU; } } else { /* There is only COLD in compat-mode */ id |= COLD_LRU; } it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags |= settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; /* Need to shuffle the pointer stored in h_next into it->data. */ if (it->it_flags & ITEM_CHUNKED) { item_chunk *chunk = (item_chunk *) ITEM_data(it); chunk->next = (item_chunk *) it->h_next; chunk->prev = 0; chunk->head = it; /* Need to chain back into the head's chunk */ chunk->next->prev = chunk; chunk->size = chunk->next->size - ((char *)chunk - (char *)it); chunk->used = 0; assert(chunk->size > 0); } it->h_next = 0; return it; }