// Send protocol header to the requesting client. static int batch_send_header(int fd, size_t len) { as_proto proto; proto.version = PROTO_VERSION; proto.type = PROTO_TYPE_AS_MSG; proto.sz = len; as_proto_swap(&proto); return batch_send(fd, (uint8_t*) &proto, 8, MSG_NOSIGNAL | MSG_MORE); }
int as_netio_send_packet(as_file_handle *fd_h, cf_buf_builder *bb_r, uint32_t *offset, bool blocking) { #if defined(USE_SYSTEMTAP) uint64_t nodeid = g_config.self_node; #endif uint32_t len = bb_r->used_sz; uint8_t *buf = bb_r->buf; as_proto proto; proto.version = PROTO_VERSION; proto.type = PROTO_TYPE_AS_MSG; proto.sz = len - 8; as_proto_swap(&proto); memcpy(bb_r->buf, &proto, 8); uint32_t pos = *offset; ASD_QUERY_SENDPACKET_STARTING(nodeid, pos, len); int rv; int retry = 0; cf_detail(AS_PROTO," Start At %p %d %d", buf, pos, len); while (pos < len) { rv = send(fd_h->fd, buf + pos, len - pos, MSG_NOSIGNAL); if (rv <= 0) { if (errno != EAGAIN) { cf_debug(AS_PROTO, "Packet send response error returned %d errno %d fd %d", rv, errno, fd_h->fd); return AS_NETIO_IO_ERR; } if (!blocking && (retry > AS_NETIO_MAX_IO_RETRY)) { *offset = pos; cf_detail(AS_PROTO," End At %p %d %d", buf, pos, len); ASD_QUERY_SENDPACKET_CONTINUE(nodeid, pos); return AS_NETIO_CONTINUE; } retry++; // bigger packets so try few extra times usleep(100); } else { pos += rv; } } ASD_QUERY_SENDPACKET_FINISHED(nodeid); return AS_NETIO_OK; }
// Security is an enterprise feature. If we receive a security message from a // client here, quickly return AS_SEC_ERR_NOT_SUPPORTED. The client may choose // to continue using this (unsecured) socket. void as_security_transact(as_transaction* tr) { // We don't need the request, since we're ignoring it. cf_free(tr->msgp); tr->msgp = NULL; // Set up a simple response with a single as_sec_msg that has no fields. size_t resp_size = sizeof(as_proto) + sizeof(as_sec_msg); uint8_t resp[resp_size]; // Fill out the as_proto fields. as_proto* p_resp_proto = (as_proto*)resp; p_resp_proto->version = PROTO_VERSION; p_resp_proto->type = PROTO_TYPE_SECURITY; p_resp_proto->sz = sizeof(as_sec_msg); // Switch to network byte order. as_proto_swap(p_resp_proto); uint8_t* p_proto_body = resp + sizeof(as_proto); memset((void*)p_proto_body, 0, sizeof(as_sec_msg)); // Fill out the relevant as_sec_msg fields. as_sec_msg* p_sec_msg = (as_sec_msg*)p_proto_body; p_sec_msg->scheme = AS_SEC_MSG_SCHEME; p_sec_msg->result = AS_SEC_ERR_NOT_SUPPORTED; // Send the complete response. cf_socket *sock = &tr->from.proto_fd_h->sock; if (cf_socket_send_all(sock, resp, resp_size, MSG_NOSIGNAL, CF_SOCKET_TIMEOUT) < 0) { cf_warning(AS_SECURITY, "fd %d send failed, errno %d", CSFD(sock), errno); as_end_of_transaction_force_close(tr->from.proto_fd_h); tr->from.proto_fd_h = NULL; return; } as_end_of_transaction_ok(tr->from.proto_fd_h); tr->from.proto_fd_h = NULL; }
int as_query__send_packet(as_file_handle *fd_h, cf_buf_builder *bb_r, uint32_t *offset, bool blocking) { uint32_t len = bb_r->used_sz; uint8_t *buf = bb_r->buf; as_proto proto; proto.version = PROTO_VERSION; proto.type = PROTO_TYPE_AS_MSG; proto.sz = len - 8; as_proto_swap(&proto); memcpy(bb_r->buf, &proto, 8); uint32_t pos = *offset; int rv; int retry = 0; cf_detail(AS_PROTO," Start At %p %d %d", buf, pos, len); while (pos < len) { rv = send(fd_h->fd, buf + pos, len - pos, MSG_NOSIGNAL ); if (rv <= 0) { if (errno != EAGAIN) { cf_warning(AS_PROTO, "Packet send response error returned %d errno %d fd %d", rv, errno, fd_h->fd); shutdown(fd_h->fd, SHUT_RDWR); return AS_NETIO_ERR; } if (!blocking && (retry > AS_NETIO_MAX_IO_RETRY)) { *offset = pos; cf_detail(AS_PROTO," End At %p %d %d", buf, pos, len); return AS_NETIO_CONTINUE; } retry++; // bigger packets so try few extra times usleep(100); } else { pos += rv; } } return AS_NETIO_OK; }
int as_msg_send_fin(int fd, uint32_t result_code) { cl_msg m; m.proto.version = PROTO_VERSION; m.proto.type = PROTO_TYPE_AS_MSG; m.proto.sz = sizeof(as_msg); as_proto_swap(&m.proto); m.msg.header_sz = sizeof(as_msg); m.msg.info1 = 0; m.msg.info2 = 0; m.msg.info3 = AS_MSG_INFO3_LAST; m.msg.unused = 0; m.msg.result_code = result_code; m.msg.generation = 0; m.msg.record_ttl = 0; m.msg.transaction_ttl = 0; m.msg.n_fields = 0; m.msg.n_ops = 0; as_msg_swap_header(&m.msg); return as_msg_send_response(fd, (uint8_t*) &m, sizeof(m), MSG_NOSIGNAL); }
// Security is an enterprise feature. If we receive a security message from a // client here, quickly return AS_SEC_ERR_NOT_SUPPORTED. The client may choose // to continue using this (unsecured) socket. void as_security_transact(as_transaction* tr) { // We don't need the request, since we're ignoring it. cf_free(tr->msgp); tr->msgp = NULL; // Set up a simple response with a single as_sec_msg that has no fields. size_t resp_size = sizeof(as_proto) + sizeof(as_sec_msg); uint8_t resp[resp_size]; // Fill out the as_proto fields. as_proto* p_resp_proto = (as_proto*)resp; p_resp_proto->version = PROTO_VERSION; p_resp_proto->type = PROTO_TYPE_SECURITY; p_resp_proto->sz = sizeof(as_sec_msg); // Switch to network byte order. as_proto_swap(p_resp_proto); uint8_t* p_proto_body = resp + sizeof(as_proto); memset((void*)p_proto_body, 0, sizeof(as_sec_msg)); // Fill out the relevant as_sec_msg fields. as_sec_msg* p_sec_msg = (as_sec_msg*)p_proto_body; p_sec_msg->scheme = AS_SEC_MSG_SCHEME; p_sec_msg->result = AS_SEC_ERR_NOT_SUPPORTED; // Send the complete response. uint8_t* p_write = resp; uint8_t* p_end = resp + resp_size; int fd = tr->from.proto_fd_h->fd; while (p_write < p_end) { int rv = send(fd, (void*)p_write, p_end - p_write, MSG_NOSIGNAL); if (rv > 0) { p_write += rv; } else if (rv == 0) { cf_warning(AS_SECURITY, "fd %d send returned 0", fd); as_end_of_transaction_force_close(tr->from.proto_fd_h); tr->from.proto_fd_h = NULL; return; } // rv < 0 else if (errno == EAGAIN || errno == EWOULDBLOCK) { usleep(1); } else { cf_warning(AS_SECURITY, "fd %d send failed, errno %d", fd, errno); as_end_of_transaction_force_close(tr->from.proto_fd_h); tr->from.proto_fd_h = NULL; return; } } as_end_of_transaction_ok(tr->from.proto_fd_h); tr->from.proto_fd_h = NULL; }
cl_msg * as_msg_make_response_msg( uint32_t result_code, uint32_t generation, uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count, as_namespace *ns, cl_msg *msgp_in, size_t *msg_sz_in, uint64_t trid, const char *setname) { int setname_len = 0; // figure out the size of the entire buffer int msg_sz = sizeof(cl_msg); msg_sz += sizeof(as_msg_op) * bin_count; // the bin headers for (uint16_t i = 0; i < bin_count; i++) { if (bins[i]) { msg_sz += ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(ns, bins[i]->id)); uint32_t psz; if (as_bin_is_hidden(bins[i])) { psz = 0; } else { bool tojson = (as_bin_get_particle_type(bins[i]) == AS_PARTICLE_TYPE_LUA_BLOB); _as_particle_tobuf(bins[i], 0, &psz, tojson); // get size } msg_sz += psz; } else if (ops[i]) // no bin, only op, no particle size msg_sz += ops[i]->name_sz; else cf_warning(AS_PROTO, "internal error!"); } //If a transaction-id is sent by the client, we should send it back in a field if (trid != 0) { msg_sz += (sizeof(as_msg_field) + sizeof(trid)); } // If setname is present, we will send it as a field. Account for its space overhead. if (setname != 0) { setname_len = strlen(setname); msg_sz += (sizeof(as_msg_field) + setname_len); } // most cases are small messages - try to stack alloc if we can byte *b; if ((0 == msgp_in) || (*msg_sz_in < msg_sz)) { b = cf_malloc(msg_sz); if (!b) return(0); } else { b = (byte *) msgp_in; } *msg_sz_in = msg_sz; // set up the header byte *buf = b; // current buffer pointer cl_msg *msgp = (cl_msg *) buf; msgp->proto.version = PROTO_VERSION; msgp->proto.type = PROTO_TYPE_AS_MSG; msgp->proto.sz = msg_sz - sizeof(as_proto); as_proto_swap(&msgp->proto); as_msg *m = &msgp->msg; m->header_sz = sizeof(as_msg); m->info1 = 0; m->info2 = 0; m->info3 = 0; m->unused = 0; m->result_code = result_code; m->generation = generation; m->record_ttl = void_time; m->transaction_ttl = 0; m->n_ops = bin_count; m->n_fields = 0; // Count the number of fields that we are going to send back if (trid != 0) { m->n_fields++; } if (setname != NULL) { m->n_fields++; } as_msg_swap_header(m); buf += sizeof(cl_msg); //If we have to send back the transaction-id, we have fields to send back if (trid != 0) { as_msg_field *trfield = (as_msg_field *) buf; //Allow space for the message field header buf += sizeof(as_msg_field); //Fill the field header trfield->type = AS_MSG_FIELD_TYPE_TRID; //Copy the transaction-id as field data in network byte order (big-endian) uint64_t trid_nbo = __cpu_to_be64(trid); trfield->field_sz = sizeof(trid_nbo); memcpy(trfield->data, &trid_nbo, sizeof(trid_nbo)); as_msg_swap_field(trfield); //Allow space for the message field data buf += sizeof(trid_nbo); } // If we have to send back the setname, we have fields to send back if (setname != NULL) { as_msg_field *trfield = (as_msg_field *) buf; // Allow space for the message field header buf += sizeof(as_msg_field); // Fill the field header trfield->type = AS_MSG_FIELD_TYPE_SET; trfield->field_sz = setname_len + 1; memcpy(trfield->data, setname, setname_len); as_msg_swap_field(trfield); // Allow space for the message field data buf += setname_len; } // over all bins, copy into the buffer for (uint16_t i = 0; i < bin_count; i++) { as_msg_op *op = (as_msg_op *)buf; buf += sizeof(as_msg_op); op->op = AS_MSG_OP_READ; if (bins[i]) { op->version = as_bin_get_version(bins[i], ns->single_bin); op->name_sz = as_bin_memcpy_name(ns, op->name, bins[i]); } else { op->version = 0; memcpy(op->name, ops[i]->name, ops[i]->name_sz); op->name_sz = ops[i]->name_sz; } buf += op->name_sz; // cf_detail(AS_PROTO, "make response: bin %d %s : version %d",i,bins[i]->name,op->version); // Since there are two variable bits, the size is everything after the // data bytes - and this is only the head, we're patching up the rest // in a minute. op->op_sz = 4 + op->name_sz; if (bins[i] && as_bin_inuse(bins[i])) { op->particle_type = as_particle_type_convert(as_bin_get_particle_type(bins[i])); uint32_t psz = msg_sz - (buf - b); // size remaining in buffer, for safety if (as_bin_is_hidden(bins[i])) { op->particle_type = AS_PARTICLE_TYPE_NULL; psz = 0; // packet of size NULL } else { bool tojson = (as_bin_get_particle_type(bins[i]) == AS_PARTICLE_TYPE_LUA_BLOB); if (0 != _as_particle_tobuf(bins[i], buf, &psz, tojson)) { cf_warning(AS_PROTO, "particle to buf: could not copy data!"); } } buf += psz; op->op_sz += psz; } else { op->particle_type = AS_PARTICLE_TYPE_NULL; } as_msg_swap_op(op); } return((cl_msg *) b); }
// Incoming messages start here. // - Could get a request that we need to service. // - Could get a response to one of our requests - need to find the request and // send the real response to the remote end. int proxy_msg_fn(cf_node id, msg *m, void *udata) { int rv; if (cf_rc_count((void*)m) == 0) { cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m); return -1; } uint32_t op = 99999; msg_get_uint32(m, PROXY_FIELD_OP, &op); uint32_t transaction_id = 0; msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id); cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id); switch (op) { case PROXY_OP_REQUEST: { cf_atomic_int_incr(&g_config.proxy_action); #ifdef DEBUG cf_debug(AS_PROXY, "Proxy_msg: received request"); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy msg"); #endif #endif cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "proxy msg function: no digest, problem"); as_fabric_msg_put(m); return 0; } cl_msg *msgp; size_t as_msg_sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) { cf_info(AS_PROXY, "proxy msg function: no as msg, problem"); as_fabric_msg_put(m); return 0; } uint64_t cluster_key = 0; if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) { cf_info(AS_PROXY, "proxy msg function: no cluster key, problem"); as_fabric_msg_put(m); return 0; } // This is allowed to fail - this is a new field, and gets defaulted // to 0 if it doesn't exist. uint32_t timeout_ms = 0; msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms); // cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms); // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.incoming_cluster_key = cluster_key; tr.end_time = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0; tr.proxy_node = id; tr.proxy_msg = m; // Check here if this is shipped op. uint32_t info = 0; msg_get_uint32(m, PROXY_FIELD_INFO, &info); if (info & PROXY_INFO_SHIPPED_OP) { tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP; cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received"); } else { cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid); } MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); } break; case PROXY_OP_RESPONSE: { #ifdef DEBUG // Got the response from the actual endpoint. cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy response"); #endif #endif // Look up the element. proxy_request pr; bool free_msg = true; if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) { // Found the element (sometimes we get two acks so it's OK for // an ack to not find the transaction). if (pr.wr) { as_proxy_shipop_response_hdlr(m, &pr, &free_msg); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } #ifdef DEBUG_VERBOSE cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd); for (size_t _i = 0; _i < proto_sz; _i++) { fprintf(stderr, " %x", ((byte *)proto)[_i]); if (_i % 16 == 15) { fprintf(stderr, "\n"); } } #endif #ifdef EXTRA_CHECKS as_proto proto_copy = *proto; as_proto_swap(&proto_copy); if (proto_copy.sz + 8 != proto_sz) { cf_info(AS_PROXY, "BONE BONE BONE!!!"); cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz); } #endif // Write to the file descriptor. cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd); cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0"); if (pr.batch_shared) { cf_digest* digest; size_t digest_sz = 0; if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) { as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz); as_proxy_set_stat_counters(0); } else { cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id); as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN); as_proxy_set_stat_counters(-1); } cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); } else { size_t pos = 0; while (pos < proto_sz) { rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } } as_proxy_set_stat_counters(0); SendFin: cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); // Return the fabric message or the direct file descriptor - // after write and complete. pr.fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(pr.fd_h); pr.fd_h = 0; } as_fabric_msg_put(pr.fab_msg); pr.fab_msg = 0; } } else { cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id); as_proxy_set_stat_counters(-1); } if (free_msg) { as_fabric_msg_put(m); } } break; case PROXY_OP_REDIRECT: { // Sometimes the destination we proxied a request to isn't able to // satisfy it (for example, their copy of the partition in question // might be desync). cf_node new_dst = 0; msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst); cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst); // Look in the proxy retransmit hash for the tid. proxy_request *pr; pthread_mutex_t *pr_lock; int r = 0; if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) { cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id); as_fabric_msg_put(m); return -1; } if (g_config.self_node == new_dst) { // Although we don't know we're the final destination, undo the // proxy-nature and put back on the main queue. Dangerous, as it // leaves open the possibility of a looping message. cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } cl_msg *msgp; sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.start_time = pr->start_time; // start time tr.end_time = pr->end_time; tr.proto_fd_h = pr->fd_h; tr.batch_shared = pr->batch_shared; tr.batch_index = pr->batch_index; MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); as_fabric_msg_put(pr->fab_msg); shash_delete_lockfree(g_proxy_hash, &transaction_id); } else { // Change the destination, update the retransmit time. pr->dest = new_dst; pr->xmit_ms = cf_getms() + 1; // Send it. msg_incr_ref(pr->fab_msg); if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv); as_fabric_msg_put(pr->fab_msg); } } pthread_mutex_unlock(pr_lock); } as_fabric_msg_put(m); break; default: cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op); msg_dump(m, "proxy received unknown msg"); as_fabric_msg_put(m); break; } // end switch return 0; } // end proxy_msg_fn()
// Set of threads which talk to client over the connection for doing the needful // processing. Note that once fd is assigned to a thread all the work on that fd // is done by that thread. Fair fd usage is expected of the client. First thread // is special - also does accept [listens for new connections]. It is the only // thread which does it. void * thr_demarshal(void *arg) { cf_socket_cfg *s, *ls; // Create my epoll fd, register in the global list. struct epoll_event ev; int nevents, i, n, epoll_fd; cf_clock last_fd_print = 0; #if defined(USE_SYSTEMTAP) uint64_t nodeid = g_config.self_node; #endif // Early stage aborts; these will cause faults in process scope. cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument"); s = &g_config.socket; ls = &g_config.localhost_socket; #ifdef USE_JEM int orig_arena; if (0 > (orig_arena = jem_get_arena())) { cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!"); } else { cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena); } #endif // Figure out my thread index. pthread_t self = pthread_self(); int thr_id; for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) { if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self)) break; } if (thr_id == MAX_DEMARSHAL_THREADS) { cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!"); return(0); } // First thread accepts new connection at interface socket. if (thr_id == 0) { demarshal_file_handle_init(); epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); memset(&ev, 0, sizeof (ev)); ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = s->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port); if (ls->sock) { ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = ls->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port); } } else { epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); } g_demarshal_args->epoll_fd[thr_id] = epoll_fd; cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id); int id_cntr = 0; // Demarshal transactions from the socket. for ( ; ; ) { struct epoll_event events[EPOLL_SZ]; cf_detail(AS_DEMARSHAL, "calling epoll"); nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1); if (0 > nevents) { cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno)); } cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents); uint64_t now_ns = cf_getns(); uint64_t now_ms = now_ns / 1000000; // Iterate over all events. for (i = 0; i < nevents; i++) { if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) { // Accept new connections on the service socket. int csocket = -1; struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); char cpaddr[64]; if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) { // This means we're out of file descriptors - could be a SYN // flood attack or misbehaving client. Eventually we'd like // to make the reaper fairer, but for now we'll just have to // ignore the accept error and move on. if ((errno == EMFILE) || (errno == ENFILE)) { if (last_fd_print != (cf_getms() / 1000L)) { cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit"); last_fd_print = cf_getms() / 1000L; } continue; } cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno); } // Get the client IP address in string form. if (caddr.sin_family == AF_INET) { if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else if (caddr.sin_family == AF_INET6) { struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr; if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else { cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family); } cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket); // Validate the limit of protocol connections we allow. uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed; if (conns_open > g_config.n_proto_fd_max) { if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open); last_fd_print = cf_getms(); } shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Set the socket to nonblocking. if (-1 == cf_socket_set_nonblocking(csocket)) { cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Create as_file_handle and queue it up in epoll_fd for further // communication on one of the demarshal threads. as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle)); if (!fd_h) { cf_crash(AS_DEMARSHAL, "malloc"); } sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port)); fd_h->fd = csocket; fd_h->last_used = cf_getms(); fd_h->reap_me = false; fd_h->trans_active = false; fd_h->proto = 0; fd_h->proto_unread = 0; fd_h->fh_info = 0; fd_h->security_filter = as_security_filter_create(); // Insert into the global table so the reaper can manage it. Do // this before queueing it up for demarshal threads - once // EPOLL_CTL_ADD is done it's difficult to back out (if insert // into global table fails) because fd state could be anything. cf_rc_reserve(fd_h); pthread_mutex_lock(&g_file_handle_a_LOCK); int j; bool inserted = true; if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) { inserted = false; } else { g_file_handle_a[j] = fd_h; } pthread_mutex_unlock(&g_file_handle_a_LOCK); if (!inserted) { cf_info(AS_DEMARSHAL, "unable to add socket to file handle table"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; cf_rc_free(fd_h); // will free even with ref-count of 2 } else { // Place the client socket in the event queue. memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ; ev.data.ptr = fd_h; // Round-robin pick up demarshal thread epoll_fd and add // this new connection to epoll. int id; while (true) { id = (id_cntr++) % g_demarshal_args->num_threads; if (g_demarshal_args->epoll_fd[id] != 0) { break; } } fd_h->epoll_fd = g_demarshal_args->epoll_fd[id]; if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) { cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads); pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); } else { cf_atomic_int_incr(&g_config.proto_connections_opened); } } } else { bool has_extra_ref = false; as_file_handle *fd_h = events[i].data.ptr; if (fd_h == 0) { cf_info(AS_DEMARSHAL, "event with null handle, continuing"); goto NextEvent; } cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events); // Process data on an existing connection: this might be more // activity on an already existing transaction, so we have some // state to manage. as_proto *proto_p = 0; int fd = fd_h->fd; if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events); // no longer in use: out of epoll etc goto NextEvent_FD_Cleanup; } if (fd_h->trans_active) { goto NextEvent; } // If pointer is NULL, then we need to create a transaction and // store it in the buffer. if (fd_h->proto == NULL) { as_proto proto; int sz; /* Get the number of available bytes */ if (-1 == ioctl(fd, FIONREAD, &sz)) { cf_info(AS_DEMARSHAL, "unable to get number of available bytes"); goto NextEvent_FD_Cleanup; } // If we don't have enough data to fill the message buffer, // just wait and we'll come back to this one. However, we'll // let messages with zero size through, since they are // likely errors. We don't cleanup the FD in this case since // we'll get more data on it. if (sz < sizeof(as_proto) && sz != 0) { goto NextEvent; } // Do a preliminary read of the header into a stack- // allocated structure, so that later on we can allocate the // entire message buffer. if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) { cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno); goto NextEvent_FD_Cleanup; } if (proto.version != PROTO_VERSION && // For backward compatibility, allow version 0 with // security messages. ! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) { cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u", fd_h->client, proto.version); goto NextEvent_FD_Cleanup; } // Swap the necessary elements of the as_proto. as_proto_swap(&proto); if (proto.sz > PROTO_SIZE_MAX) { cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64, fd_h->client, PROTO_SIZE_MAX, proto.sz); goto NextEvent_FD_Cleanup; } #ifdef USE_JEM // Attempt to peek the namespace and set the JEMalloc arena accordingly. size_t peeked_data_sz = 0; size_t min_field_sz = sizeof(uint32_t) + sizeof(char); size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz; size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.) uint8_t peekbuf[peekbuf_sz]; if (PROTO_TYPE_AS_MSG == proto.type) { size_t offset = sizeof(as_msg); // Number of bytes to peek from the socket. // size_t peek_sz = peekbuf_sz; // Peak up to the size of the peek buffer. size_t peek_sz = MIN(proto.sz, peekbuf_sz); // Peek only up to the minimum necessary number of bytes. if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) { // That's actually legitimate. The as_proto may have gone into one // packet, the as_msg into the next one, which we haven't yet received. // This just "never happened" without async. cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz); } if (peeked_data_sz > min_as_msg_sz) { // cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz); if (peeked_data_sz > proto.sz) { cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd); log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz); goto NextEvent_FD_Cleanup; } if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) { jem_set_arena(orig_arena); } else { uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0; bool found = false; // cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields); while (!found && (field_num < n_fields)) { as_msg_field *field = (as_msg_field *) (&peekbuf[offset]); uint32_t value_sz = ntohl(field->field_sz) - 1; // cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset); // cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz); // cf_debug(AS_DEMARSHAL, "\ttype %d", field->type); if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) { if (value_sz >= AS_ID_NAMESPACE_SZ) { cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz); goto NextEvent_FD_Cleanup; } char ns[AS_ID_NAMESPACE_SZ]; found = true; memcpy(ns, field->data, value_sz); ns[value_sz] = '\0'; // cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num); jem_set_arena(as_namespace_get_jem_arena(ns)); } else { // cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type); field_num++; offset += sizeof(as_msg_field) + value_sz; if (offset >= peeked_data_sz) { break; } } } if (!found) { cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz); jem_set_arena(orig_arena); } } } else { jem_set_arena(orig_arena); } } else { jem_set_arena(orig_arena); } #endif // Allocate the complete message buffer. proto_p = cf_malloc(sizeof(as_proto) + proto.sz); cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno)); memcpy(proto_p, &proto, sizeof(as_proto)); #ifdef USE_JEM // Jam in the peeked data. if (peeked_data_sz) { memcpy(proto_p->data, &peekbuf, peeked_data_sz); } fd_h->proto_unread = proto_p->sz - peeked_data_sz; #else fd_h->proto_unread = proto_p->sz; #endif fd_h->proto = (void *) proto_p; } else { proto_p = fd_h->proto; } if (fd_h->proto_unread > 0) { // Read the data. n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0); if (0 >= n) { if (errno == EAGAIN) { continue; } cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno)); goto NextEvent_FD_Cleanup; } // Decrement bytes-unread counter. cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread); fd_h->proto_unread -= n; } // Check for a finished read. if (0 == fd_h->proto_unread) { // It's only really live if it's injecting a transaction. fd_h->last_used = now_ms; thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress fd_h->proto = 0; fd_h->proto_unread = 0; // INIT_TR as_transaction tr; as_transaction_init(&tr, NULL, (cl_msg *)proto_p); cf_rc_reserve(fd_h); has_extra_ref = true; tr.proto_fd_h = fd_h; tr.start_time = now_ns; // set transaction start time tr.preprocessed = false; if (! as_proto_is_valid_type(proto_p)) { cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type); // We got a proto message type we don't recognize, so it // may not do any good to send back an as_msg error, but // it's the best we can do. At least we can keep the fd. as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } if (g_config.microbenchmarks) { histogram_insert_data_point(g_config.demarshal_hist, now_ns); tr.microbenchmark_time = cf_getns(); } // Check if it's compressed. if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) { // Decompress it - allocate buffer to hold decompressed // packet. uint8_t *decompressed_buf = NULL; size_t decompressed_buf_size = 0; int rv = 0; if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) { cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv); cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p"); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Count the packets. cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1); // Free the compressed packet since we'll be using the // decompressed packet from now on. cf_free(proto_p); proto_p = NULL; // Get original packet. tr.msgp = (cl_msg *)decompressed_buf; as_proto_swap(&(tr.msgp->proto)); if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) { cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]", tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } } // Security protocol transactions. if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) { as_security_transact(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Info protocol requests. if (tr.msgp->proto.type == PROTO_TYPE_INFO) { if (as_info(&tr)) { cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer"); goto NextEvent_FD_Cleanup; } cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp); // Fast path for batch requests. if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) { as_batch_queue_task(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Either process the transaction directly in this thread, // or queue it for processing by another thread (tsvc/info). if (0 != thr_tsvc_process_or_enqueue(&tr)) { cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread"); goto NextEvent_FD_Cleanup; } else { cf_atomic_int_incr(&g_config.proto_transactions); } } // Jump the proto message free & FD cleanup. If we get here, the // above operations went smoothly. The message free & FD cleanup // job is handled elsewhere as directed by // thr_tsvc_process_or_enqueue(). goto NextEvent; NextEvent_FD_Cleanup: // If we allocated memory for the incoming message, free it. if (proto_p) { cf_free(proto_p); fd_h->proto = 0; } // If fd has extra reference for transaction, release it. if (has_extra_ref) { cf_rc_release(fd_h); } // Remove the fd from the events list. if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) { cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)", fd, epoll_fd, errno, cf_strerror(errno)); } pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); NextEvent: ; } // We should never be canceled externally, but just in case... pthread_testcancel(); } } return NULL; }
cl_msg * as_msg_make_response_msg(uint32_t result_code, uint32_t generation, uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count, as_namespace *ns, cl_msg *msgp_in, size_t *msg_sz_in, uint64_t trid, const char *setname) { size_t msg_sz = sizeof(cl_msg); msg_sz += sizeof(as_msg_op) * bin_count; for (uint16_t i = 0; i < bin_count; i++) { if (ops) { msg_sz += ops[i]->name_sz; } else if (bins[i]) { msg_sz += ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(ns, bins[i]->id)); } else { cf_crash(AS_PROTO, "making response message with null bin and op"); } if (bins[i]) { msg_sz += as_bin_particle_client_value_size(bins[i]); } } if (trid != 0) { msg_sz += sizeof(as_msg_field) + sizeof(trid); } uint32_t setname_len = 0; if (setname) { setname_len = strlen(setname); msg_sz += sizeof(as_msg_field) + setname_len; } uint8_t *b; if (! msgp_in || *msg_sz_in < msg_sz) { b = cf_malloc(msg_sz); if (! b) { return NULL; } } else { b = (uint8_t *)msgp_in; } *msg_sz_in = msg_sz; uint8_t *buf = b; cl_msg *msgp = (cl_msg *)buf; msgp->proto.version = PROTO_VERSION; msgp->proto.type = PROTO_TYPE_AS_MSG; msgp->proto.sz = msg_sz - sizeof(as_proto); as_proto_swap(&msgp->proto); as_msg *m = &msgp->msg; m->header_sz = sizeof(as_msg); m->info1 = 0; m->info2 = 0; m->info3 = 0; m->unused = 0; m->result_code = result_code; m->generation = generation; m->record_ttl = void_time; m->transaction_ttl = 0; m->n_ops = bin_count; m->n_fields = 0; buf += sizeof(cl_msg); if (trid != 0) { m->n_fields++; as_msg_field *trfield = (as_msg_field *)buf; trfield->field_sz = 1 + sizeof(uint64_t); trfield->type = AS_MSG_FIELD_TYPE_TRID; *(uint64_t *)trfield->data = cf_swap_to_be64(trid); buf += sizeof(as_msg_field) + sizeof(uint64_t); as_msg_swap_field(trfield); } if (setname) { m->n_fields++; as_msg_field *trfield = (as_msg_field *)buf; trfield->field_sz = 1 + setname_len; trfield->type = AS_MSG_FIELD_TYPE_SET; memcpy(trfield->data, setname, setname_len); buf += sizeof(as_msg_field) + setname_len; as_msg_swap_field(trfield); } as_msg_swap_header(m); for (uint16_t i = 0; i < bin_count; i++) { as_msg_op *op = (as_msg_op *)buf; op->version = 0; if (ops) { op->op = ops[i]->op; memcpy(op->name, ops[i]->name, ops[i]->name_sz); op->name_sz = ops[i]->name_sz; } else { op->op = AS_MSG_OP_READ; op->name_sz = as_bin_memcpy_name(ns, op->name, bins[i]); } op->op_sz = 4 + op->name_sz; buf += sizeof(as_msg_op) + op->name_sz; buf += as_bin_particle_to_client(bins[i], op); as_msg_swap_op(op); } return (cl_msg *)b; }