// Called when writes encounter a "doomed" record, to delete the doomed record // and create a new one in place without giving up the record lock. // FIXME - won't be able to "rescue" with future sindex method - will go away. void as_record_rescue(as_index_ref *r_ref, as_namespace *ns) { record_delete_adjust_sindex(r_ref->r, ns); as_record_destroy(r_ref->r, ns); as_index_clear_record_info(r_ref->r); cf_atomic64_incr(&ns->n_objects); }
static inline void client_read_update_stats(as_namespace* ns, uint8_t result_code) { switch (result_code) { case AS_OK: cf_atomic64_incr(&ns->n_client_read_success); break; case AS_ERR_TIMEOUT: cf_atomic64_incr(&ns->n_client_read_timeout); break; default: cf_atomic64_incr(&ns->n_client_read_error); break; case AS_ERR_NOT_FOUND: cf_atomic64_incr(&ns->n_client_read_not_found); break; } }
// Returns: // 1 - created new record // 0 - found existing record // -1 - failure - could not allocate arena stage int as_record_get_create(as_index_tree *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns) { int rv = as_index_get_insert_vlock(tree, keyd, r_ref); if (rv == 1) { cf_atomic64_incr(&ns->n_objects); } return rv; }
//------------------------------------------------ // Insert a data point. Points out of range will // end up in the bucket at the appropriate end. // void linear_histogram_insert_data_point(linear_histogram *h, uint64_t point) { int64_t offset = point - h->start; int64_t bucket = 0; if (offset > 0) { bucket = offset / h->bucket_width; if (bucket >= (int64_t)h->num_buckets) { bucket = h->num_buckets - 1; } } cf_atomic64_incr(&h->counts[bucket]); }
as_rec * crec_create(ldt_record *lrecord) { // Generate Key Digest udf_record *h_urecord = (udf_record *) as_rec_source(lrecord->h_urec); cf_digest keyd = h_urecord->r_ref->r->key; as_namespace *ns = h_urecord->tr->rsv.ns; int retry_cnt = 0; ldt_slot *lslotp = slot_lookup_free(lrecord, "crec_create"); if (!lslotp) { cf_crash(AS_LDT, "Allocation error !!!"); } slot_init(lslotp, lrecord); while (retry_cnt++ < LDT_SUBRECORD_RANDOMIZER_MAX_RETRIES) { as_ldt_digest_randomizer(&keyd); as_ldt_subdigest_setversion(&keyd, lrecord->version); slot_setup_digest(lslotp, &keyd); int rv = as_aerospike_rec_create(lrecord->as, lslotp->c_urec_p); // rv == 0 if successful // rv == 1 if record is already found retry // other wise failure if (rv == 0) { cf_detail_digest(AS_LDT, &keyd, "Crec Create:Ptr(%p) Digest: version %ld", lslotp->c_urec_p, lrecord->version); as_val_reserve(lslotp->c_urec_p); return lslotp->c_urec_p; } if (rv != 1) { cf_warning(AS_LDT, "crec_create: LDT Sub-Record Create Error [rv=%d]... Fail", rv); break; } cf_atomic64_incr(&ns->lstats.ldt_randomizer_retry); } slot_destroy(lslotp, lrecord); cf_warning_digest(AS_LDT, &keyd, "ldt_aerospike_crec_create : Create failed after %d retries", retry_cnt); return NULL; }
//------------------------------------------------ // Insert a time interval data point. The interval // is time elapsed since start_ns, converted to // milliseconds or microseconds as appropriate. // Assumes start_ns was obtained via cf_getns() // some time ago. Generates a histogram with // either: // // bucket millisecond range // ------ ----------------- // 0 0 to 1 (more exactly, 0.999999) // 1 1 to 2 (more exactly, 1.999999) // 2 2 to 4 (more exactly, 3.999999) // 3 4 to 8 (more exactly, 7.999999) // 4 8 to 16 (more exactly, 15.999999) // etc. // // or: // // bucket microsecond range // ------ ----------------- // 0 0 to 1 (more exactly, 0.999) // 1 1 to 2 (more exactly, 1.999) // 2 2 to 4 (more exactly, 3.999) // 3 4 to 8 (more exactly, 7.999) // 4 8 to 16 (more exactly, 15.999) // etc. // void histogram_insert_data_point(histogram *h, uint64_t start_ns) { uint64_t end_ns = cf_getns(); uint64_t delta_t = (end_ns - start_ns) / h->time_div; int bucket = 0; if (delta_t != 0) { bucket = msb(delta_t); if (start_ns > end_ns) { // Either the clock went backwards, or wrapped. (Assume the former, // since it takes ~580 years from 0 to wrap.) cf_warning(AS_INFO, "clock went backwards: start %lu end %lu", start_ns, end_ns); bucket = 0; } } cf_atomic64_incr(&h->counts[bucket]); }
int as_namespace_get_create_set(as_namespace *ns, const char *set_name, uint16_t *p_set_id, bool apply_restrictions) { if (! set_name) { // Should be impossible. cf_warning(AS_NAMESPACE, "null set name"); return -1; } uint32_t idx; cf_vmapx_err result = cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx); bool already_in_vmap = false; if (result == CF_VMAPX_OK) { already_in_vmap = true; } else if (result == CF_VMAPX_ERR_NAME_NOT_FOUND) { as_set set; memset(&set, 0, sizeof(set)); // Check name length just once, here at insertion. (Other vmap calls are // safe if name is too long - they return CF_VMAPX_ERR_NAME_NOT_FOUND.) strncpy(set.name, set_name, AS_SET_NAME_MAX_SIZE); if (set.name[AS_SET_NAME_MAX_SIZE - 1]) { set.name[AS_SET_NAME_MAX_SIZE - 1] = 0; cf_info(AS_NAMESPACE, "set name %s... too long", set.name); return -1; } set.num_elements = 1; result = cf_vmapx_put_unique(ns->p_sets_vmap, &set, &idx); if (result == CF_VMAPX_ERR_NAME_EXISTS) { already_in_vmap = true; } else if (result == CF_VMAPX_ERR_FULL) { cf_info(AS_NAMESPACE, "at set names limit, can't add %s", set.name); return -1; } else if (result != CF_VMAPX_OK) { // Currently, remaining errors are all some form of out-of-memory. cf_info(AS_NAMESPACE, "error %d, can't add %s", result, set.name); return -1; } } else { // Should be impossible. cf_warning(AS_NAMESPACE, "unexpected error %d", result); return -1; } if (already_in_vmap) { as_set *p_set; if ((result = cf_vmapx_get_by_index(ns->p_sets_vmap, idx, (void**)&p_set)) != CF_VMAPX_OK) { // Should be impossible - just verified idx. cf_warning(AS_NAMESPACE, "unexpected error %d", result); return -1; } // If requested, fail if emptying set or stop-writes limit is breached. if (apply_restrictions && (IS_SET_DELETED(p_set) || as_set_stop_writes(p_set))) { return -2; } // The set passed all tests - need to increment its num_elements. cf_atomic64_incr(&p_set->num_elements); } *p_set_id = (uint16_t)(idx + 1); return 0; }
//------------------------------------------------ // Insert a raw data point. Generates a histogram // with: // // bucket value range // ------ ----------- // 0 0 // 1 1 // 2 2, 3 // 3 4 to 7 // 4 8 to 15 // etc. // void histogram_insert_raw(histogram *h, uint64_t value) { cf_atomic64_incr(&h->counts[msb(value)]); }
// Set of threads which talk to client over the connection for doing the needful // processing. Note that once fd is assigned to a thread all the work on that fd // is done by that thread. Fair fd usage is expected of the client. First thread // is special - also does accept [listens for new connections]. It is the only // thread which does it. void * thr_demarshal(void *arg) { cf_socket_cfg *s, *ls, *xs; // Create my epoll fd, register in the global list. struct epoll_event ev; int nevents, i, n, epoll_fd; cf_clock last_fd_print = 0; #if defined(USE_SYSTEMTAP) uint64_t nodeid = g_config.self_node; #endif // Early stage aborts; these will cause faults in process scope. cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument"); s = &g_config.socket; ls = &g_config.localhost_socket; xs = &g_config.xdr_socket; #ifdef USE_JEM int orig_arena; if (0 > (orig_arena = jem_get_arena())) { cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!"); } else { cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena); } #endif // Figure out my thread index. pthread_t self = pthread_self(); int thr_id; for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) { if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self)) break; } if (thr_id == MAX_DEMARSHAL_THREADS) { cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!"); return(0); } // First thread accepts new connection at interface socket. if (thr_id == 0) { demarshal_file_handle_init(); epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) { cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); } memset(&ev, 0, sizeof (ev)); ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = s->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev)) { cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); } cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port); if (ls->sock) { ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = ls->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev)) { cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); } cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port); } if (xs->sock) { ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = xs->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, xs->sock, &ev)) { cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); } cf_info(AS_DEMARSHAL, "Service also listening on XDR info socket %s:%d", xs->addr, xs->port); } } else { epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) { cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); } } g_demarshal_args->epoll_fd[thr_id] = epoll_fd; cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id); int id_cntr = 0; // Demarshal transactions from the socket. for ( ; ; ) { struct epoll_event events[EPOLL_SZ]; cf_detail(AS_DEMARSHAL, "calling epoll"); nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1); if (0 > nevents) { cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno)); } cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents); uint64_t now_ns = cf_getns(); uint64_t now_ms = now_ns / 1000000; // Iterate over all events. for (i = 0; i < nevents; i++) { if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd) || (xs->sock == events[i].data.fd)) { // Accept new connections on the service socket. int csocket = -1; struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); char cpaddr[64]; if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) { // This means we're out of file descriptors - could be a SYN // flood attack or misbehaving client. Eventually we'd like // to make the reaper fairer, but for now we'll just have to // ignore the accept error and move on. if ((errno == EMFILE) || (errno == ENFILE)) { if (last_fd_print != (cf_getms() / 1000L)) { cf_warning(AS_DEMARSHAL, "Hit OS file descriptor limit (EMFILE on accept). Consider raising limit for uid %d", g_config.uid); last_fd_print = cf_getms() / 1000L; } continue; } cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno); } // Get the client IP address in string form. if (caddr.sin_family == AF_INET) { if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else if (caddr.sin_family == AF_INET6) { struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr; if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else { cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family); } cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket); // Validate the limit of protocol connections we allow. uint32_t conns_open = g_stats.proto_connections_opened - g_stats.proto_connections_closed; if (xs->sock != events[i].data.fd && conns_open > g_config.n_proto_fd_max) { if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open); last_fd_print = cf_getms(); } shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Set the socket to nonblocking. if (-1 == cf_socket_set_nonblocking(csocket)) { cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Create as_file_handle and queue it up in epoll_fd for further // communication on one of the demarshal threads. as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle)); if (!fd_h) { cf_crash(AS_DEMARSHAL, "malloc"); } sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port)); fd_h->fd = csocket; fd_h->last_used = cf_getms(); fd_h->reap_me = false; fd_h->trans_active = false; fd_h->proto = 0; fd_h->proto_unread = 0; fd_h->fh_info = 0; fd_h->security_filter = as_security_filter_create(); // Insert into the global table so the reaper can manage it. Do // this before queueing it up for demarshal threads - once // EPOLL_CTL_ADD is done it's difficult to back out (if insert // into global table fails) because fd state could be anything. cf_rc_reserve(fd_h); pthread_mutex_lock(&g_file_handle_a_LOCK); int j; bool inserted = true; if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) { inserted = false; } else { g_file_handle_a[j] = fd_h; } pthread_mutex_unlock(&g_file_handle_a_LOCK); if (!inserted) { cf_info(AS_DEMARSHAL, "unable to add socket to file handle table"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; cf_rc_free(fd_h); // will free even with ref-count of 2 } else { // Place the client socket in the event queue. memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ; ev.data.ptr = fd_h; // Round-robin pick up demarshal thread epoll_fd and add // this new connection to epoll. int id = (id_cntr++) % g_demarshal_args->num_threads; fd_h->epoll_fd = g_demarshal_args->epoll_fd[id]; if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) { cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads); pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); } else { cf_atomic64_incr(&g_stats.proto_connections_opened); } } } else { bool has_extra_ref = false; as_file_handle *fd_h = events[i].data.ptr; if (fd_h == 0) { cf_info(AS_DEMARSHAL, "event with null handle, continuing"); goto NextEvent; } cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events); // Process data on an existing connection: this might be more // activity on an already existing transaction, so we have some // state to manage. as_proto *proto_p = 0; int fd = fd_h->fd; if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events); // no longer in use: out of epoll etc goto NextEvent_FD_Cleanup; } if (fd_h->trans_active) { goto NextEvent; } // If pointer is NULL, then we need to create a transaction and // store it in the buffer. if (fd_h->proto == NULL) { as_proto proto; int sz; /* Get the number of available bytes */ if (-1 == ioctl(fd, FIONREAD, &sz)) { cf_info(AS_DEMARSHAL, "unable to get number of available bytes"); goto NextEvent_FD_Cleanup; } // If we don't have enough data to fill the message buffer, // just wait and we'll come back to this one. However, we'll // let messages with zero size through, since they are // likely errors. We don't cleanup the FD in this case since // we'll get more data on it. if (sz < sizeof(as_proto) && sz != 0) { goto NextEvent; } // Do a preliminary read of the header into a stack- // allocated structure, so that later on we can allocate the // entire message buffer. if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) { cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno); goto NextEvent_FD_Cleanup; } if (proto.version != PROTO_VERSION && // For backward compatibility, allow version 0 with // security messages. ! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) { cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u", fd_h->client, proto.version); goto NextEvent_FD_Cleanup; } // Swap the necessary elements of the as_proto. as_proto_swap(&proto); if (proto.sz > PROTO_SIZE_MAX) { cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64, fd_h->client, PROTO_SIZE_MAX, (uint64_t)proto.sz); goto NextEvent_FD_Cleanup; } #ifdef USE_JEM // Attempt to peek the namespace and set the JEMalloc arena accordingly. size_t peeked_data_sz = 0; size_t min_field_sz = sizeof(uint32_t) + sizeof(char); size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz; size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.) uint8_t peekbuf[peekbuf_sz]; if (PROTO_TYPE_AS_MSG == proto.type) { size_t offset = sizeof(as_msg); // Number of bytes to peek from the socket. // size_t peek_sz = peekbuf_sz; // Peak up to the size of the peek buffer. size_t peek_sz = MIN(proto.sz, peekbuf_sz); // Peek only up to the minimum necessary number of bytes. if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) { // That's actually legitimate. The as_proto may have gone into one // packet, the as_msg into the next one, which we haven't yet received. // This just "never happened" without async. cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz); } if (peeked_data_sz > min_as_msg_sz) { // cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz); if (peeked_data_sz > proto.sz) { cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd); log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz); goto NextEvent_FD_Cleanup; } if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) { jem_set_arena(orig_arena); } else { uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0; bool found = false; // cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields); while (!found && (field_num < n_fields)) { as_msg_field *field = (as_msg_field *) (&peekbuf[offset]); uint32_t value_sz = ntohl(field->field_sz) - 1; // cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset); // cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz); // cf_debug(AS_DEMARSHAL, "\ttype %d", field->type); if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) { if (value_sz >= AS_ID_NAMESPACE_SZ) { cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz); goto NextEvent_FD_Cleanup; } char ns[AS_ID_NAMESPACE_SZ]; found = true; memcpy(ns, field->data, value_sz); ns[value_sz] = '\0'; // cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num); jem_set_arena(as_namespace_get_jem_arena(ns)); } else { // cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type); field_num++; offset += sizeof(as_msg_field) + value_sz; if (offset >= peeked_data_sz) { break; } } } if (!found) { cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz); jem_set_arena(orig_arena); } } } else { jem_set_arena(orig_arena); } } else { jem_set_arena(orig_arena); } #endif // Allocate the complete message buffer. proto_p = cf_malloc(sizeof(as_proto) + proto.sz); cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno)); memcpy(proto_p, &proto, sizeof(as_proto)); #ifdef USE_JEM // Jam in the peeked data. if (peeked_data_sz) { memcpy(proto_p->data, &peekbuf, peeked_data_sz); } fd_h->proto_unread = proto_p->sz - peeked_data_sz; #else fd_h->proto_unread = proto_p->sz; #endif fd_h->proto = (void *) proto_p; } else { proto_p = fd_h->proto; } if (fd_h->proto_unread > 0) { // Read the data. n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0); if (0 >= n) { if (errno == EAGAIN) { continue; } cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno)); goto NextEvent_FD_Cleanup; } // Decrement bytes-unread counter. cf_detail(AS_DEMARSHAL, "read fd %d (%d %"PRIu64")", fd, n, fd_h->proto_unread); fd_h->proto_unread -= n; } // Check for a finished read. if (0 == fd_h->proto_unread) { // It's only really live if it's injecting a transaction. fd_h->last_used = now_ms; thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress fd_h->proto = 0; fd_h->proto_unread = 0; cf_rc_reserve(fd_h); has_extra_ref = true; // Info protocol requests. if (proto_p->type == PROTO_TYPE_INFO) { as_info_transaction it = { fd_h, proto_p, now_ns }; as_info(&it); goto NextEvent; } // INIT_TR as_transaction tr; as_transaction_init_head(&tr, NULL, (cl_msg *)proto_p); tr.origin = FROM_CLIENT; tr.from.proto_fd_h = fd_h; tr.start_time = now_ns; if (! as_proto_is_valid_type(proto_p)) { cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type); // We got a proto message type we don't recognize, so it // may not do any good to send back an as_msg error, but // it's the best we can do. At least we can keep the fd. as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); goto NextEvent; } // Check if it's compressed. if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) { // Decompress it - allocate buffer to hold decompressed // packet. uint8_t *decompressed_buf = NULL; size_t decompressed_buf_size = 0; int rv = 0; if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) { cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv); cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p"); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); goto NextEvent; } // Free the compressed packet since we'll be using the // decompressed packet from now on. cf_free(proto_p); proto_p = NULL; // Get original packet. tr.msgp = (cl_msg *)decompressed_buf; as_proto_swap(&(tr.msgp->proto)); if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) { cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]", tr.msgp->proto.version, tr.msgp->proto.type, (uint64_t)tr.msgp->proto.sz, decompressed_buf_size); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); goto NextEvent; } } // If it's an XDR connection and we haven't yet modified the connection settings, ... if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG && as_transaction_is_xdr(&tr) && (fd_h->fh_info & FH_INFO_XDR) == 0) { // ... modify them. if (thr_demarshal_config_xdr(fd_h->fd) != 0) { cf_warning(AS_DEMARSHAL, "Failed to configure XDR connection"); goto NextEvent_FD_Cleanup; } fd_h->fh_info |= FH_INFO_XDR; } // Security protocol transactions. if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) { as_security_transact(&tr); goto NextEvent; } // For now only AS_MSG's contribute to this benchmark. if (g_config.svc_benchmarks_enabled) { tr.benchmark_time = histogram_insert_data_point(g_stats.svc_demarshal_hist, now_ns); } // Fast path for batch requests. if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) { as_batch_queue_task(&tr); goto NextEvent; } // Swap as_msg fields and bin-ops to host order, and flag // which fields are present, to reduce re-parsing. if (! as_transaction_demarshal_prepare(&tr)) { as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_PARAMETER); goto NextEvent; } ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp, as_transaction_trid(&tr)); // Either process the transaction directly in this thread, // or queue it for processing by another thread (tsvc/info). if (0 != thr_tsvc_process_or_enqueue(&tr)) { cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread"); goto NextEvent_FD_Cleanup; } } // Jump the proto message free & FD cleanup. If we get here, the // above operations went smoothly. The message free & FD cleanup // job is handled elsewhere as directed by // thr_tsvc_process_or_enqueue(). goto NextEvent; NextEvent_FD_Cleanup: // If we allocated memory for the incoming message, free it. if (proto_p) { cf_free(proto_p); fd_h->proto = 0; } // If fd has extra reference for transaction, release it. if (has_extra_ref) { cf_rc_release(fd_h); } // Remove the fd from the events list. if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) { cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)", fd, epoll_fd, errno, cf_strerror(errno)); } pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); NextEvent: ; } // We should never be canceled externally, but just in case... pthread_testcancel(); } } return NULL; }