/* gets information back from any of the nodes in the cluster */ int citrusleaf_info_cluster(as_cluster *cluster, char *names, char **values_r, bool send_asis, bool check_bounds, int timeout_ms) { if (timeout_ms == 0) { timeout_ms = 100; // milliseconds } uint64_t start = cf_getms(); uint64_t end = start + timeout_ms; int ret = -1; as_nodes* nodes = as_nodes_reserve(cluster); for (uint32_t i = 0; i < nodes->size; i++) { as_node* node = nodes->array[i]; struct sockaddr_in* sa_in = as_node_get_address(node); char* values = 0; if (citrusleaf_info_host_auth(cluster, sa_in, names, &values, (int)(end - cf_getms()), send_asis, check_bounds) == 0) { *values_r = values; ret = 0; break; } if (cf_getms() >= end) { ret = -2; break; } } as_nodes_release(nodes); return ret; }
bool udf_timer_timedout(const as_timer *as_tt) { time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey); if (!tt || !tt->end_time || !tt->udata) { return true; } bool timedout = (cf_getms() > tt->end_time(tt)); if (timedout) { cf_debug(AS_UDF, "UDF Timed Out [%ld:%ld]", cf_getms(), tt->end_time(tt)); return true; } return false; }
as_status as_event_command_execute(as_event_command* cmd, as_error* err) { ck_pr_inc_32(&cmd->cluster->async_pending); // Only do this after the above increment to avoid a race with as_cluster_destroy(). if (!cmd->cluster->valid) { as_event_command_free(cmd); return as_error_set_message(err, AEROSPIKE_ERR_CLIENT, "Client shutting down"); } // Use pointer comparison for performance. // If portability becomes an issue, use "pthread_equal(event_loop->thread, pthread_self())" // instead. if (cmd->event_loop->thread == pthread_self()) { // We are already in event loop thread, so start processing. as_event_command_begin(cmd); } else { if (cmd->timeout_ms) { // Store current time in first 8 bytes which is not used yet. *(uint64_t*)cmd = cf_getms(); } // Send command through queue so it can be executed in event loop thread. if (! as_event_send(cmd)) { as_event_command_free(cmd); return as_error_set_message(err, AEROSPIKE_ERR_CLIENT, "Failed to queue command"); } } return AEROSPIKE_OK; }
/* Processing reads when they return from aio_read */ static void process_read(as_async_info_t *info) { if(!g_running) { return; } cf_atomic_int_decr(&g_read_reqs_queued); uint64_t stop_time = cf_getms(); fd_put(info->p_readreq.p_device, info->fd); if (stop_time != -1) { histogram_insert_data_point(g_p_raw_read_histogram, safe_delta_ms(info->raw_start_time, stop_time)); histogram_insert_data_point(g_p_read_histogram, safe_delta_ms(info->p_readreq.start_time, stop_time)); histogram_insert_data_point( info->p_readreq.p_device->p_raw_read_histogram, safe_delta_ms(info->raw_start_time, stop_time)); } if (g_use_valloc && info->p_buffer) { free(info->p_buffer); } uintptr_t temp = (uintptr_t)info; cf_queue_push(async_info_queue, (void*)&temp); }
//------------------------------------------------ // Do one large block write operation and report. // static void write_and_report_large_block(device* p_device) { salter* p_salter; if (g_num_write_buffers > 1) { p_salter = &g_salters[rand_32() % g_num_write_buffers]; pthread_mutex_lock(&p_salter->lock); *(uint32_t*)p_salter->p_buffer = p_salter->stamp++; } else { p_salter = &g_salters[0]; } uint64_t offset = random_large_block_offset(p_device); uint64_t start_time = cf_getms(); uint64_t stop_time = write_to_device(p_device, offset, g_large_block_ops_bytes, p_salter->p_buffer); if (g_num_write_buffers > 1) { pthread_mutex_unlock(&p_salter->lock); } if (stop_time != -1) { histogram_insert_data_point(g_p_large_block_write_histogram, safe_delta_ms(start_time, stop_time)); } }
// Process a batch request. static void batch_process_request(batch_transaction* btr) { // Keep the reaper at bay. btr->fd_h->last_used = cf_getms(); cf_buf_builder* bb = 0; batch_build_response(btr, &bb); int fd = btr->fd_h->fd; if (bb) { int brv = batch_send_header(fd, bb->used_sz); if (brv == 0) { brv = batch_send(fd, bb->buf, bb->used_sz, MSG_NOSIGNAL | MSG_MORE); if (brv == 0) { brv = batch_send_final(fd, 0); } } cf_buf_builder_free(bb); } else { cf_info(AS_BATCH, " batch request: returned no local responses"); batch_send_final(fd, 0); } batch_transaction_done(btr); }
int citrusleaf_info_cluster_foreach( as_cluster *cluster, const char *command, bool send_asis, bool check_bounds, int timeout_ms, void *udata, bool (*callback)(const as_node * node, const struct sockaddr_in * sa_in, const char *command, char *value, void *udata) ) { //Usage Notes: //udata = memory allocated by caller, passed back to the caller callback function, ufn() //command = command string, memory allocated by caller, caller must free it, passed to server for execution //value = memory allocated by c-client for caller, caller must free it after using it. if (timeout_ms == 0) { timeout_ms = 100; // milliseconds } uint64_t start = cf_getms(); uint64_t end = start + timeout_ms; int ret = 0; as_nodes* nodes = as_nodes_reserve(cluster); for (uint32_t i = 0; i < nodes->size; i++) { as_node* node = nodes->array[i]; struct sockaddr_in* sa_in = as_node_get_address(node); char* value = 0; if (citrusleaf_info_host_auth(cluster, sa_in, (char *)command, &value, (int)(end - cf_getms()), send_asis, check_bounds) == 0) { bool status = callback(node, sa_in, command, value, udata); if (value) { free(value); } if(! status) { ret = -1; break; } } if (cf_getms() >= end) { ret = -2; break; } } as_nodes_release(nodes); return ret; }
uint64_t udf_timer_timeslice(const as_timer *as_tt) { time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey); if (!tt || !tt->end_time || !tt->udata) { return true; } uint64_t timeslice = tt->end_time(tt) - cf_getms(); return (timeslice > 0) ? timeslice : 1; }
//------------------------------------------------ // Do one large block read operation and report. // static void read_and_report_large_block(device* p_device) { uint64_t offset = random_large_block_offset(p_device); uint64_t start_time = cf_getms(); uint64_t stop_time = read_from_device(p_device, offset, g_large_block_ops_bytes, p_device->p_large_block_read_buffer); if (stop_time != -1) { histogram_insert_data_point(g_p_large_block_read_histogram, safe_delta_ms(start_time, stop_time)); } }
static int wait_socket(as_socket_fd fd, uint32_t socket_timeout, uint64_t deadline, bool read) { as_poll poll; as_poll_init(&poll, fd); uint32_t timeout; int rv; while (true) { if (deadline > 0) { uint64_t now = cf_getms(); if (now >= deadline) { rv = 1; // timeout break; } timeout = (uint32_t)(deadline - now); if (socket_timeout > 0 && socket_timeout < timeout) { timeout = socket_timeout; } } else { timeout = socket_timeout; } rv = as_poll_socket(&poll, fd, timeout, read); if (rv > 0) { rv = 0; // success break; } if (rv < 0) { break; // error } // rv == 0 timeout. continue in case timed out before real timeout. } as_poll_destroy(&poll); return rv; }
static as_status as_admin_read_list(aerospike* as, as_error* err, const as_policy_admin* policy, uint8_t* command, uint8_t* end, as_admin_parse_fn parse_fn, as_vector* list) { int timeout_ms = (policy)? policy->timeout : as->config.policies.admin.timeout; if (timeout_ms <= 0) { timeout_ms = DEFAULT_TIMEOUT; } uint64_t deadline_ms = cf_getms() + timeout_ms; as_node* node = as_node_get_random(as->cluster); if (! node) { return as_error_set_message(err, AEROSPIKE_ERR_CLIENT, "Failed to find server node."); } int fd; as_status status = as_node_get_connection(err, node, &fd); if (status) { as_node_release(node); return status; } status = as_admin_send(err, fd, command, end, deadline_ms); if (status) { as_close(fd); as_node_release(node); return status; } status = as_admin_read_blocks(err, fd, deadline_ms, parse_fn, list); if (status) { as_close(fd); as_node_release(node); return status; } as_node_put_connection(node, fd); as_node_release(node); return status; }
//------------------------------------------------ // Do one device write operation. // static uint64_t write_to_device(device* p_device, uint64_t offset, uint32_t size, uint8_t* p_buffer) { int fd = fd_get(p_device); if (fd == -1) { return -1; } if (lseek(fd, offset, SEEK_SET) != offset || write(fd, p_buffer, size) != (ssize_t)size) { close(fd); fprintf(stdout, "ERROR: seek & write\n"); return -1; } uint64_t stop_ms = cf_getms(); fd_put(p_device, fd); return stop_ms; }
void repl_write_reset_rw(rw_request* rw, as_transaction* tr, repl_write_done_cb cb) { // Reset rw->from.any which was set null in tr setup. (And note that // tr->from.any will be null here in respond-on-master-complete mode.) rw->from.any = tr->from.any; // Needed for response to origin. rw->generation = tr->generation; rw->void_time = tr->void_time; rw->repl_write_cb = cb; // TODO - is this better than not resetting? Note - xmit_ms not volatile. rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms; rw->retry_interval_ms = g_config.transaction_retry_ms; for (int i = 0; i < rw->n_dest_nodes; i++) { rw->dest_complete[i] = false; } }
//------------------------------------------------ // Runs in every device large-block write thread, // executes large-block writes at a constant rate. // static void* run_large_block_writes(void* pv_device) { device* p_device = (device*)pv_device; uint64_t count = 0; while (g_running) { write_and_report_large_block(p_device); count++; int sleep_ms = (int) (((count * 1000 * g_num_devices) / g_large_block_ops_per_sec) - (cf_getms() - g_run_start_ms)); if (sleep_ms > 0) { usleep((uint32_t)sleep_ms * 1000); } } return (0); }
static int as_read_users(aerospike* as, const as_policy_admin* policy, uint8_t* buffer, uint8_t* end, as_vector* /*<as_user_roles*>*/ users) { int timeout_ms = (policy)? policy->timeout : as->config.policies.admin.timeout; if (timeout_ms <= 0) { timeout_ms = DEFAULT_TIMEOUT; } uint64_t deadline_ms = cf_getms() + timeout_ms; as_node* node = as_node_get_random(as->cluster); if (! node) { return CITRUSLEAF_FAIL_CLIENT; } int fd; int status = as_node_get_connection(node, &fd); if (status) { as_node_release(node); return status; } if (as_send(fd, buffer, end, deadline_ms, timeout_ms)) { cf_close(fd); as_node_release(node); return CITRUSLEAF_FAIL_TIMEOUT; } status = as_read_user_blocks(fd, buffer, deadline_ms, timeout_ms, users); if (status >= 0) { as_node_put_connection(node, fd); } else { cf_close(fd); } as_node_release(node); return status; }
void repl_write_setup_rw(rw_request* rw, as_transaction* tr, repl_write_done_cb repl_write_cb, timeout_done_cb timeout_cb) { rw->msgp = tr->msgp; tr->msgp = NULL; rw->msg_fields = tr->msg_fields; rw->origin = tr->origin; rw->from_flags = tr->from_flags; rw->from.any = tr->from.any; rw->from_data.any = tr->from_data.any; tr->from.any = NULL; rw->start_time = tr->start_time; rw->benchmark_time = tr->benchmark_time; as_partition_reservation_copy(&rw->rsv, &tr->rsv); // Hereafter, rw_request must release reservation - happens in destructor. rw->end_time = tr->end_time; rw->generation = tr->generation; rw->void_time = tr->void_time; rw->repl_write_cb = repl_write_cb; rw->timeout_cb = timeout_cb; rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms; rw->retry_interval_ms = g_config.transaction_retry_ms; for (int i = 0; i < rw->n_dest_nodes; i++) { rw->dest_complete[i] = false; } // Allow retransmit thread to destroy rw_request as soon as we unlock. rw->is_set_up = true; }
static int as_execute(aerospike* as, const as_policy_admin* policy, uint8_t* buffer, uint8_t* end) { int timeout_ms = (policy)? policy->timeout : as->config.policies.admin.timeout; if (timeout_ms <= 0) { timeout_ms = DEFAULT_TIMEOUT; } uint64_t deadline_ms = cf_getms() + timeout_ms; as_node* node = as_node_get_random(as->cluster); if (! node) { return CITRUSLEAF_FAIL_CLIENT; } int fd; int status = as_node_get_connection(node, &fd); if (status) { as_node_release(node); return status; } if (as_send(fd, buffer, end, deadline_ms, timeout_ms)) { cf_close(fd); as_node_release(node); return CITRUSLEAF_FAIL_TIMEOUT; } if (cf_socket_read_timeout(fd, buffer, HEADER_SIZE, deadline_ms, timeout_ms)) { cf_close(fd); as_node_release(node); return CITRUSLEAF_FAIL_TIMEOUT; } as_node_put_connection(node, fd); as_node_release(node); return buffer[RESULT_CODE]; }
int as_authenticate(int fd, const char* user, const char* credential, int timeout_ms) { uint8_t buffer[STACK_BUF_SZ]; uint8_t* p = buffer + 8; p = write_header(p, AUTHENTICATE, 2); p = write_field_string(p, USER, user); p = write_field_string(p, CREDENTIAL, credential); if (timeout_ms == 0) { timeout_ms = DEFAULT_TIMEOUT; } uint64_t deadline_ms = cf_getms() + timeout_ms; if (as_send(fd, buffer, p, deadline_ms, timeout_ms)) { return CITRUSLEAF_FAIL_TIMEOUT; } if (cf_socket_read_timeout(fd, buffer, HEADER_SIZE, deadline_ms, timeout_ms)) { return CITRUSLEAF_FAIL_TIMEOUT; } return buffer[RESULT_CODE]; }
// Make a callback for a specified number of elements in the tree, from outside // the tree lock. void as_index_reduce_partial(as_index_tree *tree, uint32_t sample_count, as_index_reduce_fn cb, void *udata) { pthread_mutex_lock(&tree->reduce_lock); // For full reduce, get the number of elements inside the tree lock. if (sample_count == AS_REDUCE_ALL) { sample_count = tree->elements; } if (sample_count == 0) { pthread_mutex_unlock(&tree->reduce_lock); return; } size_t sz = sizeof(as_index_ph_array) + (sizeof(as_index_ph) * sample_count); as_index_ph_array *v_a; uint8_t buf[64 * 1024]; if (sz > 64 * 1024) { v_a = cf_malloc(sz); if (! v_a) { pthread_mutex_unlock(&tree->reduce_lock); return; } } else { v_a = (as_index_ph_array*)buf; } v_a->alloc_sz = sample_count; v_a->pos = 0; uint64_t start_ms = cf_getms(); // Recursively, fetch all the value pointers into this array, so we can make // all the callbacks outside the big lock. if (tree->root->left_h != tree->sentinel_h) { as_index_reduce_traverse(tree, tree->root->left_h, tree->sentinel_h, v_a); } cf_debug(AS_INDEX, "as_index_reduce_traverse took %"PRIu64" ms", cf_getms() - start_ms); pthread_mutex_unlock(&tree->reduce_lock); for (uint32_t i = 0; i < v_a->pos; i++) { as_index_ref r_ref; r_ref.skip_lock = false; r_ref.r = v_a->indexes[i].r; r_ref.r_h = v_a->indexes[i].r_h; olock_vlock(g_config.record_locks, &r_ref.r->key, &r_ref.olock); cf_atomic_int_incr(&g_config.global_record_lock_count); // Callback MUST call as_record_done() to unlock and release record. cb(&r_ref, udata); } if (v_a != (as_index_ph_array*)buf) { cf_free(v_a); } }
// Incoming messages start here. // - Could get a request that we need to service. // - Could get a response to one of our requests - need to find the request and // send the real response to the remote end. int proxy_msg_fn(cf_node id, msg *m, void *udata) { int rv; if (cf_rc_count((void*)m) == 0) { cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m); return -1; } uint32_t op = 99999; msg_get_uint32(m, PROXY_FIELD_OP, &op); uint32_t transaction_id = 0; msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id); cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id); switch (op) { case PROXY_OP_REQUEST: { cf_atomic_int_incr(&g_config.proxy_action); #ifdef DEBUG cf_debug(AS_PROXY, "Proxy_msg: received request"); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy msg"); #endif #endif cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "proxy msg function: no digest, problem"); as_fabric_msg_put(m); return 0; } cl_msg *msgp; size_t as_msg_sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) { cf_info(AS_PROXY, "proxy msg function: no as msg, problem"); as_fabric_msg_put(m); return 0; } uint64_t cluster_key = 0; if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) { cf_info(AS_PROXY, "proxy msg function: no cluster key, problem"); as_fabric_msg_put(m); return 0; } // This is allowed to fail - this is a new field, and gets defaulted // to 0 if it doesn't exist. uint32_t timeout_ms = 0; msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms); // cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms); // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.incoming_cluster_key = cluster_key; tr.end_time = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0; tr.proxy_node = id; tr.proxy_msg = m; // Check here if this is shipped op. uint32_t info = 0; msg_get_uint32(m, PROXY_FIELD_INFO, &info); if (info & PROXY_INFO_SHIPPED_OP) { tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP; cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received"); } else { cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid); } MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); } break; case PROXY_OP_RESPONSE: { #ifdef DEBUG // Got the response from the actual endpoint. cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy response"); #endif #endif // Look up the element. proxy_request pr; bool free_msg = true; if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) { // Found the element (sometimes we get two acks so it's OK for // an ack to not find the transaction). if (pr.wr) { as_proxy_shipop_response_hdlr(m, &pr, &free_msg); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } #ifdef DEBUG_VERBOSE cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd); for (size_t _i = 0; _i < proto_sz; _i++) { fprintf(stderr, " %x", ((byte *)proto)[_i]); if (_i % 16 == 15) { fprintf(stderr, "\n"); } } #endif #ifdef EXTRA_CHECKS as_proto proto_copy = *proto; as_proto_swap(&proto_copy); if (proto_copy.sz + 8 != proto_sz) { cf_info(AS_PROXY, "BONE BONE BONE!!!"); cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz); } #endif // Write to the file descriptor. cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd); cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0"); if (pr.batch_shared) { cf_digest* digest; size_t digest_sz = 0; if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) { as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz); as_proxy_set_stat_counters(0); } else { cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id); as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN); as_proxy_set_stat_counters(-1); } cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); } else { size_t pos = 0; while (pos < proto_sz) { rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } } as_proxy_set_stat_counters(0); SendFin: cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); // Return the fabric message or the direct file descriptor - // after write and complete. pr.fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(pr.fd_h); pr.fd_h = 0; } as_fabric_msg_put(pr.fab_msg); pr.fab_msg = 0; } } else { cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id); as_proxy_set_stat_counters(-1); } if (free_msg) { as_fabric_msg_put(m); } } break; case PROXY_OP_REDIRECT: { // Sometimes the destination we proxied a request to isn't able to // satisfy it (for example, their copy of the partition in question // might be desync). cf_node new_dst = 0; msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst); cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst); // Look in the proxy retransmit hash for the tid. proxy_request *pr; pthread_mutex_t *pr_lock; int r = 0; if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) { cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id); as_fabric_msg_put(m); return -1; } if (g_config.self_node == new_dst) { // Although we don't know we're the final destination, undo the // proxy-nature and put back on the main queue. Dangerous, as it // leaves open the possibility of a looping message. cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } cl_msg *msgp; sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.start_time = pr->start_time; // start time tr.end_time = pr->end_time; tr.proto_fd_h = pr->fd_h; tr.batch_shared = pr->batch_shared; tr.batch_index = pr->batch_index; MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); as_fabric_msg_put(pr->fab_msg); shash_delete_lockfree(g_proxy_hash, &transaction_id); } else { // Change the destination, update the retransmit time. pr->dest = new_dst; pr->xmit_ms = cf_getms() + 1; // Send it. msg_incr_ref(pr->fab_msg); if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv); as_fabric_msg_put(pr->fab_msg); } } pthread_mutex_unlock(pr_lock); } as_fabric_msg_put(m); break; default: cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op); msg_dump(m, "proxy received unknown msg"); as_fabric_msg_put(m); break; } // end switch return 0; } // end proxy_msg_fn()
int as_proxy_shipop(cf_node dst, write_request *wr) { as_partition_id pid = as_partition_getid(wr->keyd); if (dst == 0) { cf_crash(AS_PROXY, "the destination should never be zero"); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key()); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl); wr->msgp = 0; // If it is shipped op. uint32_t info = 0; info |= PROXY_INFO_SHIPPED_OP; msg_set_uint32(m, PROXY_FIELD_INFO, info); cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)", wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = wr->start_time; pr.end_time = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns; cf_rc_reserve(wr); pr.wr = wr; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.fd_h = NULL; pr.batch_shared = NULL; pr.batch_index = 0; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_info(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv); as_fabric_msg_put(m); } wr->shipped_op_initiator = true; cf_atomic_int_incr(&g_config.ldt_proxy_initiate); return 0; }
// Make a request to another node. // // Note: there's a cheat here. 'as_msg' is used in a raw form, and includes // structured data (version - type - nfields - sz ...) which should be made more // wire-protocol-friendly. int as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key) { cf_detail(AS_PROXY, "proxy divert"); cf_atomic_int_incr(&g_config.stat_proxy_reqs); if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) { cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr); } as_partition_id pid = as_partition_getid(tr->keyd); if (dst == 0) { // Get the list of replicas. dst = as_partition_getreplica_read(ns, pid); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC; msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl); tr->msgp = 0; cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = tr->start_time; pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns; pr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.ns = ns; pr.wr = NULL; pr.batch_shared = tr->batch_shared; pr.batch_index = tr->batch_index; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_debug(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv); as_fabric_msg_put(m); } cf_atomic_int_incr(&g_config.proxy_initiate); return 0; }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }
/* cf_queue_pop * if ms_wait < 0, wait forever * if ms_wait = 0, don't wait at all * if ms_wait > 0, wait that number of ms * */ int cf_queue_pop(cf_queue *q, void *buf, int ms_wait) { if (NULL == q) { cf_error("cf_queue_pop: try passing in a queue"); return(-1); } #ifdef EXTERNAL_LOCKS if (ms_wait != CF_QUEUE_NOWAIT) { // this implementation won't wait cf_error("cf_queue_pop: only nowait supported"); return(-1); } #endif // EXTERNAL_LOCKS QUEUE_LOCK(q); struct timespec tp; if (ms_wait > 0) { #ifdef OSX uint64_t curms = cf_getms(); // using the cl generic functions defined in cf_clock.h. It is going to have slightly less resolution than the pure linux version tp.tv_sec = (curms + ms_wait)/1000; tp.tv_nsec = (ms_wait %1000) * 1000000; #else // linux clock_gettime( CLOCK_REALTIME, &tp); tp.tv_sec += ms_wait / 1000; tp.tv_nsec += (ms_wait % 1000) * 1000000; if (tp.tv_nsec > 1000000000) { tp.tv_nsec -= 1000000000; tp.tv_sec++; } #endif } /* FIXME error checking */ /* Note that we apparently have to use a while() loop. Careful reading * of the pthread_cond_signal() documentation says that AT LEAST ONE * waiting thread will be awakened... */ if (q->threadsafe) { #ifdef EXTERNAL_LOCKS if (CF_Q_EMPTY(q)) { QUEUE_UNLOCK(q); return(CF_QUEUE_EMPTY); } #else while (CF_Q_EMPTY(q)) { if (CF_QUEUE_FOREVER == ms_wait) { pthread_cond_wait(&q->CV, &q->LOCK); } else if (CF_QUEUE_NOWAIT == ms_wait) { pthread_mutex_unlock(&q->LOCK); return(CF_QUEUE_EMPTY); } else { pthread_cond_timedwait(&q->CV, &q->LOCK, &tp); if (CF_Q_EMPTY(q)) { pthread_mutex_unlock(&q->LOCK); return(CF_QUEUE_EMPTY); } } } #endif // EXTERNAL_LOCKS } else if (CF_Q_EMPTY(q)) return(CF_QUEUE_EMPTY); memcpy(buf, CF_Q_ELEM_PTR(q,q->read_offset), q->elementsz); q->read_offset++; // interesting idea - this probably keeps the cache fresher // because the queue is fully empty just make it all zero if (q->read_offset == q->write_offset) { q->read_offset = q->write_offset = 0; } QUEUE_UNLOCK(q); return(0); }
// Set of threads which talk to client over the connection for doing the needful // processing. Note that once fd is assigned to a thread all the work on that fd // is done by that thread. Fair fd usage is expected of the client. First thread // is special - also does accept [listens for new connections]. It is the only // thread which does it. void * thr_demarshal(void *arg) { cf_socket_cfg *s, *ls; // Create my epoll fd, register in the global list. struct epoll_event ev; int nevents, i, n, epoll_fd; cf_clock last_fd_print = 0; #if defined(USE_SYSTEMTAP) uint64_t nodeid = g_config.self_node; #endif // Early stage aborts; these will cause faults in process scope. cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument"); s = &g_config.socket; ls = &g_config.localhost_socket; #ifdef USE_JEM int orig_arena; if (0 > (orig_arena = jem_get_arena())) { cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!"); } else { cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena); } #endif // Figure out my thread index. pthread_t self = pthread_self(); int thr_id; for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) { if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self)) break; } if (thr_id == MAX_DEMARSHAL_THREADS) { cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!"); return(0); } // First thread accepts new connection at interface socket. if (thr_id == 0) { demarshal_file_handle_init(); epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); memset(&ev, 0, sizeof (ev)); ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = s->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port); if (ls->sock) { ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = ls->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port); } } else { epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); } g_demarshal_args->epoll_fd[thr_id] = epoll_fd; cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id); int id_cntr = 0; // Demarshal transactions from the socket. for ( ; ; ) { struct epoll_event events[EPOLL_SZ]; cf_detail(AS_DEMARSHAL, "calling epoll"); nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1); if (0 > nevents) { cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno)); } cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents); uint64_t now_ns = cf_getns(); uint64_t now_ms = now_ns / 1000000; // Iterate over all events. for (i = 0; i < nevents; i++) { if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) { // Accept new connections on the service socket. int csocket = -1; struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); char cpaddr[64]; if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) { // This means we're out of file descriptors - could be a SYN // flood attack or misbehaving client. Eventually we'd like // to make the reaper fairer, but for now we'll just have to // ignore the accept error and move on. if ((errno == EMFILE) || (errno == ENFILE)) { if (last_fd_print != (cf_getms() / 1000L)) { cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit"); last_fd_print = cf_getms() / 1000L; } continue; } cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno); } // Get the client IP address in string form. if (caddr.sin_family == AF_INET) { if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else if (caddr.sin_family == AF_INET6) { struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr; if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else { cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family); } cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket); // Validate the limit of protocol connections we allow. uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed; if (conns_open > g_config.n_proto_fd_max) { if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open); last_fd_print = cf_getms(); } shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Set the socket to nonblocking. if (-1 == cf_socket_set_nonblocking(csocket)) { cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Create as_file_handle and queue it up in epoll_fd for further // communication on one of the demarshal threads. as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle)); if (!fd_h) { cf_crash(AS_DEMARSHAL, "malloc"); } sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port)); fd_h->fd = csocket; fd_h->last_used = cf_getms(); fd_h->reap_me = false; fd_h->trans_active = false; fd_h->proto = 0; fd_h->proto_unread = 0; fd_h->fh_info = 0; fd_h->security_filter = as_security_filter_create(); // Insert into the global table so the reaper can manage it. Do // this before queueing it up for demarshal threads - once // EPOLL_CTL_ADD is done it's difficult to back out (if insert // into global table fails) because fd state could be anything. cf_rc_reserve(fd_h); pthread_mutex_lock(&g_file_handle_a_LOCK); int j; bool inserted = true; if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) { inserted = false; } else { g_file_handle_a[j] = fd_h; } pthread_mutex_unlock(&g_file_handle_a_LOCK); if (!inserted) { cf_info(AS_DEMARSHAL, "unable to add socket to file handle table"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; cf_rc_free(fd_h); // will free even with ref-count of 2 } else { // Place the client socket in the event queue. memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ; ev.data.ptr = fd_h; // Round-robin pick up demarshal thread epoll_fd and add // this new connection to epoll. int id; while (true) { id = (id_cntr++) % g_demarshal_args->num_threads; if (g_demarshal_args->epoll_fd[id] != 0) { break; } } fd_h->epoll_fd = g_demarshal_args->epoll_fd[id]; if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) { cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads); pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); } else { cf_atomic_int_incr(&g_config.proto_connections_opened); } } } else { bool has_extra_ref = false; as_file_handle *fd_h = events[i].data.ptr; if (fd_h == 0) { cf_info(AS_DEMARSHAL, "event with null handle, continuing"); goto NextEvent; } cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events); // Process data on an existing connection: this might be more // activity on an already existing transaction, so we have some // state to manage. as_proto *proto_p = 0; int fd = fd_h->fd; if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events); // no longer in use: out of epoll etc goto NextEvent_FD_Cleanup; } if (fd_h->trans_active) { goto NextEvent; } // If pointer is NULL, then we need to create a transaction and // store it in the buffer. if (fd_h->proto == NULL) { as_proto proto; int sz; /* Get the number of available bytes */ if (-1 == ioctl(fd, FIONREAD, &sz)) { cf_info(AS_DEMARSHAL, "unable to get number of available bytes"); goto NextEvent_FD_Cleanup; } // If we don't have enough data to fill the message buffer, // just wait and we'll come back to this one. However, we'll // let messages with zero size through, since they are // likely errors. We don't cleanup the FD in this case since // we'll get more data on it. if (sz < sizeof(as_proto) && sz != 0) { goto NextEvent; } // Do a preliminary read of the header into a stack- // allocated structure, so that later on we can allocate the // entire message buffer. if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) { cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno); goto NextEvent_FD_Cleanup; } if (proto.version != PROTO_VERSION && // For backward compatibility, allow version 0 with // security messages. ! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) { cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u", fd_h->client, proto.version); goto NextEvent_FD_Cleanup; } // Swap the necessary elements of the as_proto. as_proto_swap(&proto); if (proto.sz > PROTO_SIZE_MAX) { cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64, fd_h->client, PROTO_SIZE_MAX, proto.sz); goto NextEvent_FD_Cleanup; } #ifdef USE_JEM // Attempt to peek the namespace and set the JEMalloc arena accordingly. size_t peeked_data_sz = 0; size_t min_field_sz = sizeof(uint32_t) + sizeof(char); size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz; size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.) uint8_t peekbuf[peekbuf_sz]; if (PROTO_TYPE_AS_MSG == proto.type) { size_t offset = sizeof(as_msg); // Number of bytes to peek from the socket. // size_t peek_sz = peekbuf_sz; // Peak up to the size of the peek buffer. size_t peek_sz = MIN(proto.sz, peekbuf_sz); // Peek only up to the minimum necessary number of bytes. if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) { // That's actually legitimate. The as_proto may have gone into one // packet, the as_msg into the next one, which we haven't yet received. // This just "never happened" without async. cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz); } if (peeked_data_sz > min_as_msg_sz) { // cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz); if (peeked_data_sz > proto.sz) { cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd); log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz); goto NextEvent_FD_Cleanup; } if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) { jem_set_arena(orig_arena); } else { uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0; bool found = false; // cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields); while (!found && (field_num < n_fields)) { as_msg_field *field = (as_msg_field *) (&peekbuf[offset]); uint32_t value_sz = ntohl(field->field_sz) - 1; // cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset); // cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz); // cf_debug(AS_DEMARSHAL, "\ttype %d", field->type); if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) { if (value_sz >= AS_ID_NAMESPACE_SZ) { cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz); goto NextEvent_FD_Cleanup; } char ns[AS_ID_NAMESPACE_SZ]; found = true; memcpy(ns, field->data, value_sz); ns[value_sz] = '\0'; // cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num); jem_set_arena(as_namespace_get_jem_arena(ns)); } else { // cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type); field_num++; offset += sizeof(as_msg_field) + value_sz; if (offset >= peeked_data_sz) { break; } } } if (!found) { cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz); jem_set_arena(orig_arena); } } } else { jem_set_arena(orig_arena); } } else { jem_set_arena(orig_arena); } #endif // Allocate the complete message buffer. proto_p = cf_malloc(sizeof(as_proto) + proto.sz); cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno)); memcpy(proto_p, &proto, sizeof(as_proto)); #ifdef USE_JEM // Jam in the peeked data. if (peeked_data_sz) { memcpy(proto_p->data, &peekbuf, peeked_data_sz); } fd_h->proto_unread = proto_p->sz - peeked_data_sz; #else fd_h->proto_unread = proto_p->sz; #endif fd_h->proto = (void *) proto_p; } else { proto_p = fd_h->proto; } if (fd_h->proto_unread > 0) { // Read the data. n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0); if (0 >= n) { if (errno == EAGAIN) { continue; } cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno)); goto NextEvent_FD_Cleanup; } // Decrement bytes-unread counter. cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread); fd_h->proto_unread -= n; } // Check for a finished read. if (0 == fd_h->proto_unread) { // It's only really live if it's injecting a transaction. fd_h->last_used = now_ms; thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress fd_h->proto = 0; fd_h->proto_unread = 0; // INIT_TR as_transaction tr; as_transaction_init(&tr, NULL, (cl_msg *)proto_p); cf_rc_reserve(fd_h); has_extra_ref = true; tr.proto_fd_h = fd_h; tr.start_time = now_ns; // set transaction start time tr.preprocessed = false; if (! as_proto_is_valid_type(proto_p)) { cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type); // We got a proto message type we don't recognize, so it // may not do any good to send back an as_msg error, but // it's the best we can do. At least we can keep the fd. as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } if (g_config.microbenchmarks) { histogram_insert_data_point(g_config.demarshal_hist, now_ns); tr.microbenchmark_time = cf_getns(); } // Check if it's compressed. if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) { // Decompress it - allocate buffer to hold decompressed // packet. uint8_t *decompressed_buf = NULL; size_t decompressed_buf_size = 0; int rv = 0; if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) { cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv); cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p"); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Count the packets. cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1); // Free the compressed packet since we'll be using the // decompressed packet from now on. cf_free(proto_p); proto_p = NULL; // Get original packet. tr.msgp = (cl_msg *)decompressed_buf; as_proto_swap(&(tr.msgp->proto)); if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) { cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]", tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } } // Security protocol transactions. if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) { as_security_transact(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Info protocol requests. if (tr.msgp->proto.type == PROTO_TYPE_INFO) { if (as_info(&tr)) { cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer"); goto NextEvent_FD_Cleanup; } cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp); // Fast path for batch requests. if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) { as_batch_queue_task(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Either process the transaction directly in this thread, // or queue it for processing by another thread (tsvc/info). if (0 != thr_tsvc_process_or_enqueue(&tr)) { cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread"); goto NextEvent_FD_Cleanup; } else { cf_atomic_int_incr(&g_config.proto_transactions); } } // Jump the proto message free & FD cleanup. If we get here, the // above operations went smoothly. The message free & FD cleanup // job is handled elsewhere as directed by // thr_tsvc_process_or_enqueue(). goto NextEvent; NextEvent_FD_Cleanup: // If we allocated memory for the incoming message, free it. if (proto_p) { cf_free(proto_p); fd_h->proto = 0; } // If fd has extra reference for transaction, release it. if (has_extra_ref) { cf_rc_release(fd_h); } // Remove the fd from the events list. if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) { cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)", fd, epoll_fd, errno, cf_strerror(errno)); } pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); NextEvent: ; } // We should never be canceled externally, but just in case... pthread_testcancel(); } } return NULL; }
as_status as_command_execute(as_cluster* cluster, as_error * err, as_command_node* cn, uint8_t* command, size_t command_len, uint32_t timeout_ms, uint32_t retry, as_parse_results_fn parse_results_fn, void* parse_results_data ) { uint64_t deadline_ms = as_socket_deadline(timeout_ms); uint32_t sleep_between_retries_ms = 0; uint32_t failed_nodes = 0; uint32_t failed_conns = 0; uint32_t iterations = 0; bool release_node; // Execute command until successful, timed out or maximum iterations have been reached. while (true) { as_node* node; if (cn->node) { node = cn->node; release_node = false; } else { node = as_node_get(cluster, cn->ns, cn->digest, cn->write, cn->replica); release_node = true; } if (!node) { failed_nodes++; sleep_between_retries_ms = 10; goto Retry; } int fd; as_status status = as_node_get_connection(err, node, deadline_ms, &fd); if (status) { if (release_node) { as_node_release(node); } failed_conns++; sleep_between_retries_ms = 1; goto Retry; } // Send command. status = as_socket_write_deadline(err, fd, command, command_len, deadline_ms); if (status) { // Socket errors are considered temporary anomalies. Retry. // Close socket to flush out possible garbage. Do not put back in pool. as_close(fd); if (release_node) { as_node_release(node); } sleep_between_retries_ms = 0; goto Retry; } // Parse results returned by server. status = parse_results_fn(err, fd, deadline_ms, parse_results_data); if (status == AEROSPIKE_OK) { // Reset error code if retry had occurred. if (iterations > 0) { as_error_reset(err); } } else { switch (status) { // Retry on timeout. case AEROSPIKE_ERR_TIMEOUT: as_close(fd); if (release_node) { as_node_release(node); } sleep_between_retries_ms = 0; goto Retry; // Close socket on errors that can leave unread data in socket. case AEROSPIKE_ERR_QUERY_ABORTED: case AEROSPIKE_ERR_SCAN_ABORTED: case AEROSPIKE_ERR_CLIENT_ABORT: case AEROSPIKE_ERR_CLIENT: as_close(fd); if (release_node) { as_node_release(node); } err->code = status; return status; default: err->code = status; break; } } // Put connection back in pool. as_node_put_connection(node, fd, cluster->conn_queue_size); // Release resources. if (release_node) { as_node_release(node); } return status; Retry: // Check if max retries reached. if (++iterations > retry) { break; } // Check for client timeout. if (deadline_ms > 0) { int remaining_ms = (int)(deadline_ms - cf_getms() - sleep_between_retries_ms); if (remaining_ms <= 0) { break; } // Reset timeout in send buffer (destined for server). *(uint32_t*)(command + 22) = cf_swap_to_be32(remaining_ms); } if (sleep_between_retries_ms > 0) { // Sleep before trying again. usleep(sleep_between_retries_ms * 1000); } } return as_error_update(err, AEROSPIKE_ERR_TIMEOUT, "Client timeout: timeout=%d iterations=%u failedNodes=%u failedConns=%u", timeout_ms, iterations, failed_nodes, failed_conns); }
// Put batch request on a separate batch queue. int as_batch(as_transaction* tr) { as_msg* msg = &tr->msgp->msg; as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE); if (! nsfp) { cf_warning(AS_BATCH, "Batch namespace is required."); return -1; } as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY); if (! dfp) { cf_warning(AS_BATCH, "Batch digests are required."); return -1; } uint n_digests = dfp->field_sz / sizeof(cf_digest); if (n_digests > g_config.batch_max_requests) { cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests); return -1; } batch_transaction btr; btr.trid = tr->trid; btr.end_time = tr->end_time; btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA); btr.ns = as_namespace_get_bymsgfield(nsfp); if (! btr.ns) { cf_warning(AS_BATCH, "Batch namespace is required."); return -1; } // Create the master digest table. btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests)); if (! btr.digests) { cf_warning(AS_BATCH, "Failed to allocate memory for batch digests."); return -1; } batch_digests* bmd = btr.digests; bmd->n_digests = n_digests; uint8_t* digest_field_data = dfp->data; for (int i = 0; i < n_digests; i++) { bmd->digest[i].done = false; bmd->digest[i].node = 0; memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest)); digest_field_data += sizeof(cf_digest); } btr.binlist = as_binlist_from_op(msg); btr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; btr.fd_h->last_used = cf_getms(); cf_atomic_int_incr(&g_config.batch_initiate); cf_queue_push(g_batch_queue, &btr); return 0; }
/* cf_socket_init_client * Connect a socket to a remote endpoint * DOES A BLOCKING CONNECT INLINE - timeout */ int cf_socket_init_client(cf_socket_cfg *s, int timeout) { cf_assert(s, CF_SOCKET, CF_CRITICAL, "invalid argument"); if (0 > (s->sock = socket(AF_INET, s->proto, 0))) { cf_warning(CF_SOCKET, "socket: %s", cf_strerror(errno)); return(-1); } fcntl(s->sock, F_SETFD, FD_CLOEXEC); /* close on exec */ fcntl(s->sock, F_SETFL, O_NONBLOCK); /* non-blocking */ // Try tuning the window: must be done before connect // int flag = (1024 * 32); // setsockopt(s->sock, SOL_SOCKET, SO_SNDBUF, &flag, sizeof(flag) ); // setsockopt(s->sock, SOL_SOCKET, SO_RCVBUF, &flag, sizeof(flag) ); memset(&s->saddr,0,sizeof(s->saddr)); s->saddr.sin_family = AF_INET; int rv = inet_pton(AF_INET, s->addr, &s->saddr.sin_addr.s_addr); if (rv < 0) { cf_warning(CF_SOCKET, "inet_pton: %s", cf_strerror(errno)); close(s->sock); return(-1); } else if (rv == 0) { cf_warning(CF_SOCKET, "inet_pton: invalid ip %s", s->addr); close(s->sock); return(-1); } s->saddr.sin_port = htons(s->port); rv = connect(s->sock, (struct sockaddr *)&s->saddr, sizeof(s->saddr)); cf_debug(CF_SOCKET, "connect: rv %d errno %s",rv,cf_strerror(errno)); if (rv < 0) { int epoll_fd = -1; if (errno == EINPROGRESS) { cf_clock start = cf_getms(); if (0 > (epoll_fd = epoll_create(1))) { cf_warning(CF_SOCKET, "epoll_create() failed (errno %d: \"%s\")", errno, cf_strerror(errno)); goto Fail; } struct epoll_event event; memset(&event, 0, sizeof(struct epoll_event)); event.data.fd = s->sock; event.events = EPOLLOUT; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &event)) { cf_warning(CF_SOCKET, "epoll_ctl(ADD) of client socket failed (errno %d: \"%s\")", errno, cf_strerror(errno)); goto Fail; } int tries = 0; do { int nevents = 0; int max_events = 1; int wait_ms = 1; struct epoll_event events[max_events]; if (0 > (nevents = epoll_wait(epoll_fd, events, max_events, wait_ms))) { if (errno == EINTR) { cf_debug(CF_SOCKET, "epoll_wait() on client socket encountered EINTR ~~ Retrying!"); goto Retry; } else { cf_warning(CF_SOCKET, "epoll_wait() on client socket failed (errno %d: \"%s\") ~~ Failing!", errno, cf_strerror(errno)); goto Fail; } } else { if (nevents == 0) { cf_debug(CF_SOCKET, "epoll_wait() returned no events ~~ Retrying!"); goto Retry; } if (nevents != 1) { cf_warning(CF_SOCKET, "epoll_wait() returned %d events ~~ only 1 expected, so ignoring others!", nevents); } if (events[0].data.fd == s->sock) { if (events[0].events & EPOLLOUT) { cf_debug(CF_SOCKET, "epoll_wait() on client socket ready for write detected ~~ Succeeding!"); } else { // (Note: ERR and HUP events are automatically waited for as well.) if (events[0].events & (EPOLLERR | EPOLLHUP)) { cf_debug(CF_SOCKET, "epoll_wait() on client socket detected failure event 0x%x ~~ Failing!", events[0].events); } else { cf_warning(CF_SOCKET, "epoll_wait() on client socket detected non-write events 0x%x ~~ Failing!", events[0].events); } goto Fail; } } else { cf_warning(CF_SOCKET, "epoll_wait() on client socket returned event on unknown socket %d ~~ Retrying!", events[0].data.fd); goto Retry; } if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_DEL, s->sock, &event)) { cf_warning(CF_SOCKET, "epoll_ctl(DEL) on client socket failed (errno %d: \"%s\")", errno, cf_strerror(errno)); } close(epoll_fd); goto Success; } Retry: cf_debug(CF_SOCKET, "Connect epoll loop: Retry #%d", tries++); if (start + timeout < cf_getms()) { cf_warning(CF_SOCKET, "Error in delayed connect() to %s:%d: timed out", s->addr, s->port); errno = ETIMEDOUT; goto Fail; } } while (1); } Fail: cf_debug(CF_SOCKET, "connect failed to %s:%d : %s", s->addr, s->port, cf_strerror(errno)); if (epoll_fd > 0) { close(epoll_fd); } close(s->sock); s->sock = -1; return(-1); } else { cf_debug(CF_SOCKET, "client socket connect() to %s:%d in 1 try!", s->addr, s->port); } Success: ; // regarding this: calling here doesn't seem terribly effective. // on the fabric threads, it seems important to set no-delay much later int flag = 1; setsockopt(s->sock, SOL_TCP, TCP_NODELAY, &flag, sizeof(flag)); long farg = fcntl(s->sock, F_GETFL, 0); fcntl(s->sock, F_SETFL, farg & (~O_NONBLOCK)); /* blocking again */ return(0); }
// Keep track of the connections, since they're precious. Kill anything that // hasn't been used in a while. The file handle array keeps a reference count, // and allows a reaper to run through and find the ones to reap. The table is // only written by the demarshal threads, and only read by the reaper thread. void * thr_demarshal_reaper_fn(void *arg) { uint64_t last = cf_getms(); while (true) { uint64_t now = cf_getms(); uint inuse_cnt = 0; uint64_t kill_ms = g_config.proto_fd_idle_ms; bool refresh = false; if (now - last > (uint64_t)(g_config.sec_cfg.privilege_refresh_period * 1000)) { refresh = true; last = now; } pthread_mutex_lock(&g_file_handle_a_LOCK); for (int i = 0; i < g_file_handle_a_sz; i++) { if (g_file_handle_a[i]) { as_file_handle *fd_h = g_file_handle_a[i]; if (refresh) { as_security_refresh(fd_h); } // Reap, if asked to. if (fd_h->reap_me) { cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", fd_h->fd); g_file_handle_a[i] = 0; cf_queue_push(g_freeslot, &i); as_release_file_handle(fd_h); fd_h = 0; } // Reap if past kill time. else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) { if (fd_h->fh_info & FH_INFO_DONOT_REAP) { cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", fd_h->fd); inuse_cnt++; continue; } shutdown(fd_h->fd, SHUT_RDWR); // will trigger epoll errors cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", fd_h->fd); g_file_handle_a[i] = 0; cf_queue_push(g_freeslot, &i); as_release_file_handle(fd_h); fd_h = 0; cf_atomic_int_incr(&g_config.reaper_count); } else { inuse_cnt++; } } } pthread_mutex_unlock(&g_file_handle_a_LOCK); if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) { cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse", g_file_handle_a_sz, inuse_cnt); } // Validate the system statistics. if (g_config.proto_connections_opened - g_config.proto_connections_closed != inuse_cnt) { cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count: %d in stats vs %d calculated", g_config.proto_connections_opened - g_config.proto_connections_closed, inuse_cnt); } sleep(1); } return NULL; }