/** * Initialize UDF from tr->udata. It is for internal UDF transactions * * Returns: * 0 on if found * -1 if not found */ int udf_rw_call_init_internal(udf_call * call, as_transaction * tr) { udf_call *ucall = NULL; if (tr->udata.req_type == UDF_SCAN_REQUEST) { ucall = as_scan_get_udf_call(tr->udata.req_udata); } else if (tr->udata.req_type == UDF_QUERY_REQUEST) { ucall = as_query_get_udf_call(tr->udata.req_udata); } if (ucall) { strncpy (call->def.filename, ucall->def.filename, sizeof(ucall->def.filename)); strncpy (call->def.function, ucall->def.function, sizeof(ucall->def.function)); call->tr = tr; call->def.arglist = ucall->def.arglist; call->def.type = ucall->def.type; if (tr->udata.req_type == UDF_SCAN_REQUEST) { cf_atomic_int_incr(&g_config.udf_scan_rec_reqs); } else if (tr->udata.req_type == UDF_QUERY_REQUEST) { cf_atomic_int_incr(&g_config.udf_query_rec_reqs); } return 0; } return -1; }
// If there's an element with specified digest in the tree, return a locked // and reserved reference to it in index_ref. // // Returns: // 0 - found (reference returned in index_ref) // -1 - not found (index_ref untouched) int as_index_get_vlock(as_index_tree *tree, cf_digest *keyd, as_index_ref *index_ref) { pthread_mutex_lock(&tree->lock); int rv = as_index_search_lockless(tree, keyd, &index_ref->r, &index_ref->r_h); if (rv != 0) { pthread_mutex_unlock(&tree->lock); return rv; } as_index_reserve(index_ref->r); cf_atomic_int_incr(&g_config.global_record_ref_count); pthread_mutex_unlock(&tree->lock); if (! index_ref->skip_lock) { olock_vlock(g_config.record_locks, keyd, &index_ref->olock); cf_atomic_int_incr(&g_config.global_record_lock_count); } return 0; }
void as_transaction_error(as_transaction* tr, uint32_t error_code) { if (tr->proto_fd_h) { if (tr->batch_shared) { as_batch_add_error(tr->batch_shared, tr->batch_index, error_code); // Clear this transaction's msgp so calling code does not free it. tr->msgp = 0; } else { as_msg_send_reply(tr->proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, NULL, as_transaction_trid(tr), NULL); tr->proto_fd_h = 0; MICROBENCHMARK_HIST_INSERT_P(error_hist); cf_atomic_int_incr(&g_config.err_tsvc_requests); if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { cf_atomic_int_incr(&g_config.err_tsvc_requests_timeout); } } } else if (tr->proxy_msg) { as_proxy_send_response(tr->proxy_node, tr->proxy_msg, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr), NULL); tr->proxy_msg = NULL; } else if (tr->udata.req_udata) { if (udf_rw_needcomplete(tr)) { udf_rw_complete(tr, error_code, __FILE__,__LINE__); } } }
void as_proxy_set_stat_counters(int rv) { if (rv == 0) { cf_atomic_int_incr(&g_config.stat_proxy_success); } else { cf_atomic_int_incr(&g_config.stat_proxy_errs); } }
/* Internal Function: Workhorse function to send response back to the client * after UDF execution. * * caller: * send_success * send_failure * * Assumption: The call should be setup properly pointing to the tr. * * Special Handling: If it is background scan udf job do not sent any * response to client * If it is scan job ...do not cleanup the fd it will * be done by the scan thread after scan is finished */ static int send_response(udf_call *call, const char *key, size_t klen, int vtype, void *val, size_t vlen) { as_transaction * tr = call->transaction; as_namespace * ns = tr->rsv.ns; uint32_t generation = tr->generation; uint sp_sz = 1024 * 16; uint32_t void_time = 0; uint written_sz = 0; bool keep_fd = false; as_bin stack_bin; as_bin * bin = &stack_bin; // space for the stack particles uint8_t stack_particle_buf[sp_sz]; uint8_t * sp_p = stack_particle_buf; if (call->udf_type == AS_SCAN_UDF_OP_BACKGROUND) { // If we are doing a background UDF scan, do not send any result back cf_detail(AS_UDF, "UDF: Background transaction, send no result back. " "Parent job id [%"PRIu64"]", ((tscan_job*)(tr->udata.req_udata))->tid); if(strncmp(key, "FAILURE", 8) == 0) { cf_atomic_int_incr(&((tscan_job*)(tr->udata.req_udata))->n_obj_udf_failed); } else if(strncmp(key, "SUCCESS", 8) == 0) { cf_atomic_int_incr(&((tscan_job*)(tr->udata.req_udata))->n_obj_udf_success); } return 0; } else if(call->udf_type == AS_SCAN_UDF_OP_UDF) { // Do not release fd now, scan will do it at the end of all internal // udf transactions cf_detail(AS_UDF, "UDF: Internal udf transaction, do not release fd"); keep_fd = true; } if (0 != make_send_bin(ns, bin, &sp_p, sp_sz, key, klen, vtype, val, vlen)) { return(-1); } // this is going to release the file descriptor if (keep_fd && tr->proto_fd_h) cf_rc_reserve(tr->proto_fd_h); single_transaction_response( tr, ns, NULL/*ops*/, &bin, 1, generation, void_time, &written_sz, NULL); // clean up. // TODO: check: is bin_inuse valid only when data_in_memory? // There must be another way to determine if the particle is used? if ( as_bin_inuse(bin) ) { as_particle_destroy(&stack_bin, ns->storage_data_in_memory); } if (sp_p != stack_particle_buf) { cf_free(sp_p); } return 0; } // end send_response()
void as_index_reduce_traverse(as_index_tree *tree, cf_arenax_handle r_h, cf_arenax_handle sentinel_h, as_index_ph_array *v_a) { if (v_a->pos >= v_a->alloc_sz) { return; } as_index *r = RESOLVE_H(r_h); if (r->left_h != sentinel_h) { as_index_reduce_traverse(tree, r->left_h, sentinel_h, v_a); } as_index_reserve(r); cf_atomic_int_incr(&g_config.global_record_ref_count); v_a->indexes[v_a->pos].r = r; v_a->indexes[v_a->pos].r_h = r_h; v_a->pos++; if (r->right_h != sentinel_h) { as_index_reduce_traverse(tree, r->right_h, sentinel_h, v_a); } }
// Process one queue's batch requests. void* batch_process_queue(void* q_to_wait_on) { cf_queue* worker_queue = (cf_queue*)q_to_wait_on; batch_transaction btr; uint64_t start; while (1) { if (cf_queue_pop(worker_queue, &btr, CF_QUEUE_FOREVER) != 0) { cf_crash(AS_BATCH, "Failed to pop from batch worker queue."); } // Check for timeouts. if (btr.end_time != 0 && cf_getns() > btr.end_time) { cf_atomic_int_incr(&g_config.batch_timeout); if (btr.fd_h) { as_msg_send_reply(btr.fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, 0, 0, 0, 0, 0, 0, btr.trid, NULL); btr.fd_h = 0; } batch_transaction_done(&btr); continue; } // Process batch request. start = cf_getns(); batch_process_request(&btr); histogram_insert_data_point(g_config.batch_q_process_hist, start); } return 0; }
// TODO - deprecate this when swap is moved out into thr_demarshal.c! void as_transaction_error_unswapped(as_transaction* tr, uint32_t error_code) { if (tr->batch_shared) { as_batch_add_error(tr->batch_shared, tr->batch_index, error_code); // Clear this transaction's msgp so calling code does not free it. tr->msgp = 0; } else { as_msg_send_reply(tr->proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, NULL, 0, NULL); tr->proto_fd_h = 0; MICROBENCHMARK_HIST_INSERT_P(error_hist); cf_atomic_int_incr(&g_config.err_tsvc_requests); if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { cf_atomic_int_incr(&g_config.err_tsvc_requests_timeout); } } }
/** * Get UDF call object pointer from parent job via tr->from.iudf_orig. */ udf_call * udf_rw_call_def_init_internal(udf_call * call, as_transaction * tr) { // TODO - wouldn't need this if we bailed early on losing race vs. timeout. if (! tr->from.iudf_orig) { return NULL; // can happen on timeout } call->def = &tr->from.iudf_orig->def; if (tr->from.iudf_orig->type == UDF_SCAN_REQUEST) { cf_atomic_int_incr(&g_config.udf_scan_rec_reqs); } else if (tr->from.iudf_orig->type == UDF_QUERY_REQUEST) { cf_atomic_int_incr(&g_config.udf_query_rec_reqs); } return call; }
/* * Create a tree "stub" for the storage has index case. * Returns: 1 = new * 0 = success (found) * -1 = fail */ int as_index_ref_initialize(as_index_tree *tree, cf_digest *key, as_index_ref *index_ref, bool create_p, as_namespace *ns) { /* Allocate memory for the new node and set the node parameters */ cf_arenax_handle n_h = cf_arenax_alloc(tree->arena); if (0 == n_h) { // cf_debug(AS_INDEX," malloc failed "); return(-1); } as_index *n = RESOLVE_H(n_h); n->key = *key; n->rc = 1; n->left_h = n->right_h = tree->sentinel_h; n->color = AS_RED; n->parent_h = tree->sentinel_h; if (AS_STORAGE_ENGINE_KV == ns->storage_type) n->storage_key.kv.file_id = STORAGE_INVALID_FILE_ID; // careful here - this is now unsigned else cf_crash(AS_INDEX, "non-KV storage type ns %s key %p", ns->name, key); index_ref->r = n; index_ref->r_h = n_h; if (!index_ref->skip_lock) { olock_vlock(g_config.record_locks, key, &(index_ref->olock)); cf_atomic_int_incr(&g_config.global_record_lock_count); } as_index_reserve(n); cf_atomic_int_add(&g_config.global_record_ref_count, 2); int rv = !as_storage_record_exists(ns, key); // Unlock if not found and we're not creating it. if (rv && !create_p) { if (!index_ref->skip_lock) { pthread_mutex_unlock(index_ref->olock); cf_atomic_int_decr(&g_config.global_record_lock_count); } as_index_release(n); cf_atomic_int_decr(&g_config.global_record_ref_count); cf_arenax_free(tree->arena, n_h); index_ref->r = 0; index_ref->r_h = 0; } return(rv); }
// TODO - really? we can't hide this behind an XDR stub? bool xdr_allows_write(as_transaction* tr) { if (as_transaction_is_xdr(tr)) { if (tr->rsv.ns->ns_allow_xdr_writes) { return true; } } else { if (tr->rsv.ns->ns_allow_nonxdr_writes) { return true; } } cf_atomic_int_incr(&tr->rsv.ns->n_fail_xdr_forbidden); return false; }
//------------------------------------------------ // Runs in thr_add_readreqs, adds readreq objects // to all read queues in an even, random spread. // static void* run_add_readreqs(void* pv_unused) { uint64_t count = 0; while (g_running) { if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) { fprintf(stdout, "ERROR: too many read reqs queued\n"); fprintf(stdout, "drive(s) can't keep up - test stopped\n"); g_running = false; break; } uint32_t random_queue_index = rand_32() % g_num_queues; uint32_t random_device_index = g_queue_per_device ? random_queue_index : rand_32() % g_num_devices; device* p_random_device = &g_devices[random_device_index]; readreq* p_readreq = malloc(sizeof(readreq)); p_readreq->p_device = p_random_device; p_readreq->offset = random_read_offset(p_random_device); p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES; p_readreq->start_time = cf_getus(); cf_queue_push(g_readqs[random_queue_index].p_req_queue, &p_readreq); count++; int sleep_us = (int) (((count * 1000000) / g_read_reqs_per_sec) - (cf_getus() - g_run_start_us)); if (sleep_us > 0) { usleep((uint32_t)sleep_us); } if (sleep_us != 0) { fprintf(stdout, "%" PRIu64 ", sleep_us = %d\n", count, sleep_us); } } return (0); }
// Process one queue's batch requests. static void batch_worker(void* udata) { batch_transaction* btr = (batch_transaction*)udata; // Check for timeouts. if (btr->end_time != 0 && cf_getns() > btr->end_time) { cf_atomic_int_incr(&g_config.batch_timeout); if (btr->fd_h) { as_msg_send_reply(btr->fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, 0, 0, 0, 0, 0, 0, btr->trid, NULL); btr->fd_h = 0; } batch_transaction_done(btr, false); return; } // Process batch request. uint64_t start = cf_getns(); batch_process_request(btr); histogram_insert_data_point(g_config.batch_q_process_hist, start); }
// Helper to release transaction file handles. void as_release_file_handle(as_file_handle *proto_fd_h) { int rc = cf_rc_release(proto_fd_h); if (rc > 0) { return; } else if (rc < 0) { cf_warning(AS_PROTO, "release file handle: negative ref-count %d", rc); return; } close(proto_fd_h->fd); proto_fd_h->fh_info &= ~FH_INFO_DONOT_REAP; proto_fd_h->fd = -1; if (proto_fd_h->proto) { as_proto *p = proto_fd_h->proto; if ((p->version != PROTO_VERSION) || (p->type >= PROTO_TYPE_MAX)) { cf_warning(AS_PROTO, "release file handle: bad proto buf, corruption"); } else { cf_free(proto_fd_h->proto); proto_fd_h->proto = NULL; } } if (proto_fd_h->security_filter) { as_security_filter_destroy(proto_fd_h->security_filter); proto_fd_h->security_filter = NULL; } cf_rc_free(proto_fd_h); cf_atomic_int_incr(&g_config.proto_connections_closed); }
static void* generate_async_reads(void* aio_context) { uint64_t count = 0; while(g_running) { /* Create the struct of info needed at the process_read end */ uintptr_t info_ptr; if (cf_queue_pop(async_info_queue, (void*)&info_ptr, CF_QUEUE_NOWAIT) != CF_QUEUE_OK) { fprintf(stdout, "Error: Could not pop info struct \n"); return (void*)(-1); } as_async_info_t *info = (as_async_info_t*)info_ptr; memset(info, 0, sizeof(as_async_info_t)); /* Generate the actual read request */ uint32_t random_device_index = rand_32() % g_num_devices; device* p_random_device = &g_devices[random_device_index]; readreq* p_readreq = &(info->p_readreq); if(p_readreq == NULL) { fprintf(stdout, "Error: preadreq null \n"); goto fail; } p_readreq->p_device = p_random_device; p_readreq->offset = random_read_offset(p_random_device); p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES; p_readreq->start_time = cf_getms(); /* Async read */ if (g_use_valloc) { uint8_t* p_buffer = cf_valloc(p_readreq->size); info->p_buffer = p_buffer; if (p_buffer) { uint64_t raw_start_time = cf_getms(); info->raw_start_time = raw_start_time; if(read_async_from_device(info, *(aio_context_t *)aio_context) < 0) { fprintf(stdout, "Error: Async read failed \n"); free(p_buffer); goto fail; } } else { fprintf(stdout, "ERROR: read buffer cf_valloc()\n"); } } else { uint8_t stack_buffer[p_readreq->size + 4096]; uint8_t* p_buffer = align_4096(stack_buffer); info->p_buffer = p_buffer; uint64_t raw_start_time = cf_getms(); info->raw_start_time = raw_start_time; if(read_async_from_device(info, *(aio_context_t*)aio_context) < 0) { fprintf(stdout, "Error: Async read failed \n"); goto fail; } } if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) { fprintf(stdout, "ERROR: too many read reqs queued\n"); fprintf(stdout, "drive(s) can't keep up - test stopped\n"); g_running = false; return (void*)-1;; } count++; int sleep_ms = (int) (((count * 1000) / g_read_reqs_per_sec) - (cf_getms() - g_run_start_ms)); if (sleep_ms > 0) { usleep((uint32_t)sleep_ms * 1000); } continue; /* Rollback for failure */ fail: if(info) { uintptr_t temp = (uintptr_t)info; cf_queue_push(async_info_queue, (void*)&temp); } } return (0); }
// If there's an element with specified digest in the tree, return a locked // and reserved reference to it in index_ref. If not, create an element with // this digest, insert it into the tree, and return a locked and reserved // reference to it in index_ref. // // Returns: // 1 - created and inserted (reference returned in index_ref) // 0 - found already existing (reference returned in index_ref) // -1 - error (index_ref untouched) int as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd, as_index_ref *index_ref) { int cmp = 0; bool retry; // Save parents as we search for the specified element's insertion point. as_index_ele eles[64]; as_index_ele *ele; do { ele = eles; pthread_mutex_lock(&tree->lock); // Search for the specified element, or a parent to insert it under. ele->parent = NULL; // we'll never look this far up ele->me_h = tree->root_h; ele->me = tree->root; cf_arenax_handle t_h = tree->root->left_h; as_index *t = RESOLVE_H(t_h); while (t_h != tree->sentinel_h) { ele++; ele->parent = ele - 1; ele->me_h = t_h; ele->me = t; if ((cmp = cf_digest_compare(keyd, &t->key)) == 0) { // The element already exists, simply return it. as_index_reserve(t); cf_atomic_int_incr(&g_config.global_record_ref_count); pthread_mutex_unlock(&tree->lock); if (! index_ref->skip_lock) { olock_vlock(g_config.record_locks, keyd, &index_ref->olock); cf_atomic_int_incr(&g_config.global_record_lock_count); } index_ref->r = t; index_ref->r_h = t_h; return 0; } t_h = cmp > 0 ? t->left_h : t->right_h; t = RESOLVE_H(t_h); } // We didn't find the tree element, so we'll be inserting it. retry = false; if (EBUSY == pthread_mutex_trylock(&tree->reduce_lock)) { // The tree is being reduced - could take long, unlock so reads and // overwrites aren't blocked. pthread_mutex_unlock(&tree->lock); // Wait until the tree reduce is done... pthread_mutex_lock(&tree->reduce_lock); pthread_mutex_unlock(&tree->reduce_lock); // ... and start over - we unlocked, so the tree may have changed. retry = true; } } while (retry); // Create a new element and insert it. // Make the new element. cf_arenax_handle n_h = cf_arenax_alloc(tree->arena); if (n_h == 0) { cf_warning(AS_INDEX, "arenax alloc failed"); pthread_mutex_unlock(&tree->reduce_lock); pthread_mutex_unlock(&tree->lock); return -1; } as_index *n = RESOLVE_H(n_h); n->rc = 2; // one for create (eventually balanced by delete), one for caller cf_atomic_int_add(&g_config.global_record_ref_count, 2); n->key = *keyd; n->left_h = n->right_h = tree->sentinel_h; // n starts as a leaf element n->color = AS_RED; // n's color starts as red // Make sure we can detect that the record isn't initialized. as_index_clear_record_info(n); // Insert the new element n under parent ele. if (ele->me == tree->root || 0 < cmp) { ele->me->left_h = n_h; } else { ele->me->right_h = n_h; } ele++; ele->parent = ele - 1; ele->me_h = n_h; ele->me = n; // Rebalance the tree as needed. as_index_insert_rebalance(tree, ele); tree->elements++; pthread_mutex_unlock(&tree->reduce_lock); pthread_mutex_unlock(&tree->lock); if (! index_ref->skip_lock) { olock_vlock(g_config.record_locks, keyd, &index_ref->olock); cf_atomic_int_incr(&g_config.global_record_lock_count); } index_ref->r = n; index_ref->r_h = n_h; return 1; }
// Keep track of the connections, since they're precious. Kill anything that // hasn't been used in a while. The file handle array keeps a reference count, // and allows a reaper to run through and find the ones to reap. The table is // only written by the demarshal threads, and only read by the reaper thread. void * thr_demarshal_reaper_fn(void *arg) { uint64_t last = cf_getms(); while (true) { uint64_t now = cf_getms(); uint inuse_cnt = 0; uint64_t kill_ms = g_config.proto_fd_idle_ms; bool refresh = false; if (now - last > (uint64_t)(g_config.sec_cfg.privilege_refresh_period * 1000)) { refresh = true; last = now; } pthread_mutex_lock(&g_file_handle_a_LOCK); for (int i = 0; i < g_file_handle_a_sz; i++) { if (g_file_handle_a[i]) { as_file_handle *fd_h = g_file_handle_a[i]; if (refresh) { as_security_refresh(fd_h); } // Reap, if asked to. if (fd_h->reap_me) { cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", fd_h->fd); g_file_handle_a[i] = 0; cf_queue_push(g_freeslot, &i); as_release_file_handle(fd_h); fd_h = 0; } // Reap if past kill time. else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) { if (fd_h->fh_info & FH_INFO_DONOT_REAP) { cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", fd_h->fd); inuse_cnt++; continue; } shutdown(fd_h->fd, SHUT_RDWR); // will trigger epoll errors cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", fd_h->fd); g_file_handle_a[i] = 0; cf_queue_push(g_freeslot, &i); as_release_file_handle(fd_h); fd_h = 0; cf_atomic_int_incr(&g_config.reaper_count); } else { inuse_cnt++; } } } pthread_mutex_unlock(&g_file_handle_a_LOCK); if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) { cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse", g_file_handle_a_sz, inuse_cnt); } // Validate the system statistics. if (g_config.proto_connections_opened - g_config.proto_connections_closed != inuse_cnt) { cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count: %d in stats vs %d calculated", g_config.proto_connections_opened - g_config.proto_connections_closed, inuse_cnt); } sleep(1); } return NULL; }
// Make a callback for a specified number of elements in the tree, from outside // the tree lock. void as_index_reduce_partial(as_index_tree *tree, uint32_t sample_count, as_index_reduce_fn cb, void *udata) { pthread_mutex_lock(&tree->reduce_lock); // For full reduce, get the number of elements inside the tree lock. if (sample_count == AS_REDUCE_ALL) { sample_count = tree->elements; } if (sample_count == 0) { pthread_mutex_unlock(&tree->reduce_lock); return; } size_t sz = sizeof(as_index_ph_array) + (sizeof(as_index_ph) * sample_count); as_index_ph_array *v_a; uint8_t buf[64 * 1024]; if (sz > 64 * 1024) { v_a = cf_malloc(sz); if (! v_a) { pthread_mutex_unlock(&tree->reduce_lock); return; } } else { v_a = (as_index_ph_array*)buf; } v_a->alloc_sz = sample_count; v_a->pos = 0; uint64_t start_ms = cf_getms(); // Recursively, fetch all the value pointers into this array, so we can make // all the callbacks outside the big lock. if (tree->root->left_h != tree->sentinel_h) { as_index_reduce_traverse(tree, tree->root->left_h, tree->sentinel_h, v_a); } cf_debug(AS_INDEX, "as_index_reduce_traverse took %"PRIu64" ms", cf_getms() - start_ms); pthread_mutex_unlock(&tree->reduce_lock); for (uint32_t i = 0; i < v_a->pos; i++) { as_index_ref r_ref; r_ref.skip_lock = false; r_ref.r = v_a->indexes[i].r; r_ref.r_h = v_a->indexes[i].r_h; olock_vlock(g_config.record_locks, &r_ref.r->key, &r_ref.olock); cf_atomic_int_incr(&g_config.global_record_lock_count); // Callback MUST call as_record_done() to unlock and release record. cb(&r_ref, udata); } if (v_a != (as_index_ph_array*)buf) { cf_free(v_a); } }
int as_proxy_shipop(cf_node dst, write_request *wr) { as_partition_id pid = as_partition_getid(wr->keyd); if (dst == 0) { cf_crash(AS_PROXY, "the destination should never be zero"); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key()); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl); wr->msgp = 0; // If it is shipped op. uint32_t info = 0; info |= PROXY_INFO_SHIPPED_OP; msg_set_uint32(m, PROXY_FIELD_INFO, info); cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)", wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = wr->start_time; pr.end_time = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns; cf_rc_reserve(wr); pr.wr = wr; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.fd_h = NULL; pr.batch_shared = NULL; pr.batch_index = 0; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_info(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv); as_fabric_msg_put(m); } wr->shipped_op_initiator = true; cf_atomic_int_incr(&g_config.ldt_proxy_initiate); return 0; }
// Incoming messages start here. // - Could get a request that we need to service. // - Could get a response to one of our requests - need to find the request and // send the real response to the remote end. int proxy_msg_fn(cf_node id, msg *m, void *udata) { int rv; if (cf_rc_count((void*)m) == 0) { cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m); return -1; } uint32_t op = 99999; msg_get_uint32(m, PROXY_FIELD_OP, &op); uint32_t transaction_id = 0; msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id); cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id); switch (op) { case PROXY_OP_REQUEST: { cf_atomic_int_incr(&g_config.proxy_action); #ifdef DEBUG cf_debug(AS_PROXY, "Proxy_msg: received request"); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy msg"); #endif #endif cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "proxy msg function: no digest, problem"); as_fabric_msg_put(m); return 0; } cl_msg *msgp; size_t as_msg_sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) { cf_info(AS_PROXY, "proxy msg function: no as msg, problem"); as_fabric_msg_put(m); return 0; } uint64_t cluster_key = 0; if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) { cf_info(AS_PROXY, "proxy msg function: no cluster key, problem"); as_fabric_msg_put(m); return 0; } // This is allowed to fail - this is a new field, and gets defaulted // to 0 if it doesn't exist. uint32_t timeout_ms = 0; msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms); // cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms); // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.incoming_cluster_key = cluster_key; tr.end_time = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0; tr.proxy_node = id; tr.proxy_msg = m; // Check here if this is shipped op. uint32_t info = 0; msg_get_uint32(m, PROXY_FIELD_INFO, &info); if (info & PROXY_INFO_SHIPPED_OP) { tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP; cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received"); } else { cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid); } MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); } break; case PROXY_OP_RESPONSE: { #ifdef DEBUG // Got the response from the actual endpoint. cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy response"); #endif #endif // Look up the element. proxy_request pr; bool free_msg = true; if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) { // Found the element (sometimes we get two acks so it's OK for // an ack to not find the transaction). if (pr.wr) { as_proxy_shipop_response_hdlr(m, &pr, &free_msg); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } #ifdef DEBUG_VERBOSE cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd); for (size_t _i = 0; _i < proto_sz; _i++) { fprintf(stderr, " %x", ((byte *)proto)[_i]); if (_i % 16 == 15) { fprintf(stderr, "\n"); } } #endif #ifdef EXTRA_CHECKS as_proto proto_copy = *proto; as_proto_swap(&proto_copy); if (proto_copy.sz + 8 != proto_sz) { cf_info(AS_PROXY, "BONE BONE BONE!!!"); cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz); } #endif // Write to the file descriptor. cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd); cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0"); if (pr.batch_shared) { cf_digest* digest; size_t digest_sz = 0; if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) { as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz); as_proxy_set_stat_counters(0); } else { cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id); as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN); as_proxy_set_stat_counters(-1); } cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); } else { size_t pos = 0; while (pos < proto_sz) { rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } } as_proxy_set_stat_counters(0); SendFin: cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); // Return the fabric message or the direct file descriptor - // after write and complete. pr.fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(pr.fd_h); pr.fd_h = 0; } as_fabric_msg_put(pr.fab_msg); pr.fab_msg = 0; } } else { cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id); as_proxy_set_stat_counters(-1); } if (free_msg) { as_fabric_msg_put(m); } } break; case PROXY_OP_REDIRECT: { // Sometimes the destination we proxied a request to isn't able to // satisfy it (for example, their copy of the partition in question // might be desync). cf_node new_dst = 0; msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst); cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst); // Look in the proxy retransmit hash for the tid. proxy_request *pr; pthread_mutex_t *pr_lock; int r = 0; if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) { cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id); as_fabric_msg_put(m); return -1; } if (g_config.self_node == new_dst) { // Although we don't know we're the final destination, undo the // proxy-nature and put back on the main queue. Dangerous, as it // leaves open the possibility of a looping message. cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } cl_msg *msgp; sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.start_time = pr->start_time; // start time tr.end_time = pr->end_time; tr.proto_fd_h = pr->fd_h; tr.batch_shared = pr->batch_shared; tr.batch_index = pr->batch_index; MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); as_fabric_msg_put(pr->fab_msg); shash_delete_lockfree(g_proxy_hash, &transaction_id); } else { // Change the destination, update the retransmit time. pr->dest = new_dst; pr->xmit_ms = cf_getms() + 1; // Send it. msg_incr_ref(pr->fab_msg); if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv); as_fabric_msg_put(pr->fab_msg); } } pthread_mutex_unlock(pr_lock); } as_fabric_msg_put(m); break; default: cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op); msg_dump(m, "proxy received unknown msg"); as_fabric_msg_put(m); break; } // end switch return 0; } // end proxy_msg_fn()
// Set of threads which talk to client over the connection for doing the needful // processing. Note that once fd is assigned to a thread all the work on that fd // is done by that thread. Fair fd usage is expected of the client. First thread // is special - also does accept [listens for new connections]. It is the only // thread which does it. void * thr_demarshal(void *arg) { cf_socket_cfg *s, *ls; // Create my epoll fd, register in the global list. struct epoll_event ev; int nevents, i, n, epoll_fd; cf_clock last_fd_print = 0; #if defined(USE_SYSTEMTAP) uint64_t nodeid = g_config.self_node; #endif // Early stage aborts; these will cause faults in process scope. cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument"); s = &g_config.socket; ls = &g_config.localhost_socket; #ifdef USE_JEM int orig_arena; if (0 > (orig_arena = jem_get_arena())) { cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!"); } else { cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena); } #endif // Figure out my thread index. pthread_t self = pthread_self(); int thr_id; for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) { if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self)) break; } if (thr_id == MAX_DEMARSHAL_THREADS) { cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!"); return(0); } // First thread accepts new connection at interface socket. if (thr_id == 0) { demarshal_file_handle_init(); epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); memset(&ev, 0, sizeof (ev)); ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = s->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port); if (ls->sock) { ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = ls->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port); } } else { epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); } g_demarshal_args->epoll_fd[thr_id] = epoll_fd; cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id); int id_cntr = 0; // Demarshal transactions from the socket. for ( ; ; ) { struct epoll_event events[EPOLL_SZ]; cf_detail(AS_DEMARSHAL, "calling epoll"); nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1); if (0 > nevents) { cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno)); } cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents); uint64_t now_ns = cf_getns(); uint64_t now_ms = now_ns / 1000000; // Iterate over all events. for (i = 0; i < nevents; i++) { if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) { // Accept new connections on the service socket. int csocket = -1; struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); char cpaddr[64]; if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) { // This means we're out of file descriptors - could be a SYN // flood attack or misbehaving client. Eventually we'd like // to make the reaper fairer, but for now we'll just have to // ignore the accept error and move on. if ((errno == EMFILE) || (errno == ENFILE)) { if (last_fd_print != (cf_getms() / 1000L)) { cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit"); last_fd_print = cf_getms() / 1000L; } continue; } cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno); } // Get the client IP address in string form. if (caddr.sin_family == AF_INET) { if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else if (caddr.sin_family == AF_INET6) { struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr; if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else { cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family); } cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket); // Validate the limit of protocol connections we allow. uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed; if (conns_open > g_config.n_proto_fd_max) { if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open); last_fd_print = cf_getms(); } shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Set the socket to nonblocking. if (-1 == cf_socket_set_nonblocking(csocket)) { cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Create as_file_handle and queue it up in epoll_fd for further // communication on one of the demarshal threads. as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle)); if (!fd_h) { cf_crash(AS_DEMARSHAL, "malloc"); } sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port)); fd_h->fd = csocket; fd_h->last_used = cf_getms(); fd_h->reap_me = false; fd_h->trans_active = false; fd_h->proto = 0; fd_h->proto_unread = 0; fd_h->fh_info = 0; fd_h->security_filter = as_security_filter_create(); // Insert into the global table so the reaper can manage it. Do // this before queueing it up for demarshal threads - once // EPOLL_CTL_ADD is done it's difficult to back out (if insert // into global table fails) because fd state could be anything. cf_rc_reserve(fd_h); pthread_mutex_lock(&g_file_handle_a_LOCK); int j; bool inserted = true; if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) { inserted = false; } else { g_file_handle_a[j] = fd_h; } pthread_mutex_unlock(&g_file_handle_a_LOCK); if (!inserted) { cf_info(AS_DEMARSHAL, "unable to add socket to file handle table"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; cf_rc_free(fd_h); // will free even with ref-count of 2 } else { // Place the client socket in the event queue. memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ; ev.data.ptr = fd_h; // Round-robin pick up demarshal thread epoll_fd and add // this new connection to epoll. int id; while (true) { id = (id_cntr++) % g_demarshal_args->num_threads; if (g_demarshal_args->epoll_fd[id] != 0) { break; } } fd_h->epoll_fd = g_demarshal_args->epoll_fd[id]; if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) { cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads); pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); } else { cf_atomic_int_incr(&g_config.proto_connections_opened); } } } else { bool has_extra_ref = false; as_file_handle *fd_h = events[i].data.ptr; if (fd_h == 0) { cf_info(AS_DEMARSHAL, "event with null handle, continuing"); goto NextEvent; } cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events); // Process data on an existing connection: this might be more // activity on an already existing transaction, so we have some // state to manage. as_proto *proto_p = 0; int fd = fd_h->fd; if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events); // no longer in use: out of epoll etc goto NextEvent_FD_Cleanup; } if (fd_h->trans_active) { goto NextEvent; } // If pointer is NULL, then we need to create a transaction and // store it in the buffer. if (fd_h->proto == NULL) { as_proto proto; int sz; /* Get the number of available bytes */ if (-1 == ioctl(fd, FIONREAD, &sz)) { cf_info(AS_DEMARSHAL, "unable to get number of available bytes"); goto NextEvent_FD_Cleanup; } // If we don't have enough data to fill the message buffer, // just wait and we'll come back to this one. However, we'll // let messages with zero size through, since they are // likely errors. We don't cleanup the FD in this case since // we'll get more data on it. if (sz < sizeof(as_proto) && sz != 0) { goto NextEvent; } // Do a preliminary read of the header into a stack- // allocated structure, so that later on we can allocate the // entire message buffer. if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) { cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno); goto NextEvent_FD_Cleanup; } if (proto.version != PROTO_VERSION && // For backward compatibility, allow version 0 with // security messages. ! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) { cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u", fd_h->client, proto.version); goto NextEvent_FD_Cleanup; } // Swap the necessary elements of the as_proto. as_proto_swap(&proto); if (proto.sz > PROTO_SIZE_MAX) { cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64, fd_h->client, PROTO_SIZE_MAX, proto.sz); goto NextEvent_FD_Cleanup; } #ifdef USE_JEM // Attempt to peek the namespace and set the JEMalloc arena accordingly. size_t peeked_data_sz = 0; size_t min_field_sz = sizeof(uint32_t) + sizeof(char); size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz; size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.) uint8_t peekbuf[peekbuf_sz]; if (PROTO_TYPE_AS_MSG == proto.type) { size_t offset = sizeof(as_msg); // Number of bytes to peek from the socket. // size_t peek_sz = peekbuf_sz; // Peak up to the size of the peek buffer. size_t peek_sz = MIN(proto.sz, peekbuf_sz); // Peek only up to the minimum necessary number of bytes. if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) { // That's actually legitimate. The as_proto may have gone into one // packet, the as_msg into the next one, which we haven't yet received. // This just "never happened" without async. cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz); } if (peeked_data_sz > min_as_msg_sz) { // cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz); if (peeked_data_sz > proto.sz) { cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd); log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz); goto NextEvent_FD_Cleanup; } if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) { jem_set_arena(orig_arena); } else { uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0; bool found = false; // cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields); while (!found && (field_num < n_fields)) { as_msg_field *field = (as_msg_field *) (&peekbuf[offset]); uint32_t value_sz = ntohl(field->field_sz) - 1; // cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset); // cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz); // cf_debug(AS_DEMARSHAL, "\ttype %d", field->type); if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) { if (value_sz >= AS_ID_NAMESPACE_SZ) { cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz); goto NextEvent_FD_Cleanup; } char ns[AS_ID_NAMESPACE_SZ]; found = true; memcpy(ns, field->data, value_sz); ns[value_sz] = '\0'; // cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num); jem_set_arena(as_namespace_get_jem_arena(ns)); } else { // cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type); field_num++; offset += sizeof(as_msg_field) + value_sz; if (offset >= peeked_data_sz) { break; } } } if (!found) { cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz); jem_set_arena(orig_arena); } } } else { jem_set_arena(orig_arena); } } else { jem_set_arena(orig_arena); } #endif // Allocate the complete message buffer. proto_p = cf_malloc(sizeof(as_proto) + proto.sz); cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno)); memcpy(proto_p, &proto, sizeof(as_proto)); #ifdef USE_JEM // Jam in the peeked data. if (peeked_data_sz) { memcpy(proto_p->data, &peekbuf, peeked_data_sz); } fd_h->proto_unread = proto_p->sz - peeked_data_sz; #else fd_h->proto_unread = proto_p->sz; #endif fd_h->proto = (void *) proto_p; } else { proto_p = fd_h->proto; } if (fd_h->proto_unread > 0) { // Read the data. n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0); if (0 >= n) { if (errno == EAGAIN) { continue; } cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno)); goto NextEvent_FD_Cleanup; } // Decrement bytes-unread counter. cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread); fd_h->proto_unread -= n; } // Check for a finished read. if (0 == fd_h->proto_unread) { // It's only really live if it's injecting a transaction. fd_h->last_used = now_ms; thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress fd_h->proto = 0; fd_h->proto_unread = 0; // INIT_TR as_transaction tr; as_transaction_init(&tr, NULL, (cl_msg *)proto_p); cf_rc_reserve(fd_h); has_extra_ref = true; tr.proto_fd_h = fd_h; tr.start_time = now_ns; // set transaction start time tr.preprocessed = false; if (! as_proto_is_valid_type(proto_p)) { cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type); // We got a proto message type we don't recognize, so it // may not do any good to send back an as_msg error, but // it's the best we can do. At least we can keep the fd. as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } if (g_config.microbenchmarks) { histogram_insert_data_point(g_config.demarshal_hist, now_ns); tr.microbenchmark_time = cf_getns(); } // Check if it's compressed. if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) { // Decompress it - allocate buffer to hold decompressed // packet. uint8_t *decompressed_buf = NULL; size_t decompressed_buf_size = 0; int rv = 0; if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) { cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv); cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p"); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Count the packets. cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1); // Free the compressed packet since we'll be using the // decompressed packet from now on. cf_free(proto_p); proto_p = NULL; // Get original packet. tr.msgp = (cl_msg *)decompressed_buf; as_proto_swap(&(tr.msgp->proto)); if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) { cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]", tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } } // Security protocol transactions. if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) { as_security_transact(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Info protocol requests. if (tr.msgp->proto.type == PROTO_TYPE_INFO) { if (as_info(&tr)) { cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer"); goto NextEvent_FD_Cleanup; } cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp); // Fast path for batch requests. if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) { as_batch_queue_task(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Either process the transaction directly in this thread, // or queue it for processing by another thread (tsvc/info). if (0 != thr_tsvc_process_or_enqueue(&tr)) { cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread"); goto NextEvent_FD_Cleanup; } else { cf_atomic_int_incr(&g_config.proto_transactions); } } // Jump the proto message free & FD cleanup. If we get here, the // above operations went smoothly. The message free & FD cleanup // job is handled elsewhere as directed by // thr_tsvc_process_or_enqueue(). goto NextEvent; NextEvent_FD_Cleanup: // If we allocated memory for the incoming message, free it. if (proto_p) { cf_free(proto_p); fd_h->proto = 0; } // If fd has extra reference for transaction, release it. if (has_extra_ref) { cf_rc_release(fd_h); } // Remove the fd from the events list. if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) { cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)", fd, epoll_fd, errno, cf_strerror(errno)); } pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); NextEvent: ; } // We should never be canceled externally, but just in case... pthread_testcancel(); } } return NULL; }
// Build response to batch request. static void batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r) { as_namespace* ns = btr->ns; batch_digests *bmds = btr->digests; bool get_data = btr->get_data; uint32_t yield_count = 0; for (int i = 0; i < bmds->n_digests; i++) { batch_digest *bmd = &bmds->digest[i]; if (bmd->done == false) { // try to get the key as_partition_reservation rsv; AS_PARTITION_RESERVATION_INIT(rsv); cf_node other_node = 0; uint64_t cluster_key; if (! *bb_r) { *bb_r = cf_buf_builder_create_size(1024 * 4); } int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key); if (rv == 0) { cf_atomic_int_incr(&g_config.batch_tree_count); as_index_ref r_ref; r_ref.skip_lock = false; int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns); if (rec_rv == 0) { as_index *r = r_ref.r; // Check to see this isn't an expired record waiting to die. if (r->void_time && r->void_time < as_record_void_time_get()) { as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } else { // Make sure it's brought in from storage if necessary. as_storage_rd rd; if (get_data) { as_storage_record_open(ns, r, &rd, &r->key); rd.n_bins = as_bin_get_n_bins(r, &rd); } // Note: this array must stay in scope until the // response for this record has been built, since in the // get data w/ record on device case, it's copied by // reference directly into the record descriptor. as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; if (get_data) { // Figure out which bins you want - for now, all. rd.bins = as_bin_get_all(r, &rd, stack_bins); rd.n_bins = as_bin_inuse_count(&rd); } as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist); if (get_data) { as_storage_record_close(r, &rd); } } as_record_done(&r_ref, ns); } else { // TODO - what about empty records? cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } bmd->done = true; as_partition_release(&rsv); cf_atomic_int_decr(&g_config.batch_tree_count); } else { cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); if (other_node != 0) { bmd->node = other_node; cf_debug(AS_BATCH, "other_node is: %p.", other_node); } else { cf_debug(AS_BATCH, "other_node is NULL."); } } yield_count++; if (yield_count % g_config.batch_priority == 0) { usleep(1); } } } }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }
/* * Function based on the UDF result and the result of UDF call along * with the optype information update the UDF stats and LDT stats. * * Parameter: * op: execute optype * is_success : In case the UDF operation was successful * ret : return value of UDF execution * * Returns: nothing */ void udf_rw_update_stats(as_namespace *ns, udf_optype op, int ret, bool is_success) { if (UDF_OP_IS_LDT(op)) { if (UDF_OP_IS_READ(op)) cf_atomic_int_incr(&ns->ldt_read_reqs); else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&ns->ldt_delete_reqs); else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&ns->ldt_write_reqs); if (ret == 0) { if (is_success) { if (UDF_OP_IS_READ(op)) cf_atomic_int_incr(&ns->ldt_read_success); else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&ns->ldt_delete_success); else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&ns->ldt_write_success); } else { cf_atomic_int_incr(&ns->ldt_errs); } } else { cf_atomic_int_incr(&g_config.udf_lua_errs); } } else { if (UDF_OP_IS_READ(op)) cf_atomic_int_incr(&g_config.udf_read_reqs); else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&g_config.udf_delete_reqs); else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&g_config.udf_write_reqs); if (ret == 0) { if (is_success) { if (UDF_OP_IS_READ(op)) cf_atomic_int_incr(&g_config.udf_read_success); else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&g_config.udf_delete_success); else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&g_config.udf_write_success); } else { if (UDF_OP_IS_READ(op)) cf_atomic_int_incr(&g_config.udf_read_errs_other); else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&g_config.udf_delete_errs_other); else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&g_config.udf_write_errs_other); } } else { cf_info(AS_UDF,"lua error, ret:%d",ret); cf_atomic_int_incr(&g_config.udf_lua_errs); } } }
/** * Initialize a new UDF. This populates the udf_call from information * in the current transaction. If passed in transaction has req_data it is * assumed to be internal and the UDF information is picked from the udata * associated with it. TODO: Do not overload please define flag for this. * * * Parameter: * tr the transaction to build a udf_call from * * Returns" * return a new udf_call (Caller needs to free it up) * NULL in case of failure */ int udf_call_init(udf_call * call, as_transaction * tr) { call->active = false; call->udf_type = AS_SCAN_UDF_NONE; as_msg_field * filename = NULL; as_msg_field * function = NULL; as_msg_field * arglist = NULL; if (tr->udata.req_udata) { udf_call *ucall = NULL; if (tr->udata.req_type == UDF_SCAN_REQUEST) { ucall = &((tscan_job *)(tr->udata.req_udata))->call; } else if (tr->udata.req_type == UDF_QUERY_REQUEST) { ucall = as_query_get_udf_call(tr->udata.req_udata); } if (ucall) { strncpy(call->filename, ucall->filename, sizeof(ucall->filename)); strncpy(call->function, ucall->function, sizeof(ucall->function)); call->transaction = tr; call->active = true; call->arglist = ucall->arglist; call->udf_type = ucall->udf_type; if (tr->udata.req_type == UDF_SCAN_REQUEST) { cf_atomic_int_incr(&g_config.udf_scan_rec_reqs); } else if (tr->udata.req_type == UDF_QUERY_REQUEST) { cf_atomic_int_incr(&g_config.udf_query_rec_reqs); } } // TODO: return proper macros return 0; } // Check the type of udf as_msg_field * op = NULL; op = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_OP); if (!op) { // Normal udf operation, no special type call->udf_type = 0; } else { // We got a udf type from the server byte optype; memcpy(&optype, (byte *)op->data, sizeof(optype)); if(optype == AS_SCAN_UDF_OP_UDF) { cf_debug(AS_UDF, "UDF scan op received"); call->udf_type = AS_SCAN_UDF_OP_UDF; } else if(optype == AS_SCAN_UDF_OP_BACKGROUND) { cf_debug(AS_UDF, "UDF scan background op received"); call->udf_type = AS_SCAN_UDF_OP_BACKGROUND; } else { cf_warning(AS_UDF, "Undefined udf type received over protocol"); goto Cleanup; } } filename = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_FILENAME); if ( filename ) { function = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_FUNCTION); if ( function ) { arglist = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_ARGLIST); if ( arglist ) { call->transaction = tr; as_msg_field_get_strncpy(filename, &call->filename[0], sizeof(call->filename)); as_msg_field_get_strncpy(function, &call->function[0], sizeof(call->function)); call->arglist = arglist; call->active = true; cf_detail(AS_UDF, "UDF Request Unpacked %s %s", call->filename, call->function); return 0; } } } Cleanup: call->transaction = NULL; call->filename[0] = 0; call->function[0] = 0; call->arglist = NULL; return 1; }
// Put batch request on a separate batch queue. int as_batch_direct_queue_task(as_transaction* tr) { cf_atomic_int_incr(&g_config.batch_initiate); if (g_config.n_batch_threads <= 0) { cf_warning(AS_BATCH, "batch-threads has been disabled."); return AS_PROTO_RESULT_FAIL_BATCH_DISABLED; } as_msg* msg = &tr->msgp->msg; as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE); if (! nsfp) { cf_warning(AS_BATCH, "Batch namespace is required."); return AS_PROTO_RESULT_FAIL_NAMESPACE; } as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY); if (! dfp) { cf_warning(AS_BATCH, "Batch digests are required."); return AS_PROTO_RESULT_FAIL_PARAMETER; } uint n_digests = dfp->field_sz / sizeof(cf_digest); if (n_digests > g_config.batch_max_requests) { cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests); return AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS; } batch_transaction btr; btr.trid = tr->trid; btr.end_time = tr->end_time; btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA); btr.complete = false; btr.ns = as_namespace_get_bymsgfield(nsfp); if (! btr.ns) { cf_warning(AS_BATCH, "Batch namespace is required."); return AS_PROTO_RESULT_FAIL_NAMESPACE; } // Create the master digest table. btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests)); if (! btr.digests) { cf_warning(AS_BATCH, "Failed to allocate memory for batch digests."); return AS_PROTO_RESULT_FAIL_UNKNOWN; } batch_digests* bmd = btr.digests; bmd->n_digests = n_digests; uint8_t* digest_field_data = dfp->data; for (int i = 0; i < n_digests; i++) { bmd->digest[i].done = false; bmd->digest[i].node = 0; memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest)); digest_field_data += sizeof(cf_digest); } btr.binlist = as_binlist_from_op(msg); btr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; btr.fd_h->last_used = cf_getms(); int status = as_thread_pool_queue_task_fixed(&batch_direct_thread_pool, &btr); if (status) { cf_warning(AS_BATCH, "Batch enqueue failed"); return AS_PROTO_RESULT_FAIL_UNKNOWN; } return 0; }
/** * aerospike::create(record) * Function: udf_aerospike_rec_create * * Parameters: * as - as_aerospike * rec - as_rec * * Return Values: * 1 if record is being read or on a create, it already exists * o/w return value of udf_aerospike__execute_updates * * Description: * Create a new record in local storage. * The record will only be created if it does not exist. * This assumes the record has a digest that is valid for local storage. * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * lua interfacing function, mod_lua_aerospike_rec_create * The return value of udf_aerospike_rec_create is pushed on to the lua stack * * Notes: * The 'read' and 'exists' flag of udf_record are set to true. */ static int udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec) { int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); if (ret) { return ret; } udf_record * urecord = (udf_record *) as_rec_source(rec); // make sure record isn't already successfully read if (urecord->flag & UDF_RECORD_FLAG_OPEN) { cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists"); return 1; } as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; as_storage_rd *rd = urecord->rd; as_index_tree *tree = tr->rsv.tree; bool is_subrec = false; if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { tree = tr->rsv.sub_tree; is_subrec = true; } // make sure we got the record as a create bool is_create = false; int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns, is_subrec); cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord", (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : ""); // rv 0 means record exists, 1 means create, < 0 means fail // TODO: Verify correct result codes. if (rv == 1) { is_create = true; } else if (rv == 0) { // If it's an expired record, pretend it's a fresh create. if (as_record_is_expired(r_ref->r)) { as_record_destroy(r_ref->r, tr->rsv.ns); as_record_initialize(r_ref, tr->rsv.ns); cf_atomic_int_incr(&tr->rsv.ns->n_objects); is_create = true; } else { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2"); as_record_done(r_ref, tr->rsv.ns); // DO NOT change it has special meaning for caller return 1; } } else if (rv < 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv); return rv; } // Associates the set name with the storage rec and index if (tr->msgp) { // Set the set name to index and close record if the setting the set name // is not successful int rv_set = as_transaction_has_set(tr) ? as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0; if (rv_set != 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname"); if (is_create) { as_index_delete(tree, &tr->keyd); } as_record_done(r_ref, tr->rsv.ns); return 4; } } urecord->flag |= UDF_RECORD_FLAG_OPEN; cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); as_index *r = r_ref->r; // open up storage as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r, urecord->rd, &urecord->tr->keyd); cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p", urecord->r_ref->r, urecord->rd); // If the message has a key, apply it to the record. if (! get_msg_key(tr, rd)) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key"); if (is_create) { as_index_delete(tree, &tr->keyd); } as_record_done(r_ref, tr->rsv.ns); urecord->flag &= ~UDF_RECORD_FLAG_OPEN; return 4; } // if multibin storage, we will use urecord->stack_bins, so set the size appropriately if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) { rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); } // side effect: will set the unused bins to properly unused rd->bins = as_bin_get_all(r, rd, urecord->stack_bins); urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag); int rc = udf_aerospike__execute_updates(urecord); if (rc) { // Creating the udf record failed, destroy the as_record cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc); if (!as_bin_inuse_has(urecord->rd)) { udf_aerospike_rec_remove(as, rec); } } return rc; }
// Make a request to another node. // // Note: there's a cheat here. 'as_msg' is used in a raw form, and includes // structured data (version - type - nfields - sz ...) which should be made more // wire-protocol-friendly. int as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key) { cf_detail(AS_PROXY, "proxy divert"); cf_atomic_int_incr(&g_config.stat_proxy_reqs); if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) { cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr); } as_partition_id pid = as_partition_getid(tr->keyd); if (dst == 0) { // Get the list of replicas. dst = as_partition_getreplica_read(ns, pid); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC; msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl); tr->msgp = 0; cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = tr->start_time; pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns; pr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.ns = ns; pr.wr = NULL; pr.batch_shared = tr->batch_shared; pr.batch_index = tr->batch_index; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_debug(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv); as_fabric_msg_put(m); } cf_atomic_int_incr(&g_config.proxy_initiate); return 0; }
// Put batch request on a separate batch queue. int as_batch(as_transaction* tr) { as_msg* msg = &tr->msgp->msg; as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE); if (! nsfp) { cf_warning(AS_BATCH, "Batch namespace is required."); return -1; } as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY); if (! dfp) { cf_warning(AS_BATCH, "Batch digests are required."); return -1; } uint n_digests = dfp->field_sz / sizeof(cf_digest); if (n_digests > g_config.batch_max_requests) { cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests); return -1; } batch_transaction btr; btr.trid = tr->trid; btr.end_time = tr->end_time; btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA); btr.ns = as_namespace_get_bymsgfield(nsfp); if (! btr.ns) { cf_warning(AS_BATCH, "Batch namespace is required."); return -1; } // Create the master digest table. btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests)); if (! btr.digests) { cf_warning(AS_BATCH, "Failed to allocate memory for batch digests."); return -1; } batch_digests* bmd = btr.digests; bmd->n_digests = n_digests; uint8_t* digest_field_data = dfp->data; for (int i = 0; i < n_digests; i++) { bmd->digest[i].done = false; bmd->digest[i].node = 0; memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest)); digest_field_data += sizeof(cf_digest); } btr.binlist = as_binlist_from_op(msg); btr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; btr.fd_h->last_used = cf_getms(); cf_atomic_int_incr(&g_config.batch_initiate); cf_queue_push(g_batch_queue, &btr); return 0; }