void cl_cluster_scan_shutdown(cl_cluster* asc) { // Check whether we ever (lazily) initialized scan machinery. if (cf_atomic32_get(asc->scan_initialized) == 0 && ! asc->scan_q) { return; } // This tells the worker threads to stop. We do this (instead of using a // "running" flag) to allow the workers to "wait forever" on processing the // work dispatch queue, which has minimum impact when the queue is empty. // This also means all queued requests get processed when shutting down. for (int i = 0; i < NUM_SCAN_THREADS; i++) { cl_scan_task task; task.asc = NULL; cf_queue_push(asc->scan_q, &task); } for (int i = 0; i < NUM_SCAN_THREADS; i++) { pthread_join(asc->scan_threads[i], NULL); } cf_queue_destroy(asc->scan_q); asc->scan_q = NULL; cf_atomic32_set(&asc->scan_initialized, 0); }
static int do_scan_monte(cl_cluster *asc, char *node_name, uint operation_info, uint operation_info2, const char *ns, const char *set, cl_bin *bins, int n_bins, uint8_t scan_pct, citrusleaf_get_many_cb cb, void *udata, cl_scan_parameters *scan_opt) { int rv = -1; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = 0; size_t rd_buf_sz = 0; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); cl_scan_param_field scan_param_field; if (scan_opt) { scan_param_field.scan_pct = scan_pct>100? 100:scan_pct; scan_param_field.byte1 = (scan_opt->priority<<4) | (scan_opt->fail_on_cluster_change<<3); } // we have a single namespace and/or set to get if (cl_compile(operation_info, operation_info2, 0, ns, set, 0, 0, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, 0, NULL, 0, scan_opt ? &scan_param_field : NULL)) { return(rv); } #ifdef DEBUG_VERBOSE dump_buf("sending request to cluster:", wr_buf, wr_buf_sz); #endif int fd; cl_cluster_node *node = 0; // Get an FD from a cluster if (node_name) { node = cl_cluster_node_get_byname(asc,node_name); // grab a reservation if (node) cl_cluster_node_reserve(node, "T+"); } else { node = cl_cluster_node_get_random(asc); } if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, failing"); #endif return(-1); } fd = cl_cluster_node_fd_get(node, false, asc->nbconnect); if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no file descriptors, retrying transaction", node->name); #endif return(-1); } // send it to the cluster - non blocking socket, but we're blocking if (0 != cf_socket_write_forever(fd, wr_buf, wr_buf_sz)) { #ifdef DEBUG cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d", rv, fd, errno); #endif close(fd); return(-1); } cl_proto proto; bool done = false; do { // multiple CL proto per response // Now turn around and read a fine cl_pro - that's the first 8 bytes that has types and lengths if ((rv = cf_socket_read_forever(fd, (uint8_t *) &proto, sizeof(cl_proto) ) ) ) { cf_error("network error: errno %d fd %d",rv, fd); close(fd); return(-1); } #ifdef DEBUG_VERBOSE dump_buf("read proto header from cluster", (uint8_t *) &proto, sizeof(cl_proto)); #endif cl_proto_swap(&proto); if (proto.version != CL_PROTO_VERSION) { cf_error("network error: received protocol message of wrong version %d", proto.version); close(fd); return(-1); } if (proto.type != CL_PROTO_TYPE_CL_MSG) { cf_error("network error: received incorrect message version %d", proto.type); close(fd); return(-1); } // second read for the remainder of the message - expect this to cover lots of data, many lines // // if there's no error rd_buf_sz = proto.sz; if (rd_buf_sz > 0) { // cf_debug("message read: size %u",(uint)proto.sz); if (rd_buf_sz > sizeof(rd_stack_buf)) rd_buf = malloc(rd_buf_sz); else rd_buf = rd_stack_buf; if (rd_buf == NULL) { close(fd); return (-1); } if ((rv = cf_socket_read_forever(fd, rd_buf, rd_buf_sz))) { cf_error("network error: errno %d fd %d", rv, fd); if (rd_buf != rd_stack_buf) { free(rd_buf); } close(fd); return(-1); } // this one's a little much: printing the entire body before printing the other bits #ifdef DEBUG_VERBOSE dump_buf("read msg body header (multiple msgs)", rd_buf, rd_buf_sz); #endif } // process all the cl_msg in this proto uint8_t *buf = rd_buf; uint pos = 0; cl_bin stack_bins[STACK_BINS]; cl_bin *bins_local; while (pos < rd_buf_sz) { #ifdef DEBUG_VERBOSE dump_buf("individual message header", buf, sizeof(cl_msg)); #endif uint8_t *buf_start = buf; cl_msg *msg = (cl_msg *) buf; cl_msg_swap_header(msg); buf += sizeof(cl_msg); if (msg->header_sz != sizeof(cl_msg)) { cf_error("received cl msg of unexpected size: expecting %zd found %d, internal error", sizeof(cl_msg),msg->header_sz); close(fd); return(-1); } // parse through the fields cf_digest *keyd = 0; char ns_ret[33] = {0}; char *set_ret = NULL; cl_msg_field *mf = (cl_msg_field *)buf; for (int i=0;i<msg->n_fields;i++) { cl_msg_swap_field(mf); if (mf->type == CL_MSG_FIELD_TYPE_KEY) { cf_error("read: found a key - unexpected"); } else if (mf->type == CL_MSG_FIELD_TYPE_DIGEST_RIPE) { keyd = (cf_digest *) mf->data; } else if (mf->type == CL_MSG_FIELD_TYPE_NAMESPACE) { memcpy(ns_ret, mf->data, cl_msg_field_get_value_sz(mf)); ns_ret[ cl_msg_field_get_value_sz(mf) ] = 0; } else if (mf->type == CL_MSG_FIELD_TYPE_SET) { uint32_t set_name_len = cl_msg_field_get_value_sz(mf); set_ret = (char *)malloc(set_name_len + 1); memcpy(set_ret, mf->data, set_name_len); set_ret[ set_name_len ] = '\0'; } mf = cl_msg_field_get_next(mf); } buf = (uint8_t *) mf; #ifdef DEBUG_VERBOSE cf_debug("message header fields: nfields %u nops %u", msg->n_fields, msg->n_ops); #endif if (msg->n_ops > STACK_BINS) { bins_local = malloc(sizeof(cl_bin) * msg->n_ops); } else { bins_local = stack_bins; } if (bins_local == NULL) { if (set_ret) { free(set_ret); } close(fd); return (-1); } // parse through the bins/ops cl_msg_op *op = (cl_msg_op *)buf; for (int i=0;i<msg->n_ops;i++) { cl_msg_swap_op(op); #ifdef DEBUG_VERBOSE cf_debug("op receive: %p size %d op %d ptype %d pversion %d namesz %d", op,op->op_sz, op->op, op->particle_type, op->version, op->name_sz); #endif #ifdef DEBUG_VERBOSE dump_buf("individual op (host order)", (uint8_t *) op, op->op_sz + sizeof(uint32_t)); #endif cl_set_value_particular(op, &bins_local[i]); op = cl_msg_op_get_next(op); } buf = (uint8_t *) op; if (msg->result_code != CL_RESULT_OK) { // Special case - if we scan a set name that doesn't exist on a // node, it will return "not found" - we unify this with the // case where OK is returned and no callbacks were made. [AKG] if (msg->result_code == CL_RESULT_NOTFOUND) { msg->result_code = CL_RESULT_OK; } rv = (int)msg->result_code; done = true; } else if (msg->info3 & CL_MSG_INFO3_LAST) { #ifdef DEBUG cf_debug("received final message"); #endif done = true; } else if ((msg->n_ops) || (operation_info & CL_MSG_INFO1_NOBINDATA)) { // got one good value? call it a success! (*cb) ( ns_ret, keyd, set_ret, msg->generation, msg->record_ttl, bins_local, msg->n_ops, false /*islast*/, udata); rv = 0; } // else // cf_debug("received message with no bins, signal of an error"); if (bins_local != stack_bins) { free(bins_local); bins_local = 0; } if (set_ret) { free(set_ret); set_ret = NULL; } // don't have to free object internals. They point into the read buffer, where // a pointer is required pos += buf - buf_start; } if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); rd_buf = 0; } } while ( done == false ); if (wr_buf != wr_stack_buf) { free(wr_buf); wr_buf = 0; } cf_atomic32_set(&node->intervals_unreachable, 0); cl_cluster_node_fd_put(node, fd, false); cl_cluster_node_put(node); node = 0; #ifdef DEBUG_VERBOSE cf_debug("exited loop: rv %d", rv ); #endif return(rv); }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }