static int as_send(int fd, uint8_t* buffer, uint8_t* end, uint64_t deadline_ms, int timeout_ms) { uint64_t len = end - buffer; uint64_t proto = (len - 8) | (MSG_VERSION << 56) | (MSG_TYPE << 48); *(uint64_t*)buffer = cf_swap_to_be64(proto); return cf_socket_write_timeout(fd, buffer, len, deadline_ms, timeout_ms); }
// Request the info of a particular sockaddr_in. // Reject info request if response length is greater than max_response_length. // Return 0 on success and -1 on error. int citrusleaf_info_host_limit(struct sockaddr_in *sa_in, char *names, char **values, int timeout_ms, bool send_asis, uint64_t max_response_length) { int rv = -1; int io_rv; *values = 0; // Deal with the incoming 'names' parameter // Translate interior ';' in the passed-in names to \n uint32_t slen = 0; if (names) { if (send_asis) { slen = strlen(names); } else { char *_t = names; while (*_t) { slen++; if ((*_t == ';') || (*_t == ':') || (*_t == ',')) *_t = '\n'; _t++; } } } // Sometimes people forget/cant add the trailing '\n'. Be nice and add it for them. // using a stack allocated variable so we dn't have to clean up, Pain in the ass syntactically // but a nice little thing if (names) { if (names[slen-1] == '\n') { slen = 0; } else { slen++; if (slen > 1024) { return(-1); } } } char names_with_term[slen+1]; if (slen) { strcpy(names_with_term, names); names_with_term[slen-1] = '\n'; names_with_term[slen] = 0; names = names_with_term; } // Actually doing a non-blocking connect int fd = cf_socket_create_and_connect_nb(sa_in); if (fd == -1) { return -1; } cl_proto *req; uint8_t buf[1024]; uint buf_sz; // Un-initialized buf can lead to junk lastshiptimes values. // Initialize buf to 0. bzero(buf, 1024); if (names) { uint sz = strlen(names); buf_sz = sz + sizeof(cl_proto); if (buf_sz < 1024) req = (cl_proto *) buf; else req = (cl_proto *) malloc(buf_sz); if (req == NULL) goto Done; req->sz = sz; memcpy(req->data,names,sz); } else { req = (cl_proto *) buf; req->sz = 0; buf_sz = sizeof(cl_proto); names = ""; } req->version = CL_PROTO_VERSION; req->type = CL_PROTO_TYPE_INFO; cl_proto_swap(req); if (timeout_ms) io_rv = cf_socket_write_timeout(fd, (uint8_t *) req, buf_sz, 0, timeout_ms); else io_rv = cf_socket_write_forever(fd, (uint8_t *) req, buf_sz); if ((uint8_t *)req != buf) free(req); if (io_rv != 0) { #ifdef DEBUG cf_debug("info returned error, rv %d errno %d bufsz %d", io_rv, errno, buf_sz); #endif goto Done; } cl_proto *rsp = (cl_proto *)buf; if (timeout_ms) io_rv = cf_socket_read_timeout(fd, buf, 8, 0, timeout_ms); else io_rv = cf_socket_read_forever(fd, buf, 8); if (0 != io_rv) { #ifdef DEBUG cf_debug("info socket read failed: rv %d errno %d", io_rv, errno); #endif goto Done; } cl_proto_swap(rsp); if (rsp->sz) { size_t read_length = rsp->sz; bool limit_reached = false; if (max_response_length > 0 && rsp->sz > max_response_length) { // Response buffer is too big. Read a few bytes just to see what the buffer contains. read_length = 100; limit_reached = true; } uint8_t *v_buf = malloc(read_length + 1); if (!v_buf) { cf_warn("Info request '%s' failed. Failed to malloc %d bytes", names, read_length); goto Done; } if (timeout_ms) io_rv = cf_socket_read_timeout(fd, v_buf, read_length, 0, timeout_ms); else io_rv = cf_socket_read_forever(fd, v_buf, read_length); if (io_rv != 0) { free(v_buf); if (io_rv != ETIMEDOUT) { cf_warn("Info request '%s' failed. Failed to read %d bytes. Return code %d", names, read_length, io_rv); } goto Done; } v_buf[read_length] = 0; if (limit_reached) { // Response buffer is too big. Log warning and reject. cf_warn("Info request '%s' failed. Response buffer length %lu is excessive. Buffer: %s", names, rsp->sz, v_buf); goto Done; } *values = (char *) v_buf; } else { *values = 0; } rv = 0; Done: shutdown(fd, SHUT_RDWR); close(fd); return(rv); }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }
// Request the info of a particular sockaddr_in. // Reject info request if response length is greater than max_response_length. // Return 0 on success and -1 on error. int citrusleaf_info_host_limit(int fd, char *names, char **values, int timeout_ms, bool send_asis, uint64_t max_response_length, bool check_bounds) { uint bb_size = 2048; int rv = -1; int io_rv; *values = 0; // Deal with the incoming 'names' parameter // Translate interior ';' in the passed-in names to \n uint32_t slen = 0; if (names) { if (send_asis) { slen = (uint32_t)strlen(names); } else { char *_t = names; while (*_t) { slen++; if ((*_t == ';') || (*_t == ':') || (*_t == ',')) *_t = '\n'; _t++; } } } // Sometimes people forget/cant add the trailing '\n'. Be nice and add it for them. // using a stack allocated variable so we dn't have to clean up, Pain in the ass syntactically // but a nice little thing if (names) { if (names[slen-1] == '\n') { slen = 0; } else { slen++; // If check bounds is true, do not allow beyond a certain limit if (check_bounds && (slen > bb_size)) { return(-1); } } } char names_with_term[slen+1]; if (slen) { strcpy(names_with_term, names); names_with_term[slen-1] = '\n'; names_with_term[slen] = 0; names = names_with_term; } cl_proto *req; uint8_t buf[bb_size]; uint buf_sz; bool rmalloced = false; if (names) { uint sz = (uint)strlen(names); buf_sz = sz + sizeof(cl_proto); if (buf_sz < bb_size) req = (cl_proto *) buf; else { req = (cl_proto *) malloc(buf_sz); rmalloced = true; } if (req == NULL) goto Done; req->sz = sz; memcpy((void*)req + sizeof(cl_proto), names, sz); } else { req = (cl_proto *) buf; req->sz = 0; buf_sz = sizeof(cl_proto); names = ""; } req->version = CL_PROTO_VERSION; req->type = CL_PROTO_TYPE_INFO; cl_proto_swap_to_be(req); if (timeout_ms) io_rv = cf_socket_write_timeout(fd, (uint8_t *) req, buf_sz, 0, timeout_ms); else io_rv = cf_socket_write_forever(fd, (uint8_t *) req, buf_sz); if (rmalloced) { free (req); } if (io_rv != 0) { #ifdef DEBUG cf_debug("info returned error, rv %d errno %d bufsz %d", io_rv, errno, buf_sz); #endif goto Done; } cl_proto *rsp = (cl_proto *)buf; if (timeout_ms) io_rv = cf_socket_read_timeout(fd, buf, 8, 0, timeout_ms); else io_rv = cf_socket_read_forever(fd, buf, 8); if (0 != io_rv) { #ifdef DEBUG cf_debug("info socket read failed: rv %d errno %d", io_rv, errno); #endif goto Done; } cl_proto_swap_from_be(rsp); if (rsp->sz) { size_t read_length = rsp->sz; bool limit_reached = false; if (max_response_length > 0 && rsp->sz > max_response_length) { // Response buffer is too big. Read a few bytes just to see what the buffer contains. read_length = 100; limit_reached = true; } uint8_t *v_buf = malloc(read_length + 1); if (!v_buf) { cf_warn("Info request '%s' failed. Failed to malloc %d bytes", names, read_length); goto Done; } if (timeout_ms) io_rv = cf_socket_read_timeout(fd, v_buf, read_length, 0, timeout_ms); else io_rv = cf_socket_read_forever(fd, v_buf, read_length); if (io_rv != 0) { free(v_buf); if (io_rv != ETIMEDOUT) { cf_warn("Info request '%s' failed. Failed to read %d bytes. Return code %d", names, read_length, io_rv); } goto Done; } v_buf[read_length] = 0; if (limit_reached) { // Response buffer is too big. Log warning and reject. cf_warn("Info request '%s' failed. Response buffer length %lu is excessive. Buffer: %s", names, rsp->sz, v_buf); goto Done; } *values = (char *) v_buf; } else { cf_warn("rsp size is 0"); *values = 0; } rv = 0; Done: return(rv); }
static uint8_t* as_node_get_info(as_node* node, const char* names, size_t names_len, int timeout_ms, uint8_t* stack_buf) { int fd = node->info_fd; // Prepare the write request buffer. size_t write_size = sizeof(cl_proto) + names_len; cl_proto* proto = (cl_proto*)stack_buf; proto->sz = names_len; proto->version = CL_PROTO_VERSION; proto->type = CL_PROTO_TYPE_INFO; cl_proto_swap_to_be(proto); memcpy((void*)(stack_buf + sizeof(cl_proto)), (const void*)names, names_len); // Write the request. Note that timeout_ms is never 0. if (cf_socket_write_timeout(fd, stack_buf, write_size, 0, timeout_ms) != 0) { cf_debug("Node %s failed info socket write", node->name); return 0; } // Reuse the buffer, read the response - first 8 bytes contains body size. if (cf_socket_read_timeout(fd, stack_buf, sizeof(cl_proto), 0, timeout_ms) != 0) { cf_debug("Node %s failed info socket read header", node->name); return 0; } proto = (cl_proto*)stack_buf; cl_proto_swap_from_be(proto); // Sanity check body size. if (proto->sz == 0 || proto->sz > 512 * 1024) { cf_info("Node %s bad info response size %lu", node->name, proto->sz); return 0; } // Allocate a buffer if the response is bigger than the stack buffer - // caller must free it if this call succeeds. Note that proto is overwritten // if stack_buf is used, so we save the sz field here. size_t proto_sz = proto->sz; uint8_t* rbuf = proto_sz >= INFO_STACK_BUF_SIZE ? (uint8_t*)cf_malloc(proto_sz + 1) : stack_buf; if (! rbuf) { cf_error("Node %s failed allocation for info response", node->name); return 0; } // Read the response body. if (cf_socket_read_timeout(fd, rbuf, proto_sz, 0, timeout_ms) != 0) { cf_debug("Node %s failed info socket read body", node->name); if (rbuf != stack_buf) { cf_free(rbuf); } return 0; } // Null-terminate the response body and return it. rbuf[proto_sz] = 0; return rbuf; }