// Authenticate connection and request the info of a particular sockaddr_in. // Return 0 on success. int citrusleaf_info_host_auth(as_cluster* cluster, struct sockaddr_in *sa_in, char *names, char **values, int timeout_ms, bool send_asis, bool check_bounds) { int fd = cf_socket_create_and_connect_nb(sa_in); if (fd == -1) { return CITRUSLEAF_FAIL_UNAVAILABLE; } if (cluster->user) { int status = as_authenticate(fd, cluster->user, cluster->password, timeout_ms); if (status) { cf_error("Authentication failed for %s", cluster->user); cf_close(fd); return status; } } int rv = citrusleaf_info_host_limit(fd, names, values, timeout_ms, send_asis, 0, check_bounds); shutdown(fd, SHUT_RDWR); cf_close(fd); if (rv == 0 && strncmp(*values, "security error", 14) == 0) { cf_error("%s", *values); free(*values); return CITRUSLEAF_NOT_AUTHENTICATED; } return rv; }
int main () { while (FCGI_Accept() >= 0) { printf("Content-Type: application/json\n\n"); char *envinfo = NULL; envinfo = getenv("QUERY_STRING"); int no_qs = 0; if ((envinfo == NULL) || strlen(envinfo) == 0) { cf_error(1); no_qs++; } char *delims = "=&"; char *qval = NULL; int qstr_order = 0; qs_t qs; qval = strtok(envinfo, delims); while (qval != NULL) { qstr_order++; switch (qstr_order) { case 2: qs.lon = atof(qval); break; case 4: qs.lat = atof(qval); break; case 6: qs.dist = atof(qval); break; case 8: qs.px = atoi(qval); break; default: ; } qval = strtok(NULL, delims); } if (qstr_order != 8) { if (!no_qs) { cf_error(1); } else { ; } } else { char *query1 = base_geo_query; /* any number of additional queries */ canvas_fish(query1, qs.lon, qs.lat, qs.dist, qs.px); } } return 0; }
/** * config_parse - parse a configuration * @c: configuration * * config_parse() reads input by calling a hook function pointed to * by @cf_read_hook and parses it according to the configuration * grammar. It also calls all the preconfig and postconfig hooks * before, resp. after parsing. * * Result: 1 if the config has been parsed successfully, 0 if any * error has occurred (such as anybody calling cf_error()) and * the @err_msg field has been set to the error message. */ int config_parse(struct config *c) { DBG("Parsing configuration file `%s'\n", c->file_name); new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) return 0; cf_lex_init(0, c); sysdep_preconfig(c); protos_preconfig(c); rt_preconfig(c); cf_parse(); protos_postconfig(c); if (EMPTY_LIST(c->protos)) cf_error("No protocol is specified in the config file"); #ifdef IPV6 if (!c->router_id) cf_error("Router ID must be configured manually on IPv6 routers"); #endif return 1; }
cf_vector * citrusleaf_scan_all_nodes (cl_cluster *asc, char *ns, char *set, cl_bin *bins, int n_bins, bool nobindata, uint8_t scan_pct, citrusleaf_get_many_cb cb, void *udata, cl_scan_parameters *scan_param) { char *node_names = NULL; int n_nodes = 0; cl_cluster_get_node_names(asc, &n_nodes, &node_names); if (n_nodes == 0) { cf_error("citrusleaf scan all nodes: don't have any nodes?"); return NULL; } cf_vector *rsp_v = cf_vector_create(sizeof(cl_node_response), n_nodes, 0); if (rsp_v == NULL) { cf_error("citrusleaf scan all nodes: cannot allocate for response array for %d nodes", n_nodes); free(node_names); return NULL; } if (scan_param && scan_param->concurrent_nodes) { cf_error("citrusleaf scan all nodes: concurrent node scanning not yet supported"); } else { char *nptr = node_names; for (int i=0;i< n_nodes; i++) { cl_rv r = citrusleaf_scan_node (asc, nptr, ns, set, bins, n_bins, nobindata, scan_pct, cb, udata, scan_param); cl_node_response resp_s; resp_s.node_response = r; memcpy(resp_s.node_name,nptr,NODE_NAME_SIZE); cf_vector_append(rsp_v, (void *)&resp_s); nptr+=NODE_NAME_SIZE; } } free(node_names); return rsp_v; }
extern cl_rv citrusleaf_scan(cl_cluster *asc, char *ns, char *set, cl_bin *bins, int n_bins, bool get_key, citrusleaf_get_many_cb cb, void *udata, bool nobindata) { if (n_bins != 0) { cf_error("citrusleaf get many: does not yet support bin-specific requests"); } uint info=0; if (nobindata == true) { info = (CL_MSG_INFO1_READ | CL_MSG_INFO1_NOBINDATA); } else { info = CL_MSG_INFO1_READ; } return( do_scan_monte( asc, NULL, info, 0, ns, set, bins,n_bins, 100, cb, udata, NULL ) ); }
int as_node_get_connection(as_node* node, int* fd) { //cf_queue* q = asyncfd ? node->conn_q_asyncfd : node->conn_q; cf_queue* q = node->conn_q; while (1) { int rv = cf_queue_pop(q, fd, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { int rv2 = is_connected(*fd); switch (rv2) { case CONNECTED: // It's still good. return 0; case CONNECTED_BADFD: // Local problem, don't try closing. cf_warn("Found bad file descriptor in queue: fd %d", *fd); break; case CONNECTED_NOT: // Can't use it - the remote end closed it. case CONNECTED_ERROR: // Some other problem, could have to do with remote end. default: cf_close(*fd); break; } } else if (rv == CF_QUEUE_EMPTY) { // We exhausted the queue. Try creating a fresh socket. return as_node_create_connection(node, fd); } else { cf_error("Bad return value from cf_queue_pop"); *fd = -1; return CITRUSLEAF_FAIL_CLIENT; } } }
extern cl_rv citrusleaf_scan_node (cl_cluster *asc, char *node_name, char *ns, char *set, cl_bin *bins, int n_bins, bool nobindata, uint8_t scan_pct, citrusleaf_get_many_cb cb, void *udata, cl_scan_parameters *scan_param) { if (n_bins != 0) { cf_error("citrusleaf get many: does not yet support bin-specific requests"); } uint info=0; if (nobindata == true) { info = (CL_MSG_INFO1_READ | CL_MSG_INFO1_NOBINDATA); } else { info = CL_MSG_INFO1_READ; } cl_scan_parameters default_scan_param; if (scan_param == NULL) { cl_scan_parameters_set_default(&default_scan_param); scan_param = &default_scan_param; } return( do_scan_monte( asc, node_name, info, 0, ns, set, bins, n_bins, scan_pct, cb, udata, scan_param ) ); }
static int do_scan_monte(cl_cluster *asc, char *node_name, uint operation_info, uint operation_info2, const char *ns, const char *set, cl_bin *bins, int n_bins, uint8_t scan_pct, citrusleaf_get_many_cb cb, void *udata, cl_scan_parameters *scan_opt) { int rv = -1; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = 0; size_t rd_buf_sz = 0; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); cl_scan_param_field scan_param_field; if (scan_opt) { scan_param_field.scan_pct = scan_pct>100? 100:scan_pct; scan_param_field.byte1 = (scan_opt->priority<<4) | (scan_opt->fail_on_cluster_change<<3); } // we have a single namespace and/or set to get if (cl_compile(operation_info, operation_info2, 0, ns, set, 0, 0, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, 0, NULL, 0, scan_opt ? &scan_param_field : NULL)) { return(rv); } #ifdef DEBUG_VERBOSE dump_buf("sending request to cluster:", wr_buf, wr_buf_sz); #endif int fd; cl_cluster_node *node = 0; // Get an FD from a cluster if (node_name) { node = cl_cluster_node_get_byname(asc,node_name); // grab a reservation if (node) cl_cluster_node_reserve(node, "T+"); } else { node = cl_cluster_node_get_random(asc); } if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, failing"); #endif return(-1); } fd = cl_cluster_node_fd_get(node, false, asc->nbconnect); if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no file descriptors, retrying transaction", node->name); #endif return(-1); } // send it to the cluster - non blocking socket, but we're blocking if (0 != cf_socket_write_forever(fd, wr_buf, wr_buf_sz)) { #ifdef DEBUG cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d", rv, fd, errno); #endif close(fd); return(-1); } cl_proto proto; bool done = false; do { // multiple CL proto per response // Now turn around and read a fine cl_pro - that's the first 8 bytes that has types and lengths if ((rv = cf_socket_read_forever(fd, (uint8_t *) &proto, sizeof(cl_proto) ) ) ) { cf_error("network error: errno %d fd %d",rv, fd); close(fd); return(-1); } #ifdef DEBUG_VERBOSE dump_buf("read proto header from cluster", (uint8_t *) &proto, sizeof(cl_proto)); #endif cl_proto_swap(&proto); if (proto.version != CL_PROTO_VERSION) { cf_error("network error: received protocol message of wrong version %d", proto.version); close(fd); return(-1); } if (proto.type != CL_PROTO_TYPE_CL_MSG) { cf_error("network error: received incorrect message version %d", proto.type); close(fd); return(-1); } // second read for the remainder of the message - expect this to cover lots of data, many lines // // if there's no error rd_buf_sz = proto.sz; if (rd_buf_sz > 0) { // cf_debug("message read: size %u",(uint)proto.sz); if (rd_buf_sz > sizeof(rd_stack_buf)) rd_buf = malloc(rd_buf_sz); else rd_buf = rd_stack_buf; if (rd_buf == NULL) { close(fd); return (-1); } if ((rv = cf_socket_read_forever(fd, rd_buf, rd_buf_sz))) { cf_error("network error: errno %d fd %d", rv, fd); if (rd_buf != rd_stack_buf) { free(rd_buf); } close(fd); return(-1); } // this one's a little much: printing the entire body before printing the other bits #ifdef DEBUG_VERBOSE dump_buf("read msg body header (multiple msgs)", rd_buf, rd_buf_sz); #endif } // process all the cl_msg in this proto uint8_t *buf = rd_buf; uint pos = 0; cl_bin stack_bins[STACK_BINS]; cl_bin *bins_local; while (pos < rd_buf_sz) { #ifdef DEBUG_VERBOSE dump_buf("individual message header", buf, sizeof(cl_msg)); #endif uint8_t *buf_start = buf; cl_msg *msg = (cl_msg *) buf; cl_msg_swap_header(msg); buf += sizeof(cl_msg); if (msg->header_sz != sizeof(cl_msg)) { cf_error("received cl msg of unexpected size: expecting %zd found %d, internal error", sizeof(cl_msg),msg->header_sz); close(fd); return(-1); } // parse through the fields cf_digest *keyd = 0; char ns_ret[33] = {0}; char *set_ret = NULL; cl_msg_field *mf = (cl_msg_field *)buf; for (int i=0;i<msg->n_fields;i++) { cl_msg_swap_field(mf); if (mf->type == CL_MSG_FIELD_TYPE_KEY) { cf_error("read: found a key - unexpected"); } else if (mf->type == CL_MSG_FIELD_TYPE_DIGEST_RIPE) { keyd = (cf_digest *) mf->data; } else if (mf->type == CL_MSG_FIELD_TYPE_NAMESPACE) { memcpy(ns_ret, mf->data, cl_msg_field_get_value_sz(mf)); ns_ret[ cl_msg_field_get_value_sz(mf) ] = 0; } else if (mf->type == CL_MSG_FIELD_TYPE_SET) { uint32_t set_name_len = cl_msg_field_get_value_sz(mf); set_ret = (char *)malloc(set_name_len + 1); memcpy(set_ret, mf->data, set_name_len); set_ret[ set_name_len ] = '\0'; } mf = cl_msg_field_get_next(mf); } buf = (uint8_t *) mf; #ifdef DEBUG_VERBOSE cf_debug("message header fields: nfields %u nops %u", msg->n_fields, msg->n_ops); #endif if (msg->n_ops > STACK_BINS) { bins_local = malloc(sizeof(cl_bin) * msg->n_ops); } else { bins_local = stack_bins; } if (bins_local == NULL) { if (set_ret) { free(set_ret); } close(fd); return (-1); } // parse through the bins/ops cl_msg_op *op = (cl_msg_op *)buf; for (int i=0;i<msg->n_ops;i++) { cl_msg_swap_op(op); #ifdef DEBUG_VERBOSE cf_debug("op receive: %p size %d op %d ptype %d pversion %d namesz %d", op,op->op_sz, op->op, op->particle_type, op->version, op->name_sz); #endif #ifdef DEBUG_VERBOSE dump_buf("individual op (host order)", (uint8_t *) op, op->op_sz + sizeof(uint32_t)); #endif cl_set_value_particular(op, &bins_local[i]); op = cl_msg_op_get_next(op); } buf = (uint8_t *) op; if (msg->result_code != CL_RESULT_OK) { // Special case - if we scan a set name that doesn't exist on a // node, it will return "not found" - we unify this with the // case where OK is returned and no callbacks were made. [AKG] if (msg->result_code == CL_RESULT_NOTFOUND) { msg->result_code = CL_RESULT_OK; } rv = (int)msg->result_code; done = true; } else if (msg->info3 & CL_MSG_INFO3_LAST) { #ifdef DEBUG cf_debug("received final message"); #endif done = true; } else if ((msg->n_ops) || (operation_info & CL_MSG_INFO1_NOBINDATA)) { // got one good value? call it a success! (*cb) ( ns_ret, keyd, set_ret, msg->generation, msg->record_ttl, bins_local, msg->n_ops, false /*islast*/, udata); rv = 0; } // else // cf_debug("received message with no bins, signal of an error"); if (bins_local != stack_bins) { free(bins_local); bins_local = 0; } if (set_ret) { free(set_ret); set_ret = NULL; } // don't have to free object internals. They point into the read buffer, where // a pointer is required pos += buf - buf_start; } if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); rd_buf = 0; } } while ( done == false ); if (wr_buf != wr_stack_buf) { free(wr_buf); wr_buf = 0; } cf_atomic32_set(&node->intervals_unreachable, 0); cl_cluster_node_fd_put(node, fd, false); cl_cluster_node_put(node); node = 0; #ifdef DEBUG_VERBOSE cf_debug("exited loop: rv %d", rv ); #endif return(rv); }
/** * config_alloc - allocate a new configuration * @name: name of the config * * This function creates new &config structure, attaches a resource * pool and a linear memory pool to it and makes it available for * further use. Returns a pointer to the structure. */ struct config * config_alloc(const byte *name) { pool *p = rp_new(&root_pool, "Config"); linpool *l = lp_new(p, 4080); struct config *c = lp_allocz(l, sizeof(struct config)); /* Duplication of name string in local linear pool */ uint nlen = strlen(name) + 1; char *ndup = lp_allocu(l, nlen); memcpy(ndup, name, nlen); c->mrtdump_file = -1; /* Hack, this should be sysdep-specific */ c->pool = p; c->mem = l; c->file_name = ndup; c->load_time = now; c->tf_route = c->tf_proto = (struct timeformat){"%T", "%F", 20*3600}; c->tf_base = c->tf_log = (struct timeformat){"%F %T", NULL, 0}; c->gr_wait = DEFAULT_GR_WAIT; return c; } /** * config_parse - parse a configuration * @c: configuration * * config_parse() reads input by calling a hook function pointed to * by @cf_read_hook and parses it according to the configuration * grammar. It also calls all the preconfig and postconfig hooks * before, resp. after parsing. * * Result: 1 if the config has been parsed successfully, 0 if any * error has occurred (such as anybody calling cf_error()) and * the @err_msg field has been set to the error message. */ int config_parse(struct config *c) { int done = 0; DBG("Parsing configuration file `%s'\n", c->file_name); new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) goto cleanup; cf_lex_init(0, c); sysdep_preconfig(c); protos_preconfig(c); rt_preconfig(c); roa_preconfig(c); cf_parse(); protos_postconfig(c); if (EMPTY_LIST(c->protos)) cf_error("No protocol is specified in the config file"); #ifdef IPV6 if (!c->router_id) cf_error("Router ID must be configured manually on IPv6 routers"); #endif done = 1; cleanup: new_config = NULL; cfg_mem = NULL; return done; } /** * cli_parse - parse a CLI command * @c: temporary config structure * * cli_parse() is similar to config_parse(), but instead of a configuration, * it parses a CLI command. See the CLI module for more information. */ int cli_parse(struct config *c) { int done = 0; c->fallback = config; new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) goto cleanup; cf_lex_init(1, c); cf_parse(); done = 1; cleanup: c->fallback = NULL; new_config = NULL; cfg_mem = NULL; return done; }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }
int cl_lookup(cl_cluster *asc, char *hostname, short port, cf_vector *sockaddr_in_v) { // do the gethostbyname to find the IP address size_t hstbuflen = 1024; uint8_t stack_hstbuf[hstbuflen]; void *tmphstbuf = stack_hstbuf; int rv, herr, addrmapsz; struct hostent hostbuf, *hp; cl_addrmap *map; int retry = 0; //Find if there is an alternate address that should be used for this hostname. if (asc && (asc->host_addr_map_v.len > 0)) { addrmapsz = asc->host_addr_map_v.len; for (int i=0; i<addrmapsz; i++) { map = cf_vector_pointer_get(&asc->host_addr_map_v, i); if (map && strcmp(map->orig, hostname) == 0) { //found a mapping for this address. Use the alternate one. cf_debug("Using %s instead of %s", map->alt, hostname); hostname = map->alt; break; } } } do { #ifdef OSX // on OSX, gethostbyname is thread safe and there is no '_r' version hp = gethostbyname2(hostname, AF_INET); rv = 0; if(hp == NULL){ herr = h_errno; // I'm hoping this is thread-safe too, in the Mac world... } #else rv = gethostbyname2_r(hostname, AF_INET, &hostbuf, tmphstbuf, hstbuflen, &hp, &herr); #endif /* TRY_AGAIN for a maximun of 3 times, after which throw an error */ if(retry > 2) { cf_error("gethostbyname of %s - maxmimum retries failed", hostname); retry = 0; return -1; } if (hp == NULL) { hostname = hostname ? hostname : "NONAME"; switch(herr) { case HOST_NOT_FOUND: cf_error("gethostbyname says no host at %s", hostname); break; case NO_ADDRESS: cf_error("gethostbyname of %s says invalid address (errno %d)", hostname, herr); break; case NO_RECOVERY: cf_error("gethostbyname of %s says form error (errno %d)", hostname, herr); break; case TRY_AGAIN: cf_error("gethostbyname of %s returned TRY_AGAIN, try again (rv=%d)", hostname, rv); retry++; continue; default: cf_error("gethostbyname of %s returned an unknown error (errno %d)", hostname, herr); break; } if (tmphstbuf != stack_hstbuf) free(tmphstbuf); return(-1); } else if (rv != 0) { if (rv == ERANGE) { hstbuflen *= 2; if (tmphstbuf == stack_hstbuf) tmphstbuf = malloc(hstbuflen); else tmphstbuf = realloc (tmphstbuf, hstbuflen); if (!tmphstbuf) { cf_error("malloc fail"); return(-1); } } else if (rv == EAGAIN || herr == TRY_AGAIN) { cf_error("gethostbyname returned EAGAIN, try again"); retry++; } else if (rv == ETIMEDOUT) { cf_error("gethostbyname for %s timed out", hostname ? (hostname): "NONAME"); if (tmphstbuf != stack_hstbuf) free(tmphstbuf); return(-1); } else { cf_error("gethostbyname returned an unknown error %d %d (errno %d)",rv,herr, errno); if (tmphstbuf != stack_hstbuf) free(tmphstbuf); return(-1); } } } while ((rv != 0) || (hp == NULL)); #ifdef DEBUG cf_debug("host lookup: %s canonical: %s addrtype %d length: %d", hostname, hp->h_name, hp->h_addrtype, hp->h_length); for (int i=0;hp->h_aliases[i];i++) { cf_debug(" alias %d: %s",i, hp->h_aliases[i]); } for (int i=0;hp->h_addr_list[i];i++) { // todo: print something about the actual address cf_debug(" address %d: %x",i,*(uint32_t *) hp->h_addr_list[i]); } #endif if (hp->h_addrtype != AF_INET) { cf_error("unknown address type %d", hp->h_addrtype); if (tmphstbuf != stack_hstbuf) free(tmphstbuf); return(-1); } // sockaddr_in_v is passed as NULL from caller which needs // to only check if lookup succeeds. If reach here it is // a successful lookup. if (sockaddr_in_v == NULL) { goto ret_success; } // Move into vector for (int i=0;hp->h_addr_list[i];i++) { struct sockaddr_in addr; memset(&addr,0,sizeof(addr)); addr.sin_family = hp->h_addrtype; addr.sin_addr.s_addr = *(uint32_t *) hp->h_addr_list[i]; addr.sin_port = htons(port); cf_vector_append_unique(sockaddr_in_v, &addr); } ret_success: if (tmphstbuf != stack_hstbuf) free(tmphstbuf); return(0); }
/* cf_queue_pop * if ms_wait < 0, wait forever * if ms_wait = 0, don't wait at all * if ms_wait > 0, wait that number of ms * */ int cf_queue_pop(cf_queue *q, void *buf, int ms_wait) { if (NULL == q) { cf_error("cf_queue_pop: try passing in a queue"); return(-1); } #ifdef EXTERNAL_LOCKS if (ms_wait != CF_QUEUE_NOWAIT) { // this implementation won't wait cf_error("cf_queue_pop: only nowait supported"); return(-1); } #endif // EXTERNAL_LOCKS QUEUE_LOCK(q); struct timespec tp; if (ms_wait > 0) { #ifdef OSX uint64_t curms = cf_getms(); // using the cl generic functions defined in cf_clock.h. It is going to have slightly less resolution than the pure linux version tp.tv_sec = (curms + ms_wait)/1000; tp.tv_nsec = (ms_wait %1000) * 1000000; #else // linux clock_gettime( CLOCK_REALTIME, &tp); tp.tv_sec += ms_wait / 1000; tp.tv_nsec += (ms_wait % 1000) * 1000000; if (tp.tv_nsec > 1000000000) { tp.tv_nsec -= 1000000000; tp.tv_sec++; } #endif } /* FIXME error checking */ /* Note that we apparently have to use a while() loop. Careful reading * of the pthread_cond_signal() documentation says that AT LEAST ONE * waiting thread will be awakened... */ if (q->threadsafe) { #ifdef EXTERNAL_LOCKS if (CF_Q_EMPTY(q)) { QUEUE_UNLOCK(q); return(CF_QUEUE_EMPTY); } #else while (CF_Q_EMPTY(q)) { if (CF_QUEUE_FOREVER == ms_wait) { pthread_cond_wait(&q->CV, &q->LOCK); } else if (CF_QUEUE_NOWAIT == ms_wait) { pthread_mutex_unlock(&q->LOCK); return(CF_QUEUE_EMPTY); } else { pthread_cond_timedwait(&q->CV, &q->LOCK, &tp); if (CF_Q_EMPTY(q)) { pthread_mutex_unlock(&q->LOCK); return(CF_QUEUE_EMPTY); } } } #endif // EXTERNAL_LOCKS } else if (CF_Q_EMPTY(q)) return(CF_QUEUE_EMPTY); memcpy(buf, CF_Q_ELEM_PTR(q,q->read_offset), q->elementsz); q->read_offset++; // interesting idea - this probably keeps the cache fresher // because the queue is fully empty just make it all zero if (q->read_offset == q->write_offset) { q->read_offset = q->write_offset = 0; } QUEUE_UNLOCK(q); return(0); }
static uint8_t* as_node_get_info(as_node* node, const char* names, size_t names_len, int timeout_ms, uint8_t* stack_buf) { int fd = node->info_fd; // Prepare the write request buffer. size_t write_size = sizeof(cl_proto) + names_len; cl_proto* proto = (cl_proto*)stack_buf; proto->sz = names_len; proto->version = CL_PROTO_VERSION; proto->type = CL_PROTO_TYPE_INFO; cl_proto_swap_to_be(proto); memcpy((void*)(stack_buf + sizeof(cl_proto)), (const void*)names, names_len); // Write the request. Note that timeout_ms is never 0. if (cf_socket_write_timeout(fd, stack_buf, write_size, 0, timeout_ms) != 0) { cf_debug("Node %s failed info socket write", node->name); return 0; } // Reuse the buffer, read the response - first 8 bytes contains body size. if (cf_socket_read_timeout(fd, stack_buf, sizeof(cl_proto), 0, timeout_ms) != 0) { cf_debug("Node %s failed info socket read header", node->name); return 0; } proto = (cl_proto*)stack_buf; cl_proto_swap_from_be(proto); // Sanity check body size. if (proto->sz == 0 || proto->sz > 512 * 1024) { cf_info("Node %s bad info response size %lu", node->name, proto->sz); return 0; } // Allocate a buffer if the response is bigger than the stack buffer - // caller must free it if this call succeeds. Note that proto is overwritten // if stack_buf is used, so we save the sz field here. size_t proto_sz = proto->sz; uint8_t* rbuf = proto_sz >= INFO_STACK_BUF_SIZE ? (uint8_t*)cf_malloc(proto_sz + 1) : stack_buf; if (! rbuf) { cf_error("Node %s failed allocation for info response", node->name); return 0; } // Read the response body. if (cf_socket_read_timeout(fd, rbuf, proto_sz, 0, timeout_ms) != 0) { cf_debug("Node %s failed info socket read body", node->name); if (rbuf != stack_buf) { cf_free(rbuf); } return 0; } // Null-terminate the response body and return it. rbuf[proto_sz] = 0; return rbuf; }