static int do_scan_monte(cl_cluster *asc, char *node_name, uint operation_info, uint operation_info2, const char *ns, const char *set, cl_bin *bins, int n_bins, uint8_t scan_pct, citrusleaf_get_many_cb cb, void *udata, cl_scan_parameters *scan_opt) { int rv = -1; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = 0; size_t rd_buf_sz = 0; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); cl_scan_param_field scan_param_field; if (scan_opt) { scan_param_field.scan_pct = scan_pct>100? 100:scan_pct; scan_param_field.byte1 = (scan_opt->priority<<4) | (scan_opt->fail_on_cluster_change<<3); } // we have a single namespace and/or set to get if (cl_compile(operation_info, operation_info2, 0, ns, set, 0, 0, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, 0, NULL, 0, scan_opt ? &scan_param_field : NULL)) { return(rv); } #ifdef DEBUG_VERBOSE dump_buf("sending request to cluster:", wr_buf, wr_buf_sz); #endif int fd; cl_cluster_node *node = 0; // Get an FD from a cluster if (node_name) { node = cl_cluster_node_get_byname(asc,node_name); // grab a reservation if (node) cl_cluster_node_reserve(node, "T+"); } else { node = cl_cluster_node_get_random(asc); } if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, failing"); #endif return(-1); } fd = cl_cluster_node_fd_get(node, false, asc->nbconnect); if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no file descriptors, retrying transaction", node->name); #endif return(-1); } // send it to the cluster - non blocking socket, but we're blocking if (0 != cf_socket_write_forever(fd, wr_buf, wr_buf_sz)) { #ifdef DEBUG cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d", rv, fd, errno); #endif close(fd); return(-1); } cl_proto proto; bool done = false; do { // multiple CL proto per response // Now turn around and read a fine cl_pro - that's the first 8 bytes that has types and lengths if ((rv = cf_socket_read_forever(fd, (uint8_t *) &proto, sizeof(cl_proto) ) ) ) { cf_error("network error: errno %d fd %d",rv, fd); close(fd); return(-1); } #ifdef DEBUG_VERBOSE dump_buf("read proto header from cluster", (uint8_t *) &proto, sizeof(cl_proto)); #endif cl_proto_swap(&proto); if (proto.version != CL_PROTO_VERSION) { cf_error("network error: received protocol message of wrong version %d", proto.version); close(fd); return(-1); } if (proto.type != CL_PROTO_TYPE_CL_MSG) { cf_error("network error: received incorrect message version %d", proto.type); close(fd); return(-1); } // second read for the remainder of the message - expect this to cover lots of data, many lines // // if there's no error rd_buf_sz = proto.sz; if (rd_buf_sz > 0) { // cf_debug("message read: size %u",(uint)proto.sz); if (rd_buf_sz > sizeof(rd_stack_buf)) rd_buf = malloc(rd_buf_sz); else rd_buf = rd_stack_buf; if (rd_buf == NULL) { close(fd); return (-1); } if ((rv = cf_socket_read_forever(fd, rd_buf, rd_buf_sz))) { cf_error("network error: errno %d fd %d", rv, fd); if (rd_buf != rd_stack_buf) { free(rd_buf); } close(fd); return(-1); } // this one's a little much: printing the entire body before printing the other bits #ifdef DEBUG_VERBOSE dump_buf("read msg body header (multiple msgs)", rd_buf, rd_buf_sz); #endif } // process all the cl_msg in this proto uint8_t *buf = rd_buf; uint pos = 0; cl_bin stack_bins[STACK_BINS]; cl_bin *bins_local; while (pos < rd_buf_sz) { #ifdef DEBUG_VERBOSE dump_buf("individual message header", buf, sizeof(cl_msg)); #endif uint8_t *buf_start = buf; cl_msg *msg = (cl_msg *) buf; cl_msg_swap_header(msg); buf += sizeof(cl_msg); if (msg->header_sz != sizeof(cl_msg)) { cf_error("received cl msg of unexpected size: expecting %zd found %d, internal error", sizeof(cl_msg),msg->header_sz); close(fd); return(-1); } // parse through the fields cf_digest *keyd = 0; char ns_ret[33] = {0}; char *set_ret = NULL; cl_msg_field *mf = (cl_msg_field *)buf; for (int i=0;i<msg->n_fields;i++) { cl_msg_swap_field(mf); if (mf->type == CL_MSG_FIELD_TYPE_KEY) { cf_error("read: found a key - unexpected"); } else if (mf->type == CL_MSG_FIELD_TYPE_DIGEST_RIPE) { keyd = (cf_digest *) mf->data; } else if (mf->type == CL_MSG_FIELD_TYPE_NAMESPACE) { memcpy(ns_ret, mf->data, cl_msg_field_get_value_sz(mf)); ns_ret[ cl_msg_field_get_value_sz(mf) ] = 0; } else if (mf->type == CL_MSG_FIELD_TYPE_SET) { uint32_t set_name_len = cl_msg_field_get_value_sz(mf); set_ret = (char *)malloc(set_name_len + 1); memcpy(set_ret, mf->data, set_name_len); set_ret[ set_name_len ] = '\0'; } mf = cl_msg_field_get_next(mf); } buf = (uint8_t *) mf; #ifdef DEBUG_VERBOSE cf_debug("message header fields: nfields %u nops %u", msg->n_fields, msg->n_ops); #endif if (msg->n_ops > STACK_BINS) { bins_local = malloc(sizeof(cl_bin) * msg->n_ops); } else { bins_local = stack_bins; } if (bins_local == NULL) { if (set_ret) { free(set_ret); } close(fd); return (-1); } // parse through the bins/ops cl_msg_op *op = (cl_msg_op *)buf; for (int i=0;i<msg->n_ops;i++) { cl_msg_swap_op(op); #ifdef DEBUG_VERBOSE cf_debug("op receive: %p size %d op %d ptype %d pversion %d namesz %d", op,op->op_sz, op->op, op->particle_type, op->version, op->name_sz); #endif #ifdef DEBUG_VERBOSE dump_buf("individual op (host order)", (uint8_t *) op, op->op_sz + sizeof(uint32_t)); #endif cl_set_value_particular(op, &bins_local[i]); op = cl_msg_op_get_next(op); } buf = (uint8_t *) op; if (msg->result_code != CL_RESULT_OK) { // Special case - if we scan a set name that doesn't exist on a // node, it will return "not found" - we unify this with the // case where OK is returned and no callbacks were made. [AKG] if (msg->result_code == CL_RESULT_NOTFOUND) { msg->result_code = CL_RESULT_OK; } rv = (int)msg->result_code; done = true; } else if (msg->info3 & CL_MSG_INFO3_LAST) { #ifdef DEBUG cf_debug("received final message"); #endif done = true; } else if ((msg->n_ops) || (operation_info & CL_MSG_INFO1_NOBINDATA)) { // got one good value? call it a success! (*cb) ( ns_ret, keyd, set_ret, msg->generation, msg->record_ttl, bins_local, msg->n_ops, false /*islast*/, udata); rv = 0; } // else // cf_debug("received message with no bins, signal of an error"); if (bins_local != stack_bins) { free(bins_local); bins_local = 0; } if (set_ret) { free(set_ret); set_ret = NULL; } // don't have to free object internals. They point into the read buffer, where // a pointer is required pos += buf - buf_start; } if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); rd_buf = 0; } } while ( done == false ); if (wr_buf != wr_stack_buf) { free(wr_buf); wr_buf = 0; } cf_atomic32_set(&node->intervals_unreachable, 0); cl_cluster_node_fd_put(node, fd, false); cl_cluster_node_put(node); node = 0; #ifdef DEBUG_VERBOSE cf_debug("exited loop: rv %d", rv ); #endif return(rv); }
/* * this is an actual instance of the scan, running on a scan thread * It reads on the node fd till it finds the last msg, in the meantime calling * task->callback on the returned data. The returned data is a bin of name SUCCESS/FAILURE * and the value of the bin is the return value from the udf. */ static int cl_scan_worker_do(cl_cluster_node * node, cl_scan_task * task) { uint8_t rd_stack_buf[STACK_BUF_SZ] = {0}; uint8_t * rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; int fd = cl_cluster_node_fd_get(node, false, task->asc->nbconnect); if ( fd == -1 ) { LOG("[ERROR] cl_scan_worker_do: cannot get fd for node %s ",node->name); return CITRUSLEAF_FAIL_CLIENT; } // send it to the cluster - non blocking socket, but we're blocking if (0 != cf_socket_write_forever(fd, (uint8_t *) task->scan_buf, (size_t) task->scan_sz)) { close(fd); return CITRUSLEAF_FAIL_CLIENT; } cl_proto proto; int rc = CITRUSLEAF_OK; bool done = false; do { // multiple CL proto per response // Now turn around and read a fine cl_proto - that's the first 8 bytes // that has types and lengths if ( (rc = cf_socket_read_forever(fd, (uint8_t *) &proto, sizeof(cl_proto) ) ) ) { LOG("[ERROR] cl_scan_worker_do: network error: errno %d fd %d node name %s\n", rc, fd, node->name); close(fd); return CITRUSLEAF_FAIL_CLIENT; } cl_proto_swap(&proto); if ( proto.version != CL_PROTO_VERSION) { LOG("[ERROR] cl_scan_worker_do: network error: received protocol message of wrong version %d from node %s\n", proto.version, node->name); close(fd); return CITRUSLEAF_FAIL_CLIENT; } if ( proto.type != CL_PROTO_TYPE_CL_MSG && proto.type != CL_PROTO_TYPE_CL_MSG_COMPRESSED ) { LOG("[ERROR] cl_scan_worker_do: network error: received incorrect message version %d from node %s \n",proto.type, node->name); close(fd); return CITRUSLEAF_FAIL_CLIENT; } // second read for the remainder of the message - expect this to cover // lots of data, many lines if there's no error rd_buf_sz = proto.sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)){ rd_buf = malloc(rd_buf_sz); } else { rd_buf = rd_stack_buf; } if (rd_buf == NULL) { close(fd); return CITRUSLEAF_FAIL_CLIENT; } if ( (rc = cf_socket_read_forever(fd, rd_buf, rd_buf_sz)) ) { LOG("[ERROR] cl_scan_worker_do: network error: errno %d fd %d node name %s\n", rc, fd, node->name); if ( rd_buf != rd_stack_buf ) free(rd_buf); close(fd); return CITRUSLEAF_FAIL_CLIENT; } } // process all the cl_msg in this proto uint8_t * buf = rd_buf; uint pos = 0; cl_bin stack_bins[STACK_BINS]; cl_bin * bins; while (pos < rd_buf_sz) { uint8_t * buf_start = buf; cl_msg * msg = (cl_msg *) buf; cl_msg_swap_header(msg); buf += sizeof(cl_msg); if ( msg->header_sz != sizeof(cl_msg) ) { LOG("[ERROR] cl_scan_worker_do: received cl msg of unexpected size: expecting %zd found %d, internal error\n", sizeof(cl_msg),msg->header_sz); close(fd); return CITRUSLEAF_FAIL_CLIENT; } // parse through the fields cf_digest keyd; char ns_ret[33] = {0}; char * set_ret = NULL; cl_msg_field * mf = (cl_msg_field *)buf; for (int i=0; i < msg->n_fields; i++) { cl_msg_swap_field(mf); if (mf->type == CL_MSG_FIELD_TYPE_KEY) { LOG("[ERROR] cl_scan_worker_do: read: found a key - unexpected\n"); } else if (mf->type == CL_MSG_FIELD_TYPE_DIGEST_RIPE) { memcpy(&keyd, mf->data, sizeof(cf_digest)); } else if (mf->type == CL_MSG_FIELD_TYPE_NAMESPACE) { memcpy(ns_ret, mf->data, cl_msg_field_get_value_sz(mf)); ns_ret[ cl_msg_field_get_value_sz(mf) ] = 0; } else if (mf->type == CL_MSG_FIELD_TYPE_SET) { uint32_t set_name_len = cl_msg_field_get_value_sz(mf); set_ret = (char *)malloc(set_name_len + 1); memcpy(set_ret, mf->data, set_name_len); set_ret[ set_name_len ] = '\0'; } mf = cl_msg_field_get_next(mf); } buf = (uint8_t *) mf; if (msg->n_ops > STACK_BINS) { bins = malloc(sizeof(cl_bin) * msg->n_ops); } else { bins = stack_bins; } if (bins == NULL) { if (set_ret) { free(set_ret); } close(fd); return CITRUSLEAF_FAIL_CLIENT; } // parse through the bins/ops cl_msg_op * op = (cl_msg_op *) buf; for (int i=0;i<msg->n_ops;i++) { cl_msg_swap_op(op); #ifdef DEBUG_VERBOSE LOG("[DEBUG] cl_scan_worker_do: op receive: %p size %d op %d ptype %d pversion %d namesz %d \n", op,op->op_sz, op->op, op->particle_type, op->version, op->name_sz); #endif #ifdef DEBUG_VERBOSE dump_buf("individual op (host order)", (uint8_t *) op, op->op_sz + sizeof(uint32_t)); #endif cl_set_value_particular(op, &bins[i]); op = cl_msg_op_get_next(op); } buf = (uint8_t *) op; if (msg->result_code != CL_RESULT_OK) { rc = (int) msg->result_code; done = true; if (rc == CITRUSLEAF_FAIL_SCAN_ABORT) { LOG("[INFO] cl_scan_worker_do: Scan successfully aborted at node [%s]\n", node->name); } } else if (msg->info3 & CL_MSG_INFO3_LAST) { if ( cf_debug_enabled() ) { LOG("[INFO] cl_scan_worker_do: Received final message from node [%s], scan complete\n", node->name); } done = true; } else if ((msg->n_ops || (msg->info1 & CL_MSG_INFO1_NOBINDATA))) { cl_scan_response_rec rec; cl_scan_response_rec *recp = &rec; recp->ns = strdup(ns_ret); recp->keyd = keyd; recp->set = set_ret; recp->generation = msg->generation; recp->record_ttl = msg->record_ttl; recp->bins = bins; recp->n_bins = msg->n_ops; recp->ismalloc = false; as_rec r; as_rec *rp = &r; rp = as_rec_init(rp, recp, &scan_response_hooks); as_val * v = as_rec_get(rp, "SUCCESS"); if ( v != NULL && task->callback) { // Got a non null value for the resposne bin, // call callback on it and destroy the record task->callback(v, task->udata); as_rec_destroy(rp); } rc = CITRUSLEAF_OK; } // if done free it if (done) { citrusleaf_bins_free(bins, msg->n_ops); if (bins != stack_bins) { free(bins); bins = 0; } if (set_ret) { free(set_ret); set_ret = NULL; } } // don't have to free object internals. They point into the read buffer, where // a pointer is required pos += buf - buf_start; } if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); rd_buf = 0; } } while ( done == false ); cl_cluster_node_fd_put(node, fd, false); #ifdef DEBUG_VERBOSE LOG("[DEBUG] cl_scan_worker_do: exited loop: rc %d\n", rc ); #endif return rc; }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }