// Looked up the message in the store. Time to send the response value back to // the requester. The CF_BYTEARRAY is handed off in this case. If you want to // keep a reference, then keep the reference yourself. int as_proxy_send_response(cf_node dst, msg *m, uint32_t result_code, uint32_t generation, uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count, as_namespace *ns, uint64_t trid, const char *setname) { uint32_t tid; msg_get_uint32(m, PROXY_FIELD_TID, &tid); #ifdef DEBUG cf_debug(AS_PROXY, "proxy send response: message %p bytearray %p tid %d", m, result_code, tid); #endif msg_reset(m); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE); msg_set_uint32(m, PROXY_FIELD_TID, tid); size_t msg_sz = 0; cl_msg * msgp = as_msg_make_response_msg(result_code, generation, void_time, ops, bins, bin_count, ns, 0, &msg_sz, trid, setname); msg_set_buf(m, PROXY_FIELD_AS_PROTO, (byte *) msgp, msg_sz, MSG_SET_HANDOFF_MALLOC); int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_debug(AS_PROXY, "sending proxy response: fabric send err %d, catch you on the retry", rv); as_fabric_msg_put(m); } return 0; } // end as_proxy_send_response()
int as_proxy_send_ops_response(cf_node dst, msg *m, cf_dyn_buf *db) { uint32_t tid; msg_get_uint32(m, PROXY_FIELD_TID, &tid); #ifdef DEBUG cf_debug(AS_PROXY, "proxy send response: message %p bytearray %p tid %d", m, result_code, tid); #endif msg_reset(m); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE); msg_set_uint32(m, PROXY_FIELD_TID, tid); uint8_t *msgp = db->buf; size_t msg_sz = db->used_sz; if (db->is_stack) { msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_COPY); } else { msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_HANDOFF_MALLOC); db->buf = NULL; // the fabric owns the buffer now } int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_debug(AS_PROXY, "sending proxy response: fabric send err %d, catch you on the retry", rv); as_fabric_msg_put(m); } return 0; } // end as_proxy_send_ops_response()
// For LDTs only: void repl_write_ldt_make_message(msg* m, as_transaction* tr, uint8_t** p_pickled_buf, size_t pickled_sz, as_rec_props* p_pickled_rec_props, bool is_subrec) { as_namespace* ns = tr->rsv.ns; msg_set_uint32(m, RW_FIELD_OP, RW_OP_WRITE); msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name), MSG_SET_COPY); msg_set_uint32(m, RW_FIELD_NS_ID, ns->id); msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_uint64(m, RW_FIELD_CLUSTER_KEY, tr->rsv.cluster_key); msg_set_uint32(m, RW_FIELD_GENERATION, tr->generation); msg_set_uint32(m, RW_FIELD_VOID_TIME, tr->void_time); msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, tr->last_update_time); // TODO - do we really get here if ldt_enabled is false? if (ns->ldt_enabled && ! is_subrec) { msg_set_buf(m, RW_FIELD_VINFOSET, (uint8_t*)&tr->rsv.p->version_info, sizeof(as_partition_vinfo), MSG_SET_COPY); if (tr->rsv.p->current_outgoing_ldt_version != 0) { msg_set_uint64(m, RW_FIELD_LDT_VERSION, tr->rsv.p->current_outgoing_ldt_version); } } uint32_t info = pack_info_bits(tr, true); if (*p_pickled_buf) { bool is_sub; bool is_parent; as_ldt_get_property(p_pickled_rec_props, &is_parent, &is_sub); info |= pack_ldt_info_bits(tr, is_parent, is_sub); msg_set_buf(m, RW_FIELD_RECORD, (void*)*p_pickled_buf, pickled_sz, MSG_SET_HANDOFF_MALLOC); *p_pickled_buf = NULL; if (p_pickled_rec_props && p_pickled_rec_props->p_data) { msg_set_buf(m, RW_FIELD_REC_PROPS, p_pickled_rec_props->p_data, p_pickled_rec_props->size, MSG_SET_HANDOFF_MALLOC); as_rec_props_clear(p_pickled_rec_props); } } else { msg_set_buf(m, RW_FIELD_AS_MSG, (void*)tr->msgp, as_proto_size_get(&tr->msgp->proto), MSG_SET_COPY); info |= pack_ldt_info_bits(tr, false, is_subrec); } msg_set_uint32(m, RW_FIELD_INFO, info); }
// For LDTs only: void send_multiop_ack(cf_node node, msg* m, uint32_t result) { msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID); msg_set_uint32(m, RW_FIELD_OP, RW_OP_MULTI_ACK); msg_set_uint32(m, RW_FIELD_RESULT, result); if (as_fabric_send(node, m, AS_FABRIC_PRIORITY_MEDIUM) != AS_FABRIC_SUCCESS) { as_fabric_msg_put(m); } }
// Send a redirection message - consumes the message. int as_proxy_send_redirect(cf_node dst, msg *m, cf_node rdst) { int rv; uint32_t tid; msg_get_uint32(m, PROXY_FIELD_TID, &tid); msg_reset(m); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REDIRECT); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_uint64(m, PROXY_FIELD_REDIRECT, rdst); if (0 != (rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "sending redirection failed: fabric send error %d", rv); as_fabric_msg_put(m); } return 0; } // end as_proxy_send_redirect()
/* * The work horse function to process the acknowledgment for the duplicate op. * It is received after the intended node has finished performing the op. In * case of success the op would have been successfully performed and replicated. * In case of failure the op would not have been performed anywhere. * * The retransmit is handled by making sure op hangs from the write hash as long * as it is not applied or failed. Any attempt to perform next operation has to * hang behind it unless it is finished. Also operation is assigned a timestamp * so that there is some protection in case the op arrives out of order, or the * same op comes back again. That would be a duplicate op ... * * Received a op message - I'm a winner duplicate on this partition. Perform the * UDF op and replicate to all the nodes in the replica list. We only replicate * the subrecord if the partition is in subrecord migration phase. If not, ship * both subrecord and record. In case partition is read replica on this node, do * the write and signal back that I'm done. * * THUS - PROLE SIDE * * is_write is misnamed. Differentiates between the 'duplicate' phase and the * 'operation' phase. If is_write == false, we're in the 'duplicate' phase. * * Algorithm * * This code is called when op is shipped to the winner node. * * 1. Assert that current node is indeed the winner node. * 2. Assert the cluster key matches. * 3. Create a transaction and apply the UDF. Create an internal transaction and * make sure it does some sort of reservation and applies the write and * replicates to replica set. Once the write is done it sends the op ack. * * TODO: How do you handle retransmits? * TODO: How do you handle partition reservation? Is it something special. * TODO: How to send along with replication request? Same infra should be * used by normal replication as well. * * There won't be any deadlock because the requests are triggered from the * write. Get down to the udf apply code. Replicate to replica set and then * make sure the response is sent back to the originating node. This node has * to make sure the replication actually did succeed. * * In the response code you need to add the callback function. */ int as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg) { int rv = -1; write_request *wr = pr->wr; if (!wr) { return -1; } cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response"); // If there is a write request hanging from pr then this is a response to // the proxy ship op request. This node is the resolving node (node @ which // duplicate resolution was triggered). It could be: // 1. Originating node [where the request was sent from the client] - in // that case send response back to the client directly. // 2. Non-originating node [where the request arrived as a regular proxy] - // in that case send response back to the proxy originating node. // Case 1: Non-originating node. if (wr->proxy_msg) { // Remember that "digest" gets printed at the end of cf_detail_digest(). // Fake the ORIGINATING Proxy tid uint32_t transaction_id = 0; msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id); msg_set_uint32(m, PROXY_FIELD_TID, transaction_id); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id); if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response"); as_fabric_msg_put(m); } *free_msg = false; } // Case 2: Originating node. else { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response"); pthread_mutex_lock(&wr->lock); if (wr->proto_fd_h) { if (!wr->proto_fd_h->fd) { cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd "); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } size_t pos = 0; while (pos < proto_sz) { rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d " "sz %d pos %d rv %d errno %d", wr->proto_fd_h->fd, proto_sz, pos, rv, errno); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", wr->proto_fd_h->fd, proto_sz, pos); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } } cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client"); } wr->proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(wr->proto_fd_h); wr->proto_fd_h = 0; } else { // this may be NULL if the request has already timedout and the wr proto_fd_h // will be cleaned up by then cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd "); // Note: This may be needed if this is node where internal scan or query // UDF is initiated where it happens so that there is migration is going // on and the request get routed to the remote node which is winning node // This request may need the req_cb to be called. if (udf_rw_needcomplete_wr(wr)) { as_transaction tr; write_request_init_tr(&tr, wr); udf_rw_complete(&tr, 0, __FILE__, __LINE__); if (tr.proto_fd_h) { tr.proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(tr.proto_fd_h); tr.proto_fd_h = 0; } } } pthread_mutex_unlock(&wr->lock); } // This node is shipOp initiator. Remove it from the Global // hash global_keyd gk; gk.ns_id = wr->rsv.ns->id; gk.keyd = wr->keyd; g_write_hash_delete(&gk); WR_RELEASE(pr->wr); pr->wr = NULL; return 0; }
int as_proxy_shipop(cf_node dst, write_request *wr) { as_partition_id pid = as_partition_getid(wr->keyd); if (dst == 0) { cf_crash(AS_PROXY, "the destination should never be zero"); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key()); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl); wr->msgp = 0; // If it is shipped op. uint32_t info = 0; info |= PROXY_INFO_SHIPPED_OP; msg_set_uint32(m, PROXY_FIELD_INFO, info); cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)", wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = wr->start_time; pr.end_time = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns; cf_rc_reserve(wr); pr.wr = wr; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.fd_h = NULL; pr.batch_shared = NULL; pr.batch_index = 0; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_info(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv); as_fabric_msg_put(m); } wr->shipped_op_initiator = true; cf_atomic_int_incr(&g_config.ldt_proxy_initiate); return 0; }
// Make a request to another node. // // Note: there's a cheat here. 'as_msg' is used in a raw form, and includes // structured data (version - type - nfields - sz ...) which should be made more // wire-protocol-friendly. int as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key) { cf_detail(AS_PROXY, "proxy divert"); cf_atomic_int_incr(&g_config.stat_proxy_reqs); if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) { cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr); } as_partition_id pid = as_partition_getid(tr->keyd); if (dst == 0) { // Get the list of replicas. dst = as_partition_getreplica_read(ns, pid); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC; msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl); tr->msgp = 0; cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = tr->start_time; pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns; pr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.ns = ns; pr.wr = NULL; pr.batch_shared = tr->batch_shared; pr.batch_index = tr->batch_index; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_debug(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv); as_fabric_msg_put(m); } cf_atomic_int_incr(&g_config.proxy_initiate); return 0; }
bool repl_write_make_message(rw_request* rw, as_transaction* tr) { if (rw->dest_msg) { msg_reset(rw->dest_msg); } else if (! (rw->dest_msg = as_fabric_msg_get(M_TYPE_RW))) { return false; } as_namespace* ns = tr->rsv.ns; msg* m = rw->dest_msg; msg_set_uint32(m, RW_FIELD_OP, rw->is_multiop ? RW_OP_MULTI : RW_OP_WRITE); msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name), MSG_SET_COPY); msg_set_uint32(m, RW_FIELD_NS_ID, ns->id); msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_uint64(m, RW_FIELD_CLUSTER_KEY, tr->rsv.cluster_key); msg_set_uint32(m, RW_FIELD_TID, rw->tid); msg_set_uint32(m, RW_FIELD_GENERATION, tr->generation); msg_set_uint32(m, RW_FIELD_VOID_TIME, tr->void_time); msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, tr->last_update_time); // TODO - do we really intend to send this if the record is non-LDT? if (ns->ldt_enabled) { msg_set_buf(m, RW_FIELD_VINFOSET, (uint8_t*)&tr->rsv.p->version_info, sizeof(as_partition_vinfo), MSG_SET_COPY); if (tr->rsv.p->current_outgoing_ldt_version != 0) { msg_set_uint64(m, RW_FIELD_LDT_VERSION, tr->rsv.p->current_outgoing_ldt_version); } } if (rw->is_multiop) { msg_set_uint32(m, RW_FIELD_INFO, RW_INFO_LDT); msg_set_buf(m, RW_FIELD_MULTIOP, (void*)rw->pickled_buf, rw->pickled_sz, MSG_SET_HANDOFF_MALLOC); // Make sure destructor doesn't free this. rw->pickled_buf = NULL; return true; } uint32_t info = pack_info_bits(tr, rw->has_udf); if (rw->pickled_buf) { // Replica writes. bool is_sub; bool is_parent; as_ldt_get_property(&rw->pickled_rec_props, &is_parent, &is_sub); info |= pack_ldt_info_bits(tr, is_parent, is_sub); msg_set_buf(m, RW_FIELD_RECORD, (void*)rw->pickled_buf, rw->pickled_sz, MSG_SET_HANDOFF_MALLOC); // Make sure destructor doesn't free this. rw->pickled_buf = NULL; if (rw->pickled_rec_props.p_data) { msg_set_buf(m, RW_FIELD_REC_PROPS, rw->pickled_rec_props.p_data, rw->pickled_rec_props.size, MSG_SET_HANDOFF_MALLOC); // Make sure destructor doesn't free the data. as_rec_props_clear(&rw->pickled_rec_props); } } else { // Replica deletes. msg_set_buf(m, RW_FIELD_AS_MSG, (void*)tr->msgp, as_proto_size_get(&tr->msgp->proto), MSG_SET_COPY); info |= pack_ldt_info_bits(tr, false, false); } msg_set_uint32(m, RW_FIELD_INFO, info); return true; }