/* * Main routine to replicate the chunks of LDT objects. The LDT directory rec * is not replicated using this function. This function is called for each chunk * that got updated as part of the single LDT operation. Note that in a single * LDT operation, there can be only few chunks that change. i.e chunks in one * path of the tree structure. * * Assumption: * 1. All records should have been closed. * 2. Pickled buf for all the record and subrecord which needs shipping should have * been filled. * * Function: * * 1. Walk through each sub record and use its pickled buf to create * RW_OP_WRITE. Pack it in the buffer and push it into the RW_MULTI_OP * packet. * 2. This function packs entire pickled buf into the message that is one extra * allocation into the multi-op over the fabric. The message hangs from the * wr for the parent record for the retransmit */ int ldt_record_pickle(ldt_record *lrecord, uint8_t ** pickled_buf, size_t * pickled_sz, uint32_t * pickled_void_time) { cf_detail(AS_LDT, "Enter: MULTI_OP: Packing LDT record"); udf_record *h_urecord = as_rec_source(lrecord->h_urec); as_transaction *h_tr = h_urecord->tr; // Do an early check if we need to replicate to other nodes. In cases like // single-replica or single-node we don't need to do any replication. cf_node dest_nodes_tmp[AS_CLUSTER_SZ]; memset(dest_nodes_tmp, 0, sizeof(dest_nodes_tmp)); int listsz = as_partition_getreplica_readall(h_tr->rsv.ns, h_tr->rsv.pid, dest_nodes_tmp); if (listsz == 0) { return 0; } bool is_delete = (h_urecord->pickled_buf) ? false : true; int ret = 0; int ops = 0; // TODO: change hard coded 7 to meaningful constant. msg *m[7]; memset(m, 0, 7 * sizeof(msg *)); if (is_delete) { *pickled_buf = 0; *pickled_sz = 0; } else { size_t sz = 0; size_t buflen = 0; m[ops] = as_fabric_msg_get(M_TYPE_RW); if (!m[ops]) { ret = -3; goto Out; } if (!is_delete && h_urecord->pickled_buf) { cf_detail(AS_LDT, "MULTI_OP: Packing LDT Head Record"); rw_msg_setup(m[ops], h_tr, &h_tr->keyd, &h_urecord->pickled_buf, h_urecord->pickled_sz, h_urecord->pickled_void_time, &h_urecord->pickled_rec_props, RW_OP_WRITE, h_urecord->ldt_rectype_bits, true); buflen = 0; msg_fillbuf(m[ops], NULL, &buflen); sz += buflen; ops++; } // This macro is a for-loop thru the SR list and a test for valid SR entry FOR_EACH_SUBRECORD(i, lrecord) { udf_record *c_urecord = &lrecord->chunk[i].c_urecord; is_delete = (c_urecord->pickled_buf) ? false : true; as_transaction *c_tr = c_urecord->tr; if ( ((!c_urecord->pickled_buf) || (c_urecord->pickled_sz <= 0)) && !is_delete ) { cf_warning(AS_RW, "Got an empty pickled buf while trying to " " replicate record with digest %"PRIx64" %p, %d, %d", (uint64_t *)&c_tr->keyd, pickled_buf, pickled_sz, is_delete); ret = -2; goto Out; } // if pickled_buf is there then it is a write operation if (!is_delete && c_urecord->pickled_buf) { cf_detail(AS_LDT, "MULTI_OP: Packing LDT SUB Record"); m[ops] = as_fabric_msg_get(M_TYPE_RW); if (!m[ops]) { ret = -3; goto Out; } rw_msg_setup(m[ops], c_tr, &c_tr->keyd, &c_urecord->pickled_buf, c_urecord->pickled_sz, c_urecord->pickled_void_time, &c_urecord->pickled_rec_props, RW_OP_WRITE, c_urecord->ldt_rectype_bits, true); buflen = 0; msg_fillbuf(m[ops], NULL, &buflen); sz += buflen; ops++; } } if (sz) { uint8_t *buf = cf_malloc(sz); if (!buf) { pickled_sz = 0; *pickled_buf = NULL; ret = -1; goto Out; } *pickled_buf = buf; *pickled_sz = sz; int rsz = sz; sz = 0; for (int i = 0; i < ops; i++) { sz = rsz - sz; ret = msg_fillbuf(m[i], buf, &sz); buf += sz; } *pickled_void_time = 0; } }
int as_proxy_shipop(cf_node dst, write_request *wr) { as_partition_id pid = as_partition_getid(wr->keyd); if (dst == 0) { cf_crash(AS_PROXY, "the destination should never be zero"); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key()); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl); wr->msgp = 0; // If it is shipped op. uint32_t info = 0; info |= PROXY_INFO_SHIPPED_OP; msg_set_uint32(m, PROXY_FIELD_INFO, info); cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)", wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = wr->start_time; pr.end_time = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns; cf_rc_reserve(wr); pr.wr = wr; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.fd_h = NULL; pr.batch_shared = NULL; pr.batch_index = 0; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_info(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv); as_fabric_msg_put(m); } wr->shipped_op_initiator = true; cf_atomic_int_incr(&g_config.ldt_proxy_initiate); return 0; }
bool repl_write_make_message(rw_request* rw, as_transaction* tr) { if (rw->dest_msg) { msg_reset(rw->dest_msg); } else if (! (rw->dest_msg = as_fabric_msg_get(M_TYPE_RW))) { return false; } as_namespace* ns = tr->rsv.ns; msg* m = rw->dest_msg; msg_set_uint32(m, RW_FIELD_OP, rw->is_multiop ? RW_OP_MULTI : RW_OP_WRITE); msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name), MSG_SET_COPY); msg_set_uint32(m, RW_FIELD_NS_ID, ns->id); msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_uint64(m, RW_FIELD_CLUSTER_KEY, tr->rsv.cluster_key); msg_set_uint32(m, RW_FIELD_TID, rw->tid); msg_set_uint32(m, RW_FIELD_GENERATION, tr->generation); msg_set_uint32(m, RW_FIELD_VOID_TIME, tr->void_time); msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, tr->last_update_time); // TODO - do we really intend to send this if the record is non-LDT? if (ns->ldt_enabled) { msg_set_buf(m, RW_FIELD_VINFOSET, (uint8_t*)&tr->rsv.p->version_info, sizeof(as_partition_vinfo), MSG_SET_COPY); if (tr->rsv.p->current_outgoing_ldt_version != 0) { msg_set_uint64(m, RW_FIELD_LDT_VERSION, tr->rsv.p->current_outgoing_ldt_version); } } if (rw->is_multiop) { msg_set_uint32(m, RW_FIELD_INFO, RW_INFO_LDT); msg_set_buf(m, RW_FIELD_MULTIOP, (void*)rw->pickled_buf, rw->pickled_sz, MSG_SET_HANDOFF_MALLOC); // Make sure destructor doesn't free this. rw->pickled_buf = NULL; return true; } uint32_t info = pack_info_bits(tr, rw->has_udf); if (rw->pickled_buf) { // Replica writes. bool is_sub; bool is_parent; as_ldt_get_property(&rw->pickled_rec_props, &is_parent, &is_sub); info |= pack_ldt_info_bits(tr, is_parent, is_sub); msg_set_buf(m, RW_FIELD_RECORD, (void*)rw->pickled_buf, rw->pickled_sz, MSG_SET_HANDOFF_MALLOC); // Make sure destructor doesn't free this. rw->pickled_buf = NULL; if (rw->pickled_rec_props.p_data) { msg_set_buf(m, RW_FIELD_REC_PROPS, rw->pickled_rec_props.p_data, rw->pickled_rec_props.size, MSG_SET_HANDOFF_MALLOC); // Make sure destructor doesn't free the data. as_rec_props_clear(&rw->pickled_rec_props); } } else { // Replica deletes. msg_set_buf(m, RW_FIELD_AS_MSG, (void*)tr->msgp, as_proto_size_get(&tr->msgp->proto), MSG_SET_COPY); info |= pack_ldt_info_bits(tr, false, false); } msg_set_uint32(m, RW_FIELD_INFO, info); return true; }
// Make a request to another node. // // Note: there's a cheat here. 'as_msg' is used in a raw form, and includes // structured data (version - type - nfields - sz ...) which should be made more // wire-protocol-friendly. int as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key) { cf_detail(AS_PROXY, "proxy divert"); cf_atomic_int_incr(&g_config.stat_proxy_reqs); if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) { cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr); } as_partition_id pid = as_partition_getid(tr->keyd); if (dst == 0) { // Get the list of replicas. dst = as_partition_getreplica_read(ns, pid); } // Create a fabric message, fill it out. msg *m = as_fabric_msg_get(M_TYPE_PROXY); if (!m) { return -1; } uint32_t tid = cf_atomic32_incr(&g_proxy_tid); msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); msg_set_uint32(m, PROXY_FIELD_TID, tid); msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY); msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC; msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype); msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key); msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl); tr->msgp = 0; cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst); // Fill out a retransmit structure, insert into the retransmit hash. msg_incr_ref(m); proxy_request pr; pr.start_time = tr->start_time; pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns; pr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; pr.fab_msg = m; pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms; pr.retry_interval_ms = g_config.transaction_retry_ms; pr.dest = dst; pr.pid = pid; pr.ns = ns; pr.wr = NULL; pr.batch_shared = tr->batch_shared; pr.batch_index = tr->batch_index; if (0 != shash_put(g_proxy_hash, &tid, &pr)) { cf_debug(AS_PROXY, " shash_put failed, need cleanup code"); return -1; } // Send to the remote node. int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM); if (rv != 0) { cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv); as_fabric_msg_put(m); } cf_atomic_int_incr(&g_config.proxy_initiate); return 0; }
// For LDTs only: void repl_write_handle_multiop(cf_node node, msg* m) { uint8_t* ns_name; size_t ns_name_len; if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no namespace"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); if (! ns) { cf_warning(AS_RW, "handle_multiop: invalid namespace"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no digest"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } // Note - there should be an RW_FIELD_INFO with LDT bit set, but not // bothering to get it here since we never use it. uint8_t* pickled_buf; size_t pickled_sz; if (msg_get_buf(m, RW_FIELD_MULTIOP, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no buffer"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_partition_reservation rsv; as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL); if (rsv.state == AS_PARTITION_STATE_ABSENT) { as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); return; } ldt_prole_info linfo; memset(&linfo, 1, sizeof(ldt_prole_info)); int offset = 0; while (true) { const uint8_t* buf = (const uint8_t*)(pickled_buf + offset); size_t sz = pickled_sz - offset; if (sz == 0) { break; } uint32_t op_msg_len = 0; msg_type op_msg_type = 0; if (msg_get_initial(&op_msg_len, &op_msg_type, buf, sz) != 0 || op_msg_type != M_TYPE_RW) { cf_warning(AS_RW, "handle_multiop: peek multiop msg failed"); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } msg* op_msg = as_fabric_msg_get(op_msg_type); if (! op_msg) { cf_warning(AS_RW, "handle_multiop: can't get fabric msg"); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } if (msg_parse(op_msg, buf, sz) != 0) { cf_warning(AS_RW, "handle_multiop: can't parse multiop msg"); as_fabric_msg_put(op_msg); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } offset += op_msg_len; if (! handle_multiop_subop(node, op_msg, &rsv, &linfo)) { cf_warning(AS_RW, "handle_multiop: write_process_new failed"); as_fabric_msg_put(op_msg); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_fabric_msg_put(op_msg); } as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_OK); }