// For LDTs only: bool ldt_get_info(ldt_prole_info* linfo, msg* m, as_partition_reservation* rsv) { as_partition_vinfo* source_vinfo; size_t vinfo_sz; if (msg_get_buf(m, RW_FIELD_VINFOSET, (uint8_t**)&source_vinfo, &vinfo_sz, MSG_GET_DIRECT) != 0) { return false; } linfo->replication_partition_version_match = as_partition_vinfo_same(source_vinfo, &rsv->p->version_info); linfo->ldt_source_version = 0; linfo->ldt_source_version_set = false; if (msg_get_uint64(m, RW_FIELD_LDT_VERSION, &linfo->ldt_source_version) == 0) { linfo->ldt_source_version_set = true; } linfo->ldt_prole_version = 0; linfo->ldt_prole_version_set = false; return true; }
// Incoming messages start here. // - Could get a request that we need to service. // - Could get a response to one of our requests - need to find the request and // send the real response to the remote end. int proxy_msg_fn(cf_node id, msg *m, void *udata) { int rv; if (cf_rc_count((void*)m) == 0) { cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m); return -1; } uint32_t op = 99999; msg_get_uint32(m, PROXY_FIELD_OP, &op); uint32_t transaction_id = 0; msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id); cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id); switch (op) { case PROXY_OP_REQUEST: { cf_atomic_int_incr(&g_config.proxy_action); #ifdef DEBUG cf_debug(AS_PROXY, "Proxy_msg: received request"); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy msg"); #endif #endif cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "proxy msg function: no digest, problem"); as_fabric_msg_put(m); return 0; } cl_msg *msgp; size_t as_msg_sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) { cf_info(AS_PROXY, "proxy msg function: no as msg, problem"); as_fabric_msg_put(m); return 0; } uint64_t cluster_key = 0; if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) { cf_info(AS_PROXY, "proxy msg function: no cluster key, problem"); as_fabric_msg_put(m); return 0; } // This is allowed to fail - this is a new field, and gets defaulted // to 0 if it doesn't exist. uint32_t timeout_ms = 0; msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms); // cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms); // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.incoming_cluster_key = cluster_key; tr.end_time = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0; tr.proxy_node = id; tr.proxy_msg = m; // Check here if this is shipped op. uint32_t info = 0; msg_get_uint32(m, PROXY_FIELD_INFO, &info); if (info & PROXY_INFO_SHIPPED_OP) { tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP; cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received"); } else { cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid); } MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); } break; case PROXY_OP_RESPONSE: { #ifdef DEBUG // Got the response from the actual endpoint. cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy response"); #endif #endif // Look up the element. proxy_request pr; bool free_msg = true; if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) { // Found the element (sometimes we get two acks so it's OK for // an ack to not find the transaction). if (pr.wr) { as_proxy_shipop_response_hdlr(m, &pr, &free_msg); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } #ifdef DEBUG_VERBOSE cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd); for (size_t _i = 0; _i < proto_sz; _i++) { fprintf(stderr, " %x", ((byte *)proto)[_i]); if (_i % 16 == 15) { fprintf(stderr, "\n"); } } #endif #ifdef EXTRA_CHECKS as_proto proto_copy = *proto; as_proto_swap(&proto_copy); if (proto_copy.sz + 8 != proto_sz) { cf_info(AS_PROXY, "BONE BONE BONE!!!"); cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz); } #endif // Write to the file descriptor. cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd); cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0"); if (pr.batch_shared) { cf_digest* digest; size_t digest_sz = 0; if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) { as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz); as_proxy_set_stat_counters(0); } else { cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id); as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN); as_proxy_set_stat_counters(-1); } cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); } else { size_t pos = 0; while (pos < proto_sz) { rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } } as_proxy_set_stat_counters(0); SendFin: cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); // Return the fabric message or the direct file descriptor - // after write and complete. pr.fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(pr.fd_h); pr.fd_h = 0; } as_fabric_msg_put(pr.fab_msg); pr.fab_msg = 0; } } else { cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id); as_proxy_set_stat_counters(-1); } if (free_msg) { as_fabric_msg_put(m); } } break; case PROXY_OP_REDIRECT: { // Sometimes the destination we proxied a request to isn't able to // satisfy it (for example, their copy of the partition in question // might be desync). cf_node new_dst = 0; msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst); cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst); // Look in the proxy retransmit hash for the tid. proxy_request *pr; pthread_mutex_t *pr_lock; int r = 0; if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) { cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id); as_fabric_msg_put(m); return -1; } if (g_config.self_node == new_dst) { // Although we don't know we're the final destination, undo the // proxy-nature and put back on the main queue. Dangerous, as it // leaves open the possibility of a looping message. cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } cl_msg *msgp; sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.start_time = pr->start_time; // start time tr.end_time = pr->end_time; tr.proto_fd_h = pr->fd_h; tr.batch_shared = pr->batch_shared; tr.batch_index = pr->batch_index; MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); as_fabric_msg_put(pr->fab_msg); shash_delete_lockfree(g_proxy_hash, &transaction_id); } else { // Change the destination, update the retransmit time. pr->dest = new_dst; pr->xmit_ms = cf_getms() + 1; // Send it. msg_incr_ref(pr->fab_msg); if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv); as_fabric_msg_put(pr->fab_msg); } } pthread_mutex_unlock(pr_lock); } as_fabric_msg_put(m); break; default: cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op); msg_dump(m, "proxy received unknown msg"); as_fabric_msg_put(m); break; } // end switch return 0; } // end proxy_msg_fn()
// For LDTs only: bool handle_multiop_subop(cf_node node, msg* m, as_partition_reservation* rsv, ldt_prole_info* linfo) { cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no digest"); return true; } uint32_t info; if (msg_get_uint32(m, RW_FIELD_INFO, &info) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no info"); return true; } if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(linfo, m, rsv)) { cf_warning(AS_RW, "handle_multiop_subop: no ldt info"); return false; // Will not continue! This is the only case that stops the loop. } if (! ldt_get_prole_version(rsv, keyd, linfo, info, NULL, false)) { // If parent cannot be due to incoming migration it's ok - continue and // allow subrecords to be replicated. return true; } // TODO - can we get here if ldt_enabled is false? if (rsv->ns->ldt_enabled) { ldt_set_prole_subrec_version(info, linfo, keyd); } cl_msg* msgp; size_t msgp_sz; uint8_t* pickled_buf; size_t pickled_sz; if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz, MSG_GET_DIRECT) == 0) { delete_replica(rsv, keyd, (info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0, (info & RW_INFO_NSUP_DELETE) != 0, as_msg_is_xdr(&msgp->msg), node); } else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) == 0) { as_generation generation; if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no generation"); return true; } uint32_t void_time; if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no void-time"); return true; } uint64_t last_update_time = 0; // Optional - older versions won't send it. msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time); as_rec_props rec_props; size_t rec_props_size = 0; msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size, MSG_GET_DIRECT); rec_props.size = (uint32_t)rec_props_size; write_replica(rsv, keyd, pickled_buf, pickled_sz, &rec_props, generation, void_time, last_update_time, node, info, linfo); } else { cf_warning(AS_RW, "handle_multiop_subop: no msg or pickle"); } return true; }
void repl_write_handle_op(cf_node node, msg* m) { uint8_t* ns_name; size_t ns_name_len; if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no namespace"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); if (! ns) { cf_warning(AS_RW, "repl_write_handle_op: invalid namespace"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no digest"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_partition_reservation rsv; as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL); if (rsv.state == AS_PARTITION_STATE_ABSENT) { as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); return; } uint32_t info = 0; msg_get_uint32(m, RW_FIELD_INFO, &info); ldt_prole_info linfo; if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(&linfo, m, &rsv)) { cf_warning(AS_RW, "repl_write_handle_op: bad ldt info"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cl_msg* msgp; size_t msgp_sz; uint8_t* pickled_buf; size_t pickled_sz; uint32_t result; if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz, MSG_GET_DIRECT) == 0) { // <><><><><><> Delete Operation <><><><><><> // TODO - does this really need to be here? Just to fill linfo? if (! ldt_get_prole_version(&rsv, keyd, &linfo, info, NULL, false)) { as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_OK); // ??? return; } result = delete_replica(&rsv, keyd, (info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0, (info & RW_INFO_NSUP_DELETE) != 0, as_msg_is_xdr(&msgp->msg), node); } else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) == 0) { // <><><><><><> Write Pickle <><><><><><> as_generation generation; if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no generation"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } uint32_t void_time; if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no void-time"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } uint64_t last_update_time = 0; // Optional - older versions won't send it. msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time); as_rec_props rec_props; size_t rec_props_size = 0; msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size, MSG_GET_DIRECT); rec_props.size = (uint32_t)rec_props_size; result = write_replica(&rsv, keyd, pickled_buf, pickled_sz, &rec_props, generation, void_time, last_update_time, node, info, &linfo); } else { cf_warning(AS_RW, "repl_write_handle_op: no msg or pickle"); result = AS_PROTO_RESULT_FAIL_UNKNOWN; } as_partition_release(&rsv); send_repl_write_ack(node, m, result); }