// For LDTs only: bool ldt_get_info(ldt_prole_info* linfo, msg* m, as_partition_reservation* rsv) { as_partition_vinfo* source_vinfo; size_t vinfo_sz; if (msg_get_buf(m, RW_FIELD_VINFOSET, (uint8_t**)&source_vinfo, &vinfo_sz, MSG_GET_DIRECT) != 0) { return false; } linfo->replication_partition_version_match = as_partition_vinfo_same(source_vinfo, &rsv->p->version_info); linfo->ldt_source_version = 0; linfo->ldt_source_version_set = false; if (msg_get_uint64(m, RW_FIELD_LDT_VERSION, &linfo->ldt_source_version) == 0) { linfo->ldt_source_version_set = true; } linfo->ldt_prole_version = 0; linfo->ldt_prole_version_set = false; return true; }
// Incoming messages start here. // - Could get a request that we need to service. // - Could get a response to one of our requests - need to find the request and // send the real response to the remote end. int proxy_msg_fn(cf_node id, msg *m, void *udata) { int rv; if (cf_rc_count((void*)m) == 0) { cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m); return -1; } uint32_t op = 99999; msg_get_uint32(m, PROXY_FIELD_OP, &op); uint32_t transaction_id = 0; msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id); cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id); switch (op) { case PROXY_OP_REQUEST: { cf_atomic_int_incr(&g_config.proxy_action); #ifdef DEBUG cf_debug(AS_PROXY, "Proxy_msg: received request"); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy msg"); #endif #endif cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "proxy msg function: no digest, problem"); as_fabric_msg_put(m); return 0; } cl_msg *msgp; size_t as_msg_sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) { cf_info(AS_PROXY, "proxy msg function: no as msg, problem"); as_fabric_msg_put(m); return 0; } uint64_t cluster_key = 0; if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) { cf_info(AS_PROXY, "proxy msg function: no cluster key, problem"); as_fabric_msg_put(m); return 0; } // This is allowed to fail - this is a new field, and gets defaulted // to 0 if it doesn't exist. uint32_t timeout_ms = 0; msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms); // cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms); // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.incoming_cluster_key = cluster_key; tr.end_time = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0; tr.proxy_node = id; tr.proxy_msg = m; // Check here if this is shipped op. uint32_t info = 0; msg_get_uint32(m, PROXY_FIELD_INFO, &info); if (info & PROXY_INFO_SHIPPED_OP) { tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP; cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received"); } else { cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid); } MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); } break; case PROXY_OP_RESPONSE: { #ifdef DEBUG // Got the response from the actual endpoint. cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy response"); #endif #endif // Look up the element. proxy_request pr; bool free_msg = true; if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) { // Found the element (sometimes we get two acks so it's OK for // an ack to not find the transaction). if (pr.wr) { as_proxy_shipop_response_hdlr(m, &pr, &free_msg); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } #ifdef DEBUG_VERBOSE cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd); for (size_t _i = 0; _i < proto_sz; _i++) { fprintf(stderr, " %x", ((byte *)proto)[_i]); if (_i % 16 == 15) { fprintf(stderr, "\n"); } } #endif #ifdef EXTRA_CHECKS as_proto proto_copy = *proto; as_proto_swap(&proto_copy); if (proto_copy.sz + 8 != proto_sz) { cf_info(AS_PROXY, "BONE BONE BONE!!!"); cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz); } #endif // Write to the file descriptor. cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd); cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0"); if (pr.batch_shared) { cf_digest* digest; size_t digest_sz = 0; if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) { as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz); as_proxy_set_stat_counters(0); } else { cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id); as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN); as_proxy_set_stat_counters(-1); } cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); } else { size_t pos = 0; while (pos < proto_sz) { rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } } as_proxy_set_stat_counters(0); SendFin: cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); // Return the fabric message or the direct file descriptor - // after write and complete. pr.fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(pr.fd_h); pr.fd_h = 0; } as_fabric_msg_put(pr.fab_msg); pr.fab_msg = 0; } } else { cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id); as_proxy_set_stat_counters(-1); } if (free_msg) { as_fabric_msg_put(m); } } break; case PROXY_OP_REDIRECT: { // Sometimes the destination we proxied a request to isn't able to // satisfy it (for example, their copy of the partition in question // might be desync). cf_node new_dst = 0; msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst); cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst); // Look in the proxy retransmit hash for the tid. proxy_request *pr; pthread_mutex_t *pr_lock; int r = 0; if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) { cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id); as_fabric_msg_put(m); return -1; } if (g_config.self_node == new_dst) { // Although we don't know we're the final destination, undo the // proxy-nature and put back on the main queue. Dangerous, as it // leaves open the possibility of a looping message. cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } cl_msg *msgp; sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.start_time = pr->start_time; // start time tr.end_time = pr->end_time; tr.proto_fd_h = pr->fd_h; tr.batch_shared = pr->batch_shared; tr.batch_index = pr->batch_index; MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); as_fabric_msg_put(pr->fab_msg); shash_delete_lockfree(g_proxy_hash, &transaction_id); } else { // Change the destination, update the retransmit time. pr->dest = new_dst; pr->xmit_ms = cf_getms() + 1; // Send it. msg_incr_ref(pr->fab_msg); if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv); as_fabric_msg_put(pr->fab_msg); } } pthread_mutex_unlock(pr_lock); } as_fabric_msg_put(m); break; default: cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op); msg_dump(m, "proxy received unknown msg"); as_fabric_msg_put(m); break; } // end switch return 0; } // end proxy_msg_fn()
/* * The work horse function to process the acknowledgment for the duplicate op. * It is received after the intended node has finished performing the op. In * case of success the op would have been successfully performed and replicated. * In case of failure the op would not have been performed anywhere. * * The retransmit is handled by making sure op hangs from the write hash as long * as it is not applied or failed. Any attempt to perform next operation has to * hang behind it unless it is finished. Also operation is assigned a timestamp * so that there is some protection in case the op arrives out of order, or the * same op comes back again. That would be a duplicate op ... * * Received a op message - I'm a winner duplicate on this partition. Perform the * UDF op and replicate to all the nodes in the replica list. We only replicate * the subrecord if the partition is in subrecord migration phase. If not, ship * both subrecord and record. In case partition is read replica on this node, do * the write and signal back that I'm done. * * THUS - PROLE SIDE * * is_write is misnamed. Differentiates between the 'duplicate' phase and the * 'operation' phase. If is_write == false, we're in the 'duplicate' phase. * * Algorithm * * This code is called when op is shipped to the winner node. * * 1. Assert that current node is indeed the winner node. * 2. Assert the cluster key matches. * 3. Create a transaction and apply the UDF. Create an internal transaction and * make sure it does some sort of reservation and applies the write and * replicates to replica set. Once the write is done it sends the op ack. * * TODO: How do you handle retransmits? * TODO: How do you handle partition reservation? Is it something special. * TODO: How to send along with replication request? Same infra should be * used by normal replication as well. * * There won't be any deadlock because the requests are triggered from the * write. Get down to the udf apply code. Replicate to replica set and then * make sure the response is sent back to the originating node. This node has * to make sure the replication actually did succeed. * * In the response code you need to add the callback function. */ int as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg) { int rv = -1; write_request *wr = pr->wr; if (!wr) { return -1; } cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response"); // If there is a write request hanging from pr then this is a response to // the proxy ship op request. This node is the resolving node (node @ which // duplicate resolution was triggered). It could be: // 1. Originating node [where the request was sent from the client] - in // that case send response back to the client directly. // 2. Non-originating node [where the request arrived as a regular proxy] - // in that case send response back to the proxy originating node. // Case 1: Non-originating node. if (wr->proxy_msg) { // Remember that "digest" gets printed at the end of cf_detail_digest(). // Fake the ORIGINATING Proxy tid uint32_t transaction_id = 0; msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id); msg_set_uint32(m, PROXY_FIELD_TID, transaction_id); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id); if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response"); as_fabric_msg_put(m); } *free_msg = false; } // Case 2: Originating node. else { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response"); pthread_mutex_lock(&wr->lock); if (wr->proto_fd_h) { if (!wr->proto_fd_h->fd) { cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd "); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } size_t pos = 0; while (pos < proto_sz) { rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d " "sz %d pos %d rv %d errno %d", wr->proto_fd_h->fd, proto_sz, pos, rv, errno); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", wr->proto_fd_h->fd, proto_sz, pos); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } } cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client"); } wr->proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(wr->proto_fd_h); wr->proto_fd_h = 0; } else { // this may be NULL if the request has already timedout and the wr proto_fd_h // will be cleaned up by then cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd "); // Note: This may be needed if this is node where internal scan or query // UDF is initiated where it happens so that there is migration is going // on and the request get routed to the remote node which is winning node // This request may need the req_cb to be called. if (udf_rw_needcomplete_wr(wr)) { as_transaction tr; write_request_init_tr(&tr, wr); udf_rw_complete(&tr, 0, __FILE__, __LINE__); if (tr.proto_fd_h) { tr.proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(tr.proto_fd_h); tr.proto_fd_h = 0; } } } pthread_mutex_unlock(&wr->lock); } // This node is shipOp initiator. Remove it from the Global // hash global_keyd gk; gk.ns_id = wr->rsv.ns->id; gk.keyd = wr->keyd; g_write_hash_delete(&gk); WR_RELEASE(pr->wr); pr->wr = NULL; return 0; }
// For LDTs only: bool handle_multiop_subop(cf_node node, msg* m, as_partition_reservation* rsv, ldt_prole_info* linfo) { cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no digest"); return true; } uint32_t info; if (msg_get_uint32(m, RW_FIELD_INFO, &info) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no info"); return true; } if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(linfo, m, rsv)) { cf_warning(AS_RW, "handle_multiop_subop: no ldt info"); return false; // Will not continue! This is the only case that stops the loop. } if (! ldt_get_prole_version(rsv, keyd, linfo, info, NULL, false)) { // If parent cannot be due to incoming migration it's ok - continue and // allow subrecords to be replicated. return true; } // TODO - can we get here if ldt_enabled is false? if (rsv->ns->ldt_enabled) { ldt_set_prole_subrec_version(info, linfo, keyd); } cl_msg* msgp; size_t msgp_sz; uint8_t* pickled_buf; size_t pickled_sz; if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz, MSG_GET_DIRECT) == 0) { delete_replica(rsv, keyd, (info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0, (info & RW_INFO_NSUP_DELETE) != 0, as_msg_is_xdr(&msgp->msg), node); } else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) == 0) { as_generation generation; if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no generation"); return true; } uint32_t void_time; if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) { cf_warning(AS_RW, "handle_multiop_subop: no void-time"); return true; } uint64_t last_update_time = 0; // Optional - older versions won't send it. msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time); as_rec_props rec_props; size_t rec_props_size = 0; msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size, MSG_GET_DIRECT); rec_props.size = (uint32_t)rec_props_size; write_replica(rsv, keyd, pickled_buf, pickled_sz, &rec_props, generation, void_time, last_update_time, node, info, linfo); } else { cf_warning(AS_RW, "handle_multiop_subop: no msg or pickle"); } return true; }
// For LDTs only: void repl_write_handle_multiop(cf_node node, msg* m) { uint8_t* ns_name; size_t ns_name_len; if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no namespace"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); if (! ns) { cf_warning(AS_RW, "handle_multiop: invalid namespace"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no digest"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } // Note - there should be an RW_FIELD_INFO with LDT bit set, but not // bothering to get it here since we never use it. uint8_t* pickled_buf; size_t pickled_sz; if (msg_get_buf(m, RW_FIELD_MULTIOP, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no buffer"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_partition_reservation rsv; as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL); if (rsv.state == AS_PARTITION_STATE_ABSENT) { as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); return; } ldt_prole_info linfo; memset(&linfo, 1, sizeof(ldt_prole_info)); int offset = 0; while (true) { const uint8_t* buf = (const uint8_t*)(pickled_buf + offset); size_t sz = pickled_sz - offset; if (sz == 0) { break; } uint32_t op_msg_len = 0; msg_type op_msg_type = 0; if (msg_get_initial(&op_msg_len, &op_msg_type, buf, sz) != 0 || op_msg_type != M_TYPE_RW) { cf_warning(AS_RW, "handle_multiop: peek multiop msg failed"); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } msg* op_msg = as_fabric_msg_get(op_msg_type); if (! op_msg) { cf_warning(AS_RW, "handle_multiop: can't get fabric msg"); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } if (msg_parse(op_msg, buf, sz) != 0) { cf_warning(AS_RW, "handle_multiop: can't parse multiop msg"); as_fabric_msg_put(op_msg); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } offset += op_msg_len; if (! handle_multiop_subop(node, op_msg, &rsv, &linfo)) { cf_warning(AS_RW, "handle_multiop: write_process_new failed"); as_fabric_msg_put(op_msg); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_fabric_msg_put(op_msg); } as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_OK); }
void repl_write_handle_ack(cf_node node, msg* m) { uint32_t ns_id; if (msg_get_uint32(m, RW_FIELD_NS_ID, &ns_id) != 0) { cf_warning(AS_RW, "repl-write ack: no ns-id"); as_fabric_msg_put(m); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl-write ack: no digest"); as_fabric_msg_put(m); return; } uint32_t tid; if (msg_get_uint32(m, RW_FIELD_TID, &tid) != 0) { cf_warning(AS_RW, "repl-write ack: no tid"); as_fabric_msg_put(m); return; } // TODO - result_code is currently ignored! What should we do with it? // Note - CLUSTER_KEY_MISMATCH not special, can't re-queue transaction. uint32_t result_code; if (msg_get_uint32(m, RW_FIELD_RESULT, &result_code) != 0) { cf_warning(AS_RW, "repl-write ack: no result_code"); as_fabric_msg_put(m); return; } rw_request_hkey hkey = { ns_id, *keyd }; rw_request* rw = rw_request_hash_get(&hkey); if (! rw) { // Extra ack, after rw_request is already gone. as_fabric_msg_put(m); return; } pthread_mutex_lock(&rw->lock); if (rw->tid != tid) { // Extra ack, rw_request is that of newer transaction for same digest. pthread_mutex_unlock(&rw->lock); rw_request_release(rw); as_fabric_msg_put(m); return; } int i; for (i = 0; i < rw->n_dest_nodes; i++) { if (rw->dest_nodes[i] != node) { continue; } if (rw->dest_complete[i]) { // Extra ack for this replica write. pthread_mutex_unlock(&rw->lock); rw_request_release(rw); as_fabric_msg_put(m); return; } rw->dest_complete[i] = true; break; } if (i == rw->n_dest_nodes) { cf_warning(AS_RW, "repl-write ack: from non-dest node %lx", node); pthread_mutex_unlock(&rw->lock); rw_request_release(rw); as_fabric_msg_put(m); return; } for (int j = 0; j < rw->n_dest_nodes; j++) { if (! rw->dest_complete[j]) { // Still haven't heard from all duplicates. pthread_mutex_unlock(&rw->lock); rw_request_release(rw); as_fabric_msg_put(m); return; } } if (! rw->from.any && rw->origin != FROM_NSUP && ! rw->respond_client_on_master_completion) { // Lost race against timeout in retransmit thread. pthread_mutex_unlock(&rw->lock); rw_request_release(rw); as_fabric_msg_put(m); return; } if (! rw->respond_client_on_master_completion) { rw->repl_write_cb(rw); } pthread_mutex_unlock(&rw->lock); rw_request_hash_delete(&hkey, rw); rw_request_release(rw); as_fabric_msg_put(m); }
void repl_write_handle_op(cf_node node, msg* m) { uint8_t* ns_name; size_t ns_name_len; if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no namespace"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); if (! ns) { cf_warning(AS_RW, "repl_write_handle_op: invalid namespace"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no digest"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_partition_reservation rsv; as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL); if (rsv.state == AS_PARTITION_STATE_ABSENT) { as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); return; } uint32_t info = 0; msg_get_uint32(m, RW_FIELD_INFO, &info); ldt_prole_info linfo; if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(&linfo, m, &rsv)) { cf_warning(AS_RW, "repl_write_handle_op: bad ldt info"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cl_msg* msgp; size_t msgp_sz; uint8_t* pickled_buf; size_t pickled_sz; uint32_t result; if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz, MSG_GET_DIRECT) == 0) { // <><><><><><> Delete Operation <><><><><><> // TODO - does this really need to be here? Just to fill linfo? if (! ldt_get_prole_version(&rsv, keyd, &linfo, info, NULL, false)) { as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_OK); // ??? return; } result = delete_replica(&rsv, keyd, (info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0, (info & RW_INFO_NSUP_DELETE) != 0, as_msg_is_xdr(&msgp->msg), node); } else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) == 0) { // <><><><><><> Write Pickle <><><><><><> as_generation generation; if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no generation"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } uint32_t void_time; if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no void-time"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } uint64_t last_update_time = 0; // Optional - older versions won't send it. msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time); as_rec_props rec_props; size_t rec_props_size = 0; msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size, MSG_GET_DIRECT); rec_props.size = (uint32_t)rec_props_size; result = write_replica(&rsv, keyd, pickled_buf, pickled_sz, &rec_props, generation, void_time, last_update_time, node, info, &linfo); } else { cf_warning(AS_RW, "repl_write_handle_op: no msg or pickle"); result = AS_PROTO_RESULT_FAIL_UNKNOWN; } as_partition_release(&rsv); send_repl_write_ack(node, m, result); }