void * as_netio_th(void *q_to_wait_on) { cf_queue * q = (cf_queue*)q_to_wait_on; while (true) { as_netio io; if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) { cf_crash(AS_PROTO, "Failed to pop from IO worker queue."); } if (io.slow) { usleep(g_config.proto_slow_netio_sleep_ms * 1000); } if (as_netio_send(&io, g_netio_slow_queue, false) != AS_NETIO_CONTINUE) { AS_RELEASE_FILE_HANDLE(io.fd_h); cf_buf_builder_free(io.bb_r); }; } }
// Release memory for batch transaction. static void batch_transaction_done(batch_transaction* btr) { if (btr->fd_h) { AS_RELEASE_FILE_HANDLE(btr->fd_h); btr->fd_h = 0; } if (btr->digests) { cf_free(btr->digests); btr->digests = 0; } if (btr->binlist) { cf_vector_destroy(btr->binlist); btr->binlist = 0; } }
int as_msg_send_reply(as_file_handle *fd_h, uint32_t result_code, uint32_t generation, uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count, as_namespace *ns, uint *written_sz, uint64_t trid, const char *setname) { int rv = 0; // most cases are small messages - try to stack alloc if we can byte fb[MSG_STACK_BUFFER_SZ]; size_t msg_sz = sizeof(fb); // memset(fb,0xff,msg_sz); // helpful to see what you might not be setting uint8_t *msgp = (uint8_t *) as_msg_make_response_msg( result_code, generation, void_time, ops, bins, bin_count, ns, (cl_msg *)fb, &msg_sz, trid, setname); if (!msgp) return(-1); if (fd_h->fd == 0) { cf_warning(AS_PROTO, "write to fd 0 internal error"); cf_crash(AS_PROTO, "send reply: can't write to fd 0"); } // cf_detail(AS_PROTO, "write fd %d",fd); size_t pos = 0; while (pos < msg_sz) { int rv = send(fd_h->fd, msgp + pos, msg_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // common message when a client aborts cf_warning(AS_PROTO, "protocol write fail: fd %d sz %zd pos %zd rv %d errno %d", fd_h->fd, msg_sz, pos, rv, errno); shutdown(fd_h->fd, SHUT_RDWR); rv = -1; goto Exit; } usleep(1); // Yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", fd_h->fd, msg_sz, pos); shutdown(fd_h->fd, SHUT_RDWR); rv = -1; goto Exit; } } // good for stats as a higher layer if (written_sz) *written_sz = msg_sz; Exit: if ((uint8_t *)msgp != fb) cf_free(msgp); fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(fd_h); return(rv); }
// Incoming messages start here. // - Could get a request that we need to service. // - Could get a response to one of our requests - need to find the request and // send the real response to the remote end. int proxy_msg_fn(cf_node id, msg *m, void *udata) { int rv; if (cf_rc_count((void*)m) == 0) { cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m); return -1; } uint32_t op = 99999; msg_get_uint32(m, PROXY_FIELD_OP, &op); uint32_t transaction_id = 0; msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id); cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id); switch (op) { case PROXY_OP_REQUEST: { cf_atomic_int_incr(&g_config.proxy_action); #ifdef DEBUG cf_debug(AS_PROXY, "Proxy_msg: received request"); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy msg"); #endif #endif cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "proxy msg function: no digest, problem"); as_fabric_msg_put(m); return 0; } cl_msg *msgp; size_t as_msg_sz = 0; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) { cf_info(AS_PROXY, "proxy msg function: no as msg, problem"); as_fabric_msg_put(m); return 0; } uint64_t cluster_key = 0; if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) { cf_info(AS_PROXY, "proxy msg function: no cluster key, problem"); as_fabric_msg_put(m); return 0; } // This is allowed to fail - this is a new field, and gets defaulted // to 0 if it doesn't exist. uint32_t timeout_ms = 0; msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms); // cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms); // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.incoming_cluster_key = cluster_key; tr.end_time = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0; tr.proxy_node = id; tr.proxy_msg = m; // Check here if this is shipped op. uint32_t info = 0; msg_get_uint32(m, PROXY_FIELD_INFO, &info); if (info & PROXY_INFO_SHIPPED_OP) { tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP; cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received"); } else { cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid); } MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); } break; case PROXY_OP_RESPONSE: { #ifdef DEBUG // Got the response from the actual endpoint. cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id); #ifdef DEBUG_VERBOSE msg_dump(m, "incoming proxy response"); #endif #endif // Look up the element. proxy_request pr; bool free_msg = true; if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) { // Found the element (sometimes we get two acks so it's OK for // an ack to not find the transaction). if (pr.wr) { as_proxy_shipop_response_hdlr(m, &pr, &free_msg); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } #ifdef DEBUG_VERBOSE cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd); for (size_t _i = 0; _i < proto_sz; _i++) { fprintf(stderr, " %x", ((byte *)proto)[_i]); if (_i % 16 == 15) { fprintf(stderr, "\n"); } } #endif #ifdef EXTRA_CHECKS as_proto proto_copy = *proto; as_proto_swap(&proto_copy); if (proto_copy.sz + 8 != proto_sz) { cf_info(AS_PROXY, "BONE BONE BONE!!!"); cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz); } #endif // Write to the file descriptor. cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd); cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0"); if (pr.batch_shared) { cf_digest* digest; size_t digest_sz = 0; if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) { as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz); as_proxy_set_stat_counters(0); } else { cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id); as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN); as_proxy_set_stat_counters(-1); } cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); } else { size_t pos = 0; while (pos < proto_sz) { rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos); shutdown(pr.fd_h->fd, SHUT_RDWR); as_proxy_set_stat_counters(-1); goto SendFin; } } as_proxy_set_stat_counters(0); SendFin: cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time); // Return the fabric message or the direct file descriptor - // after write and complete. pr.fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(pr.fd_h); pr.fd_h = 0; } as_fabric_msg_put(pr.fab_msg); pr.fab_msg = 0; } } else { cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id); as_proxy_set_stat_counters(-1); } if (free_msg) { as_fabric_msg_put(m); } } break; case PROXY_OP_REDIRECT: { // Sometimes the destination we proxied a request to isn't able to // satisfy it (for example, their copy of the partition in question // might be desync). cf_node new_dst = 0; msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst); cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst); // Look in the proxy retransmit hash for the tid. proxy_request *pr; pthread_mutex_t *pr_lock; int r = 0; if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) { cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id); as_fabric_msg_put(m); return -1; } if (g_config.self_node == new_dst) { // Although we don't know we're the final destination, undo the // proxy-nature and put back on the main queue. Dangerous, as it // leaves open the possibility of a looping message. cf_digest *key; size_t sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } cl_msg *msgp; sz = 0; if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) { cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem"); pthread_mutex_unlock(pr_lock); as_fabric_msg_put(m); return -1; } // Put the as_msg on the normal queue for processing. // INIT_TR as_transaction tr; as_transaction_init(&tr, key, msgp); tr.start_time = pr->start_time; // start time tr.end_time = pr->end_time; tr.proto_fd_h = pr->fd_h; tr.batch_shared = pr->batch_shared; tr.batch_index = pr->batch_index; MICROBENCHMARK_RESET(); thr_tsvc_enqueue(&tr); as_fabric_msg_put(pr->fab_msg); shash_delete_lockfree(g_proxy_hash, &transaction_id); } else { // Change the destination, update the retransmit time. pr->dest = new_dst; pr->xmit_ms = cf_getms() + 1; // Send it. msg_incr_ref(pr->fab_msg); if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) { cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv); as_fabric_msg_put(pr->fab_msg); } } pthread_mutex_unlock(pr_lock); } as_fabric_msg_put(m); break; default: cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op); msg_dump(m, "proxy received unknown msg"); as_fabric_msg_put(m); break; } // end switch return 0; } // end proxy_msg_fn()
/* * The work horse function to process the acknowledgment for the duplicate op. * It is received after the intended node has finished performing the op. In * case of success the op would have been successfully performed and replicated. * In case of failure the op would not have been performed anywhere. * * The retransmit is handled by making sure op hangs from the write hash as long * as it is not applied or failed. Any attempt to perform next operation has to * hang behind it unless it is finished. Also operation is assigned a timestamp * so that there is some protection in case the op arrives out of order, or the * same op comes back again. That would be a duplicate op ... * * Received a op message - I'm a winner duplicate on this partition. Perform the * UDF op and replicate to all the nodes in the replica list. We only replicate * the subrecord if the partition is in subrecord migration phase. If not, ship * both subrecord and record. In case partition is read replica on this node, do * the write and signal back that I'm done. * * THUS - PROLE SIDE * * is_write is misnamed. Differentiates between the 'duplicate' phase and the * 'operation' phase. If is_write == false, we're in the 'duplicate' phase. * * Algorithm * * This code is called when op is shipped to the winner node. * * 1. Assert that current node is indeed the winner node. * 2. Assert the cluster key matches. * 3. Create a transaction and apply the UDF. Create an internal transaction and * make sure it does some sort of reservation and applies the write and * replicates to replica set. Once the write is done it sends the op ack. * * TODO: How do you handle retransmits? * TODO: How do you handle partition reservation? Is it something special. * TODO: How to send along with replication request? Same infra should be * used by normal replication as well. * * There won't be any deadlock because the requests are triggered from the * write. Get down to the udf apply code. Replicate to replica set and then * make sure the response is sent back to the originating node. This node has * to make sure the replication actually did succeed. * * In the response code you need to add the callback function. */ int as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg) { int rv = -1; write_request *wr = pr->wr; if (!wr) { return -1; } cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response"); // If there is a write request hanging from pr then this is a response to // the proxy ship op request. This node is the resolving node (node @ which // duplicate resolution was triggered). It could be: // 1. Originating node [where the request was sent from the client] - in // that case send response back to the client directly. // 2. Non-originating node [where the request arrived as a regular proxy] - // in that case send response back to the proxy originating node. // Case 1: Non-originating node. if (wr->proxy_msg) { // Remember that "digest" gets printed at the end of cf_detail_digest(). // Fake the ORIGINATING Proxy tid uint32_t transaction_id = 0; msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id); msg_set_uint32(m, PROXY_FIELD_TID, transaction_id); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id); if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response"); as_fabric_msg_put(m); } *free_msg = false; } // Case 2: Originating node. else { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response"); pthread_mutex_lock(&wr->lock); if (wr->proto_fd_h) { if (!wr->proto_fd_h->fd) { cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd "); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } size_t pos = 0; while (pos < proto_sz) { rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d " "sz %d pos %d rv %d errno %d", wr->proto_fd_h->fd, proto_sz, pos, rv, errno); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", wr->proto_fd_h->fd, proto_sz, pos); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } } cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client"); } wr->proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(wr->proto_fd_h); wr->proto_fd_h = 0; } else { // this may be NULL if the request has already timedout and the wr proto_fd_h // will be cleaned up by then cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd "); // Note: This may be needed if this is node where internal scan or query // UDF is initiated where it happens so that there is migration is going // on and the request get routed to the remote node which is winning node // This request may need the req_cb to be called. if (udf_rw_needcomplete_wr(wr)) { as_transaction tr; write_request_init_tr(&tr, wr); udf_rw_complete(&tr, 0, __FILE__, __LINE__); if (tr.proto_fd_h) { tr.proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(tr.proto_fd_h); tr.proto_fd_h = 0; } } } pthread_mutex_unlock(&wr->lock); } // This node is shipOp initiator. Remove it from the Global // hash global_keyd gk; gk.ns_id = wr->rsv.ns->id; gk.keyd = wr->keyd; g_write_hash_delete(&gk); WR_RELEASE(pr->wr); pr->wr = NULL; return 0; }