void as_transaction_error(as_transaction* tr, uint32_t error_code) { if (tr->proto_fd_h) { if (tr->batch_shared) { as_batch_add_error(tr->batch_shared, tr->batch_index, error_code); // Clear this transaction's msgp so calling code does not free it. tr->msgp = 0; } else { as_msg_send_reply(tr->proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, NULL, as_transaction_trid(tr), NULL); tr->proto_fd_h = 0; MICROBENCHMARK_HIST_INSERT_P(error_hist); cf_atomic_int_incr(&g_config.err_tsvc_requests); if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { cf_atomic_int_incr(&g_config.err_tsvc_requests_timeout); } } } else if (tr->proxy_msg) { as_proxy_send_response(tr->proxy_node, tr->proxy_msg, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr), NULL); tr->proxy_msg = NULL; } else if (tr->udata.req_udata) { if (udf_rw_needcomplete(tr)) { udf_rw_complete(tr, error_code, __FILE__,__LINE__); } } }
/* * The work horse function to process the acknowledgment for the duplicate op. * It is received after the intended node has finished performing the op. In * case of success the op would have been successfully performed and replicated. * In case of failure the op would not have been performed anywhere. * * The retransmit is handled by making sure op hangs from the write hash as long * as it is not applied or failed. Any attempt to perform next operation has to * hang behind it unless it is finished. Also operation is assigned a timestamp * so that there is some protection in case the op arrives out of order, or the * same op comes back again. That would be a duplicate op ... * * Received a op message - I'm a winner duplicate on this partition. Perform the * UDF op and replicate to all the nodes in the replica list. We only replicate * the subrecord if the partition is in subrecord migration phase. If not, ship * both subrecord and record. In case partition is read replica on this node, do * the write and signal back that I'm done. * * THUS - PROLE SIDE * * is_write is misnamed. Differentiates between the 'duplicate' phase and the * 'operation' phase. If is_write == false, we're in the 'duplicate' phase. * * Algorithm * * This code is called when op is shipped to the winner node. * * 1. Assert that current node is indeed the winner node. * 2. Assert the cluster key matches. * 3. Create a transaction and apply the UDF. Create an internal transaction and * make sure it does some sort of reservation and applies the write and * replicates to replica set. Once the write is done it sends the op ack. * * TODO: How do you handle retransmits? * TODO: How do you handle partition reservation? Is it something special. * TODO: How to send along with replication request? Same infra should be * used by normal replication as well. * * There won't be any deadlock because the requests are triggered from the * write. Get down to the udf apply code. Replicate to replica set and then * make sure the response is sent back to the originating node. This node has * to make sure the replication actually did succeed. * * In the response code you need to add the callback function. */ int as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg) { int rv = -1; write_request *wr = pr->wr; if (!wr) { return -1; } cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response"); // If there is a write request hanging from pr then this is a response to // the proxy ship op request. This node is the resolving node (node @ which // duplicate resolution was triggered). It could be: // 1. Originating node [where the request was sent from the client] - in // that case send response back to the client directly. // 2. Non-originating node [where the request arrived as a regular proxy] - // in that case send response back to the proxy originating node. // Case 1: Non-originating node. if (wr->proxy_msg) { // Remember that "digest" gets printed at the end of cf_detail_digest(). // Fake the ORIGINATING Proxy tid uint32_t transaction_id = 0; msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id); msg_set_uint32(m, PROXY_FIELD_TID, transaction_id); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id); if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response"); as_fabric_msg_put(m); } *free_msg = false; } // Case 2: Originating node. else { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response"); pthread_mutex_lock(&wr->lock); if (wr->proto_fd_h) { if (!wr->proto_fd_h->fd) { cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd "); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } size_t pos = 0; while (pos < proto_sz) { rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d " "sz %d pos %d rv %d errno %d", wr->proto_fd_h->fd, proto_sz, pos, rv, errno); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", wr->proto_fd_h->fd, proto_sz, pos); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } } cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client"); } wr->proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(wr->proto_fd_h); wr->proto_fd_h = 0; } else { // this may be NULL if the request has already timedout and the wr proto_fd_h // will be cleaned up by then cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd "); // Note: This may be needed if this is node where internal scan or query // UDF is initiated where it happens so that there is migration is going // on and the request get routed to the remote node which is winning node // This request may need the req_cb to be called. if (udf_rw_needcomplete_wr(wr)) { as_transaction tr; write_request_init_tr(&tr, wr); udf_rw_complete(&tr, 0, __FILE__, __LINE__); if (tr.proto_fd_h) { tr.proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(tr.proto_fd_h); tr.proto_fd_h = 0; } } } pthread_mutex_unlock(&wr->lock); } // This node is shipOp initiator. Remove it from the Global // hash global_keyd gk; gk.ns_id = wr->rsv.ns->id; gk.keyd = wr->keyd; g_write_hash_delete(&gk); WR_RELEASE(pr->wr); pr->wr = NULL; return 0; }