void repl_write_handle_op(cf_node node, msg* m) { uint8_t* ns_name; size_t ns_name_len; if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no namespace"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); if (! ns) { cf_warning(AS_RW, "repl_write_handle_op: invalid namespace"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no digest"); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_partition_reservation rsv; as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL); if (rsv.state == AS_PARTITION_STATE_ABSENT) { as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); return; } uint32_t info = 0; msg_get_uint32(m, RW_FIELD_INFO, &info); ldt_prole_info linfo; if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(&linfo, m, &rsv)) { cf_warning(AS_RW, "repl_write_handle_op: bad ldt info"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cl_msg* msgp; size_t msgp_sz; uint8_t* pickled_buf; size_t pickled_sz; uint32_t result; if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz, MSG_GET_DIRECT) == 0) { // <><><><><><> Delete Operation <><><><><><> // TODO - does this really need to be here? Just to fill linfo? if (! ldt_get_prole_version(&rsv, keyd, &linfo, info, NULL, false)) { as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_OK); // ??? return; } result = delete_replica(&rsv, keyd, (info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0, (info & RW_INFO_NSUP_DELETE) != 0, as_msg_is_xdr(&msgp->msg), node); } else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) == 0) { // <><><><><><> Write Pickle <><><><><><> as_generation generation; if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no generation"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } uint32_t void_time; if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) { cf_warning(AS_RW, "repl_write_handle_op: no void-time"); as_partition_release(&rsv); send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } uint64_t last_update_time = 0; // Optional - older versions won't send it. msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time); as_rec_props rec_props; size_t rec_props_size = 0; msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size, MSG_GET_DIRECT); rec_props.size = (uint32_t)rec_props_size; result = write_replica(&rsv, keyd, pickled_buf, pickled_sz, &rec_props, generation, void_time, last_update_time, node, info, &linfo); } else { cf_warning(AS_RW, "repl_write_handle_op: no msg or pickle"); result = AS_PROTO_RESULT_FAIL_UNKNOWN; } as_partition_release(&rsv); send_repl_write_ack(node, m, result); }
// For LDTs only: void repl_write_handle_multiop(cf_node node, msg* m) { uint8_t* ns_name; size_t ns_name_len; if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no namespace"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); if (! ns) { cf_warning(AS_RW, "handle_multiop: invalid namespace"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } cf_digest* keyd; size_t sz; if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no digest"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } // Note - there should be an RW_FIELD_INFO with LDT bit set, but not // bothering to get it here since we never use it. uint8_t* pickled_buf; size_t pickled_sz; if (msg_get_buf(m, RW_FIELD_MULTIOP, (uint8_t**)&pickled_buf, &pickled_sz, MSG_GET_DIRECT) != 0) { cf_warning(AS_RW, "handle_multiop: no buffer"); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_partition_reservation rsv; as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL); if (rsv.state == AS_PARTITION_STATE_ABSENT) { as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); return; } ldt_prole_info linfo; memset(&linfo, 1, sizeof(ldt_prole_info)); int offset = 0; while (true) { const uint8_t* buf = (const uint8_t*)(pickled_buf + offset); size_t sz = pickled_sz - offset; if (sz == 0) { break; } uint32_t op_msg_len = 0; msg_type op_msg_type = 0; if (msg_get_initial(&op_msg_len, &op_msg_type, buf, sz) != 0 || op_msg_type != M_TYPE_RW) { cf_warning(AS_RW, "handle_multiop: peek multiop msg failed"); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } msg* op_msg = as_fabric_msg_get(op_msg_type); if (! op_msg) { cf_warning(AS_RW, "handle_multiop: can't get fabric msg"); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } if (msg_parse(op_msg, buf, sz) != 0) { cf_warning(AS_RW, "handle_multiop: can't parse multiop msg"); as_fabric_msg_put(op_msg); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } offset += op_msg_len; if (! handle_multiop_subop(node, op_msg, &rsv, &linfo)) { cf_warning(AS_RW, "handle_multiop: write_process_new failed"); as_fabric_msg_put(op_msg); as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); return; } as_fabric_msg_put(op_msg); } as_partition_release(&rsv); send_multiop_ack(node, m, AS_PROTO_RESULT_OK); }
// Build response to batch request. static void batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r) { as_namespace* ns = btr->ns; batch_digests *bmds = btr->digests; bool get_data = btr->get_data; uint32_t yield_count = 0; for (int i = 0; i < bmds->n_digests; i++) { batch_digest *bmd = &bmds->digest[i]; if (bmd->done == false) { // try to get the key as_partition_reservation rsv; AS_PARTITION_RESERVATION_INIT(rsv); cf_node other_node = 0; uint64_t cluster_key; if (! *bb_r) { *bb_r = cf_buf_builder_create_size(1024 * 4); } int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key); if (rv == 0) { cf_atomic_int_incr(&g_config.batch_tree_count); as_index_ref r_ref; r_ref.skip_lock = false; int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns); if (rec_rv == 0) { as_index *r = r_ref.r; // Check to see this isn't an expired record waiting to die. if (r->void_time && r->void_time < as_record_void_time_get()) { as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } else { // Make sure it's brought in from storage if necessary. as_storage_rd rd; if (get_data) { as_storage_record_open(ns, r, &rd, &r->key); rd.n_bins = as_bin_get_n_bins(r, &rd); } // Note: this array must stay in scope until the // response for this record has been built, since in the // get data w/ record on device case, it's copied by // reference directly into the record descriptor. as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; if (get_data) { // Figure out which bins you want - for now, all. rd.bins = as_bin_get_all(r, &rd, stack_bins); rd.n_bins = as_bin_inuse_count(&rd); } as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist); if (get_data) { as_storage_record_close(r, &rd); } } as_record_done(&r_ref, ns); } else { // TODO - what about empty records? cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } bmd->done = true; as_partition_release(&rsv); cf_atomic_int_decr(&g_config.batch_tree_count); } else { cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); if (other_node != 0) { bmd->node = other_node; cf_debug(AS_BATCH, "other_node is: %p.", other_node); } else { cf_debug(AS_BATCH, "other_node is NULL."); } } yield_count++; if (yield_count % g_config.batch_priority == 0) { usleep(1); } } } }