/* * Function: Open storage record for passed in udf record * also set up flag like exists / read et al. * * Parameters: * urec : UDF record * * Return value : 0 on success * -1 if the record's bin count exceeds the UDF limit * * Callers: * udf_record_open * * Note: There are no checks, so the caller has to make sure that all * protections are taken and all checks are done. * * Side effect: * Counters will be reset * flag will be set * bins will be opened */ int udf_storage_record_open(udf_record *urecord) { cf_debug_digest(AS_UDF, &urecord->tr->keyd, "[ENTER] Opening record key:"); as_storage_rd *rd = urecord->rd; as_index *r = urecord->r_ref->r; as_transaction *tr = urecord->tr; int rv = as_storage_record_open(tr->rsv.ns, r, rd, &r->key); if (0 != rv) { cf_warning(AS_UDF, "Could not open record !! %d", rv); return rv; } rd->n_bins = as_bin_get_n_bins(r, rd); if (rd->n_bins > UDF_RECORD_BIN_ULIMIT) { cf_warning(AS_UDF, "record has too many bins (%d) for UDF processing", rd->n_bins); as_storage_record_close(r, rd); return -1; } // if multibin storage, we will use urecord->stack_bins, so set the size appropriately if ( ! tr->rsv.ns->storage_data_in_memory && ! tr->rsv.ns->single_bin ) { rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); } rd->bins = as_bin_get_all(r, rd, urecord->stack_bins); urecord->starting_memory_bytes = as_storage_record_get_n_bytes_memory(rd); as_storage_record_get_key(rd); urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { urecord->lrecord->subrec_io++; } cf_detail_digest(AS_UDF, &tr->keyd, "Storage Open: Rec(%p) flag(%x) Digest:", urecord, urecord->flag ); if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { as_ldt_subrec_storage_validate(rd, "Reading"); } return 0; }
// If called for data-not-in-memory, this may read record from drive! // TODO - rename as as_record_... and move to record.c? void record_delete_adjust_sindex(as_record* r, as_namespace* ns) { if (! record_has_sindex(r, ns)) { return; } as_storage_rd rd; as_storage_record_open(ns, r, &rd); as_storage_rd_load_n_bins(&rd); as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; as_storage_rd_load_bins(&rd, stack_bins); remove_from_sindex(ns, as_index_get_set_name(r, ns), &r->keyd, rd.bins, rd.n_bins); as_storage_record_close(&rd); }
int delete_replica(as_partition_reservation* rsv, cf_digest* keyd, bool is_subrec, bool is_nsup_delete, bool is_xdr_op, cf_node master) { // Shortcut pointers & flags. as_namespace* ns = rsv->ns; as_index_tree* tree = is_subrec ? rsv->sub_tree : rsv->tree; as_index_ref r_ref; r_ref.skip_lock = false; if (as_record_get(tree, keyd, &r_ref, ns) != 0) { return AS_PROTO_RESULT_FAIL_NOTFOUND; } as_record* r = r_ref.r; if (ns->storage_data_in_memory) { as_storage_rd rd; as_storage_record_open(ns, r, &rd, keyd); delete_adjust_sindex(&rd); as_storage_record_close(r, &rd); } // Save the set-ID and generation for XDR. uint16_t set_id = as_index_get_set_id(r); uint16_t generation = r->generation; as_index_delete(tree, keyd); as_record_done(&r_ref, ns); if (xdr_must_ship_delete(ns, is_nsup_delete, is_xdr_op)) { xdr_write(ns, *keyd, generation, master, true, set_id, NULL); } return AS_PROTO_RESULT_OK; }
transaction_status read_local(as_transaction* tr) { as_msg* m = &tr->msgp->msg; as_namespace* ns = tr->rsv.ns; as_index_ref r_ref; if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) { read_local_done(tr, NULL, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } as_record* r = r_ref.r; // Check if it's an expired or truncated record. if (as_record_is_doomed(r, ns)) { read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } int result = repl_state_check(r, tr); if (result != 0) { if (result == -3) { read_local_done(tr, &r_ref, NULL, AS_ERR_UNAVAILABLE); return TRANS_DONE_ERROR; } // No response sent to origin. as_record_done(&r_ref, ns); return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING; } // Check if it's a tombstone. if (! as_record_is_live(r)) { read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } as_storage_rd rd; as_storage_record_open(ns, r, &rd); // If configuration permits, allow reads to use page cache. rd.read_page_cache = ns->storage_read_page_cache; // Check the key if required. // Note - for data-not-in-memory "exists" ops, key check is expensive! if (as_transaction_has_key(tr) && as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) { read_local_done(tr, &r_ref, &rd, AS_ERR_KEY_MISMATCH); return TRANS_DONE_ERROR; } if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) { tr->generation = r->generation; tr->void_time = r->void_time; tr->last_update_time = r->last_update_time; read_local_done(tr, &r_ref, &rd, AS_OK); return TRANS_DONE_SUCCESS; } if ((result = as_storage_rd_load_n_bins(&rd)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } if (! as_bin_inuse_has(&rd)) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name); read_local_done(tr, &r_ref, &rd, AS_ERR_UNKNOWN); return TRANS_DONE_ERROR; } uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ? rd.n_bins : m->n_ops; as_msg_op* ops[bin_count]; as_msg_op** p_ops = ops; as_bin* response_bins[bin_count]; uint16_t n_bins = 0; as_bin result_bins[bin_count]; uint32_t n_result_bins = 0; if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) { p_ops = NULL; n_bins = rd.n_bins; as_bin_get_all_p(&rd, response_bins); } else { if (m->n_ops == 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name); read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER); return TRANS_DONE_ERROR; } bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0; as_msg_op* op = 0; int n = 0; while ((op = as_msg_op_iterate(m, op, &n)) != NULL) { if (op->op == AS_MSG_OP_READ) { as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); if (b || respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = b; } } else if (op->op == AS_MSG_OP_CDT_READ) { as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); if (b) { as_bin* rb = &result_bins[n_result_bins]; as_bin_set_empty(rb); if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name); destroy_stack_bins(result_bins, n_result_bins); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } if (as_bin_inuse(rb)) { n_result_bins++; ops[n_bins] = op; response_bins[n_bins++] = rb; } else if (respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = NULL; } } else if (respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = NULL; } } else { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op); destroy_stack_bins(result_bins, n_result_bins); read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER); return TRANS_DONE_ERROR; } } } cf_dyn_buf_define_size(db, 16 * 1024); if (tr->origin != FROM_BATCH) { db.used_sz = db.alloc_sz; db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code, r->generation, r->void_time, p_ops, response_bins, n_bins, ns, (cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr)); db.is_stack = db.buf == dyn_bufdb; // Note - not bothering to correct alloc_sz if buf was allocated. } else { tr->generation = r->generation; tr->void_time = r->void_time; tr->last_update_time = r->last_update_time; // Since as_batch_add_result() constructs response directly in shared // buffer to avoid extra copies, can't use db. send_read_response(tr, p_ops, response_bins, n_bins, NULL); } destroy_stack_bins(result_bins, n_result_bins); as_storage_record_close(&rd); as_record_done(&r_ref, ns); // Now that we're not under the record lock, send the message we just built. if (db.used_sz != 0) { send_read_response(tr, NULL, NULL, 0, &db); cf_dyn_buf_free(&db); tr->from.proto_fd_h = NULL; } return TRANS_DONE_SUCCESS; }
// If remote record is better than local record, replace local with remote. int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write) { as_namespace *ns = rr->rsv->ns; if (! as_storage_has_space(ns)) { cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name); return AS_ERR_OUT_OF_SPACE; } CF_ALLOC_SET_NS_ARENA(ns); as_index_tree *tree = rr->rsv->tree; as_index_ref r_ref; int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns); if (rv < 0) { return AS_ERR_OUT_OF_SPACE; } bool is_create = rv == 1; as_index *r = r_ref.r; int result; conflict_resolution_pol policy = ns->conflict_resolution_policy; if (is_repl_write) { bool from_replica; if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src, &from_replica)) != AS_OK) { record_replace_failed(rr, &r_ref, NULL, is_create); return result; } repl_write_init_repl_state(rr, from_replica); policy = repl_write_conflict_resolution_policy(ns); } if (! is_create && record_replace_check(r, ns) < 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return AS_ERR_FORBIDDEN; } // If local record is better, no-op or fail. if (! is_create && (result = as_record_resolve_conflict(policy, r->generation, r->last_update_time, (uint16_t)rr->generation, rr->last_update_time)) <= 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return result == 0 ? AS_ERR_RECORD_EXISTS : AS_ERR_GENERATION; } // else - remote winner - apply it. // If creating record, write set-ID into index. if (is_create) { if (rr->set_name && (result = as_index_set_set_w_len(r, ns, rr->set_name, rr->set_name_len, false)) < 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return -result; } r->last_update_time = rr->last_update_time; // Don't write record if it would be truncated. if (as_truncate_record_is_truncated(r, ns)) { record_replace_failed(rr, &r_ref, NULL, is_create); return AS_OK; } } // else - not bothering to check that sets match. as_storage_rd rd; if (is_create) { as_storage_record_create(ns, r, &rd); } else { as_storage_record_open(ns, r, &rd); } // TODO - old pickle - remove condition in "six months". if (rr->is_old_pickle) { // Prepare to store set name, if there is one. rd.set_name = rr->set_name; rd.set_name_len = rr->set_name_len; } else { rd.pickle = rr->pickle; rd.pickle_sz = rr->pickle_sz; rd.orig_pickle_sz = as_flat_orig_pickle_size(rr, rd.pickle_sz); } // Note - deal with key after reading existing record (if such), in case // we're dropping the key. // Split according to configuration to replace local record. bool is_delete = false; if (ns->storage_data_in_memory) { if (ns->single_bin) { result = record_apply_dim_single_bin(rr, &rd, &is_delete); } else { result = record_apply_dim(rr, &rd, skip_sindex, &is_delete); } } else { if (ns->single_bin) { result = record_apply_ssd_single_bin(rr, &rd, &is_delete); } else { result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete); } } if (result != 0) { record_replace_failed(rr, &r_ref, &rd, is_create); return result; } uint16_t set_id = as_index_get_set_id(r); // save for XDR write record_replaced(r, rr); as_storage_record_close(&rd); as_record_done(&r_ref, ns); if (do_xdr_write) { xdr_write_replica(rr, is_delete, set_id); } return AS_OK; }
int write_replica(as_partition_reservation* rsv, cf_digest* keyd, uint8_t* pickled_buf, size_t pickled_sz, const as_rec_props* p_rec_props, as_generation generation, uint32_t void_time, uint64_t last_update_time, cf_node master, uint32_t info, ldt_prole_info* linfo) { as_namespace* ns = rsv->ns; if (! as_storage_has_space(rsv->ns)) { cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name); return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE; } as_index_tree* tree = rsv->tree; bool is_subrec = false; bool is_ldt_parent = false; if (ns->ldt_enabled) { if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) { tree = rsv->sub_tree; is_subrec = true; } else if ((info & RW_INFO_LDT_PARENTREC) != 0) { is_ldt_parent = true; } } as_index_ref r_ref; r_ref.skip_lock = false; int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec); if (rv < 0) { cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name); return AS_PROTO_RESULT_FAIL_UNKNOWN; } as_record* r = r_ref.r; as_storage_rd rd; bool is_create = false; if (rv == 1) { as_storage_record_create(ns, r, &rd, keyd); is_create = true; } else { as_storage_record_open(ns, r, &rd, keyd); } bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0; rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent; rd.n_bins = as_bin_get_n_bins(r, &rd); // TODO - we really need an inline utility for this! uint16_t newbins = ntohs(*(uint16_t*)pickled_buf); if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin && newbins > rd.n_bins) { rd.n_bins = newbins; } as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; rd.bins = as_bin_get_all(r, &rd, stack_bins); uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ? 0 : as_record_buf_get_stack_particles_sz(pickled_buf); uint8_t stack_particles[stack_particles_sz + 256]; uint8_t* p_stack_particles = stack_particles; // + 256 for LDT control bin, to hold version. if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; } uint64_t memory_bytes = 0; if (! is_create) { memory_bytes = as_storage_record_get_n_bytes_memory(&rd); } as_record_set_properties(&rd, p_rec_props); if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz, &p_stack_particles, has_sindex) != 0) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity? } r->generation = generation; r->void_time = void_time; r->last_update_time = last_update_time; as_storage_record_adjust_mem_stats(&rd, memory_bytes); uint64_t version_to_set = 0; bool set_version = false; if (is_ldt_parent) { if (linfo->replication_partition_version_match && linfo->ldt_prole_version_set) { version_to_set = linfo->ldt_prole_version; set_version = true; } else if (! linfo->replication_partition_version_match) { version_to_set = linfo->ldt_source_version; set_version = true; } } if (set_version) { int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set, p_stack_particles, __FILE__, __LINE__); if (ldt_rv < 0) { cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv); // TODO - roll back. } } bool is_delete = false; if (! as_bin_inuse_has(&rd)) { // A master write that deletes a record by deleting (all) bins sends a // binless pickle that ends up here. is_delete = true; as_index_delete(tree, keyd); } as_storage_record_write(r, &rd); as_storage_record_close(r, &rd); uint16_t set_id = as_index_get_set_id(r); as_record_done(&r_ref, ns); // Don't send an XDR delete if it's disallowed. if (is_delete && ! is_xdr_delete_shipping_enabled()) { // TODO - should we also not ship if there was no record here before? return AS_PROTO_RESULT_OK; } // Do XDR write if the write is a non-XDR write or forwarding is enabled. if ((info & RW_INFO_XDR) == 0 || is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) { xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL); } return AS_PROTO_RESULT_OK; }
// Build response to batch request. static void batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r) { as_namespace* ns = btr->ns; batch_digests *bmds = btr->digests; bool get_data = btr->get_data; uint32_t yield_count = 0; for (int i = 0; i < bmds->n_digests; i++) { batch_digest *bmd = &bmds->digest[i]; if (bmd->done == false) { // try to get the key as_partition_reservation rsv; AS_PARTITION_RESERVATION_INIT(rsv); cf_node other_node = 0; uint64_t cluster_key; if (! *bb_r) { *bb_r = cf_buf_builder_create_size(1024 * 4); } int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key); if (rv == 0) { cf_atomic_int_incr(&g_config.batch_tree_count); as_index_ref r_ref; r_ref.skip_lock = false; int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns); if (rec_rv == 0) { as_index *r = r_ref.r; // Check to see this isn't an expired record waiting to die. if (r->void_time && r->void_time < as_record_void_time_get()) { as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } else { // Make sure it's brought in from storage if necessary. as_storage_rd rd; if (get_data) { as_storage_record_open(ns, r, &rd, &r->key); rd.n_bins = as_bin_get_n_bins(r, &rd); } // Note: this array must stay in scope until the // response for this record has been built, since in the // get data w/ record on device case, it's copied by // reference directly into the record descriptor. as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; if (get_data) { // Figure out which bins you want - for now, all. rd.bins = as_bin_get_all(r, &rd, stack_bins); rd.n_bins = as_bin_inuse_count(&rd); } as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist); if (get_data) { as_storage_record_close(r, &rd); } } as_record_done(&r_ref, ns); } else { // TODO - what about empty records? cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } bmd->done = true; as_partition_release(&rsv); cf_atomic_int_decr(&g_config.batch_tree_count); } else { cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); if (other_node != 0) { bmd->node = other_node; cf_debug(AS_BATCH, "other_node is: %p.", other_node); } else { cf_debug(AS_BATCH, "other_node is NULL."); } } yield_count++; if (yield_count % g_config.batch_priority == 0) { usleep(1); } } } }