/** * aerospike::create(record) * Function: udf_aerospike_rec_create * * Parameters: * as - as_aerospike * rec - as_rec * * Return Values: * 1 if record is being read or on a create, it already exists * o/w return value of udf_aerospike__execute_updates * * Description: * Create a new record in local storage. * The record will only be created if it does not exist. * This assumes the record has a digest that is valid for local storage. * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * lua interfacing function, mod_lua_aerospike_rec_create * The return value of udf_aerospike_rec_create is pushed on to the lua stack * * Notes: * The 'read' and 'exists' flag of udf_record are set to true. */ static int udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec) { int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); if (ret) { return ret; } udf_record * urecord = (udf_record *) as_rec_source(rec); // make sure record isn't already successfully read if (urecord->flag & UDF_RECORD_FLAG_OPEN) { cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists"); return 1; } as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; as_storage_rd *rd = urecord->rd; as_index_tree *tree = tr->rsv.tree; if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { tree = tr->rsv.sub_tree; } // make sure we got the record as a create int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns); cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord", (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : ""); // rv 0 means record exists, 1 means create, < 0 means fail // TODO: Verify correct result codes. if (rv == 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2"); as_record_done(r_ref, tr->rsv.ns); bzero(r_ref, sizeof(as_index_ref)); return 1; } else if (rv < 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv); return rv; } // Associates the set name with the storage rec and index if(tr->msgp) { // Set the set name to index and close record if the setting the set name // is not successful int rv_set = as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg); if (rv_set != 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname"); as_record_done(r_ref, tr->rsv.ns); // TODO bzero is expensive. Switch to use flag. bzero(r_ref, sizeof(as_index_ref)); return 4; } } urecord->flag |= UDF_RECORD_FLAG_OPEN; cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); as_index *r = r_ref->r; // open up storage as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r, urecord->rd, &urecord->tr->keyd); cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p", urecord->r_ref->r, urecord->rd); // if multibin storage, we will use urecord->stack_bins, so set the size appropriately if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) { rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); } // side effect: will set the unused bins to properly unused rd->bins = as_bin_get_all(r, rd, urecord->stack_bins); urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; // If the message has a key, apply it to the record. as_msg_field* f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_KEY); if (f) { rd->key_size = as_msg_field_get_value_sz(f); rd->key = f->data; } cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag); int rc = udf_aerospike__execute_updates(urecord); if(rc) { // Creating the udf record failed, destroy the as_record if (!as_bin_inuse_has(urecord->rd)) { udf_aerospike_rec_remove(as, rec); } } return rc; }
// If remote record is better than local record, replace local with remote. int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write) { as_namespace *ns = rr->rsv->ns; if (! as_storage_has_space(ns)) { cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name); return AS_ERR_OUT_OF_SPACE; } CF_ALLOC_SET_NS_ARENA(ns); as_index_tree *tree = rr->rsv->tree; as_index_ref r_ref; int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns); if (rv < 0) { return AS_ERR_OUT_OF_SPACE; } bool is_create = rv == 1; as_index *r = r_ref.r; int result; conflict_resolution_pol policy = ns->conflict_resolution_policy; if (is_repl_write) { bool from_replica; if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src, &from_replica)) != AS_OK) { record_replace_failed(rr, &r_ref, NULL, is_create); return result; } repl_write_init_repl_state(rr, from_replica); policy = repl_write_conflict_resolution_policy(ns); } if (! is_create && record_replace_check(r, ns) < 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return AS_ERR_FORBIDDEN; } // If local record is better, no-op or fail. if (! is_create && (result = as_record_resolve_conflict(policy, r->generation, r->last_update_time, (uint16_t)rr->generation, rr->last_update_time)) <= 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return result == 0 ? AS_ERR_RECORD_EXISTS : AS_ERR_GENERATION; } // else - remote winner - apply it. // If creating record, write set-ID into index. if (is_create) { if (rr->set_name && (result = as_index_set_set_w_len(r, ns, rr->set_name, rr->set_name_len, false)) < 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return -result; } r->last_update_time = rr->last_update_time; // Don't write record if it would be truncated. if (as_truncate_record_is_truncated(r, ns)) { record_replace_failed(rr, &r_ref, NULL, is_create); return AS_OK; } } // else - not bothering to check that sets match. as_storage_rd rd; if (is_create) { as_storage_record_create(ns, r, &rd); } else { as_storage_record_open(ns, r, &rd); } // TODO - old pickle - remove condition in "six months". if (rr->is_old_pickle) { // Prepare to store set name, if there is one. rd.set_name = rr->set_name; rd.set_name_len = rr->set_name_len; } else { rd.pickle = rr->pickle; rd.pickle_sz = rr->pickle_sz; rd.orig_pickle_sz = as_flat_orig_pickle_size(rr, rd.pickle_sz); } // Note - deal with key after reading existing record (if such), in case // we're dropping the key. // Split according to configuration to replace local record. bool is_delete = false; if (ns->storage_data_in_memory) { if (ns->single_bin) { result = record_apply_dim_single_bin(rr, &rd, &is_delete); } else { result = record_apply_dim(rr, &rd, skip_sindex, &is_delete); } } else { if (ns->single_bin) { result = record_apply_ssd_single_bin(rr, &rd, &is_delete); } else { result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete); } } if (result != 0) { record_replace_failed(rr, &r_ref, &rd, is_create); return result; } uint16_t set_id = as_index_get_set_id(r); // save for XDR write record_replaced(r, rr); as_storage_record_close(&rd); as_record_done(&r_ref, ns); if (do_xdr_write) { xdr_write_replica(rr, is_delete, set_id); } return AS_OK; }
/** * aerospike::create(record) * Function: udf_aerospike_rec_create * * Parameters: * as - as_aerospike * rec - as_rec * * Return Values: * 1 if record is being read or on a create, it already exists * o/w return value of udf_aerospike__execute_updates * * Description: * Create a new record in local storage. * The record will only be created if it does not exist. * This assumes the record has a digest that is valid for local storage. * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * lua interfacing function, mod_lua_aerospike_rec_create * The return value of udf_aerospike_rec_create is pushed on to the lua stack * * Notes: * The 'read' and 'exists' flag of udf_record are set to true. */ static int udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec) { int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); if (ret) { return ret; } udf_record * urecord = (udf_record *) as_rec_source(rec); // make sure record isn't already successfully read if (urecord->flag & UDF_RECORD_FLAG_OPEN) { cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists"); return 1; } as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; as_storage_rd *rd = urecord->rd; as_index_tree *tree = tr->rsv.tree; bool is_subrec = false; if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { tree = tr->rsv.sub_tree; is_subrec = true; } // make sure we got the record as a create bool is_create = false; int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns, is_subrec); cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord", (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : ""); // rv 0 means record exists, 1 means create, < 0 means fail // TODO: Verify correct result codes. if (rv == 1) { is_create = true; } else if (rv == 0) { // If it's an expired record, pretend it's a fresh create. if (as_record_is_expired(r_ref->r)) { as_record_destroy(r_ref->r, tr->rsv.ns); as_record_initialize(r_ref, tr->rsv.ns); cf_atomic_int_incr(&tr->rsv.ns->n_objects); is_create = true; } else { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2"); as_record_done(r_ref, tr->rsv.ns); // DO NOT change it has special meaning for caller return 1; } } else if (rv < 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv); return rv; } // Associates the set name with the storage rec and index if (tr->msgp) { // Set the set name to index and close record if the setting the set name // is not successful int rv_set = as_transaction_has_set(tr) ? as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0; if (rv_set != 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname"); if (is_create) { as_index_delete(tree, &tr->keyd); } as_record_done(r_ref, tr->rsv.ns); return 4; } } urecord->flag |= UDF_RECORD_FLAG_OPEN; cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); as_index *r = r_ref->r; // open up storage as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r, urecord->rd, &urecord->tr->keyd); cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p", urecord->r_ref->r, urecord->rd); // If the message has a key, apply it to the record. if (! get_msg_key(tr, rd)) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key"); if (is_create) { as_index_delete(tree, &tr->keyd); } as_record_done(r_ref, tr->rsv.ns); urecord->flag &= ~UDF_RECORD_FLAG_OPEN; return 4; } // if multibin storage, we will use urecord->stack_bins, so set the size appropriately if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) { rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); } // side effect: will set the unused bins to properly unused rd->bins = as_bin_get_all(r, rd, urecord->stack_bins); urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag); int rc = udf_aerospike__execute_updates(urecord); if (rc) { // Creating the udf record failed, destroy the as_record cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc); if (!as_bin_inuse_has(urecord->rd)) { udf_aerospike_rec_remove(as, rec); } } return rc; }
int write_replica(as_partition_reservation* rsv, cf_digest* keyd, uint8_t* pickled_buf, size_t pickled_sz, const as_rec_props* p_rec_props, as_generation generation, uint32_t void_time, uint64_t last_update_time, cf_node master, uint32_t info, ldt_prole_info* linfo) { as_namespace* ns = rsv->ns; if (! as_storage_has_space(rsv->ns)) { cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name); return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE; } as_index_tree* tree = rsv->tree; bool is_subrec = false; bool is_ldt_parent = false; if (ns->ldt_enabled) { if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) { tree = rsv->sub_tree; is_subrec = true; } else if ((info & RW_INFO_LDT_PARENTREC) != 0) { is_ldt_parent = true; } } as_index_ref r_ref; r_ref.skip_lock = false; int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec); if (rv < 0) { cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name); return AS_PROTO_RESULT_FAIL_UNKNOWN; } as_record* r = r_ref.r; as_storage_rd rd; bool is_create = false; if (rv == 1) { as_storage_record_create(ns, r, &rd, keyd); is_create = true; } else { as_storage_record_open(ns, r, &rd, keyd); } bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0; rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent; rd.n_bins = as_bin_get_n_bins(r, &rd); // TODO - we really need an inline utility for this! uint16_t newbins = ntohs(*(uint16_t*)pickled_buf); if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin && newbins > rd.n_bins) { rd.n_bins = newbins; } as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; rd.bins = as_bin_get_all(r, &rd, stack_bins); uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ? 0 : as_record_buf_get_stack_particles_sz(pickled_buf); uint8_t stack_particles[stack_particles_sz + 256]; uint8_t* p_stack_particles = stack_particles; // + 256 for LDT control bin, to hold version. if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; } uint64_t memory_bytes = 0; if (! is_create) { memory_bytes = as_storage_record_get_n_bytes_memory(&rd); } as_record_set_properties(&rd, p_rec_props); if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz, &p_stack_particles, has_sindex) != 0) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity? } r->generation = generation; r->void_time = void_time; r->last_update_time = last_update_time; as_storage_record_adjust_mem_stats(&rd, memory_bytes); uint64_t version_to_set = 0; bool set_version = false; if (is_ldt_parent) { if (linfo->replication_partition_version_match && linfo->ldt_prole_version_set) { version_to_set = linfo->ldt_prole_version; set_version = true; } else if (! linfo->replication_partition_version_match) { version_to_set = linfo->ldt_source_version; set_version = true; } } if (set_version) { int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set, p_stack_particles, __FILE__, __LINE__); if (ldt_rv < 0) { cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv); // TODO - roll back. } } bool is_delete = false; if (! as_bin_inuse_has(&rd)) { // A master write that deletes a record by deleting (all) bins sends a // binless pickle that ends up here. is_delete = true; as_index_delete(tree, keyd); } as_storage_record_write(r, &rd); as_storage_record_close(r, &rd); uint16_t set_id = as_index_get_set_id(r); as_record_done(&r_ref, ns); // Don't send an XDR delete if it's disallowed. if (is_delete && ! is_xdr_delete_shipping_enabled()) { // TODO - should we also not ship if there was no record here before? return AS_PROTO_RESULT_OK; } // Do XDR write if the write is a non-XDR write or forwarding is enabled. if ((info & RW_INFO_XDR) == 0 || is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) { xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL); } return AS_PROTO_RESULT_OK; }