int as_record_write_from_pickle(as_storage_rd* rd) { cf_assert(as_bin_inuse_has(rd), AS_RECORD, "unexpected binless pickle"); return as_storage_record_write(rd); }
as_bin * as_bin_get_from_buf(as_storage_rd *rd, byte *name, size_t namesz) { if (rd->ns->single_bin) { return as_bin_inuse_has(rd) ? rd->bins : NULL; } uint32_t id; if (! as_bin_get_id_w_len(rd->ns, name, namesz, &id)) { return NULL; } for (uint16_t i = 0; i < rd->n_bins; i++) { as_bin *b = &rd->bins[i]; if (! as_bin_inuse(b)) { break; } if ((uint32_t)b->id == id) { return b; } } return NULL; }
int32_t as_bin_get_index(as_storage_rd *rd, byte *name, size_t namesz) { if (rd->ns->single_bin) { return as_bin_inuse_has(rd) ? 0 : -1; } uint32_t id; if (! as_bin_get_id_from_name_buf(rd->ns, name, namesz, &id)) { return -1; } for (uint16_t i = 0; i < rd->n_bins; i++) { as_bin *b = &rd->bins[i]; if (! as_bin_inuse(b)) { break; } if ((uint32_t)b->id == id) { return (int32_t)i; } } return -1; }
int32_t as_bin_get_index(as_storage_rd *rd, const char *name) { if (rd->ns->single_bin) { return as_bin_inuse_has(rd) ? 0 : -1; } uint32_t id; if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) { return -1; } for (uint16_t i = 0; i < rd->n_bins; i++) { as_bin *b = &rd->bins[i]; if (! as_bin_inuse(b)) { break; } if ((uint32_t)b->id == id) { return (int32_t)i; } } return -1; }
static inline bool udf_zero_bins_left(udf_record *urecord) { if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) && (urecord->flag & UDF_RECORD_FLAG_OPEN) && !as_bin_inuse_has(urecord->rd)) { return true; } else { return false; } }
// Does not check bin name length. // Checks bin name quota - use appropriately. as_bin * as_bin_get_or_create(as_storage_rd *rd, const char *name) { if (rd->ns->single_bin) { if (! as_bin_inuse_has(rd)) { as_bin_init_nameless(rd->bins); } return rd->bins; } uint32_t id = (uint32_t)-1; uint16_t i; as_bin *b; if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) == CF_VMAPX_OK) { for (i = 0; i < rd->n_bins; i++) { b = &rd->bins[i]; if (! as_bin_inuse(b)) { break; } if ((uint32_t)b->id == id) { return b; } } } else { if (cf_vmapx_count(rd->ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) { cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", rd->ns->name, name); return NULL; } i = as_bin_inuse_count(rd); } if (i >= rd->n_bins) { cf_crash(AS_BIN, "ran out of allocated bins in rd"); } b = &rd->bins[i]; if (id == (uint32_t)-1) { as_bin_init(rd->ns, b, name); } else { as_bin_init_nameless(b); b->id = (uint16_t)id; } return b; }
as_bin * as_bin_get(as_storage_rd *rd, const char *name) { if (rd->ns->single_bin) { return as_bin_inuse_has(rd) ? rd->bins : NULL; } uint32_t id; if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) { return NULL; } return as_bin_get_by_id(rd, id); }
as_bin * as_bin_get_and_reserve_name(as_storage_rd *rd, byte *name, size_t namesz, bool *p_reserved, uint32_t *p_idx) { *p_reserved = true; if (rd->ns->single_bin) { return as_bin_inuse_has(rd) ? rd->bins : NULL; } char zname[namesz + 1]; memcpy(zname, name, namesz); zname[namesz] = 0; if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, zname, p_idx) != CF_VMAPX_OK) { if (cf_vmapx_count(rd->ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) { cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", rd->ns->name, zname); *p_reserved = false; } else { cf_vmapx_err result = cf_vmapx_put_unique(rd->ns->p_bin_name_vmap, zname, p_idx); if (! (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS)) { cf_warning(AS_BIN, "{%s} can't add new bin name %s, vmap err %d", rd->ns->name, zname, result); *p_reserved = false; } } return NULL; } for (uint16_t i = 0; i < rd->n_bins; i++) { as_bin *b = &rd->bins[i]; if (! as_bin_inuse(b)) { break; } if ((uint32_t)b->id == *p_idx) { return b; } } return NULL; }
transaction_status read_local(as_transaction* tr) { as_msg* m = &tr->msgp->msg; as_namespace* ns = tr->rsv.ns; as_index_ref r_ref; if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) { read_local_done(tr, NULL, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } as_record* r = r_ref.r; // Check if it's an expired or truncated record. if (as_record_is_doomed(r, ns)) { read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } int result = repl_state_check(r, tr); if (result != 0) { if (result == -3) { read_local_done(tr, &r_ref, NULL, AS_ERR_UNAVAILABLE); return TRANS_DONE_ERROR; } // No response sent to origin. as_record_done(&r_ref, ns); return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING; } // Check if it's a tombstone. if (! as_record_is_live(r)) { read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } as_storage_rd rd; as_storage_record_open(ns, r, &rd); // If configuration permits, allow reads to use page cache. rd.read_page_cache = ns->storage_read_page_cache; // Check the key if required. // Note - for data-not-in-memory "exists" ops, key check is expensive! if (as_transaction_has_key(tr) && as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) { read_local_done(tr, &r_ref, &rd, AS_ERR_KEY_MISMATCH); return TRANS_DONE_ERROR; } if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) { tr->generation = r->generation; tr->void_time = r->void_time; tr->last_update_time = r->last_update_time; read_local_done(tr, &r_ref, &rd, AS_OK); return TRANS_DONE_SUCCESS; } if ((result = as_storage_rd_load_n_bins(&rd)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } if (! as_bin_inuse_has(&rd)) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name); read_local_done(tr, &r_ref, &rd, AS_ERR_UNKNOWN); return TRANS_DONE_ERROR; } uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ? rd.n_bins : m->n_ops; as_msg_op* ops[bin_count]; as_msg_op** p_ops = ops; as_bin* response_bins[bin_count]; uint16_t n_bins = 0; as_bin result_bins[bin_count]; uint32_t n_result_bins = 0; if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) { p_ops = NULL; n_bins = rd.n_bins; as_bin_get_all_p(&rd, response_bins); } else { if (m->n_ops == 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name); read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER); return TRANS_DONE_ERROR; } bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0; as_msg_op* op = 0; int n = 0; while ((op = as_msg_op_iterate(m, op, &n)) != NULL) { if (op->op == AS_MSG_OP_READ) { as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); if (b || respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = b; } } else if (op->op == AS_MSG_OP_CDT_READ) { as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); if (b) { as_bin* rb = &result_bins[n_result_bins]; as_bin_set_empty(rb); if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name); destroy_stack_bins(result_bins, n_result_bins); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } if (as_bin_inuse(rb)) { n_result_bins++; ops[n_bins] = op; response_bins[n_bins++] = rb; } else if (respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = NULL; } } else if (respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = NULL; } } else { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op); destroy_stack_bins(result_bins, n_result_bins); read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER); return TRANS_DONE_ERROR; } } } cf_dyn_buf_define_size(db, 16 * 1024); if (tr->origin != FROM_BATCH) { db.used_sz = db.alloc_sz; db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code, r->generation, r->void_time, p_ops, response_bins, n_bins, ns, (cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr)); db.is_stack = db.buf == dyn_bufdb; // Note - not bothering to correct alloc_sz if buf was allocated. } else { tr->generation = r->generation; tr->void_time = r->void_time; tr->last_update_time = r->last_update_time; // Since as_batch_add_result() constructs response directly in shared // buffer to avoid extra copies, can't use db. send_read_response(tr, p_ops, response_bins, n_bins, NULL); } destroy_stack_bins(result_bins, n_result_bins); as_storage_record_close(&rd); as_record_done(&r_ref, ns); // Now that we're not under the record lock, send the message we just built. if (db.used_sz != 0) { send_read_response(tr, NULL, NULL, 0, &db); cf_dyn_buf_free(&db); tr->from.proto_fd_h = NULL; } return TRANS_DONE_SUCCESS; }
/** * aerospike::create(record) * Function: udf_aerospike_rec_create * * Parameters: * as - as_aerospike * rec - as_rec * * Return Values: * 1 if record is being read or on a create, it already exists * o/w return value of udf_aerospike__execute_updates * * Description: * Create a new record in local storage. * The record will only be created if it does not exist. * This assumes the record has a digest that is valid for local storage. * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * lua interfacing function, mod_lua_aerospike_rec_create * The return value of udf_aerospike_rec_create is pushed on to the lua stack * * Notes: * The 'read' and 'exists' flag of udf_record are set to true. */ static int udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec) { int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); if (ret) { return ret; } udf_record * urecord = (udf_record *) as_rec_source(rec); // make sure record isn't already successfully read if (urecord->flag & UDF_RECORD_FLAG_OPEN) { cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists"); return 1; } as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; as_storage_rd *rd = urecord->rd; as_index_tree *tree = tr->rsv.tree; if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { tree = tr->rsv.sub_tree; } // make sure we got the record as a create int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns); cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord", (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : ""); // rv 0 means record exists, 1 means create, < 0 means fail // TODO: Verify correct result codes. if (rv == 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2"); as_record_done(r_ref, tr->rsv.ns); bzero(r_ref, sizeof(as_index_ref)); return 1; } else if (rv < 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv); return rv; } // Associates the set name with the storage rec and index if(tr->msgp) { // Set the set name to index and close record if the setting the set name // is not successful int rv_set = as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg); if (rv_set != 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname"); as_record_done(r_ref, tr->rsv.ns); // TODO bzero is expensive. Switch to use flag. bzero(r_ref, sizeof(as_index_ref)); return 4; } } urecord->flag |= UDF_RECORD_FLAG_OPEN; cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); as_index *r = r_ref->r; // open up storage as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r, urecord->rd, &urecord->tr->keyd); cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p", urecord->r_ref->r, urecord->rd); // if multibin storage, we will use urecord->stack_bins, so set the size appropriately if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) { rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); } // side effect: will set the unused bins to properly unused rd->bins = as_bin_get_all(r, rd, urecord->stack_bins); urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; // If the message has a key, apply it to the record. as_msg_field* f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_KEY); if (f) { rd->key_size = as_msg_field_get_value_sz(f); rd->key = f->data; } cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag); int rc = udf_aerospike__execute_updates(urecord); if(rc) { // Creating the udf record failed, destroy the as_record if (!as_bin_inuse_has(urecord->rd)) { udf_aerospike_rec_remove(as, rec); } } return rc; }
// Does not check bin name length. // Checks bin name quota and bin-level policy - use appropriately. as_bin * as_bin_get_or_create_from_buf(as_storage_rd *rd, byte *name, size_t namesz, bool create_only, bool replace_only, int *p_result) { if (rd->ns->single_bin) { if (! as_bin_inuse_has(rd)) { as_bin_init_nameless(rd->bins); } // Ignored bin-level policy - single-bin needs only record-level policy. return rd->bins; } uint32_t id = (uint32_t)-1; uint16_t i; as_bin *b; if (cf_vmapx_get_index_w_len(rd->ns->p_bin_name_vmap, (const char *)name, namesz, &id) == CF_VMAPX_OK) { for (i = 0; i < rd->n_bins; i++) { b = &rd->bins[i]; if (! as_bin_inuse(b)) { break; } if ((uint32_t)b->id == id) { if (as_bin_is_hidden(b)) { cf_warning(AS_BIN, "cannot manipulate hidden bin directly"); *p_result = AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; return NULL; } if (create_only) { *p_result = AS_PROTO_RESULT_FAIL_BIN_EXISTS; return NULL; } return b; } } } else { if (cf_vmapx_count(rd->ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) { char zname[namesz + 1]; memcpy(zname, name, namesz); zname[namesz] = 0; cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", rd->ns->name, zname); *p_result = AS_PROTO_RESULT_FAIL_BIN_NAME; return NULL; } i = as_bin_inuse_count(rd); } if (replace_only) { *p_result = AS_PROTO_RESULT_FAIL_BIN_NOT_FOUND; return NULL; } if (i >= rd->n_bins) { cf_crash(AS_BIN, "ran out of allocated bins in rd"); } b = &rd->bins[i]; if (id == (uint32_t)-1) { as_bin_init_w_len(rd->ns, b, name, namesz); } else { as_bin_init_nameless(b); b->id = (uint16_t)id; } return b; }
/* Internal Function: Does the post processing for the UDF record after the * UDF execution. Does the following: * 1. Record is closed * 2. urecord_op is updated to delete in case there is no bin left in it. * 3. record->pickled_buf is populated before the record is close in case * it was write operation * 4. UDF updates cache is cleared * * Returns: Nothing * * Parameters: urecord - UDF record to operate on * urecord_op (out) - Populated with the optype */ void udf_rw_post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id) { as_storage_rd *rd = urecord->rd; as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; // INIT urecord->pickled_buf = NULL; urecord->pickled_sz = 0; urecord->pickled_void_time = 0; as_rec_props_clear(&urecord->pickled_rec_props); bool udf_xdr_ship_op = false; // TODO: optimize not to allocate buffer if it is single // node cluster. No remote to send data to // Check if UDF has updates. if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) { // Check if the record is not deleted after an update if ( urecord->flag & UDF_RECORD_FLAG_OPEN) { *urecord_op = UDF_OPTYPE_WRITE; udf_xdr_ship_op = true; } else { // If the record has updates and it is not open, // and if it pre-existed it's an update followed by a delete. if ( urecord->flag & UDF_RECORD_FLAG_PREEXISTS) { *urecord_op = UDF_OPTYPE_DELETE; udf_xdr_ship_op = true; } // If the record did not pre-exist and is updated // and it is not open, then it is create followed by // delete essentially no_op. else { *urecord_op = UDF_OPTYPE_NONE; } } } else if ((urecord->flag & UDF_RECORD_FLAG_PREEXISTS) && !(urecord->flag & UDF_RECORD_FLAG_OPEN)) { *urecord_op = UDF_OPTYPE_DELETE; udf_xdr_ship_op = true; } else { *urecord_op = UDF_OPTYPE_READ; } cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord, urecord->tr, urecord->r_ref, urecord->rd, (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)); // If there exists a record reference but no bin of the record is in use, // delete the record. remove from the tree. Only LDT_RECORD here not needed // for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of // removed LDT_RECORD will be lazily cleaned up by defrag. if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) && urecord->flag & UDF_RECORD_FLAG_OPEN && !as_bin_inuse_has(rd)) { as_index_delete(tr->rsv.tree, &tr->keyd); urecord->starting_memory_bytes = 0; *urecord_op = UDF_OPTYPE_DELETE; udf_xdr_ship_op = true; } else if (*urecord_op == UDF_OPTYPE_WRITE) { cf_detail(AS_UDF, "Committing Changes %"PRIx64" n_bins %d", rd->keyd, as_bin_get_n_bins(r_ref->r, rd)); size_t rec_props_data_size = as_storage_record_rec_props_size(rd); uint8_t rec_props_data[rec_props_data_size]; if (rec_props_data_size > 0) { as_storage_record_set_rec_props(rd, rec_props_data); } write_local_post_processing(tr, tr->rsv.ns, NULL, &urecord->pickled_buf, &urecord->pickled_sz, &urecord->pickled_void_time, &urecord->pickled_rec_props, true/*increment_generation*/, NULL, r_ref->r, rd, urecord->starting_memory_bytes); // Now ok to accommodate a new stored key... if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) { if (rd->ns->storage_data_in_memory) { as_record_allocate_key(r_ref->r, rd->key, rd->key_size); } as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED); } // ... or drop a stored key. else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) { if (rd->ns->storage_data_in_memory) { as_record_remove_key(r_ref->r); } as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED); } } // Collect the record information (for XDR) before closing the record as_generation generation = 0; if (urecord->flag & UDF_RECORD_FLAG_OPEN) { generation = r_ref->r->generation; set_id = as_index_get_set_id(r_ref->r); } // Close the record for all the cases udf_record_close(urecord, false); // Write to XDR pipe after closing the record, in order to release the record lock as // early as possible. if (udf_xdr_ship_op == true) { if (UDF_OP_IS_WRITE(*urecord_op)) { cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd); xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id); } else if (UDF_OP_IS_DELETE(*urecord_op)) { cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd); xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id); } } // Replication happens when the main record replicates if (urecord->particle_data) { cf_free(urecord->particle_data); urecord->particle_data = 0; } udf_record_cache_free(urecord); }
/** * aerospike::create(record) * Function: udf_aerospike_rec_create * * Parameters: * as - as_aerospike * rec - as_rec * * Return Values: * 1 if record is being read or on a create, it already exists * o/w return value of udf_aerospike__execute_updates * * Description: * Create a new record in local storage. * The record will only be created if it does not exist. * This assumes the record has a digest that is valid for local storage. * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * lua interfacing function, mod_lua_aerospike_rec_create * The return value of udf_aerospike_rec_create is pushed on to the lua stack * * Notes: * The 'read' and 'exists' flag of udf_record are set to true. */ static int udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec) { int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); if (ret) { return ret; } udf_record * urecord = (udf_record *) as_rec_source(rec); // make sure record isn't already successfully read if (urecord->flag & UDF_RECORD_FLAG_OPEN) { cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists"); return 1; } as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; as_storage_rd *rd = urecord->rd; as_index_tree *tree = tr->rsv.tree; bool is_subrec = false; if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) { tree = tr->rsv.sub_tree; is_subrec = true; } // make sure we got the record as a create bool is_create = false; int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns, is_subrec); cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord", (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : ""); // rv 0 means record exists, 1 means create, < 0 means fail // TODO: Verify correct result codes. if (rv == 1) { is_create = true; } else if (rv == 0) { // If it's an expired record, pretend it's a fresh create. if (as_record_is_expired(r_ref->r)) { as_record_destroy(r_ref->r, tr->rsv.ns); as_record_initialize(r_ref, tr->rsv.ns); cf_atomic_int_incr(&tr->rsv.ns->n_objects); is_create = true; } else { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2"); as_record_done(r_ref, tr->rsv.ns); // DO NOT change it has special meaning for caller return 1; } } else if (rv < 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv); return rv; } // Associates the set name with the storage rec and index if (tr->msgp) { // Set the set name to index and close record if the setting the set name // is not successful int rv_set = as_transaction_has_set(tr) ? as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0; if (rv_set != 0) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname"); if (is_create) { as_index_delete(tree, &tr->keyd); } as_record_done(r_ref, tr->rsv.ns); return 4; } } urecord->flag |= UDF_RECORD_FLAG_OPEN; cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); as_index *r = r_ref->r; // open up storage as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r, urecord->rd, &urecord->tr->keyd); cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p", urecord->r_ref->r, urecord->rd); // If the message has a key, apply it to the record. if (! get_msg_key(tr, rd)) { cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key"); if (is_create) { as_index_delete(tree, &tr->keyd); } as_record_done(r_ref, tr->rsv.ns); urecord->flag &= ~UDF_RECORD_FLAG_OPEN; return 4; } // if multibin storage, we will use urecord->stack_bins, so set the size appropriately if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) { rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); } // side effect: will set the unused bins to properly unused rd->bins = as_bin_get_all(r, rd, urecord->stack_bins); urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd); cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag); int rc = udf_aerospike__execute_updates(urecord); if (rc) { // Creating the udf record failed, destroy the as_record cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc); if (!as_bin_inuse_has(urecord->rd)) { udf_aerospike_rec_remove(as, rec); } } return rc; }
int write_replica(as_partition_reservation* rsv, cf_digest* keyd, uint8_t* pickled_buf, size_t pickled_sz, const as_rec_props* p_rec_props, as_generation generation, uint32_t void_time, uint64_t last_update_time, cf_node master, uint32_t info, ldt_prole_info* linfo) { as_namespace* ns = rsv->ns; if (! as_storage_has_space(rsv->ns)) { cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name); return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE; } as_index_tree* tree = rsv->tree; bool is_subrec = false; bool is_ldt_parent = false; if (ns->ldt_enabled) { if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) { tree = rsv->sub_tree; is_subrec = true; } else if ((info & RW_INFO_LDT_PARENTREC) != 0) { is_ldt_parent = true; } } as_index_ref r_ref; r_ref.skip_lock = false; int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec); if (rv < 0) { cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name); return AS_PROTO_RESULT_FAIL_UNKNOWN; } as_record* r = r_ref.r; as_storage_rd rd; bool is_create = false; if (rv == 1) { as_storage_record_create(ns, r, &rd, keyd); is_create = true; } else { as_storage_record_open(ns, r, &rd, keyd); } bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0; rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent; rd.n_bins = as_bin_get_n_bins(r, &rd); // TODO - we really need an inline utility for this! uint16_t newbins = ntohs(*(uint16_t*)pickled_buf); if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin && newbins > rd.n_bins) { rd.n_bins = newbins; } as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; rd.bins = as_bin_get_all(r, &rd, stack_bins); uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ? 0 : as_record_buf_get_stack_particles_sz(pickled_buf); uint8_t stack_particles[stack_particles_sz + 256]; uint8_t* p_stack_particles = stack_particles; // + 256 for LDT control bin, to hold version. if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; } uint64_t memory_bytes = 0; if (! is_create) { memory_bytes = as_storage_record_get_n_bytes_memory(&rd); } as_record_set_properties(&rd, p_rec_props); if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz, &p_stack_particles, has_sindex) != 0) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity? } r->generation = generation; r->void_time = void_time; r->last_update_time = last_update_time; as_storage_record_adjust_mem_stats(&rd, memory_bytes); uint64_t version_to_set = 0; bool set_version = false; if (is_ldt_parent) { if (linfo->replication_partition_version_match && linfo->ldt_prole_version_set) { version_to_set = linfo->ldt_prole_version; set_version = true; } else if (! linfo->replication_partition_version_match) { version_to_set = linfo->ldt_source_version; set_version = true; } } if (set_version) { int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set, p_stack_particles, __FILE__, __LINE__); if (ldt_rv < 0) { cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv); // TODO - roll back. } } bool is_delete = false; if (! as_bin_inuse_has(&rd)) { // A master write that deletes a record by deleting (all) bins sends a // binless pickle that ends up here. is_delete = true; as_index_delete(tree, keyd); } as_storage_record_write(r, &rd); as_storage_record_close(r, &rd); uint16_t set_id = as_index_get_set_id(r); as_record_done(&r_ref, ns); // Don't send an XDR delete if it's disallowed. if (is_delete && ! is_xdr_delete_shipping_enabled()) { // TODO - should we also not ship if there was no record here before? return AS_PROTO_RESULT_OK; } // Do XDR write if the write is a non-XDR write or forwarding is enabled. if ((info & RW_INFO_XDR) == 0 || is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) { xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL); } return AS_PROTO_RESULT_OK; }