// If remote record is better than local record, replace local with remote. int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write) { as_namespace *ns = rr->rsv->ns; if (! as_storage_has_space(ns)) { cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name); return AS_ERR_OUT_OF_SPACE; } CF_ALLOC_SET_NS_ARENA(ns); as_index_tree *tree = rr->rsv->tree; as_index_ref r_ref; int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns); if (rv < 0) { return AS_ERR_OUT_OF_SPACE; } bool is_create = rv == 1; as_index *r = r_ref.r; int result; conflict_resolution_pol policy = ns->conflict_resolution_policy; if (is_repl_write) { bool from_replica; if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src, &from_replica)) != AS_OK) { record_replace_failed(rr, &r_ref, NULL, is_create); return result; } repl_write_init_repl_state(rr, from_replica); policy = repl_write_conflict_resolution_policy(ns); } if (! is_create && record_replace_check(r, ns) < 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return AS_ERR_FORBIDDEN; } // If local record is better, no-op or fail. if (! is_create && (result = as_record_resolve_conflict(policy, r->generation, r->last_update_time, (uint16_t)rr->generation, rr->last_update_time)) <= 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return result == 0 ? AS_ERR_RECORD_EXISTS : AS_ERR_GENERATION; } // else - remote winner - apply it. // If creating record, write set-ID into index. if (is_create) { if (rr->set_name && (result = as_index_set_set_w_len(r, ns, rr->set_name, rr->set_name_len, false)) < 0) { record_replace_failed(rr, &r_ref, NULL, is_create); return -result; } r->last_update_time = rr->last_update_time; // Don't write record if it would be truncated. if (as_truncate_record_is_truncated(r, ns)) { record_replace_failed(rr, &r_ref, NULL, is_create); return AS_OK; } } // else - not bothering to check that sets match. as_storage_rd rd; if (is_create) { as_storage_record_create(ns, r, &rd); } else { as_storage_record_open(ns, r, &rd); } // TODO - old pickle - remove condition in "six months". if (rr->is_old_pickle) { // Prepare to store set name, if there is one. rd.set_name = rr->set_name; rd.set_name_len = rr->set_name_len; } else { rd.pickle = rr->pickle; rd.pickle_sz = rr->pickle_sz; rd.orig_pickle_sz = as_flat_orig_pickle_size(rr, rd.pickle_sz); } // Note - deal with key after reading existing record (if such), in case // we're dropping the key. // Split according to configuration to replace local record. bool is_delete = false; if (ns->storage_data_in_memory) { if (ns->single_bin) { result = record_apply_dim_single_bin(rr, &rd, &is_delete); } else { result = record_apply_dim(rr, &rd, skip_sindex, &is_delete); } } else { if (ns->single_bin) { result = record_apply_ssd_single_bin(rr, &rd, &is_delete); } else { result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete); } } if (result != 0) { record_replace_failed(rr, &r_ref, &rd, is_create); return result; } uint16_t set_id = as_index_get_set_id(r); // save for XDR write record_replaced(r, rr); as_storage_record_close(&rd); as_record_done(&r_ref, ns); if (do_xdr_write) { xdr_write_replica(rr, is_delete, set_id); } return AS_OK; }
/* * Internal function: udf_aerospike__apply_update_atomic * * Parameters: * rec -- udf_record to be updated * * Return Values: * 0 success * -1 failure * * Description: * This function applies all the updates atomically. That is, * if one of the bin update/delete/create fails, the entire function * will fail. If the nth update fails, all the n-1 updates are rolled * back to their initial values * * Special Notes: * i. The basic checks of bin name being too long or if there is enough space * on the disk for the bin values is done before allocating space for any * of the bins. * * ii. If one of the updates to be rolled back is a bin creation, * udf_aerospike_delbin is called. This will not free up the bin metadata. * So there will be a small memory mismatch b/w replica (which did not get the * record at all and hence no memory is accounted) and the master will be seen. * To avoid such cases, we are doing checks upfront. * * Callers: * udf_aerospike__execute_updates * In this function, if udf_aerospike__apply_update_atomic fails, the record * is not committed to the storage. On success, record is closed which commits to * the storage and reopened for the next set of udf updates. * The return value from udf_aerospike__apply_update_atomic is passed on to the * callers of this function. */ int udf_aerospike__apply_update_atomic(udf_record *urecord) { int rc = 0; int failmax = 0; int new_bins = 0; // How many new bins have to be created in this update as_storage_rd * rd = urecord->rd; as_namespace * ns = rd->ns; bool has_sindex = as_sindex_ns_has_sindex(ns); bool is_record_dirty = false; bool is_record_flag_dirty = false; uint8_t old_index_flags = as_index_get_flags(rd->r); uint8_t new_index_flags = 0; // This will iterate over all the updates and apply them to storage. // The items will remain, and be used as cache values. If an error // occurred during setbin(), we rollback all the operation which // is and return failure cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates); // loop twice to make sure the updates are performed first so in case // something wrong it can be rolled back. The deletes will go through // successfully generally. // In first iteration, just calculate how many new bins need to be created for(uint32_t i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty ) { char * k = urecord->updates[i].name; if ( k != NULL ) { if ( !as_bin_get(rd, k) ) { new_bins++; } } } } // Free bins - total bins not in use in the record // Delta bins - new bins that need to be created int inuse_bins = as_bin_inuse_count(rd); int free_bins = rd->n_bins - inuse_bins; int delta_bins = new_bins - free_bins; cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d", rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins); // Check bin usage limit. if ((inuse_bins + new_bins > UDF_RECORD_BIN_ULIMIT) || (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS)) { cf_warning(AS_UDF, "bin limit of %d for UDF exceeded: %d bins in use, %d bins free, %s%d new bins needed", (int)UDF_RECORD_BIN_ULIMIT, inuse_bins, free_bins, (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS) ? ">" : "", new_bins); goto Rollback; } // Allocate space for all the new bins that need to be created beforehand if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { as_bin_allocate_bin_space(urecord->r_ref->r, rd, delta_bins); } if (!rd->ns->storage_data_in_memory && !urecord->particle_data) { // 256 as upper bound on the LDT control bin, we may write version below // leave it at the end for its use urecord->particle_data = cf_malloc(rd->ns->storage_write_block_size + 256); urecord->cur_particle_data = urecord->particle_data; urecord->end_particle_data = urecord->particle_data + rd->ns->storage_write_block_size; } if (has_sindex) { SINDEX_GRLOCK(); } // In second iteration apply updates. for(uint32_t i = 0; i < urecord->nupdates; i++ ) { urecord->updates[i].oldvalue = NULL; urecord->updates[i].washidden = false; if ( urecord->updates[i].dirty && rc == 0) { char * k = urecord->updates[i].name; as_val * v = urecord->updates[i].value; bool h = urecord->updates[i].ishidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); // Only case delete fails if bin is not found that is // as good as delete. Ignore return code !! udf_aerospike_delbin(urecord, k); if (urecord->dirty != NULL) { xdr_fill_dirty_bins(urecord->dirty); } } else { // otherwise, it is a set cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); rc = udf_aerospike_setbin(urecord, i, k, v, h); if (rc) { if (urecord->updates[i].oldvalue) { as_val_destroy(urecord->updates[i].oldvalue); urecord->updates[i].oldvalue = NULL; } failmax = i; goto Rollback; } if (urecord->dirty != NULL) { xdr_add_dirty_bin(ns, urecord->dirty, k, strlen(k)); } } } is_record_dirty = true; } } if (urecord->ldt_rectype_bit_update) { if (urecord->ldt_rectype_bit_update < 0) { // ldt_rectype_bit_update is negative in case we want to reset the bits uint8_t rectype_bits = urecord->ldt_rectype_bit_update * -1; new_index_flags = old_index_flags & ~rectype_bits; } else { new_index_flags = old_index_flags | urecord->ldt_rectype_bit_update; } if (new_index_flags != old_index_flags) { as_index_clear_flags(rd->r, old_index_flags); as_index_set_flags(rd->r, new_index_flags); is_record_flag_dirty = true; cf_detail_digest(AS_RW, &urecord->tr->keyd, "Setting index flags from %d to %d new flag %d", old_index_flags, new_index_flags, as_index_get_flags(rd->r)); } } { // This is _NOT_ for writing to the storage but for simply performing sizing // calculation. If we know the upper bounds of size of rec_props.. we could // avoid this work and check with that much correction ... // // See // - udf_rw_post_processing for building rec_props for replication // - udf_record_close for building rec_props for writing it to storage size_t rec_props_data_size = as_storage_record_rec_props_size(rd); uint8_t rec_props_data[rec_props_data_size]; if (rec_props_data_size > 0) { as_storage_record_set_rec_props(rd, rec_props_data); } // Version is set in the end after record size check. Setting version won't change the size of // the record. And if it were before size check then this setting of version as well needs to // be backed out. // TODO: Add backout logic would work till very first create call of LDT end up crossing over // record boundary if (rd->ns->ldt_enabled && as_ldt_record_is_parent(rd->r)) { int rv = as_ldt_parent_storage_set_version(rd, urecord->lrecord->version, urecord->end_particle_data, __FILE__, __LINE__); if (rv < 0) { cf_warning(AS_LDT, "udf_aerospike__apply_update_atomic: Internal Error " " [Failed to set the version on storage rv=%d]... Fail",rv); goto Rollback; } // TODO - if size check below fails, won't write to device - // different behavior than write_to_device flag - OK? is_record_dirty = true; } if (! as_storage_record_size_and_check(rd)) { cf_warning(AS_UDF, "record failed storage size check, will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (cf_atomic32_get(rd->ns->stop_writes) == 1) { cf_warning(AS_UDF, "UDF failed by stop-writes, record will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (! as_storage_has_space(rd->ns)) { cf_warning(AS_UDF, "drives full, record will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (! is_valid_ttl(rd->ns, urecord->tr->msgp->msg.record_ttl)) { cf_warning(AS_UDF, "invalid ttl %u", urecord->tr->msgp->msg.record_ttl); failmax = (int)urecord->nupdates; goto Rollback; } } if (has_sindex) { SINDEX_GUNLOCK(); } // If there were updates do miscellaneous successful commit // tasks if (is_record_dirty || is_record_flag_dirty || (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED)) { urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // will write to storage } urecord->ldt_rectype_bit_update = 0; // Clean up oldvalue cache and reset dirty. All the changes made // here has made to the particle buffer. Nothing will now be backed out. for (uint32_t i = 0; i < urecord->nupdates; i++) { udf_record_bin * bin = &urecord->updates[i]; if (bin->oldvalue != NULL ) { as_val_destroy(bin->oldvalue); bin->oldvalue = NULL; } bin->dirty = false; } return rc; Rollback: cf_debug(AS_UDF, "Rollback Called: failmax %d", failmax); for (int i = 0; i < failmax; i++) { if (urecord->updates[i].dirty) { char * k = urecord->updates[i].name; // Pick the oldvalue for rollback as_val * v = urecord->updates[i].oldvalue; bool h = urecord->updates[i].washidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k); rc = udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k); rc = udf_aerospike_setbin(urecord, i, k, v, h); if (rc) { cf_warning(AS_UDF, "Rollback failed .. not good ... !!"); } } } if (v) { as_val_destroy(v); cf_debug(AS_UDF, "ROLLBACK as_val_destroy()"); } } } if (is_record_dirty && urecord->dirty != NULL) { xdr_clear_dirty_bins(urecord->dirty); } if (is_record_flag_dirty) { as_index_clear_flags(rd->r, new_index_flags); as_index_set_flags(rd->r, old_index_flags); is_record_flag_dirty = false; } urecord->ldt_rectype_bit_update = 0; if (has_sindex) { SINDEX_GUNLOCK(); } // Reset the flat size in case the stuff is backedout !!! it should not // fail in the backout code ... if (! as_storage_record_size_and_check(rd)) { cf_warning(AS_LDT, "Does not fit even after rollback... it is trouble"); } // Do not clean up the cache in case of failure return -1; }
int write_replica(as_partition_reservation* rsv, cf_digest* keyd, uint8_t* pickled_buf, size_t pickled_sz, const as_rec_props* p_rec_props, as_generation generation, uint32_t void_time, uint64_t last_update_time, cf_node master, uint32_t info, ldt_prole_info* linfo) { as_namespace* ns = rsv->ns; if (! as_storage_has_space(rsv->ns)) { cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name); return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE; } as_index_tree* tree = rsv->tree; bool is_subrec = false; bool is_ldt_parent = false; if (ns->ldt_enabled) { if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) { tree = rsv->sub_tree; is_subrec = true; } else if ((info & RW_INFO_LDT_PARENTREC) != 0) { is_ldt_parent = true; } } as_index_ref r_ref; r_ref.skip_lock = false; int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec); if (rv < 0) { cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name); return AS_PROTO_RESULT_FAIL_UNKNOWN; } as_record* r = r_ref.r; as_storage_rd rd; bool is_create = false; if (rv == 1) { as_storage_record_create(ns, r, &rd, keyd); is_create = true; } else { as_storage_record_open(ns, r, &rd, keyd); } bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0; rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent; rd.n_bins = as_bin_get_n_bins(r, &rd); // TODO - we really need an inline utility for this! uint16_t newbins = ntohs(*(uint16_t*)pickled_buf); if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin && newbins > rd.n_bins) { rd.n_bins = newbins; } as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; rd.bins = as_bin_get_all(r, &rd, stack_bins); uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ? 0 : as_record_buf_get_stack_particles_sz(pickled_buf); uint8_t stack_particles[stack_particles_sz + 256]; uint8_t* p_stack_particles = stack_particles; // + 256 for LDT control bin, to hold version. if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; } uint64_t memory_bytes = 0; if (! is_create) { memory_bytes = as_storage_record_get_n_bytes_memory(&rd); } as_record_set_properties(&rd, p_rec_props); if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz, &p_stack_particles, has_sindex) != 0) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity? } r->generation = generation; r->void_time = void_time; r->last_update_time = last_update_time; as_storage_record_adjust_mem_stats(&rd, memory_bytes); uint64_t version_to_set = 0; bool set_version = false; if (is_ldt_parent) { if (linfo->replication_partition_version_match && linfo->ldt_prole_version_set) { version_to_set = linfo->ldt_prole_version; set_version = true; } else if (! linfo->replication_partition_version_match) { version_to_set = linfo->ldt_source_version; set_version = true; } } if (set_version) { int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set, p_stack_particles, __FILE__, __LINE__); if (ldt_rv < 0) { cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv); // TODO - roll back. } } bool is_delete = false; if (! as_bin_inuse_has(&rd)) { // A master write that deletes a record by deleting (all) bins sends a // binless pickle that ends up here. is_delete = true; as_index_delete(tree, keyd); } as_storage_record_write(r, &rd); as_storage_record_close(r, &rd); uint16_t set_id = as_index_get_set_id(r); as_record_done(&r_ref, ns); // Don't send an XDR delete if it's disallowed. if (is_delete && ! is_xdr_delete_shipping_enabled()) { // TODO - should we also not ship if there was no record here before? return AS_PROTO_RESULT_OK; } // Do XDR write if the write is a non-XDR write or forwarding is enabled. if ((info & RW_INFO_XDR) == 0 || is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) { xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL); } return AS_PROTO_RESULT_OK; }