/* * Internal function: udf_aerospike__apply_update_atomic * * Parameters: * rec -- udf_record to be updated * * Return Values: * 0 success * -1 failure * * Description: * This function applies all the updates atomically. That is, * if one of the bin update/delete/create fails, the entire function * will fail. If the nth update fails, all the n-1 updates are rolled * back to their initial values * * Special Notes: * i. The basic checks of bin name being too long or if there is enough space * on the disk for the bin values is done before allocating space for any * of the bins. * * ii. If one of the updates to be rolled back is a bin creation, * udf_aerospike_delbin is called. This will not free up the bin metadata. * So there will be a small memory mismatch b/w replica (which did not get the * record at all and hence no memory is accounted) and the master will be seen. * To avoid such cases, we are doing checks upfront. * * Callers: * udf_aerospike__execute_updates * In this function, if udf_aerospike__apply_update_atomic fails, the record * is not committed to the storage. On success, record is closed which commits to * the storage and reopened for the next set of udf updates. * The return value from udf_aerospike__apply_update_atomic is passed on to the * callers of this function. */ int udf_aerospike__apply_update_atomic(udf_record *urecord) { int rc = 0; int failindex = 0; int new_bins = 0; // How many new bins have to be created in this update as_storage_rd * rd = urecord->rd; // This will iterate over all the updates and apply them to storage. // The items will remain, and be used as cache values. If an error // occurred during setbin(), we rollback all the operation which // is and return failure cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates); // loop twice to make sure the updates are performed first so in case // something wrong it can be rolled back. The deletes will go through // successfully generally. // In first iteration, just calculate how many new bins need to be created for(int i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty ) { char * k = urecord->updates[i].name; if ( k != NULL ) { if ( !as_bin_get(rd, (uint8_t *)k, strlen(k)) ) { new_bins++; } } } } // Free bins - total bins not in use in the record // Delta bins - new bins that need to be created int free_bins = urecord->rd->n_bins - as_bin_inuse_count(urecord->rd); int delta_bins = new_bins - free_bins; cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d", urecord->rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins); // Allocate space for all the new bins that need to be created beforehand if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { as_bin_allocate_bin_space(urecord->r_ref->r, rd, delta_bins); } bool has_sindex = as_sindex_ns_has_sindex(rd->ns); if (has_sindex) { SINDEX_GRLOCK(); } // In second iteration apply updates. for(int i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty && rc == 0) { char * k = urecord->updates[i].name; as_val * v = urecord->updates[i].value; bool h = urecord->updates[i].ishidden; urecord->updates[i].oldvalue = NULL; urecord->updates[i].washidden = false; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); // Only case delete fails if bin is not found that is // as good as delete. Ignore return code !! udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); rc = udf_aerospike_setbin(urecord, k, v, h); if (rc) { failindex = i; goto Rollback; } } } } } if (has_sindex) { SINDEX_GUNLOCK(); } for(int i = 0; i < urecord->nupdates; i++ ) { if ((urecord->updates[i].dirty) && (urecord->updates[i].oldvalue)) { as_val_destroy(urecord->updates[i].oldvalue); cf_debug(AS_UDF, "REGULAR as_val_destroy()"); } } // Commit successful do miscellaneous task // Set updated flag to true urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // Set up record to be flushed to storage urecord->rd->write_to_device = true; // Before committing to storage set the rec_type_bits .. cf_detail(AS_RW, "TO INDEX Digest=%"PRIx64" bits %d %p", *(uint64_t *)&urecord->tr->keyd.digest[8], urecord->ldt_rectype_bits, urecord); as_index_set_flags(rd->r, urecord->ldt_rectype_bits); // Clean up cache and start from 0 update again. All the changes // made here will if flush from write buffer to storage goes // then will never be backed out. udf_record_cache_free(urecord); return rc; Rollback: cf_debug(AS_UDF, "Rollback Called: FailIndex(%d)", failindex); for(int i = 0; i <= failindex; i++) { if (urecord->updates[i].dirty) { char * k = urecord->updates[i].name; // Pick the oldvalue for rollback as_val * v = urecord->updates[i].oldvalue; bool h = urecord->updates[i].washidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k); rc = udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k); rc = udf_aerospike_setbin(urecord, k, v, h); if (rc) { cf_warning(AS_UDF, "Rollback failed .. not good ... !!"); } } } if (v) { as_val_destroy(v); cf_debug(AS_UDF, "ROLLBACK as_val_destroy()"); } } } if (has_sindex) { SINDEX_GUNLOCK(); } // Do not clean up the cache in case of failure return -1; }
/* * Internal function: udf_aerospike__apply_update_atomic * * Parameters: * rec -- udf_record to be updated * * Return Values: * 0 success * -1 failure * * Description: * This function applies all the updates atomically. That is, * if one of the bin update/delete/create fails, the entire function * will fail. If the nth update fails, all the n-1 updates are rolled * back to their initial values * * Special Notes: * i. The basic checks of bin name being too long or if there is enough space * on the disk for the bin values is done before allocating space for any * of the bins. * * ii. If one of the updates to be rolled back is a bin creation, * udf_aerospike_delbin is called. This will not free up the bin metadata. * So there will be a small memory mismatch b/w replica (which did not get the * record at all and hence no memory is accounted) and the master will be seen. * To avoid such cases, we are doing checks upfront. * * Callers: * udf_aerospike__execute_updates * In this function, if udf_aerospike__apply_update_atomic fails, the record * is not committed to the storage. On success, record is closed which commits to * the storage and reopened for the next set of udf updates. * The return value from udf_aerospike__apply_update_atomic is passed on to the * callers of this function. */ int udf_aerospike__apply_update_atomic(udf_record *urecord) { int rc = 0; int failmax = 0; int new_bins = 0; // How many new bins have to be created in this update as_storage_rd * rd = urecord->rd; as_namespace * ns = rd->ns; bool has_sindex = as_sindex_ns_has_sindex(ns); bool is_record_dirty = false; bool is_record_flag_dirty = false; uint8_t old_index_flags = as_index_get_flags(rd->r); uint8_t new_index_flags = 0; // This will iterate over all the updates and apply them to storage. // The items will remain, and be used as cache values. If an error // occurred during setbin(), we rollback all the operation which // is and return failure cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates); // loop twice to make sure the updates are performed first so in case // something wrong it can be rolled back. The deletes will go through // successfully generally. // In first iteration, just calculate how many new bins need to be created for(uint32_t i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty ) { char * k = urecord->updates[i].name; if ( k != NULL ) { if ( !as_bin_get(rd, k) ) { new_bins++; } } } } // Free bins - total bins not in use in the record // Delta bins - new bins that need to be created int inuse_bins = as_bin_inuse_count(rd); int free_bins = rd->n_bins - inuse_bins; int delta_bins = new_bins - free_bins; cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d", rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins); // Check bin usage limit. if ((inuse_bins + new_bins > UDF_RECORD_BIN_ULIMIT) || (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS)) { cf_warning(AS_UDF, "bin limit of %d for UDF exceeded: %d bins in use, %d bins free, %s%d new bins needed", (int)UDF_RECORD_BIN_ULIMIT, inuse_bins, free_bins, (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS) ? ">" : "", new_bins); goto Rollback; } // Allocate space for all the new bins that need to be created beforehand if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { as_bin_allocate_bin_space(urecord->r_ref->r, rd, delta_bins); } if (!rd->ns->storage_data_in_memory && !urecord->particle_data) { // 256 as upper bound on the LDT control bin, we may write version below // leave it at the end for its use urecord->particle_data = cf_malloc(rd->ns->storage_write_block_size + 256); urecord->cur_particle_data = urecord->particle_data; urecord->end_particle_data = urecord->particle_data + rd->ns->storage_write_block_size; } if (has_sindex) { SINDEX_GRLOCK(); } // In second iteration apply updates. for(uint32_t i = 0; i < urecord->nupdates; i++ ) { urecord->updates[i].oldvalue = NULL; urecord->updates[i].washidden = false; if ( urecord->updates[i].dirty && rc == 0) { char * k = urecord->updates[i].name; as_val * v = urecord->updates[i].value; bool h = urecord->updates[i].ishidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); // Only case delete fails if bin is not found that is // as good as delete. Ignore return code !! udf_aerospike_delbin(urecord, k); if (urecord->dirty != NULL) { xdr_fill_dirty_bins(urecord->dirty); } } else { // otherwise, it is a set cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); rc = udf_aerospike_setbin(urecord, i, k, v, h); if (rc) { if (urecord->updates[i].oldvalue) { as_val_destroy(urecord->updates[i].oldvalue); urecord->updates[i].oldvalue = NULL; } failmax = i; goto Rollback; } if (urecord->dirty != NULL) { xdr_add_dirty_bin(ns, urecord->dirty, k, strlen(k)); } } } is_record_dirty = true; } } if (urecord->ldt_rectype_bit_update) { if (urecord->ldt_rectype_bit_update < 0) { // ldt_rectype_bit_update is negative in case we want to reset the bits uint8_t rectype_bits = urecord->ldt_rectype_bit_update * -1; new_index_flags = old_index_flags & ~rectype_bits; } else { new_index_flags = old_index_flags | urecord->ldt_rectype_bit_update; } if (new_index_flags != old_index_flags) { as_index_clear_flags(rd->r, old_index_flags); as_index_set_flags(rd->r, new_index_flags); is_record_flag_dirty = true; cf_detail_digest(AS_RW, &urecord->tr->keyd, "Setting index flags from %d to %d new flag %d", old_index_flags, new_index_flags, as_index_get_flags(rd->r)); } } { // This is _NOT_ for writing to the storage but for simply performing sizing // calculation. If we know the upper bounds of size of rec_props.. we could // avoid this work and check with that much correction ... // // See // - udf_rw_post_processing for building rec_props for replication // - udf_record_close for building rec_props for writing it to storage size_t rec_props_data_size = as_storage_record_rec_props_size(rd); uint8_t rec_props_data[rec_props_data_size]; if (rec_props_data_size > 0) { as_storage_record_set_rec_props(rd, rec_props_data); } // Version is set in the end after record size check. Setting version won't change the size of // the record. And if it were before size check then this setting of version as well needs to // be backed out. // TODO: Add backout logic would work till very first create call of LDT end up crossing over // record boundary if (rd->ns->ldt_enabled && as_ldt_record_is_parent(rd->r)) { int rv = as_ldt_parent_storage_set_version(rd, urecord->lrecord->version, urecord->end_particle_data, __FILE__, __LINE__); if (rv < 0) { cf_warning(AS_LDT, "udf_aerospike__apply_update_atomic: Internal Error " " [Failed to set the version on storage rv=%d]... Fail",rv); goto Rollback; } // TODO - if size check below fails, won't write to device - // different behavior than write_to_device flag - OK? is_record_dirty = true; } if (! as_storage_record_size_and_check(rd)) { cf_warning(AS_UDF, "record failed storage size check, will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (cf_atomic32_get(rd->ns->stop_writes) == 1) { cf_warning(AS_UDF, "UDF failed by stop-writes, record will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (! as_storage_has_space(rd->ns)) { cf_warning(AS_UDF, "drives full, record will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (! is_valid_ttl(rd->ns, urecord->tr->msgp->msg.record_ttl)) { cf_warning(AS_UDF, "invalid ttl %u", urecord->tr->msgp->msg.record_ttl); failmax = (int)urecord->nupdates; goto Rollback; } } if (has_sindex) { SINDEX_GUNLOCK(); } // If there were updates do miscellaneous successful commit // tasks if (is_record_dirty || is_record_flag_dirty || (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED)) { urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // will write to storage } urecord->ldt_rectype_bit_update = 0; // Clean up oldvalue cache and reset dirty. All the changes made // here has made to the particle buffer. Nothing will now be backed out. for (uint32_t i = 0; i < urecord->nupdates; i++) { udf_record_bin * bin = &urecord->updates[i]; if (bin->oldvalue != NULL ) { as_val_destroy(bin->oldvalue); bin->oldvalue = NULL; } bin->dirty = false; } return rc; Rollback: cf_debug(AS_UDF, "Rollback Called: failmax %d", failmax); for (int i = 0; i < failmax; i++) { if (urecord->updates[i].dirty) { char * k = urecord->updates[i].name; // Pick the oldvalue for rollback as_val * v = urecord->updates[i].oldvalue; bool h = urecord->updates[i].washidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k); rc = udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k); rc = udf_aerospike_setbin(urecord, i, k, v, h); if (rc) { cf_warning(AS_UDF, "Rollback failed .. not good ... !!"); } } } if (v) { as_val_destroy(v); cf_debug(AS_UDF, "ROLLBACK as_val_destroy()"); } } } if (is_record_dirty && urecord->dirty != NULL) { xdr_clear_dirty_bins(urecord->dirty); } if (is_record_flag_dirty) { as_index_clear_flags(rd->r, new_index_flags); as_index_set_flags(rd->r, old_index_flags); is_record_flag_dirty = false; } urecord->ldt_rectype_bit_update = 0; if (has_sindex) { SINDEX_GUNLOCK(); } // Reset the flat size in case the stuff is backedout !!! it should not // fail in the backout code ... if (! as_storage_record_size_and_check(rd)) { cf_warning(AS_LDT, "Does not fit even after rollback... it is trouble"); } // Do not clean up the cache in case of failure return -1; }
/* * Internal Function: udf_aerospike_delbin * * Parameters: * r - udf_record to be manipulated * bname - name of the bin to be deleted * * Return value: * 0 on success * -1 on failure * * Description: * The function deletes the bin with the name * passed in as parameter. The as_bin_destroy function * which is called here, only frees the data and * the bin is marked as not in use. The bin can then be reused later. * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * udf_aerospike__apply_update_atomic * In this function, if it fails at the time of update, the record is set * to rollback all the updates till this point. The case where it fails in * rollback is not handled. * * Side Notes: * i. write_to_device will be set to true on a successful bin destroy. * If all the updates from udf_aerospike__apply_update_atomic (including this) are * successful, the record will be written to disk and reopened so that the rest of * sets of updates can be applied. * * ii. If delete from sindex fails, we do not handle it. */ static int udf_aerospike_delbin(udf_record * urecord, const char * bname) { // Check that bname is not completely invalid if ( !bname || !bname[0] ) { cf_warning(AS_UDF, "udf_aerospike_delbin: Invalid Parameters [No bin name supplied]... Fail"); return -1; } as_storage_rd *rd = urecord->rd; as_transaction *tr = urecord->tr; // Check quality of bname -- check that it is proper length, then make sure // that the bin exists. if (strlen(bname) >= AS_ID_BIN_SZ) { // Can't read bin if name too large. cf_warning(AS_UDF, "udf_aerospike_delbin: Invalid Parameters [bin name(%s) too big]... Fail", bname); return -1; } as_bin * b = as_bin_get(rd, bname); if ( !b ) { cf_debug(AS_UDF, "udf_aerospike_delbin: Invalid Operation [Bin name(%s) not found of delete]... Fail", bname); return -1; } const char * set_name = as_index_get_set_name(rd->r, rd->ns); bool has_sindex = as_sindex_ns_has_sindex(rd->ns); if (has_sindex) { SINDEX_GRLOCK(); } SINDEX_BINS_SETUP(sbins, rd->ns->sindex_cnt); as_sindex * si_arr[rd->ns->sindex_cnt]; int si_arr_index = 0; int sbins_populated = 0; if (has_sindex) { si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(rd->ns, set_name, b->id, &si_arr[si_arr_index]); sbins_populated += as_sindex_sbins_from_bin(rd->ns, set_name, b, sbins, AS_SINDEX_OP_DELETE); SINDEX_GUNLOCK(); } int32_t i = as_bin_get_index(rd, bname); if (i != -1) { if (has_sindex) { if (sbins_populated > 0) { tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED; as_sindex_update_by_sbin(rd->ns, as_index_get_set_name(rd->r, rd->ns), sbins, sbins_populated, &rd->keyd); } } as_bin_destroy(rd, i); } else { cf_warning(AS_UDF, "udf_aerospike_delbin: Internal Error [Deleting non-existing bin %s]... Fail", bname); } if (has_sindex) { as_sindex_sbin_freeall(sbins, sbins_populated); as_sindex_release_arr(si_arr, si_arr_index); } return 0; }
/* * Internal function: udf_aerospike_setbin * * Parameters: * offset -- offset of udf bin in updates array * r -- udf_record to be manipulated * bname -- name of the bin to be deleted * val -- value to be updated with * * Return value: * 0 on success * -1 on failure * * Description: * The function sets the bin with the name * passed in as parameter to the value, passed as the third parameter. * Before updating the bin, it is checked if the value can fit in the storage * * Synchronization : object lock acquired by the transaction thread executing UDF. * Partition reservation takes place just before the transaction starts executing * ( look for as_partition_reserve_udf in thr_tsvc.c ) * * Callers: * udf_aerospike__apply_update_atomic * In this function, if it fails at the time of update, the record is set * to rollback all the updates till this point. The case where it fails in * rollback is not handled. * * Side Notes: * i. write_to_device will be set to true on a successful bin update. * If all the updates from udf_aerospike__apply_update_atomic (including this) are * successful, the record will be written to disk and reopened so that the rest of * sets of updates can be applied. * * ii. If put in sindex fails, we do not handle it. * * TODO make sure anything goes into setbin only if the bin value is * changed */ static int udf_aerospike_setbin(udf_record * urecord, int offset, const char * bname, const as_val * val, bool is_hidden) { if (bname == NULL || bname[0] == 0 ) { cf_warning(AS_UDF, "udf_aerospike_setbin: Invalid Parameters: [No bin name supplied]... Fail"); return -1; } if (as_particle_type_from_asval(val) == AS_PARTICLE_TYPE_NULL) { cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] called with unusable as_val", bname); return -3; } uint8_t type = as_val_type(val); if (is_hidden && ((type != AS_MAP) && (type != AS_LIST))) { cf_warning(AS_UDF, "udf_aerospike_setbin: Invalid Operation [Hidden %d type Not allowed]... Fail", type); return -3; } as_storage_rd * rd = urecord->rd; as_transaction *tr = urecord->tr; as_bin * b = as_bin_get_or_create(rd, bname); if ( !b ) { cf_warning(AS_UDF, "udf_aerospike_setbin: Internal Error [Bin %s not found.. Possibly ran out of bins]... Fail", bname); return -1; } bool has_sindex = as_sindex_ns_has_sindex(rd->ns); if (has_sindex) { SINDEX_GRLOCK(); } SINDEX_BINS_SETUP(sbins, 2 * rd->ns->sindex_cnt); as_sindex * si_arr[2 * rd->ns->sindex_cnt]; int sbins_populated = 0; int si_arr_index = 0; const char * set_name = as_index_get_set_name(rd->r, rd->ns); if (has_sindex ) { si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(rd->ns, set_name, b->id, &si_arr[si_arr_index]); sbins_populated += as_sindex_sbins_from_bin(rd->ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_DELETE); } // we know we are doing an update now, make sure there is particle data, // set to be 1 wblock size now @TODO! int ret = 0; cf_detail(AS_UDF, "udf_setbin: bin %s type %d ", bname, type ); if (rd->ns->storage_data_in_memory) { if (as_bin_particle_replace_from_asval(b, val) != 0) { cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] failed to replace particle", bname); ret = -4; } } else { uint32_t size = as_particle_size_from_asval(val); uint8_t *particle_buf = udf__aerospike_get_particle_buf(urecord, &urecord->updates[offset], size); if (particle_buf) { as_bin_particle_stack_from_asval(b, particle_buf, val); } else { cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] failed to get space for particle size %u", bname, size); ret = -4; } } if (is_hidden && ret == 0) { if (type == AS_LIST) { as_bin_particle_list_set_hidden(b); } else if (type == AS_MAP) { as_bin_particle_map_set_hidden(b); } } // Update sindex if required if (has_sindex) { if (ret) { SINDEX_GUNLOCK(); if (sbins_populated > 0) { as_sindex_sbin_freeall(sbins, sbins_populated); } as_sindex_release_arr(si_arr, si_arr_index); return ret; } si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(rd->ns, set_name, b->id, &si_arr[si_arr_index]); sbins_populated += as_sindex_sbins_from_bin(rd->ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_INSERT); SINDEX_GUNLOCK(); if (sbins_populated > 0) { tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED; as_sindex_update_by_sbin(rd->ns, as_index_get_set_name(rd->r, rd->ns), sbins, sbins_populated, &rd->keyd); as_sindex_sbin_freeall(sbins, sbins_populated); } as_sindex_release_arr(si_arr, si_arr_index); } return ret; } // end udf_aerospike_setbin()