int ldt_aerospike_crec_close(const as_aerospike * as, const as_rec *crec_p) { cf_detail(AS_LDT, "[ENTER] as(%p) subrec(%p)", as, crec_p ); if (!as || !crec_p) { cf_warning(AS_LDT, "ldt_aerospike_crec_close: Invalid Parameters [as=%p, subrecord=%p]... Fail", as, crec_p); return 2; } // Close of the record is only allowed if the user has not updated // it. Other wise it is a group commit udf_record *c_urecord = (udf_record *)as_rec_source(crec_p); if (!c_urecord) { cf_warning(AS_LDT, "ldt_aerospike_crec_close: Internal Error [Malformed Sub Record]... Fail"); return -1; } ldt_record *lrecord = (ldt_record *)c_urecord->lrecord; if (!lrecord) { cf_warning(AS_LDT, "ldt_aerospike_crec_close: Internal Error [Invalid Head Record Reference in Sub Record]... Fail"); return -1; } ldt_slot *lslotp = slot_lookup_by_urec(lrecord, crec_p); if (!lslotp) { cf_warning(AS_LDT, "ldt_aerospike_crec_close: Invalid Operation [Sub Record close called for the record which is not open]... Fail"); return -1; } cf_detail(AS_LDT, "ldt_aerospike_crec_close"); if (c_urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) { cf_debug(AS_LDT, "Cannot close record with update ... it needs group commit"); return -2; } udf_record_close(c_urecord); udf_record_cache_free(c_urecord); slot_destroy(lslotp, lrecord); c_urecord->flag &= ~UDF_RECORD_FLAG_ISVALID; return 0; }
/* * Function: Close storage record for udf record. Release * all locks and partition reservation / namespace * reservation etc. if requested. * Also cleans up entire cache (updated from udf) * * Parameters: * urec : UDF record being operated on * * Return value : Nothing * * Callers: * query_agg_istream_read * ldt_aerospike_crec_close * as_query__agg * udf_record_destroy */ void udf_record_close(udf_record *urecord) { as_transaction *tr = urecord->tr; cf_debug_digest(AS_UDF, &tr->keyd, "[ENTER] Closing record key:"); if (urecord->flag & UDF_RECORD_FLAG_OPEN) { as_index_ref *r_ref = urecord->r_ref; cf_detail(AS_UDF, "Closing %sRecord", (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : ""); udf_storage_record_close(urecord); as_record_done(r_ref, tr->rsv.ns); urecord->flag &= ~UDF_RECORD_FLAG_OPEN; cf_detail_digest(AS_UDF, &urecord->tr->keyd, "Storage Close:: Rec(%p) Flag(%x) Digest:", urecord, urecord->flag ); } // Replication happens when the main record replicates if (urecord->particle_data) { cf_free(urecord->particle_data); urecord->particle_data = 0; } udf_record_cache_free(urecord); }
/* * Internal function: udf_aerospike__apply_update_atomic * * Parameters: * rec -- udf_record to be updated * * Return Values: * 0 success * -1 failure * * Description: * This function applies all the updates atomically. That is, * if one of the bin update/delete/create fails, the entire function * will fail. If the nth update fails, all the n-1 updates are rolled * back to their initial values * * Special Notes: * i. The basic checks of bin name being too long or if there is enough space * on the disk for the bin values is done before allocating space for any * of the bins. * * ii. If one of the updates to be rolled back is a bin creation, * udf_aerospike_delbin is called. This will not free up the bin metadata. * So there will be a small memory mismatch b/w replica (which did not get the * record at all and hence no memory is accounted) and the master will be seen. * To avoid such cases, we are doing checks upfront. * * Callers: * udf_aerospike__execute_updates * In this function, if udf_aerospike__apply_update_atomic fails, the record * is not committed to the storage. On success, record is closed which commits to * the storage and reopened for the next set of udf updates. * The return value from udf_aerospike__apply_update_atomic is passed on to the * callers of this function. */ int udf_aerospike__apply_update_atomic(udf_record *urecord) { int rc = 0; int failindex = 0; int new_bins = 0; // How many new bins have to be created in this update as_storage_rd * rd = urecord->rd; // This will iterate over all the updates and apply them to storage. // The items will remain, and be used as cache values. If an error // occurred during setbin(), we rollback all the operation which // is and return failure cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates); // loop twice to make sure the updates are performed first so in case // something wrong it can be rolled back. The deletes will go through // successfully generally. // In first iteration, just calculate how many new bins need to be created for(int i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty ) { char * k = urecord->updates[i].name; if ( k != NULL ) { if ( !as_bin_get(rd, (uint8_t *)k, strlen(k)) ) { new_bins++; } } } } // Free bins - total bins not in use in the record // Delta bins - new bins that need to be created int free_bins = urecord->rd->n_bins - as_bin_inuse_count(urecord->rd); int delta_bins = new_bins - free_bins; cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d", urecord->rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins); // Allocate space for all the new bins that need to be created beforehand if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { as_bin_allocate_bin_space(urecord->r_ref->r, rd, delta_bins); } bool has_sindex = as_sindex_ns_has_sindex(rd->ns); if (has_sindex) { SINDEX_GRLOCK(); } // In second iteration apply updates. for(int i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty && rc == 0) { char * k = urecord->updates[i].name; as_val * v = urecord->updates[i].value; bool h = urecord->updates[i].ishidden; urecord->updates[i].oldvalue = NULL; urecord->updates[i].washidden = false; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); // Only case delete fails if bin is not found that is // as good as delete. Ignore return code !! udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); rc = udf_aerospike_setbin(urecord, k, v, h); if (rc) { failindex = i; goto Rollback; } } } } } if (has_sindex) { SINDEX_GUNLOCK(); } for(int i = 0; i < urecord->nupdates; i++ ) { if ((urecord->updates[i].dirty) && (urecord->updates[i].oldvalue)) { as_val_destroy(urecord->updates[i].oldvalue); cf_debug(AS_UDF, "REGULAR as_val_destroy()"); } } // Commit successful do miscellaneous task // Set updated flag to true urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // Set up record to be flushed to storage urecord->rd->write_to_device = true; // Before committing to storage set the rec_type_bits .. cf_detail(AS_RW, "TO INDEX Digest=%"PRIx64" bits %d %p", *(uint64_t *)&urecord->tr->keyd.digest[8], urecord->ldt_rectype_bits, urecord); as_index_set_flags(rd->r, urecord->ldt_rectype_bits); // Clean up cache and start from 0 update again. All the changes // made here will if flush from write buffer to storage goes // then will never be backed out. udf_record_cache_free(urecord); return rc; Rollback: cf_debug(AS_UDF, "Rollback Called: FailIndex(%d)", failindex); for(int i = 0; i <= failindex; i++) { if (urecord->updates[i].dirty) { char * k = urecord->updates[i].name; // Pick the oldvalue for rollback as_val * v = urecord->updates[i].oldvalue; bool h = urecord->updates[i].washidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k); rc = udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k); rc = udf_aerospike_setbin(urecord, k, v, h); if (rc) { cf_warning(AS_UDF, "Rollback failed .. not good ... !!"); } } } if (v) { as_val_destroy(v); cf_debug(AS_UDF, "ROLLBACK as_val_destroy()"); } } } if (has_sindex) { SINDEX_GUNLOCK(); } // Do not clean up the cache in case of failure return -1; }
/* Internal Function: Does the post processing for the UDF record after the * UDF execution. Does the following: * 1. Record is closed * 2. urecord_op is updated to delete in case there is no bin left in it. * 3. record->pickled_buf is populated before the record is close in case * it was write operation * 4. UDF updates cache is cleared * * Returns: Nothing * * Parameters: urecord - UDF record to operate on * urecord_op (out) - Populated with the optype */ void udf_rw_post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id) { as_storage_rd *rd = urecord->rd; as_transaction *tr = urecord->tr; as_index_ref *r_ref = urecord->r_ref; // INIT urecord->pickled_buf = NULL; urecord->pickled_sz = 0; urecord->pickled_void_time = 0; as_rec_props_clear(&urecord->pickled_rec_props); bool udf_xdr_ship_op = false; // TODO: optimize not to allocate buffer if it is single // node cluster. No remote to send data to // Check if UDF has updates. if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) { // Check if the record is not deleted after an update if ( urecord->flag & UDF_RECORD_FLAG_OPEN) { *urecord_op = UDF_OPTYPE_WRITE; udf_xdr_ship_op = true; } else { // If the record has updates and it is not open, // and if it pre-existed it's an update followed by a delete. if ( urecord->flag & UDF_RECORD_FLAG_PREEXISTS) { *urecord_op = UDF_OPTYPE_DELETE; udf_xdr_ship_op = true; } // If the record did not pre-exist and is updated // and it is not open, then it is create followed by // delete essentially no_op. else { *urecord_op = UDF_OPTYPE_NONE; } } } else if ((urecord->flag & UDF_RECORD_FLAG_PREEXISTS) && !(urecord->flag & UDF_RECORD_FLAG_OPEN)) { *urecord_op = UDF_OPTYPE_DELETE; udf_xdr_ship_op = true; } else { *urecord_op = UDF_OPTYPE_READ; } cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord, urecord->tr, urecord->r_ref, urecord->rd, (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)); // If there exists a record reference but no bin of the record is in use, // delete the record. remove from the tree. Only LDT_RECORD here not needed // for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of // removed LDT_RECORD will be lazily cleaned up by defrag. if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) && urecord->flag & UDF_RECORD_FLAG_OPEN && !as_bin_inuse_has(rd)) { as_index_delete(tr->rsv.tree, &tr->keyd); urecord->starting_memory_bytes = 0; *urecord_op = UDF_OPTYPE_DELETE; udf_xdr_ship_op = true; } else if (*urecord_op == UDF_OPTYPE_WRITE) { cf_detail(AS_UDF, "Committing Changes %"PRIx64" n_bins %d", rd->keyd, as_bin_get_n_bins(r_ref->r, rd)); size_t rec_props_data_size = as_storage_record_rec_props_size(rd); uint8_t rec_props_data[rec_props_data_size]; if (rec_props_data_size > 0) { as_storage_record_set_rec_props(rd, rec_props_data); } write_local_post_processing(tr, tr->rsv.ns, NULL, &urecord->pickled_buf, &urecord->pickled_sz, &urecord->pickled_void_time, &urecord->pickled_rec_props, true/*increment_generation*/, NULL, r_ref->r, rd, urecord->starting_memory_bytes); // Now ok to accommodate a new stored key... if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) { if (rd->ns->storage_data_in_memory) { as_record_allocate_key(r_ref->r, rd->key, rd->key_size); } as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED); } // ... or drop a stored key. else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) { if (rd->ns->storage_data_in_memory) { as_record_remove_key(r_ref->r); } as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED); } } // Collect the record information (for XDR) before closing the record as_generation generation = 0; if (urecord->flag & UDF_RECORD_FLAG_OPEN) { generation = r_ref->r->generation; set_id = as_index_get_set_id(r_ref->r); } // Close the record for all the cases udf_record_close(urecord, false); // Write to XDR pipe after closing the record, in order to release the record lock as // early as possible. if (udf_xdr_ship_op == true) { if (UDF_OP_IS_WRITE(*urecord_op)) { cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd); xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id); } else if (UDF_OP_IS_DELETE(*urecord_op)) { cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd); xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id); } } // Replication happens when the main record replicates if (urecord->particle_data) { cf_free(urecord->particle_data); urecord->particle_data = 0; } udf_record_cache_free(urecord); }