void cl_cluster_scan_shutdown(cl_cluster* asc) { // Check whether we ever (lazily) initialized scan machinery. if (cf_atomic32_get(asc->scan_initialized) == 0 && ! asc->scan_q) { return; } // This tells the worker threads to stop. We do this (instead of using a // "running" flag) to allow the workers to "wait forever" on processing the // work dispatch queue, which has minimum impact when the queue is empty. // This also means all queued requests get processed when shutting down. for (int i = 0; i < NUM_SCAN_THREADS; i++) { cl_scan_task task; task.asc = NULL; cf_queue_push(asc->scan_q, &task); } for (int i = 0; i < NUM_SCAN_THREADS; i++) { pthread_join(asc->scan_threads[i], NULL); } cf_queue_destroy(asc->scan_q); asc->scan_q = NULL; cf_atomic32_set(&asc->scan_initialized, 0); }
static void append_set_props(as_set *p_set, cf_dyn_buf *db) { cf_dyn_buf_append_string(db, "n_objects="); cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->num_elements)); cf_dyn_buf_append_char(db, ':'); cf_dyn_buf_append_string(db, "n-bytes-memory="); cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_bytes_memory)); cf_dyn_buf_append_char(db, ':'); cf_dyn_buf_append_string(db, "stop-writes-count="); cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->stop_writes_count)); cf_dyn_buf_append_char(db, ':'); cf_dyn_buf_append_string(db, "set-enable-xdr="); if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_TRUE) { cf_dyn_buf_append_string(db, "true"); } else if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_FALSE) { cf_dyn_buf_append_string(db, "false"); } else if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_DEFAULT) { cf_dyn_buf_append_string(db, "use-default"); } else { cf_dyn_buf_append_uint32(db, cf_atomic32_get(p_set->enable_xdr)); } cf_dyn_buf_append_char(db, ':'); cf_dyn_buf_append_string(db, "disable-eviction="); if (IS_SET_EVICTION_DISABLED(p_set)) { cf_dyn_buf_append_string(db, "true"); } else { cf_dyn_buf_append_string(db, "false"); } cf_dyn_buf_append_char(db, ':'); cf_dyn_buf_append_string(db, "set-delete="); if (IS_SET_DELETED(p_set)) { cf_dyn_buf_append_string(db, "true"); } else { cf_dyn_buf_append_string(db, "false"); } cf_dyn_buf_append_char(db, ';'); }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }
/* * Internal function: udf_aerospike__apply_update_atomic * * Parameters: * rec -- udf_record to be updated * * Return Values: * 0 success * -1 failure * * Description: * This function applies all the updates atomically. That is, * if one of the bin update/delete/create fails, the entire function * will fail. If the nth update fails, all the n-1 updates are rolled * back to their initial values * * Special Notes: * i. The basic checks of bin name being too long or if there is enough space * on the disk for the bin values is done before allocating space for any * of the bins. * * ii. If one of the updates to be rolled back is a bin creation, * udf_aerospike_delbin is called. This will not free up the bin metadata. * So there will be a small memory mismatch b/w replica (which did not get the * record at all and hence no memory is accounted) and the master will be seen. * To avoid such cases, we are doing checks upfront. * * Callers: * udf_aerospike__execute_updates * In this function, if udf_aerospike__apply_update_atomic fails, the record * is not committed to the storage. On success, record is closed which commits to * the storage and reopened for the next set of udf updates. * The return value from udf_aerospike__apply_update_atomic is passed on to the * callers of this function. */ int udf_aerospike__apply_update_atomic(udf_record *urecord) { int rc = 0; int failmax = 0; int new_bins = 0; // How many new bins have to be created in this update as_storage_rd * rd = urecord->rd; as_namespace * ns = rd->ns; bool has_sindex = as_sindex_ns_has_sindex(ns); bool is_record_dirty = false; bool is_record_flag_dirty = false; uint8_t old_index_flags = as_index_get_flags(rd->r); uint8_t new_index_flags = 0; // This will iterate over all the updates and apply them to storage. // The items will remain, and be used as cache values. If an error // occurred during setbin(), we rollback all the operation which // is and return failure cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates); // loop twice to make sure the updates are performed first so in case // something wrong it can be rolled back. The deletes will go through // successfully generally. // In first iteration, just calculate how many new bins need to be created for(uint32_t i = 0; i < urecord->nupdates; i++ ) { if ( urecord->updates[i].dirty ) { char * k = urecord->updates[i].name; if ( k != NULL ) { if ( !as_bin_get(rd, k) ) { new_bins++; } } } } // Free bins - total bins not in use in the record // Delta bins - new bins that need to be created int inuse_bins = as_bin_inuse_count(rd); int free_bins = rd->n_bins - inuse_bins; int delta_bins = new_bins - free_bins; cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d", rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins); // Check bin usage limit. if ((inuse_bins + new_bins > UDF_RECORD_BIN_ULIMIT) || (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS)) { cf_warning(AS_UDF, "bin limit of %d for UDF exceeded: %d bins in use, %d bins free, %s%d new bins needed", (int)UDF_RECORD_BIN_ULIMIT, inuse_bins, free_bins, (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS) ? ">" : "", new_bins); goto Rollback; } // Allocate space for all the new bins that need to be created beforehand if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { as_bin_allocate_bin_space(urecord->r_ref->r, rd, delta_bins); } if (!rd->ns->storage_data_in_memory && !urecord->particle_data) { // 256 as upper bound on the LDT control bin, we may write version below // leave it at the end for its use urecord->particle_data = cf_malloc(rd->ns->storage_write_block_size + 256); urecord->cur_particle_data = urecord->particle_data; urecord->end_particle_data = urecord->particle_data + rd->ns->storage_write_block_size; } if (has_sindex) { SINDEX_GRLOCK(); } // In second iteration apply updates. for(uint32_t i = 0; i < urecord->nupdates; i++ ) { urecord->updates[i].oldvalue = NULL; urecord->updates[i].washidden = false; if ( urecord->updates[i].dirty && rc == 0) { char * k = urecord->updates[i].name; as_val * v = urecord->updates[i].value; bool h = urecord->updates[i].ishidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); // Only case delete fails if bin is not found that is // as good as delete. Ignore return code !! udf_aerospike_delbin(urecord, k); if (urecord->dirty != NULL) { xdr_fill_dirty_bins(urecord->dirty); } } else { // otherwise, it is a set cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k); urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k); rc = udf_aerospike_setbin(urecord, i, k, v, h); if (rc) { if (urecord->updates[i].oldvalue) { as_val_destroy(urecord->updates[i].oldvalue); urecord->updates[i].oldvalue = NULL; } failmax = i; goto Rollback; } if (urecord->dirty != NULL) { xdr_add_dirty_bin(ns, urecord->dirty, k, strlen(k)); } } } is_record_dirty = true; } } if (urecord->ldt_rectype_bit_update) { if (urecord->ldt_rectype_bit_update < 0) { // ldt_rectype_bit_update is negative in case we want to reset the bits uint8_t rectype_bits = urecord->ldt_rectype_bit_update * -1; new_index_flags = old_index_flags & ~rectype_bits; } else { new_index_flags = old_index_flags | urecord->ldt_rectype_bit_update; } if (new_index_flags != old_index_flags) { as_index_clear_flags(rd->r, old_index_flags); as_index_set_flags(rd->r, new_index_flags); is_record_flag_dirty = true; cf_detail_digest(AS_RW, &urecord->tr->keyd, "Setting index flags from %d to %d new flag %d", old_index_flags, new_index_flags, as_index_get_flags(rd->r)); } } { // This is _NOT_ for writing to the storage but for simply performing sizing // calculation. If we know the upper bounds of size of rec_props.. we could // avoid this work and check with that much correction ... // // See // - udf_rw_post_processing for building rec_props for replication // - udf_record_close for building rec_props for writing it to storage size_t rec_props_data_size = as_storage_record_rec_props_size(rd); uint8_t rec_props_data[rec_props_data_size]; if (rec_props_data_size > 0) { as_storage_record_set_rec_props(rd, rec_props_data); } // Version is set in the end after record size check. Setting version won't change the size of // the record. And if it were before size check then this setting of version as well needs to // be backed out. // TODO: Add backout logic would work till very first create call of LDT end up crossing over // record boundary if (rd->ns->ldt_enabled && as_ldt_record_is_parent(rd->r)) { int rv = as_ldt_parent_storage_set_version(rd, urecord->lrecord->version, urecord->end_particle_data, __FILE__, __LINE__); if (rv < 0) { cf_warning(AS_LDT, "udf_aerospike__apply_update_atomic: Internal Error " " [Failed to set the version on storage rv=%d]... Fail",rv); goto Rollback; } // TODO - if size check below fails, won't write to device - // different behavior than write_to_device flag - OK? is_record_dirty = true; } if (! as_storage_record_size_and_check(rd)) { cf_warning(AS_UDF, "record failed storage size check, will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (cf_atomic32_get(rd->ns->stop_writes) == 1) { cf_warning(AS_UDF, "UDF failed by stop-writes, record will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (! as_storage_has_space(rd->ns)) { cf_warning(AS_UDF, "drives full, record will not be updated"); failmax = (int)urecord->nupdates; goto Rollback; } if (! is_valid_ttl(rd->ns, urecord->tr->msgp->msg.record_ttl)) { cf_warning(AS_UDF, "invalid ttl %u", urecord->tr->msgp->msg.record_ttl); failmax = (int)urecord->nupdates; goto Rollback; } } if (has_sindex) { SINDEX_GUNLOCK(); } // If there were updates do miscellaneous successful commit // tasks if (is_record_dirty || is_record_flag_dirty || (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED)) { urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // will write to storage } urecord->ldt_rectype_bit_update = 0; // Clean up oldvalue cache and reset dirty. All the changes made // here has made to the particle buffer. Nothing will now be backed out. for (uint32_t i = 0; i < urecord->nupdates; i++) { udf_record_bin * bin = &urecord->updates[i]; if (bin->oldvalue != NULL ) { as_val_destroy(bin->oldvalue); bin->oldvalue = NULL; } bin->dirty = false; } return rc; Rollback: cf_debug(AS_UDF, "Rollback Called: failmax %d", failmax); for (int i = 0; i < failmax; i++) { if (urecord->updates[i].dirty) { char * k = urecord->updates[i].name; // Pick the oldvalue for rollback as_val * v = urecord->updates[i].oldvalue; bool h = urecord->updates[i].washidden; if ( k != NULL ) { if ( v == NULL || v->type == AS_NIL ) { // if the value is NIL, then do a delete cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k); rc = udf_aerospike_delbin(urecord, k); } else { // otherwise, it is a set cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k); rc = udf_aerospike_setbin(urecord, i, k, v, h); if (rc) { cf_warning(AS_UDF, "Rollback failed .. not good ... !!"); } } } if (v) { as_val_destroy(v); cf_debug(AS_UDF, "ROLLBACK as_val_destroy()"); } } } if (is_record_dirty && urecord->dirty != NULL) { xdr_clear_dirty_bins(urecord->dirty); } if (is_record_flag_dirty) { as_index_clear_flags(rd->r, new_index_flags); as_index_set_flags(rd->r, old_index_flags); is_record_flag_dirty = false; } urecord->ldt_rectype_bit_update = 0; if (has_sindex) { SINDEX_GUNLOCK(); } // Reset the flat size in case the stuff is backedout !!! it should not // fail in the backout code ... if (! as_storage_record_size_and_check(rd)) { cf_warning(AS_LDT, "Does not fit even after rollback... it is trouble"); } // Do not clean up the cache in case of failure return -1; }