// TODO - old pickle - remove in "six months". int old_record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete) { as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle); if (n_new_bins > 1) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins); return AS_ERR_UNKNOWN; } as_bin stack_bin = { { 0 } }; rd->n_bins = 1; rd->bins = &stack_bin; // Fill the new bin and particle. cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE); int result; if (n_new_bins == 1 && (result = unpickle_bins(rr, rd, &particles_llb)) != 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name); cf_ll_buf_free(&particles_llb); return result; } // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Prepare to store or drop key, as determined by message. rd->key = rr->key; rd->key_size = rr->key_size; // Write the record to storage. if ((result = as_record_write_from_pickle(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); cf_ll_buf_free(&particles_llb); return -result; } // Now ok to store or drop key, as determined by message. as_record_finalize_key(r, ns, rd->key, rd->key_size); *is_delete = n_new_bins == 0; cf_ll_buf_free(&particles_llb); return AS_OK; }
int record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete) { // TODO - old pickle - remove in "six months". if (rr->is_old_pickle) { return old_record_apply_ssd_single_bin(rr, rd, is_delete); } as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; uint16_t n_new_bins = rr->n_bins; if (n_new_bins > 1) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins); return AS_ERR_UNKNOWN; } // Won't use to flatten, but needed to know if bins are in use. rd->n_bins = n_new_bins; // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Write the record to storage. int result = as_record_write_from_pickle(rd); if (result < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); return -result; } // Now ok to store or drop key, as determined by message. as_record_finalize_key(r, ns, rr->key, rr->key_size); *is_delete = n_new_bins == 0; return AS_OK; }
int handle_msg_key(as_transaction* tr, as_storage_rd* rd) { // Shortcut pointers. as_msg* m = &tr->msgp->msg; as_namespace* ns = tr->rsv.ns; if (rd->r->key_stored == 1) { // Key stored for this record - be sure it gets rewritten. // This will force a device read for non-data-in-memory, even if // must_fetch_data is false! Since there's no advantage to using the // loaded block after this if must_fetch_data is false, leave the // subsequent code as-is. if (! as_storage_record_get_key(rd)) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} can't get stored key ", ns->name); return AS_ERR_UNKNOWN; } // Check the client-sent key, if any, against the stored key. if (as_transaction_has_key(tr) && ! check_msg_key(m, rd)) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} key mismatch ", ns->name); return AS_ERR_KEY_MISMATCH; } } // If we got a key without a digest, it's an old client, not a cue to store // the key. (Remove this check when we're sure all old C clients are gone.) else if (as_transaction_has_digest(tr)) { // Key not stored for this record - store one if sent from client. For // data-in-memory, don't allocate the key until we reach the point of no // return. Also don't set AS_INDEX_FLAG_KEY_STORED flag until then. if (! get_msg_key(tr, rd)) { return AS_ERR_UNSUPPORTED_FEATURE; } } return 0; }
/** * Send failure notification of general UDF execution, but check for special * LDT errors and return specific Wire Protocol error codes for these cases: * (1) Record not found (2) * (2) LDT Collection item not found (125) * * All other errors get the generic 100 (UDF FAIL) code. */ static inline int process_udf_failure(udf_call *call, const as_string *s, cf_dyn_buf *db) { char *val = as_string_tostring(s); size_t vlen = as_string_len((as_string *)s); // TODO - make as_string_len() take const long error_code = ldt_get_error_code(val, vlen); if (error_code) { if (error_code == AS_PROTO_RESULT_FAIL_NOTFOUND || error_code == AS_PROTO_RESULT_FAIL_COLLECTION_ITEM_NOT_FOUND) { call->tr->result_code = (uint8_t)error_code; // Send an "empty" response, with no failure bin. as_transaction * tr = call->tr; if (db) { size_t msg_sz = 0; uint8_t *msgp = (uint8_t *)as_msg_make_response_msg( tr->result_code, 0, 0, NULL, NULL, 0, tr->rsv.ns, NULL, &msg_sz, as_transaction_trid(tr), NULL); if (! msgp) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} LDT UDF failed to make response msg ", tr->rsv.ns->name); return -1; } // Stash the message, to be sent later. db->buf = msgp; db->is_stack = false; db->alloc_sz = msg_sz; db->used_sz = msg_sz; } else { single_transaction_response(tr, tr->rsv.ns, NULL/*ops*/, NULL /*bin*/, 0 /*nbins*/, 0, 0, NULL, NULL); } return 0; } } cf_debug(AS_UDF, "Non-special LDT or General UDF Error(%s)", (char *) val); call->tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION; return process_failure(call, as_string_toval(s), db); }
as_rec * crec_create(ldt_record *lrecord) { // Generate Key Digest udf_record *h_urecord = (udf_record *) as_rec_source(lrecord->h_urec); cf_digest keyd = h_urecord->r_ref->r->key; as_namespace *ns = h_urecord->tr->rsv.ns; int retry_cnt = 0; ldt_slot *lslotp = slot_lookup_free(lrecord, "crec_create"); if (!lslotp) { cf_crash(AS_LDT, "Allocation error !!!"); } slot_init(lslotp, lrecord); while (retry_cnt++ < LDT_SUBRECORD_RANDOMIZER_MAX_RETRIES) { as_ldt_digest_randomizer(&keyd); as_ldt_subdigest_setversion(&keyd, lrecord->version); slot_setup_digest(lslotp, &keyd); int rv = as_aerospike_rec_create(lrecord->as, lslotp->c_urec_p); // rv == 0 if successful // rv == 1 if record is already found retry // other wise failure if (rv == 0) { cf_detail_digest(AS_LDT, &keyd, "Crec Create:Ptr(%p) Digest: version %ld", lslotp->c_urec_p, lrecord->version); as_val_reserve(lslotp->c_urec_p); return lslotp->c_urec_p; } if (rv != 1) { cf_warning(AS_LDT, "crec_create: LDT Sub-Record Create Error [rv=%d]... Fail", rv); break; } cf_atomic64_incr(&ns->lstats.ldt_randomizer_retry); } slot_destroy(lslotp, lrecord); cf_warning_digest(AS_LDT, &keyd, "ldt_aerospike_crec_create : Create failed after %d retries", retry_cnt); return NULL; }
transaction_status read_local(as_transaction* tr) { as_msg* m = &tr->msgp->msg; as_namespace* ns = tr->rsv.ns; as_index_ref r_ref; if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) { read_local_done(tr, NULL, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } as_record* r = r_ref.r; // Check if it's an expired or truncated record. if (as_record_is_doomed(r, ns)) { read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } int result = repl_state_check(r, tr); if (result != 0) { if (result == -3) { read_local_done(tr, &r_ref, NULL, AS_ERR_UNAVAILABLE); return TRANS_DONE_ERROR; } // No response sent to origin. as_record_done(&r_ref, ns); return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING; } // Check if it's a tombstone. if (! as_record_is_live(r)) { read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND); return TRANS_DONE_ERROR; } as_storage_rd rd; as_storage_record_open(ns, r, &rd); // If configuration permits, allow reads to use page cache. rd.read_page_cache = ns->storage_read_page_cache; // Check the key if required. // Note - for data-not-in-memory "exists" ops, key check is expensive! if (as_transaction_has_key(tr) && as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) { read_local_done(tr, &r_ref, &rd, AS_ERR_KEY_MISMATCH); return TRANS_DONE_ERROR; } if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) { tr->generation = r->generation; tr->void_time = r->void_time; tr->last_update_time = r->last_update_time; read_local_done(tr, &r_ref, &rd, AS_OK); return TRANS_DONE_SUCCESS; } if ((result = as_storage_rd_load_n_bins(&rd)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } if (! as_bin_inuse_has(&rd)) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name); read_local_done(tr, &r_ref, &rd, AS_ERR_UNKNOWN); return TRANS_DONE_ERROR; } uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ? rd.n_bins : m->n_ops; as_msg_op* ops[bin_count]; as_msg_op** p_ops = ops; as_bin* response_bins[bin_count]; uint16_t n_bins = 0; as_bin result_bins[bin_count]; uint32_t n_result_bins = 0; if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) { p_ops = NULL; n_bins = rd.n_bins; as_bin_get_all_p(&rd, response_bins); } else { if (m->n_ops == 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name); read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER); return TRANS_DONE_ERROR; } bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0; as_msg_op* op = 0; int n = 0; while ((op = as_msg_op_iterate(m, op, &n)) != NULL) { if (op->op == AS_MSG_OP_READ) { as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); if (b || respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = b; } } else if (op->op == AS_MSG_OP_CDT_READ) { as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); if (b) { as_bin* rb = &result_bins[n_result_bins]; as_bin_set_empty(rb); if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name); destroy_stack_bins(result_bins, n_result_bins); read_local_done(tr, &r_ref, &rd, -result); return TRANS_DONE_ERROR; } if (as_bin_inuse(rb)) { n_result_bins++; ops[n_bins] = op; response_bins[n_bins++] = rb; } else if (respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = NULL; } } else if (respond_all_ops) { ops[n_bins] = op; response_bins[n_bins++] = NULL; } } else { cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op); destroy_stack_bins(result_bins, n_result_bins); read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER); return TRANS_DONE_ERROR; } } } cf_dyn_buf_define_size(db, 16 * 1024); if (tr->origin != FROM_BATCH) { db.used_sz = db.alloc_sz; db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code, r->generation, r->void_time, p_ops, response_bins, n_bins, ns, (cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr)); db.is_stack = db.buf == dyn_bufdb; // Note - not bothering to correct alloc_sz if buf was allocated. } else { tr->generation = r->generation; tr->void_time = r->void_time; tr->last_update_time = r->last_update_time; // Since as_batch_add_result() constructs response directly in shared // buffer to avoid extra copies, can't use db. send_read_response(tr, p_ops, response_bins, n_bins, NULL); } destroy_stack_bins(result_bins, n_result_bins); as_storage_record_close(&rd); as_record_done(&r_ref, ns); // Now that we're not under the record lock, send the message we just built. if (db.used_sz != 0) { send_read_response(tr, NULL, NULL, 0, &db); cf_dyn_buf_free(&db); tr->from.proto_fd_h = NULL; } return TRANS_DONE_SUCCESS; }
/* * The work horse function to process the acknowledgment for the duplicate op. * It is received after the intended node has finished performing the op. In * case of success the op would have been successfully performed and replicated. * In case of failure the op would not have been performed anywhere. * * The retransmit is handled by making sure op hangs from the write hash as long * as it is not applied or failed. Any attempt to perform next operation has to * hang behind it unless it is finished. Also operation is assigned a timestamp * so that there is some protection in case the op arrives out of order, or the * same op comes back again. That would be a duplicate op ... * * Received a op message - I'm a winner duplicate on this partition. Perform the * UDF op and replicate to all the nodes in the replica list. We only replicate * the subrecord if the partition is in subrecord migration phase. If not, ship * both subrecord and record. In case partition is read replica on this node, do * the write and signal back that I'm done. * * THUS - PROLE SIDE * * is_write is misnamed. Differentiates between the 'duplicate' phase and the * 'operation' phase. If is_write == false, we're in the 'duplicate' phase. * * Algorithm * * This code is called when op is shipped to the winner node. * * 1. Assert that current node is indeed the winner node. * 2. Assert the cluster key matches. * 3. Create a transaction and apply the UDF. Create an internal transaction and * make sure it does some sort of reservation and applies the write and * replicates to replica set. Once the write is done it sends the op ack. * * TODO: How do you handle retransmits? * TODO: How do you handle partition reservation? Is it something special. * TODO: How to send along with replication request? Same infra should be * used by normal replication as well. * * There won't be any deadlock because the requests are triggered from the * write. Get down to the udf apply code. Replicate to replica set and then * make sure the response is sent back to the originating node. This node has * to make sure the replication actually did succeed. * * In the response code you need to add the callback function. */ int as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg) { int rv = -1; write_request *wr = pr->wr; if (!wr) { return -1; } cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response"); // If there is a write request hanging from pr then this is a response to // the proxy ship op request. This node is the resolving node (node @ which // duplicate resolution was triggered). It could be: // 1. Originating node [where the request was sent from the client] - in // that case send response back to the client directly. // 2. Non-originating node [where the request arrived as a regular proxy] - // in that case send response back to the proxy originating node. // Case 1: Non-originating node. if (wr->proxy_msg) { // Remember that "digest" gets printed at the end of cf_detail_digest(). // Fake the ORIGINATING Proxy tid uint32_t transaction_id = 0; msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id); msg_set_uint32(m, PROXY_FIELD_TID, transaction_id); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr); cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id); if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response"); as_fabric_msg_put(m); } *free_msg = false; } // Case 2: Originating node. else { cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response"); pthread_mutex_lock(&wr->lock); if (wr->proto_fd_h) { if (!wr->proto_fd_h->fd) { cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd "); } else { as_proto *proto; size_t proto_sz; if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) { cf_info(AS_PROXY, "msg get buf failed!"); } size_t pos = 0; while (pos < proto_sz) { rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL); if (rv > 0) { pos += rv; } else if (rv < 0) { if (errno != EWOULDBLOCK) { // Common message when a client aborts. cf_debug(AS_PROTO, "protocol proxy write fail: fd %d " "sz %d pos %d rv %d errno %d", wr->proto_fd_h->fd, proto_sz, pos, rv, errno); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } usleep(1); // yield } else { cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", wr->proto_fd_h->fd, proto_sz, pos); shutdown(wr->proto_fd_h->fd, SHUT_RDWR); break; } } cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client"); } wr->proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(wr->proto_fd_h); wr->proto_fd_h = 0; } else { // this may be NULL if the request has already timedout and the wr proto_fd_h // will be cleaned up by then cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd "); // Note: This may be needed if this is node where internal scan or query // UDF is initiated where it happens so that there is migration is going // on and the request get routed to the remote node which is winning node // This request may need the req_cb to be called. if (udf_rw_needcomplete_wr(wr)) { as_transaction tr; write_request_init_tr(&tr, wr); udf_rw_complete(&tr, 0, __FILE__, __LINE__); if (tr.proto_fd_h) { tr.proto_fd_h->t_inprogress = false; AS_RELEASE_FILE_HANDLE(tr.proto_fd_h); tr.proto_fd_h = 0; } } } pthread_mutex_unlock(&wr->lock); } // This node is shipOp initiator. Remove it from the Global // hash global_keyd gk; gk.ns_id = wr->rsv.ns->id; gk.keyd = wr->keyd; g_write_hash_delete(&gk); WR_RELEASE(pr->wr); pr->wr = NULL; return 0; }
// TODO - old pickle - remove in "six months". int old_record_apply_dim(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete) { as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; // Set rd->n_bins! as_storage_rd_load_n_bins(rd); // Set rd->bins! as_storage_rd_load_bins(rd, NULL); // For memory accounting, note current usage. uint64_t memory_bytes = as_storage_record_get_n_bytes_memory(rd); // Keep old bins intact for sindex adjustment and unwinding. uint16_t n_old_bins = rd->n_bins; as_bin* old_bins = rd->bins; uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle); as_bin new_bins[n_new_bins]; memset(new_bins, 0, sizeof(new_bins)); rd->n_bins = n_new_bins; rd->bins = new_bins; // Fill the new bins and particles. int result = unpickle_bins(rr, rd, NULL); if (result != 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name); destroy_stack_bins(new_bins, n_new_bins); return result; } // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Prepare to store or drop key, as determined by message. rd->key = rr->key; rd->key_size = rr->key_size; // Write the record to storage. if ((result = as_record_write_from_pickle(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); destroy_stack_bins(new_bins, n_new_bins); return -result; } // Success - adjust sindex, looking at old and new bins. if (! (skip_sindex && next_generation(r->generation, (uint16_t)rr->generation, ns)) && record_has_sindex(r, ns)) { write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd, old_bins, n_old_bins, new_bins, n_new_bins); } // Cleanup - destroy relevant bins, can't unwind after. destroy_stack_bins(old_bins, n_old_bins); // Fill out new_bin_space. as_bin_space* new_bin_space = NULL; if (n_new_bins != 0) { new_bin_space = (as_bin_space*) cf_malloc_ns(sizeof(as_bin_space) + sizeof(new_bins)); new_bin_space->n_bins = rd->n_bins; memcpy((void*)new_bin_space->bins, new_bins, sizeof(new_bins)); } // Swizzle the index element's as_bin_space pointer. as_bin_space* old_bin_space = as_index_get_bin_space(r); if (old_bin_space) { cf_free(old_bin_space); } as_index_set_bin_space(r, new_bin_space); // Now ok to store or drop key, as determined by message. as_record_finalize_key(r, ns, rd->key, rd->key_size); as_storage_record_adjust_mem_stats(rd, memory_bytes); *is_delete = n_new_bins == 0; return AS_OK; }
// TODO - old pickle - remove in "six months". int old_record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete) { as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; rd->n_bins = 1; // Set rd->bins! as_storage_rd_load_bins(rd, NULL); // For memory accounting, note current usage. uint64_t memory_bytes = 0; // TODO - as_storage_record_get_n_bytes_memory() could check bins in use. if (as_bin_inuse(rd->bins)) { memory_bytes = as_storage_record_get_n_bytes_memory(rd); } uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle); if (n_new_bins > 1) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins); return AS_ERR_UNKNOWN; } // Keep old bin intact for unwinding, clear record bin for incoming. as_bin old_bin; as_single_bin_copy(&old_bin, rd->bins); as_bin_set_empty(rd->bins); int result; // Fill the new bins and particles. if (n_new_bins == 1 && (result = unpickle_bins(rr, rd, NULL)) != 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name); unwind_dim_single_bin(&old_bin, rd->bins); return result; } // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Write the record to storage. if ((result = as_record_write_from_pickle(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); unwind_dim_single_bin(&old_bin, rd->bins); return -result; } // Cleanup - destroy old bin, can't unwind after. as_bin_particle_destroy(&old_bin, true); as_storage_record_adjust_mem_stats(rd, memory_bytes); *is_delete = n_new_bins == 0; return AS_OK; }
int record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete) { // TODO - old pickle - remove in "six months". if (rr->is_old_pickle) { return old_record_apply_ssd(rr, rd, skip_sindex, is_delete); } as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; bool has_sindex = ! (skip_sindex && next_generation(r->generation, (uint16_t)rr->generation, ns)) && record_has_sindex(r, ns); int result; uint16_t n_old_bins = 0; as_bin *old_bins = NULL; uint16_t n_new_bins = rr->n_bins; as_bin *new_bins = NULL; if (has_sindex) { // TODO - separate function? if ((result = as_storage_rd_load_n_bins(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load n-bins ", ns->name); return -result; } n_old_bins = rd->n_bins; old_bins = alloca(n_old_bins * sizeof(as_bin)); if ((result = as_storage_rd_load_bins(rd, old_bins)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load bins ", ns->name); return -result; } // Won't use to flatten. rd->bins = NULL; if (n_new_bins != 0) { new_bins = alloca(n_new_bins * sizeof(as_bin)); memset(new_bins, 0, n_new_bins * sizeof(as_bin)); if ((result = as_flat_unpack_remote_bins(rr, new_bins)) != 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name); return -result; } } } // Won't use to flatten, but needed to know if bins are in use. rd->n_bins = n_new_bins; // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Write the record to storage. if ((result = as_record_write_from_pickle(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); return -result; } // Success - adjust sindex, looking at old and new bins. if (has_sindex) { write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd, old_bins, n_old_bins, new_bins, n_new_bins); } // Now ok to store or drop key, as determined by message. as_record_finalize_key(r, ns, rr->key, rr->key_size); *is_delete = n_new_bins == 0; return AS_OK; }
int record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete) { // TODO - old pickle - remove in "six months". if (rr->is_old_pickle) { return old_record_apply_dim_single_bin(rr, rd, is_delete); } as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; rd->n_bins = 1; // Set rd->bins! as_storage_rd_load_bins(rd, NULL); // For memory accounting, note current usage. uint64_t memory_bytes = 0; // TODO - as_storage_record_get_n_bytes_memory() could check bins in use. if (as_bin_inuse(rd->bins)) { memory_bytes = as_storage_record_get_n_bytes_memory(rd); } uint16_t n_new_bins = rr->n_bins; if (n_new_bins > 1) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins); return AS_ERR_UNKNOWN; } // Keep old bin for unwinding. as_bin old_bin; as_single_bin_copy(&old_bin, rd->bins); // No stack new bin - simpler to operate directly on bin embedded in index. as_bin_set_empty(rd->bins); int result; // Fill the new bins and particles. if (n_new_bins == 1 && (result = as_flat_unpack_remote_bins(rr, rd->bins)) != 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name); unwind_dim_single_bin(&old_bin, rd->bins); return -result; } // Won't use to flatten, but needed to know if bins are in use. Amazingly, // rd->n_bins 0 ok adjusting memory stats. Also, rd->bins already filled. rd->n_bins = n_new_bins; // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Write the record to storage. if ((result = as_record_write_from_pickle(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); unwind_dim_single_bin(&old_bin, rd->bins); return -result; } // Cleanup - destroy old bin, can't unwind after. as_bin_particle_destroy(&old_bin, true); as_storage_record_adjust_mem_stats(rd, memory_bytes); *is_delete = n_new_bins == 0; return AS_OK; }
// TODO - old pickle - remove in "six months". int old_record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete) { as_namespace* ns = rr->rsv->ns; as_record* r = rd->r; bool has_sindex = ! (skip_sindex && next_generation(r->generation, (uint16_t)rr->generation, ns)) && record_has_sindex(r, ns); uint16_t n_old_bins = 0; int result; if (has_sindex) { // Set rd->n_bins! if ((result = as_storage_rd_load_n_bins(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load n-bins ", ns->name); return -result; } n_old_bins = rd->n_bins; } as_bin old_bins[n_old_bins]; if (has_sindex) { // Set rd->bins! if ((result = as_storage_rd_load_bins(rd, old_bins)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load bins ", ns->name); return -result; } } // Stack space for resulting record's bins. uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle); as_bin new_bins[n_new_bins]; memset(new_bins, 0, sizeof(new_bins)); rd->n_bins = n_new_bins; rd->bins = new_bins; // Fill the new bins and particles. cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE); if ((result = unpickle_bins(rr, rd, &particles_llb)) != 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name); cf_ll_buf_free(&particles_llb); return result; } // Apply changes to metadata in as_index needed for and writing. index_metadata old_metadata; update_index_metadata(rr, &old_metadata, r); // Prepare to store or drop key, as determined by message. rd->key = rr->key; rd->key_size = rr->key_size; // Write the record to storage. if ((result = as_record_write_from_pickle(rd)) < 0) { cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); unwind_index_metadata(&old_metadata, r); cf_ll_buf_free(&particles_llb); return -result; } // Success - adjust sindex, looking at old and new bins. if (has_sindex) { write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd, old_bins, n_old_bins, new_bins, n_new_bins); } // Now ok to store or drop key, as determined by message. as_record_finalize_key(r, ns, rd->key, rd->key_size); *is_delete = n_new_bins == 0; cf_ll_buf_free(&particles_llb); return AS_OK; }
int write_replica(as_partition_reservation* rsv, cf_digest* keyd, uint8_t* pickled_buf, size_t pickled_sz, const as_rec_props* p_rec_props, as_generation generation, uint32_t void_time, uint64_t last_update_time, cf_node master, uint32_t info, ldt_prole_info* linfo) { as_namespace* ns = rsv->ns; if (! as_storage_has_space(rsv->ns)) { cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name); return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE; } as_index_tree* tree = rsv->tree; bool is_subrec = false; bool is_ldt_parent = false; if (ns->ldt_enabled) { if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) { tree = rsv->sub_tree; is_subrec = true; } else if ((info & RW_INFO_LDT_PARENTREC) != 0) { is_ldt_parent = true; } } as_index_ref r_ref; r_ref.skip_lock = false; int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec); if (rv < 0) { cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name); return AS_PROTO_RESULT_FAIL_UNKNOWN; } as_record* r = r_ref.r; as_storage_rd rd; bool is_create = false; if (rv == 1) { as_storage_record_create(ns, r, &rd, keyd); is_create = true; } else { as_storage_record_open(ns, r, &rd, keyd); } bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0; rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent; rd.n_bins = as_bin_get_n_bins(r, &rd); // TODO - we really need an inline utility for this! uint16_t newbins = ntohs(*(uint16_t*)pickled_buf); if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin && newbins > rd.n_bins) { rd.n_bins = newbins; } as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; rd.bins = as_bin_get_all(r, &rd, stack_bins); uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ? 0 : as_record_buf_get_stack_particles_sz(pickled_buf); uint8_t stack_particles[stack_particles_sz + 256]; uint8_t* p_stack_particles = stack_particles; // + 256 for LDT control bin, to hold version. if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; } uint64_t memory_bytes = 0; if (! is_create) { memory_bytes = as_storage_record_get_n_bytes_memory(&rd); } as_record_set_properties(&rd, p_rec_props); if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz, &p_stack_particles, has_sindex) != 0) { if (is_create) { as_index_delete(tree, keyd); } as_storage_record_close(r, &rd); as_record_done(&r_ref, ns); return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity? } r->generation = generation; r->void_time = void_time; r->last_update_time = last_update_time; as_storage_record_adjust_mem_stats(&rd, memory_bytes); uint64_t version_to_set = 0; bool set_version = false; if (is_ldt_parent) { if (linfo->replication_partition_version_match && linfo->ldt_prole_version_set) { version_to_set = linfo->ldt_prole_version; set_version = true; } else if (! linfo->replication_partition_version_match) { version_to_set = linfo->ldt_source_version; set_version = true; } } if (set_version) { int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set, p_stack_particles, __FILE__, __LINE__); if (ldt_rv < 0) { cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv); // TODO - roll back. } } bool is_delete = false; if (! as_bin_inuse_has(&rd)) { // A master write that deletes a record by deleting (all) bins sends a // binless pickle that ends up here. is_delete = true; as_index_delete(tree, keyd); } as_storage_record_write(r, &rd); as_storage_record_close(r, &rd); uint16_t set_id = as_index_get_set_id(r); as_record_done(&r_ref, ns); // Don't send an XDR delete if it's disallowed. if (is_delete && ! is_xdr_delete_shipping_enabled()) { // TODO - should we also not ship if there was no record here before? return AS_PROTO_RESULT_OK; } // Do XDR write if the write is a non-XDR write or forwarding is enabled. if ((info & RW_INFO_XDR) == 0 || is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) { xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL); } return AS_PROTO_RESULT_OK; }
/* Workhorse function to send response back to the client after UDF execution. * * Assumption: The call should be setup properly pointing to the tr. * * Special Handling: If it is background udf job do not send any * response to client */ int process_response(udf_call *call, const char *bin_name, const as_val *val, cf_dyn_buf *db) { // NO response if background UDF if (call->def->type == AS_UDF_OP_BACKGROUND) { return 0; } // Note - this function quietly handles a null val. The response call will // be given a bin with a name but not 'in use', and it does the right thing. as_bin stack_bin; as_bin *bin = &stack_bin; uint32_t particle_size = as_particle_size_from_asval(val); static const size_t MAX_STACK_SIZE = 32 * 1024; uint8_t stack_particle[particle_size > MAX_STACK_SIZE ? 0 : particle_size]; uint8_t *particle_buf = stack_particle; if (particle_size > MAX_STACK_SIZE) { particle_buf = (uint8_t *)cf_malloc(particle_size); if (! particle_buf) { cf_warning(AS_UDF, "failed alloc for particle size %u", particle_size); return -1; } } as_transaction *tr = call->tr; as_namespace *ns = tr->rsv.ns; as_bin_init(ns, bin, bin_name); as_bin_particle_stack_from_asval(bin, particle_buf, val); if (db) { size_t msg_sz = 0; uint8_t *msgp = (uint8_t *)as_msg_make_response_msg(tr->result_code, tr->generation, tr->void_time, NULL, &bin, 1, ns, NULL, &msg_sz, as_transaction_trid(tr), NULL); if (! msgp) { cf_warning_digest(AS_RW, &tr->keyd, "{%s} UDF failed to make response msg ", ns->name); if (particle_buf != stack_particle) { cf_free(particle_buf); } return -1; } // Stash the message, to be sent later. db->buf = msgp; db->is_stack = false; db->alloc_sz = msg_sz; db->used_sz = msg_sz; } else { single_transaction_response(tr, ns, NULL, &bin, 1, tr->generation, tr->void_time, NULL, NULL); } if (particle_buf != stack_particle) { cf_free(particle_buf); } return 0; }