//------------------------------------------------ // Runs in every thread of every read queue, pops // readreq objects, does the read and reports the // read transaction duration. // static void* run_reads(void* pv_req_queue) { cf_queue* p_req_queue = (cf_queue*)pv_req_queue; readreq* p_readreq; while (g_running) { if (cf_queue_pop(p_req_queue, (void*)&p_readreq, 100) != CF_QUEUE_OK) { continue; } if (g_use_valloc) { uint8_t* p_buffer = cf_valloc(p_readreq->size); if (p_buffer) { read_and_report(p_readreq, p_buffer); free(p_buffer); } else { fprintf(stdout, "ERROR: read buffer cf_valloc()\n"); } } else { uint8_t stack_buffer[p_readreq->size + 4096]; uint8_t* p_buffer = align_4096(stack_buffer); read_and_report(p_readreq, p_buffer); } free(p_readreq); cf_atomic_int_decr(&g_read_reqs_queued); } return (0); }
/* Processing reads when they return from aio_read */ static void process_read(as_async_info_t *info) { if(!g_running) { return; } cf_atomic_int_decr(&g_read_reqs_queued); uint64_t stop_time = cf_getms(); fd_put(info->p_readreq.p_device, info->fd); if (stop_time != -1) { histogram_insert_data_point(g_p_raw_read_histogram, safe_delta_ms(info->raw_start_time, stop_time)); histogram_insert_data_point(g_p_read_histogram, safe_delta_ms(info->p_readreq.start_time, stop_time)); histogram_insert_data_point( info->p_readreq.p_device->p_raw_read_histogram, safe_delta_ms(info->raw_start_time, stop_time)); } if (g_use_valloc && info->p_buffer) { free(info->p_buffer); } uintptr_t temp = (uintptr_t)info; cf_queue_push(async_info_queue, (void*)&temp); }
/* * Create a tree "stub" for the storage has index case. * Returns: 1 = new * 0 = success (found) * -1 = fail */ int as_index_ref_initialize(as_index_tree *tree, cf_digest *key, as_index_ref *index_ref, bool create_p, as_namespace *ns) { /* Allocate memory for the new node and set the node parameters */ cf_arenax_handle n_h = cf_arenax_alloc(tree->arena); if (0 == n_h) { // cf_debug(AS_INDEX," malloc failed "); return(-1); } as_index *n = RESOLVE_H(n_h); n->key = *key; n->rc = 1; n->left_h = n->right_h = tree->sentinel_h; n->color = AS_RED; n->parent_h = tree->sentinel_h; if (AS_STORAGE_ENGINE_KV == ns->storage_type) n->storage_key.kv.file_id = STORAGE_INVALID_FILE_ID; // careful here - this is now unsigned else cf_crash(AS_INDEX, "non-KV storage type ns %s key %p", ns->name, key); index_ref->r = n; index_ref->r_h = n_h; if (!index_ref->skip_lock) { olock_vlock(g_config.record_locks, key, &(index_ref->olock)); cf_atomic_int_incr(&g_config.global_record_lock_count); } as_index_reserve(n); cf_atomic_int_add(&g_config.global_record_ref_count, 2); int rv = !as_storage_record_exists(ns, key); // Unlock if not found and we're not creating it. if (rv && !create_p) { if (!index_ref->skip_lock) { pthread_mutex_unlock(index_ref->olock); cf_atomic_int_decr(&g_config.global_record_lock_count); } as_index_release(n); cf_atomic_int_decr(&g_config.global_record_ref_count); cf_arenax_free(tree->arena, n_h); index_ref->r = 0; index_ref->r_h = 0; } return(rv); }
void as_index_tree_purge(as_index_tree *tree, as_index *r, cf_arenax_handle r_h) { // Don't purge the sentinel. if (r_h == tree->sentinel_h) { return; } as_index_tree_purge(tree, RESOLVE_H(r->left_h), r->left_h); as_index_tree_purge(tree, RESOLVE_H(r->right_h), r->right_h); if (0 == as_index_release(r)) { if (tree->destructor) { tree->destructor(r, tree->destructor_udata); } cf_arenax_free(tree->arena, r_h); } cf_atomic_int_decr(&g_config.global_record_ref_count); }
// If there's an element with specified digest in the tree, delete it. // // Returns: // 0 - found and deleted // -1 - not found // TODO - nobody cares about the return value, make it void? int as_index_delete(as_index_tree *tree, cf_digest *keyd) { as_index *r; cf_arenax_handle r_h; bool retry; // Save parents as we search for the specified element (or its successor). as_index_ele eles[(64 * 2) + 3]; as_index_ele *ele; do { ele = eles; pthread_mutex_lock(&tree->lock); ele->parent = NULL; // we'll never look this far up ele->me_h = tree->root_h; ele->me = tree->root; r_h = tree->root->left_h; r = RESOLVE_H(r_h); while (r_h != tree->sentinel_h) { ele++; ele->parent = ele - 1; ele->me_h = r_h; ele->me = r; int cmp = cf_digest_compare(keyd, &r->key); if (cmp == 0) { break; // found, we'll be deleting it } r_h = cmp > 0 ? r->left_h : r->right_h; r = RESOLVE_H(r_h); } if (r_h == tree->sentinel_h) { pthread_mutex_unlock(&tree->lock); return -1; // not found, nothing to delete } // We found the tree element, so we'll be deleting it. retry = false; if (EBUSY == pthread_mutex_trylock(&tree->reduce_lock)) { // The tree is being reduced - could take long, unlock so reads and // overwrites aren't blocked. pthread_mutex_unlock(&tree->lock); // Wait until the tree reduce is done... pthread_mutex_lock(&tree->reduce_lock); pthread_mutex_unlock(&tree->reduce_lock); // ... and start over - we unlocked, so the tree may have changed. retry = true; } } while (retry); // Delete the element. // Snapshot the element to delete, r. (Already have r_h and r shortcuts.) as_index_ele *r_e = ele; if (r->left_h != tree->sentinel_h && r->right_h != tree->sentinel_h) { // Search down for a "successor"... ele++; ele->parent = ele - 1; ele->me_h = r->right_h; ele->me = RESOLVE_H(ele->me_h); while (ele->me->left_h != tree->sentinel_h) { ele++; ele->parent = ele - 1; ele->me_h = ele->parent->me->left_h; ele->me = RESOLVE_H(ele->me_h); } } // else ele is left at r, i.e. s == r // Snapshot the successor, s. (Note - s could be r.) as_index_ele *s_e = ele; cf_arenax_handle s_h = s_e->me_h; as_index *s = s_e->me; // Get the appropriate child of s. (Note - child could be sentinel.) ele++; if (s->left_h == tree->sentinel_h) { ele->me_h = s->right_h; } else { ele->me_h = s->left_h; } ele->me = RESOLVE_H(ele->me_h); // Cut s (remember, it could be r) out of the tree. ele->parent = s_e->parent; if (s_h == s_e->parent->me->left_h) { s_e->parent->me->left_h = ele->me_h; } else { s_e->parent->me->right_h = ele->me_h; } // Rebalance at ele if necessary. (Note - if r != s, r is in the tree, and // its parent may change during rebalancing.) if (s->color == AS_BLACK) { as_index_delete_rebalance(tree, ele); } if (s != r) { // s was a successor distinct from r, put it in r's place in the tree. s->left_h = r->left_h; s->right_h = r->right_h; s->color = r->color; if (r_h == r_e->parent->me->left_h) { r_e->parent->me->left_h = s_h; } else { r_e->parent->me->right_h = s_h; } } // We may now destroy r, which is no longer in the tree. if (0 == as_index_release(r)) { if (tree->destructor) { tree->destructor(r, tree->destructor_udata); } cf_arenax_free(tree->arena, r_h); } cf_atomic_int_decr(&g_config.global_record_ref_count); tree->elements--; pthread_mutex_unlock(&tree->reduce_lock); pthread_mutex_unlock(&tree->lock); return 0; }
// Build response to batch request. static void batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r) { as_namespace* ns = btr->ns; batch_digests *bmds = btr->digests; bool get_data = btr->get_data; uint32_t yield_count = 0; for (int i = 0; i < bmds->n_digests; i++) { batch_digest *bmd = &bmds->digest[i]; if (bmd->done == false) { // try to get the key as_partition_reservation rsv; AS_PARTITION_RESERVATION_INIT(rsv); cf_node other_node = 0; uint64_t cluster_key; if (! *bb_r) { *bb_r = cf_buf_builder_create_size(1024 * 4); } int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key); if (rv == 0) { cf_atomic_int_incr(&g_config.batch_tree_count); as_index_ref r_ref; r_ref.skip_lock = false; int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns); if (rec_rv == 0) { as_index *r = r_ref.r; // Check to see this isn't an expired record waiting to die. if (r->void_time && r->void_time < as_record_void_time_get()) { as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } else { // Make sure it's brought in from storage if necessary. as_storage_rd rd; if (get_data) { as_storage_record_open(ns, r, &rd, &r->key); rd.n_bins = as_bin_get_n_bins(r, &rd); } // Note: this array must stay in scope until the // response for this record has been built, since in the // get data w/ record on device case, it's copied by // reference directly into the record descriptor. as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; if (get_data) { // Figure out which bins you want - for now, all. rd.bins = as_bin_get_all(r, &rd, stack_bins); rd.n_bins = as_bin_inuse_count(&rd); } as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist); if (get_data) { as_storage_record_close(r, &rd); } } as_record_done(&r_ref, ns); } else { // TODO - what about empty records? cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); } bmd->done = true; as_partition_release(&rsv); cf_atomic_int_decr(&g_config.batch_tree_count); } else { cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv); as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name); if (other_node != 0) { bmd->node = other_node; cf_debug(AS_BATCH, "other_node is: %p.", other_node); } else { cf_debug(AS_BATCH, "other_node is NULL."); } } yield_count++; if (yield_count % g_config.batch_priority == 0) { usleep(1); } } } }