/* * Based on io object send buffer to the network, if fails * the queues it up to be picked by the asynchronous queueing * thread * * vtable: * * start_cb: Callback to the module before the real IO is started. * it returns the status * AS_NETIO_OK: Everythin ok go ahead with IO * AS_NETIO_ERR: If there was issue like abort/err/timeout etc. * * finish_cb: Callback to the module with the status code of the IO call * AS_NETIO_OK: Everything went fine * AS_NETIO_CONTINUE: The IO was requeued. Generally is noop in finish_cb * AS_NETIO_ERR: IO erred out due to some issue. * * The function should do the needful like release ref to user * data etc. * * Return Code: * AS_NETIO_OK: Everything is fine normal code flow. Both the start_cb * finish were called * * AS_NETIO_ERR: Something failed either in calling module start_cb or * while doing network IO. finish_cb is called. * * Consumption: * this function consumes qtr reference. It calls finish_cb which releases * ref to qtr * In case of AS_NETIO_CONTINUE: This function also consumes bb_r and ref for * fd_h. The background thread is responsible for freeing up bb_r and release * ref to fd_h. */ int as_netio_send(as_netio *io, void *q_to_use, bool blocking) { cf_queue *q = (cf_queue *)q_to_use; int ret = io->start_cb(io, io->seq); if (ret == AS_NETIO_OK) { ret = io->finish_cb(io, as_netio_send_packet(io->fd_h, io->bb_r, &io->offset, blocking)); } else { ret = io->finish_cb(io, ret); } // If needs requeue then requeue it switch (ret) { case AS_NETIO_CONTINUE: if (!q) { cf_queue_push(g_netio_queue, io); } else { io->slow = true; cf_queue_push(q, io); } break; default: ret = AS_NETIO_OK; break; } return ret; }
int cf_queue_priority_push(cf_queue_priority *q, void *ptr, int pri) { if (q->threadsafe && (0 != pthread_mutex_lock(&q->LOCK))) return(-1); int rv; if (pri == CF_QUEUE_PRIORITY_HIGH) rv = cf_queue_push(q->high_q, ptr); else if (pri == CF_QUEUE_PRIORITY_MEDIUM) rv = cf_queue_push(q->medium_q, ptr); else if (pri == CF_QUEUE_PRIORITY_LOW) rv = cf_queue_push(q->low_q, ptr); else { // cf_warning(CF_QUEUE, "SERIOUS! bad priority %d passed into queue priority push",pri); rv = -1; } if (rv == 0 && q->threadsafe) pthread_cond_signal(&q->CV); /* FIXME blow a gasket */ if (q->threadsafe && (0 != pthread_mutex_unlock(&q->LOCK))) return(-1); return(rv); }
void demarshal_file_handle_init() { struct rlimit rl; pthread_mutex_lock(&g_file_handle_a_LOCK); if (g_file_handle_a == 0) { if (-1 == getrlimit(RLIMIT_NOFILE, &rl)) { cf_crash(AS_DEMARSHAL, "getrlimit: %s", cf_strerror(errno)); } // Initialize the message pointer array and the unread byte counters. g_file_handle_a = cf_calloc(rl.rlim_cur, sizeof(as_proto *)); cf_assert(g_file_handle_a, AS_DEMARSHAL, CF_CRITICAL, "allocation: %s", cf_strerror(errno)); g_file_handle_a_sz = rl.rlim_cur; for (int i = 0; i < g_file_handle_a_sz; i++) { cf_queue_push(g_freeslot, &i); } pthread_create(&g_demarshal_reaper_th, 0, thr_demarshal_reaper_fn, 0); // If config value is 0, set a maximum proto size based on the RLIMIT. if (g_config.n_proto_fd_max == 0) { g_config.n_proto_fd_max = rl.rlim_cur / 2; cf_info(AS_DEMARSHAL, "setting default client file descriptors to %d", g_config.n_proto_fd_max); } } pthread_mutex_unlock(&g_file_handle_a_LOCK); }
// // This assumes the element we're looking for is unique! Returns // CF_QUEUE_NOMATCH if the element is not found or not moved. // int cf_queue_priority_change(cf_queue_priority *priority_q, const void *ptr, int new_pri) { cf_queue_priority_lock(priority_q); cf_queue *queues[3]; queues[0] = priority_q->high_q; queues[1] = priority_q->medium_q; queues[2] = priority_q->low_q; int dest_q_itr = CF_QUEUE_PRIORITY_HIGH - new_pri; cf_queue *q; for (int q_itr = 0; q_itr < 3; q_itr++) { q = queues[q_itr]; if (q_itr == dest_q_itr || CF_Q_SZ(q) == 0) { continue; } for (uint32_t i = q->read_offset; i < q->write_offset; i++) { if (memcmp(CF_Q_ELEM_PTR(q, i), ptr, q->element_sz) == 0) { // Move it to the queue with desired priority. cf_queue_delete_offset(q, i); cf_queue_push(queues[dest_q_itr], ptr); cf_queue_priority_unlock(priority_q); return CF_QUEUE_OK; } } } cf_queue_priority_unlock(priority_q); return CF_QUEUE_NOMATCH; }
/* Processing reads when they return from aio_read */ static void process_read(as_async_info_t *info) { if(!g_running) { return; } cf_atomic_int_decr(&g_read_reqs_queued); uint64_t stop_time = cf_getms(); fd_put(info->p_readreq.p_device, info->fd); if (stop_time != -1) { histogram_insert_data_point(g_p_raw_read_histogram, safe_delta_ms(info->raw_start_time, stop_time)); histogram_insert_data_point(g_p_read_histogram, safe_delta_ms(info->p_readreq.start_time, stop_time)); histogram_insert_data_point( info->p_readreq.p_device->p_raw_read_histogram, safe_delta_ms(info->raw_start_time, stop_time)); } if (g_use_valloc && info->p_buffer) { free(info->p_buffer); } uintptr_t temp = (uintptr_t)info; cf_queue_push(async_info_queue, (void*)&temp); }
void cl_cluster_scan_shutdown(cl_cluster* asc) { // Check whether we ever (lazily) initialized scan machinery. if (cf_atomic32_get(asc->scan_initialized) == 0 && ! asc->scan_q) { return; } // This tells the worker threads to stop. We do this (instead of using a // "running" flag) to allow the workers to "wait forever" on processing the // work dispatch queue, which has minimum impact when the queue is empty. // This also means all queued requests get processed when shutting down. for (int i = 0; i < NUM_SCAN_THREADS; i++) { cl_scan_task task; task.asc = NULL; cf_queue_push(asc->scan_q, &task); } for (int i = 0; i < NUM_SCAN_THREADS; i++) { pthread_join(asc->scan_threads[i], NULL); } cf_queue_destroy(asc->scan_q); asc->scan_q = NULL; cf_atomic32_set(&asc->scan_initialized, 0); }
// // Close async worker threads gracefully. // void citrusleaf_async_shutdown() { if (g_cl_async_q == 0) return; /* * If a process is forked, the threads in it do not get spawned in the child process. * In citrusleaf_init(), we are remembering the process id(g_init_pid) of the process who spawned the * background threads. If the current process is not the process who spawned the background threads * then it cannot call pthread_join() on the threads which does not exist in this process. */ if(g_init_pid == getpid()) { // Send shutdown message to each worker thread. cl_async_work *workitem = malloc(sizeof(cl_async_work)); memset(workitem, 0, sizeof(cl_async_work)); workitem->fd = -1; uint i; for (i = 0; i < g_async_num_threads; i++) { cf_queue_push(g_cl_async_q, &workitem); } for (i = 0; i < g_async_num_threads; i++) { pthread_join(g_async_reciever[i], NULL); } free(workitem); cf_queue_destroy(g_cl_async_q); g_cl_async_q = 0; } }
void releaseDigArrToQueue(void *v) { dig_arr_t *dt = (dig_arr_t *)v; if (cf_queue_sz(g_q_dig_arr) < DIG_ARRAY_QUEUE_HIGHWATER) { cf_queue_push(g_q_dig_arr, &dt); } else cf_free(dt); }
static void as_scan_worker(void* data) { as_scan_task* task = (as_scan_task*)data; as_scan_complete_task complete_task; complete_task.node = task->node; complete_task.task_id = task->task_id; complete_task.result = as_scan_command_execute(task); cf_queue_push(task->complete_q, &complete_task); }
int cf_queue_priority_push(cf_queue_priority *q, const void *ptr, int pri) { cf_queue_priority_lock(q); int rv = CF_QUEUE_ERR; if (pri == CF_QUEUE_PRIORITY_HIGH) { rv = cf_queue_push(q->high_q, ptr); } else if (pri == CF_QUEUE_PRIORITY_MEDIUM) { rv = cf_queue_push(q->medium_q, ptr); } else if (pri == CF_QUEUE_PRIORITY_LOW) { rv = cf_queue_push(q->low_q, ptr); } if (rv == 0 && q->threadsafe) { pthread_cond_signal(&q->CV); } cf_queue_priority_unlock(q); return rv; }
int cf_queue_priority_push(cf_queue_priority *q, void *ptr, int pri) { QUEUE_LOCK(q); int rv; if (pri == CF_QUEUE_PRIORITY_HIGH) rv = cf_queue_push(q->high_q, ptr); else if (pri == CF_QUEUE_PRIORITY_MEDIUM) rv = cf_queue_push(q->medium_q, ptr); else if (pri == CF_QUEUE_PRIORITY_LOW) rv = cf_queue_push(q->low_q, ptr); else { rv = -1; } #ifndef EXTERNAL_LOCKS if (rv == 0 && q->threadsafe) pthread_cond_signal(&q->CV); #endif QUEUE_UNLOCK(q); return(rv); }
int cf_queue_priority_push(cf_queue_priority *q, void *ptr, int pri) { if (q->threadsafe && (0 != pthread_mutex_lock(&q->LOCK))) return(-1); int rv; if (pri == CF_QUEUE_PRIORITY_HIGH) rv = cf_queue_push(q->high_q, ptr); else if (pri == CF_QUEUE_PRIORITY_MEDIUM) rv = cf_queue_push(q->medium_q, ptr); else if (pri == CF_QUEUE_PRIORITY_LOW) rv = cf_queue_push(q->low_q, ptr); else { rv = -1; } if (rv == 0 && q->threadsafe) pthread_cond_signal(&q->CV); if (q->threadsafe && (0 != pthread_mutex_unlock(&q->LOCK))) return(-1); return(rv); }
static void as_scan_worker(void* data) { as_scan_task* task = (as_scan_task*)data; as_scan_complete_task complete_task; complete_task.node = task->node; complete_task.task_id = task->task_id; if (as_load_uint32(task->error_mutex) == 0) { complete_task.result = as_scan_command_execute(task); } else { complete_task.result = AEROSPIKE_ERR_SCAN_ABORTED; } cf_queue_push(task->complete_q, &complete_task); }
// // Reduce the inner queues whose priorities are different to 'new_pri'. If the // callback returns -1, move that element to the inner queue whose priority is // 'new_pri' and return CF_QUEUE_OK. Returns CF_QUEUE_NOMATCH if callback never // triggers a move. // int cf_queue_priority_reduce_change(cf_queue_priority *priority_q, int new_pri, cf_queue_reduce_fn cb, void *udata) { cf_queue_priority_lock(priority_q); cf_queue *queues[3]; queues[0] = priority_q->high_q; queues[1] = priority_q->medium_q; queues[2] = priority_q->low_q; int dest_q_itr = CF_QUEUE_PRIORITY_HIGH - new_pri; cf_queue *q; for (int q_itr = 0; q_itr < 3; q_itr++) { q = queues[q_itr]; if (q_itr == dest_q_itr || CF_Q_SZ(q) == 0) { continue; } for (uint32_t i = q->read_offset; i < q->write_offset; i++) { int rv = cb(CF_Q_ELEM_PTR(q, i), udata); if (rv == 0) { continue; } if (rv == -1) { // Found it - move to desired queue and return. uint8_t* buf = alloca(q->element_sz); memcpy(buf, CF_Q_ELEM_PTR(q, i), q->element_sz); cf_queue_delete_offset(q, i); cf_queue_push(queues[dest_q_itr], buf); cf_queue_priority_unlock(priority_q); return CF_QUEUE_OK; } } } cf_queue_priority_unlock(priority_q); return CF_QUEUE_NOMATCH; }
//------------------------------------------------ // Runs in thr_add_readreqs, adds readreq objects // to all read queues in an even, random spread. // static void* run_add_readreqs(void* pv_unused) { uint64_t count = 0; while (g_running) { if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) { fprintf(stdout, "ERROR: too many read reqs queued\n"); fprintf(stdout, "drive(s) can't keep up - test stopped\n"); g_running = false; break; } uint32_t random_queue_index = rand_32() % g_num_queues; uint32_t random_device_index = g_queue_per_device ? random_queue_index : rand_32() % g_num_devices; device* p_random_device = &g_devices[random_device_index]; readreq* p_readreq = malloc(sizeof(readreq)); p_readreq->p_device = p_random_device; p_readreq->offset = random_read_offset(p_random_device); p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES; p_readreq->start_time = cf_getus(); cf_queue_push(g_readqs[random_queue_index].p_req_queue, &p_readreq); count++; int sleep_us = (int) (((count * 1000000) / g_read_reqs_per_sec) - (cf_getus() - g_run_start_us)); if (sleep_us > 0) { usleep((uint32_t)sleep_us); } if (sleep_us != 0) { fprintf(stdout, "%" PRIu64 ", sleep_us = %d\n", count, sleep_us); } } return (0); }
void * cf_queue_test_1_write(void *arg) { cf_queue *q = (cf_queue *) arg; for (int i=0; i<TEST1_SZ; i++) { usleep(TEST1_INTERVAL * 1000); int rv; rv = cf_queue_push(q, &i); if (0 != rv) { fprintf(stderr, "queue push failed: error %d",rv); return((void *)-1); } } return((void *)0); }
void * cl_scan_worker(void * pv_asc) { cl_cluster* asc = (cl_cluster*)pv_asc; while (true) { // Response structure to be pushed in the complete q cl_node_response response; memset(&response, 0, sizeof(cl_node_response)); cl_scan_task task; if ( 0 != cf_queue_pop(asc->scan_q, &task, CF_QUEUE_FOREVER) ) { LOG("[WARNING] cl_scan_worker: queue pop failed\n"); } if ( cf_debug_enabled() ) { LOG("[DEBUG] cl_scan_worker: getting one task item\n"); } // This is how scan shutdown signals we're done. if ( ! task.asc ) { break; } // query if the node is still around int rc = CITRUSLEAF_FAIL_UNAVAILABLE; cl_cluster_node * node = cl_cluster_node_get_byname(task.asc, task.node_name); if ( node ) { rc = cl_scan_worker_do(node, &task); } else { LOG("[INFO] cl_scan_worker: No node found with the name %s\n", task.node_name); } strncpy(response.node_name, task.node_name, strlen(task.node_name)); response.node_response = rc; response.job_id = task.job_id; cf_queue_push(task.complete_q, (void *)&response); } return NULL; }
static void create_async_info_queue() { int i; uintptr_t info; as_async_info_t *temp_info; async_info_queue = cf_queue_create(sizeof(uintptr_t), true); async_info_array = (as_async_info_t*)malloc(MAX_READ_REQS_QUEUED * sizeof(as_async_info_t)); if(async_info_array == NULL) { fprintf(stdout, "Error: Malloc info structs failed.\n Exiting. \n"); cf_queue_destroy(async_info_queue); exit(-1); } for(i = 0; i < MAX_READ_REQS_QUEUED; i++) { temp_info = async_info_array + i; info = (uintptr_t)temp_info; cf_queue_push(async_info_queue, (void*)&info); } }
void emigrate_queue_push(emigration *emig) { cf_queue_push(&g_emigration_q, &emig); }
static void* generate_async_reads(void* aio_context) { uint64_t count = 0; while(g_running) { /* Create the struct of info needed at the process_read end */ uintptr_t info_ptr; if (cf_queue_pop(async_info_queue, (void*)&info_ptr, CF_QUEUE_NOWAIT) != CF_QUEUE_OK) { fprintf(stdout, "Error: Could not pop info struct \n"); return (void*)(-1); } as_async_info_t *info = (as_async_info_t*)info_ptr; memset(info, 0, sizeof(as_async_info_t)); /* Generate the actual read request */ uint32_t random_device_index = rand_32() % g_num_devices; device* p_random_device = &g_devices[random_device_index]; readreq* p_readreq = &(info->p_readreq); if(p_readreq == NULL) { fprintf(stdout, "Error: preadreq null \n"); goto fail; } p_readreq->p_device = p_random_device; p_readreq->offset = random_read_offset(p_random_device); p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES; p_readreq->start_time = cf_getms(); /* Async read */ if (g_use_valloc) { uint8_t* p_buffer = cf_valloc(p_readreq->size); info->p_buffer = p_buffer; if (p_buffer) { uint64_t raw_start_time = cf_getms(); info->raw_start_time = raw_start_time; if(read_async_from_device(info, *(aio_context_t *)aio_context) < 0) { fprintf(stdout, "Error: Async read failed \n"); free(p_buffer); goto fail; } } else { fprintf(stdout, "ERROR: read buffer cf_valloc()\n"); } } else { uint8_t stack_buffer[p_readreq->size + 4096]; uint8_t* p_buffer = align_4096(stack_buffer); info->p_buffer = p_buffer; uint64_t raw_start_time = cf_getms(); info->raw_start_time = raw_start_time; if(read_async_from_device(info, *(aio_context_t*)aio_context) < 0) { fprintf(stdout, "Error: Async read failed \n"); goto fail; } } if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) { fprintf(stdout, "ERROR: too many read reqs queued\n"); fprintf(stdout, "drive(s) can't keep up - test stopped\n"); g_running = false; return (void*)-1;; } count++; int sleep_ms = (int) (((count * 1000) / g_read_reqs_per_sec) - (cf_getms() - g_run_start_ms)); if (sleep_ms > 0) { usleep((uint32_t)sleep_ms * 1000); } continue; /* Rollback for failure */ fail: if(info) { uintptr_t temp = (uintptr_t)info; cf_queue_push(async_info_queue, (void*)&temp); } } return (0); }
cf_vector * cl_scan_execute(cl_cluster * cluster, const cl_scan * scan, char * node_name, cl_rv * res, int (* callback)(as_val *, void *), void * udata) { cl_rv rc = CITRUSLEAF_OK; uint8_t wr_stack_buf[STACK_BUF_SZ] = { 0 }; uint8_t * wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int node_count = 0; cl_node_response response; rc = scan_compile(scan, &wr_buf, &wr_buf_sz); if ( rc != CITRUSLEAF_OK ) { LOG("[ERROR] cl_scan_execute: scan compile failed: \n"); *res = rc; return NULL; } // Setup worker cl_scan_task task = { .asc = cluster, .ns = scan->ns, .scan_buf = wr_buf, .scan_sz = wr_buf_sz, .udata = udata, .callback = callback, .job_id = scan->job_id, .type = scan->udf.type, }; task.complete_q = cf_queue_create(sizeof(cl_node_response), true); cf_vector * result_v = NULL; // If node_name is not null, we are executing scan on a particular node if (node_name) { // Copy the node name in the task and push it in the global scan queue. One task for each node strcpy(task.node_name, node_name); cf_queue_push(cluster->scan_q, &task); node_count = 1; } else { // Node name is NULL, we have to scan all nodes char *node_names = NULL; // Get a list of the node names, so we can can send work to each node cl_cluster_get_node_names(cluster, &node_count, &node_names); if ( node_count == 0 ) { LOG("[ERROR] cl_scan_execute: don't have any nodes?\n"); *res = CITRUSLEAF_FAIL_CLIENT; goto Cleanup; } // Dispatch work to the worker queue to allow the transactions in parallel // NOTE: if a new node is introduced in the middle, it is NOT taken care of node_name = node_names; for ( int i=0; i < node_count; i++ ) { // fill in per-request specifics strcpy(task.node_name, node_name); cf_queue_push(cluster->scan_q, &task); node_name += NODE_NAME_SIZE; } free(node_names); node_names = NULL; } // Wait for the work to complete from all the nodes. // For every node, fill in the return value in the result vector result_v = cf_vector_create(sizeof(cl_node_response), node_count, 0); for ( int i=0; i < node_count; i++ ) { // Pop the response structure cf_queue_pop(task.complete_q, &response, CF_QUEUE_FOREVER); cf_vector_append(result_v, &response); } Cleanup: if ( wr_buf && (wr_buf != wr_stack_buf) ) { free(wr_buf); wr_buf = 0; } cf_queue_destroy(task.complete_q); return result_v; } /** * Allocates and initializes a new cl_scan. */ cl_scan * cl_scan_new(const char * ns, const char * setname, uint64_t *job_id) { cl_scan * scan = (cl_scan*) malloc(sizeof(cl_scan)); memset(scan, 0, sizeof(cl_scan)); return cl_scan_init(scan, ns, setname, job_id); }
//------------------------------------------------ // Recycle a safe file descriptor for a device. // static void fd_put(device* p_device, int fd) { cf_queue_push(p_device->p_fd_queue, (void*)&fd); }
// Put batch request on a separate batch queue. int as_batch(as_transaction* tr) { as_msg* msg = &tr->msgp->msg; as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE); if (! nsfp) { cf_warning(AS_BATCH, "Batch namespace is required."); return -1; } as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY); if (! dfp) { cf_warning(AS_BATCH, "Batch digests are required."); return -1; } uint n_digests = dfp->field_sz / sizeof(cf_digest); if (n_digests > g_config.batch_max_requests) { cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests); return -1; } batch_transaction btr; btr.trid = tr->trid; btr.end_time = tr->end_time; btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA); btr.ns = as_namespace_get_bymsgfield(nsfp); if (! btr.ns) { cf_warning(AS_BATCH, "Batch namespace is required."); return -1; } // Create the master digest table. btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests)); if (! btr.digests) { cf_warning(AS_BATCH, "Failed to allocate memory for batch digests."); return -1; } batch_digests* bmd = btr.digests; bmd->n_digests = n_digests; uint8_t* digest_field_data = dfp->data; for (int i = 0; i < n_digests; i++) { bmd->digest[i].done = false; bmd->digest[i].node = 0; memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest)); digest_field_data += sizeof(cf_digest); } btr.binlist = as_binlist_from_op(msg); btr.fd_h = tr->proto_fd_h; tr->proto_fd_h = 0; btr.fd_h->last_used = cf_getms(); cf_atomic_int_incr(&g_config.batch_initiate); cf_queue_push(g_batch_queue, &btr); return 0; }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }
// Keep track of the connections, since they're precious. Kill anything that // hasn't been used in a while. The file handle array keeps a reference count, // and allows a reaper to run through and find the ones to reap. The table is // only written by the demarshal threads, and only read by the reaper thread. void * thr_demarshal_reaper_fn(void *arg) { uint64_t last = cf_getms(); while (true) { uint64_t now = cf_getms(); uint inuse_cnt = 0; uint64_t kill_ms = g_config.proto_fd_idle_ms; bool refresh = false; if (now - last > (uint64_t)(g_config.sec_cfg.privilege_refresh_period * 1000)) { refresh = true; last = now; } pthread_mutex_lock(&g_file_handle_a_LOCK); for (int i = 0; i < g_file_handle_a_sz; i++) { if (g_file_handle_a[i]) { as_file_handle *fd_h = g_file_handle_a[i]; if (refresh) { as_security_refresh(fd_h); } // Reap, if asked to. if (fd_h->reap_me) { cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", fd_h->fd); g_file_handle_a[i] = 0; cf_queue_push(g_freeslot, &i); as_release_file_handle(fd_h); fd_h = 0; } // Reap if past kill time. else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) { if (fd_h->fh_info & FH_INFO_DONOT_REAP) { cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", fd_h->fd); inuse_cnt++; continue; } shutdown(fd_h->fd, SHUT_RDWR); // will trigger epoll errors cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", fd_h->fd); g_file_handle_a[i] = 0; cf_queue_push(g_freeslot, &i); as_release_file_handle(fd_h); fd_h = 0; cf_atomic_int_incr(&g_config.reaper_count); } else { inuse_cnt++; } } } pthread_mutex_unlock(&g_file_handle_a_LOCK); if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) { cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse", g_file_handle_a_sz, inuse_cnt); } // Validate the system statistics. if (g_config.proto_connections_opened - g_config.proto_connections_closed != inuse_cnt) { cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count: %d in stats vs %d calculated", g_config.proto_connections_opened - g_config.proto_connections_closed, inuse_cnt); } sleep(1); } return NULL; }