int cf_queue_priority_pop(cf_queue_priority *q, void *buf, int ms_wait) { if (q->threadsafe && (0 != pthread_mutex_lock(&q->LOCK))) return(-1); struct timespec tp; if (ms_wait > 0) { clock_gettime( CLOCK_REALTIME, &tp); tp.tv_sec += ms_wait / 1000; tp.tv_nsec += (ms_wait % 1000) * 1000000; if (tp.tv_nsec > 1000000000) { tp.tv_nsec -= 1000000000; tp.tv_sec++; } } if (q->threadsafe) { while (CF_Q_PRI_EMPTY(q)) { if (CF_QUEUE_FOREVER == ms_wait) { pthread_cond_wait(&q->CV, &q->LOCK); } else if (CF_QUEUE_NOWAIT == ms_wait) { pthread_mutex_unlock(&q->LOCK); return(CF_QUEUE_EMPTY); } else { pthread_cond_timedwait(&q->CV, &q->LOCK, &tp); if (CF_Q_PRI_EMPTY(q)) { pthread_mutex_unlock(&q->LOCK); return(CF_QUEUE_EMPTY); } } } } int rv; if (CF_Q_SZ(q->high_q)) rv = cf_queue_pop(q->high_q, buf, 0); else if (CF_Q_SZ(q->medium_q)) rv = cf_queue_pop(q->medium_q, buf, 0); else if (CF_Q_SZ(q->low_q)) rv = cf_queue_pop(q->low_q, buf, 0); else rv = CF_QUEUE_EMPTY; if (q->threadsafe && (0 != pthread_mutex_unlock(&q->LOCK))) return(-1); return(rv); }
//------------------------------------------------ // Runs in every thread of every read queue, pops // readreq objects, does the read and reports the // read transaction duration. // static void* run_reads(void* pv_req_queue) { cf_queue* p_req_queue = (cf_queue*)pv_req_queue; readreq* p_readreq; while (g_running) { if (cf_queue_pop(p_req_queue, (void*)&p_readreq, 100) != CF_QUEUE_OK) { continue; } if (g_use_valloc) { uint8_t* p_buffer = cf_valloc(p_readreq->size); if (p_buffer) { read_and_report(p_readreq, p_buffer); free(p_buffer); } else { fprintf(stdout, "ERROR: read buffer cf_valloc()\n"); } } else { uint8_t stack_buffer[p_readreq->size + 4096]; uint8_t* p_buffer = align_4096(stack_buffer); read_and_report(p_readreq, p_buffer); } free(p_readreq); cf_atomic_int_decr(&g_read_reqs_queued); } return (0); }
// Process one queue's batch requests. void* batch_process_queue(void* q_to_wait_on) { cf_queue* worker_queue = (cf_queue*)q_to_wait_on; batch_transaction btr; uint64_t start; while (1) { if (cf_queue_pop(worker_queue, &btr, CF_QUEUE_FOREVER) != 0) { cf_crash(AS_BATCH, "Failed to pop from batch worker queue."); } // Check for timeouts. if (btr.end_time != 0 && cf_getns() > btr.end_time) { cf_atomic_int_incr(&g_config.batch_timeout); if (btr.fd_h) { as_msg_send_reply(btr.fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, 0, 0, 0, 0, 0, 0, btr.trid, NULL); btr.fd_h = 0; } batch_transaction_done(&btr); continue; } // Process batch request. start = cf_getns(); batch_process_request(&btr); histogram_insert_data_point(g_config.batch_q_process_hist, start); } return 0; }
//------------------------------------------------ // Close all file descriptors for a device. // static void fd_close_all(device* p_device) { int fd; while (cf_queue_pop(p_device->p_fd_queue, (void*)&fd, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) { close(fd); } }
int cf_queue_priority_pop(cf_queue_priority *q, void *buf, int ms_wait) { cf_queue_priority_lock(q); struct timespec tp; if (ms_wait > 0) { cf_set_wait_timespec(ms_wait, &tp); } if (q->threadsafe) { while (CF_Q_PRI_EMPTY(q)) { if (CF_QUEUE_FOREVER == ms_wait) { pthread_cond_wait(&q->CV, &q->LOCK); } else if (CF_QUEUE_NOWAIT == ms_wait) { pthread_mutex_unlock(&q->LOCK); return CF_QUEUE_EMPTY; } else { pthread_cond_timedwait(&q->CV, &q->LOCK, &tp); if (CF_Q_PRI_EMPTY(q)) { pthread_mutex_unlock(&q->LOCK); return CF_QUEUE_EMPTY; } } } } int rv = CF_QUEUE_EMPTY; if (CF_Q_SZ(q->high_q)) { rv = cf_queue_pop(q->high_q, buf, 0); } else if (CF_Q_SZ(q->medium_q)) { rv = cf_queue_pop(q->medium_q, buf, 0); } else if (CF_Q_SZ(q->low_q)) { rv = cf_queue_pop(q->low_q, buf, 0); } cf_queue_priority_unlock(q); return rv; }
static dig_arr_t * getDigestArray(void) { dig_arr_t *dt; if (cf_queue_pop(g_q_dig_arr, &dt, CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) { dt = cf_malloc(sizeof(dig_arr_t)); } dt->num = 0; return dt; }
void * as_netio_th(void *q_to_wait_on) { cf_queue * q = (cf_queue*)q_to_wait_on; while (true) { as_netio io; if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) { cf_crash(AS_PROTO, "Failed to pop from IO worker queue."); } if (io.slow) { usleep(g_config.proto_slow_netio_sleep_ms * 1000); } as_netio_send(&io, g_netio_slow_queue, false); } }
//------------------------------------------------ // Get a safe file descriptor for a device. // static int fd_get(device* p_device) { int fd = -1; if (cf_queue_pop(p_device->p_fd_queue, (void*)&fd, CF_QUEUE_NOWAIT) != CF_QUEUE_OK) { fd = open(p_device->name, O_DIRECT | O_RDWR, S_IRUSR | S_IWUSR); if (fd == -1) { fprintf(stdout, "ERROR: open device %s\n", p_device->name); } } return (fd); }
void as_node_destroy(as_node* node) { // Drain out the queue and close the FDs int rv; do { int fd; rv = cf_queue_pop(node->conn_q, &fd, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) cf_close(fd); } while (rv == CF_QUEUE_OK); /* do { int fd; rv = cf_queue_pop(node->conn_q_asyncfd, &fd, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) cf_close(fd); } while (rv == CF_QUEUE_OK); */ /* do { //When we reach this point, ideally there should not be any workitems. cl_async_work *aw; rv = cf_queue_pop(node->asyncwork_q, &aw, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { free(aw); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node if (g_cl_async_hashtab) { shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, node); } */ as_vector_destroy(&node->addresses); cf_queue_destroy(node->conn_q); //cf_queue_destroy(node->conn_q_asyncfd); //cf_queue_destroy(node->asyncwork_q); if (node->info_fd >= 0) { cf_close(node->info_fd); } cf_free(node); }
void * as_netio_th(void *q_to_wait_on) { cf_queue * q = (cf_queue*)q_to_wait_on; while (true) { as_netio io; if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) { cf_crash(AS_PROTO, "Failed to pop from IO worker queue."); } if (io.slow) { usleep(g_config.proto_slow_netio_sleep_ms * 1000); } if (as_netio_send(&io, g_netio_slow_queue, false) != AS_NETIO_CONTINUE) { AS_RELEASE_FILE_HANDLE(io.fd_h); cf_buf_builder_free(io.bb_r); }; } }
int as_node_get_connection(as_node* node, int* fd) { //cf_queue* q = asyncfd ? node->conn_q_asyncfd : node->conn_q; cf_queue* q = node->conn_q; while (1) { int rv = cf_queue_pop(q, fd, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { int rv2 = is_connected(*fd); switch (rv2) { case CONNECTED: // It's still good. return 0; case CONNECTED_BADFD: // Local problem, don't try closing. cf_warn("Found bad file descriptor in queue: fd %d", *fd); break; case CONNECTED_NOT: // Can't use it - the remote end closed it. case CONNECTED_ERROR: // Some other problem, could have to do with remote end. default: cf_close(*fd); break; } } else if (rv == CF_QUEUE_EMPTY) { // We exhausted the queue. Try creating a fresh socket. return as_node_create_connection(node, fd); } else { cf_error("Bad return value from cf_queue_pop"); *fd = -1; return CITRUSLEAF_FAIL_CLIENT; } } }
void * cl_scan_worker(void * pv_asc) { cl_cluster* asc = (cl_cluster*)pv_asc; while (true) { // Response structure to be pushed in the complete q cl_node_response response; memset(&response, 0, sizeof(cl_node_response)); cl_scan_task task; if ( 0 != cf_queue_pop(asc->scan_q, &task, CF_QUEUE_FOREVER) ) { LOG("[WARNING] cl_scan_worker: queue pop failed\n"); } if ( cf_debug_enabled() ) { LOG("[DEBUG] cl_scan_worker: getting one task item\n"); } // This is how scan shutdown signals we're done. if ( ! task.asc ) { break; } // query if the node is still around int rc = CITRUSLEAF_FAIL_UNAVAILABLE; cl_cluster_node * node = cl_cluster_node_get_byname(task.asc, task.node_name); if ( node ) { rc = cl_scan_worker_do(node, &task); } else { LOG("[INFO] cl_scan_worker: No node found with the name %s\n", task.node_name); } strncpy(response.node_name, task.node_name, strlen(task.node_name)); response.node_response = rc; response.job_id = task.job_id; cf_queue_push(task.complete_q, (void *)&response); } return NULL; }
void cl_scan_destroy(cl_scan *scan) { if ( scan == NULL ) return; cl_scan_udf_destroy(&scan->udf); if (scan->ns) free(scan->ns); if (scan->setname) free(scan->setname); if ( scan->res_streamq ) { as_val *val = NULL; while (CF_QUEUE_OK == cf_queue_pop (scan->res_streamq, &val, CF_QUEUE_NOWAIT)) { as_val_destroy(val); val = NULL; } cf_queue_destroy(scan->res_streamq); scan->res_streamq = NULL; } free(scan); scan = NULL; }
void * cf_queue_test_1_read(void *arg) { cf_queue *q = (cf_queue *) arg; for (int i=0; i<TEST1_SZ; i++) { // sleep twice as long as the inserter, to test overflow usleep(TEST1_INTERVAL * 1000 * 2); int v = -1; int rv = cf_queue_pop(q, &v, CF_QUEUE_FOREVER); if (rv != CF_QUEUE_OK) { fprintf(stderr, "cf_queue_test1: pop error %d",rv); return((void *) -1); } if (v != i) { fprintf(stderr, "cf_queue_test1: pop value error: %d should be %d",v,i); return((void *) -1); } } return((void *) 0); }
static void* async_receiver_fn(void *thdata) { int rv = -1; bool network_error = false; cl_async_work *workitem = NULL; // cl_async_work *tmpworkitem = NULL; as_msg msg; cf_queue *q_to_use = NULL; cl_cluster_node *thisnode = NULL; uint8_t rd_stack_buf[STACK_BUF_SZ]; uint8_t *rd_buf = rd_stack_buf; size_t rd_buf_sz = 0; uint64_t acktrid; // uint64_t starttime, endtime; int progress_timeout_ms; unsigned int thread_id = cf_atomic32_incr(&g_thread_count); if (thdata == NULL) { q_to_use = g_cl_async_q; } else { thisnode = (cl_cluster_node *)thdata; q_to_use = thisnode->asyncwork_q; } //Infinite loop which keeps picking work items from the list and try to find the end result while(1) { network_error = false; #if ONEASYNCFD if(thisnode->dunned == true) { do { rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT); if (rv == CF_QUEUE_OK) { cl_cluster_node_put(thisnode); free(workitem); } } while (rv == CF_QUEUE_OK); //We want to delete all the workitems of this node shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode); break; } #endif //This call will block if there is no element in the queue cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER); //TODO: What if the node gets dunned while this pop call is blocked ? #if ONEASYNCFD //cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d", // cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab)); #endif // If we have no progress in 50ms, we should move to the next workitem // and revisit this workitem at a later stage progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT; // Read into this fine cl_msg, which is the short header rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms); if (rv) { #if DEBUG cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd); #endif if (rv != ETIMEDOUT) { cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd); network_error = true; goto Error; } else { goto Retry; } } #ifdef DEBUG_VERBOSE dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg)); #endif cl_proto_swap(&msg.proto); cl_msg_swap_header(&msg.m); // second read for the remainder of the message rd_buf_sz = msg.proto.sz - msg.m.header_sz; if (rd_buf_sz > 0) { if (rd_buf_sz > sizeof(rd_stack_buf)) { rd_buf = malloc(rd_buf_sz); if (!rd_buf) { cf_error("malloc fail: trying %zu",rd_buf_sz); rv = -1; goto Error; } } rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms); if (rv) { //We already read some part of the message before but failed to read the //remaining data for whatever reason (network error or timeout). We cannot //reread as we already read partial data. Declare this as error. cf_error("Timeout after reading the header but before reading the body"); goto Error; } #ifdef DEBUG_VERBOSE dump_buf("read body from cluster", rd_buf, rd_buf_sz); #endif } rv = CITRUSLEAF_OK; goto Ok; Retry: //We are trying to postpone the reading if (workitem->deadline && workitem->deadline < cf_getms()) { cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64, workitem->deadline, cf_getms()); //cf_error("async receiver: Workitem missed the final deadline"); rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } else { //We have time. Push the element back to the queue to be considered later cf_queue_push(q_to_use, &workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } cf_atomic_int_incr(&g_async_stats.retries); continue; Error: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. */ } #if ONEASYNCFD //Do not close FD #else //We do not know the state of FD. It may have pending data to be read. //We cannot reuse the FD. So, close it to be on safe side. cf_error("async receiver: Closing the fd %d because of error", workitem->fd); cf_close(workitem->fd); workitem->fd = -1; #endif cf_atomic_int_incr(&g_async_stats.dropouts); //Continue down with what we do during an Ok //Inform the caller that there is no response from the server for this workitem. //No response does not mean that the work is not done. The work might be //successfully completed on the server side, we just didnt get response for it. if (g_fail_cb_fn) { g_fail_cb_fn(workitem->udata, rv, workitem->starttime); } Ok: //rd_buf may not be there during an error condition. if (rd_buf && (rv == CITRUSLEAF_OK)) { //As of now, async functionality is there only for put call. //In put call, we do not get anything back other than the trid field. //So, just pass variable to get back the trid and ignore others. if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) { rv = CITRUSLEAF_FAIL_UNKNOWN; } else { rv = msg.m.result_code; if (workitem->trid != acktrid) { #if ONEASYNCFD //It is likely that we may get response for a different trid. //Just delete the correct one from the queue //put back the current workitem back in the queue. shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem); cf_queue_delete(q_to_use, &tmpworkitem, true); cf_queue_push(q_to_use, &workitem); //From now on workitem will be the one for which we got ack workitem = tmpworkitem; #endif #ifdef DEBUG cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d", workitem->trid, acktrid, workitem->fd); #endif } } if (g_success_cb_fn) { g_success_cb_fn(workitem->udata, rv, workitem->starttime); } } //Remember to put back the FD into the pool, if it is re-usable. if (workitem->fd != -1) { cl_cluster_node_fd_put(workitem->node, workitem->fd, true); } //Also decrement the reference count for this node cl_cluster_node_put(workitem->node); #if ONEASYNCFD //Delete the item from the global hashtable if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK) { #if DEBUG cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid); #endif } #endif //Push it back into the free pool. If the attempt fails, free it. if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) { free(workitem); } //If we allocated memory in this loop, release it. if (rd_buf && (rd_buf != rd_stack_buf)) { free(rd_buf); } // Kick this thread out if its ID is greater than total if (thread_id > cf_atomic32_get(g_async_num_threads)) { cf_atomic32_decr(&g_thread_count); return NULL; } }//The infnite loop return NULL; }
cf_vector * cl_scan_execute(cl_cluster * cluster, const cl_scan * scan, char * node_name, cl_rv * res, int (* callback)(as_val *, void *), void * udata) { cl_rv rc = CITRUSLEAF_OK; uint8_t wr_stack_buf[STACK_BUF_SZ] = { 0 }; uint8_t * wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int node_count = 0; cl_node_response response; rc = scan_compile(scan, &wr_buf, &wr_buf_sz); if ( rc != CITRUSLEAF_OK ) { LOG("[ERROR] cl_scan_execute: scan compile failed: \n"); *res = rc; return NULL; } // Setup worker cl_scan_task task = { .asc = cluster, .ns = scan->ns, .scan_buf = wr_buf, .scan_sz = wr_buf_sz, .udata = udata, .callback = callback, .job_id = scan->job_id, .type = scan->udf.type, }; task.complete_q = cf_queue_create(sizeof(cl_node_response), true); cf_vector * result_v = NULL; // If node_name is not null, we are executing scan on a particular node if (node_name) { // Copy the node name in the task and push it in the global scan queue. One task for each node strcpy(task.node_name, node_name); cf_queue_push(cluster->scan_q, &task); node_count = 1; } else { // Node name is NULL, we have to scan all nodes char *node_names = NULL; // Get a list of the node names, so we can can send work to each node cl_cluster_get_node_names(cluster, &node_count, &node_names); if ( node_count == 0 ) { LOG("[ERROR] cl_scan_execute: don't have any nodes?\n"); *res = CITRUSLEAF_FAIL_CLIENT; goto Cleanup; } // Dispatch work to the worker queue to allow the transactions in parallel // NOTE: if a new node is introduced in the middle, it is NOT taken care of node_name = node_names; for ( int i=0; i < node_count; i++ ) { // fill in per-request specifics strcpy(task.node_name, node_name); cf_queue_push(cluster->scan_q, &task); node_name += NODE_NAME_SIZE; } free(node_names); node_names = NULL; } // Wait for the work to complete from all the nodes. // For every node, fill in the return value in the result vector result_v = cf_vector_create(sizeof(cl_node_response), node_count, 0); for ( int i=0; i < node_count; i++ ) { // Pop the response structure cf_queue_pop(task.complete_q, &response, CF_QUEUE_FOREVER); cf_vector_append(result_v, &response); } Cleanup: if ( wr_buf && (wr_buf != wr_stack_buf) ) { free(wr_buf); wr_buf = 0; } cf_queue_destroy(task.complete_q); return result_v; } /** * Allocates and initializes a new cl_scan. */ cl_scan * cl_scan_new(const char * ns, const char * setname, uint64_t *job_id) { cl_scan * scan = (cl_scan*) malloc(sizeof(cl_scan)); memset(scan, 0, sizeof(cl_scan)); return cl_scan_init(scan, ns, setname, job_id); }
static void* generate_async_reads(void* aio_context) { uint64_t count = 0; while(g_running) { /* Create the struct of info needed at the process_read end */ uintptr_t info_ptr; if (cf_queue_pop(async_info_queue, (void*)&info_ptr, CF_QUEUE_NOWAIT) != CF_QUEUE_OK) { fprintf(stdout, "Error: Could not pop info struct \n"); return (void*)(-1); } as_async_info_t *info = (as_async_info_t*)info_ptr; memset(info, 0, sizeof(as_async_info_t)); /* Generate the actual read request */ uint32_t random_device_index = rand_32() % g_num_devices; device* p_random_device = &g_devices[random_device_index]; readreq* p_readreq = &(info->p_readreq); if(p_readreq == NULL) { fprintf(stdout, "Error: preadreq null \n"); goto fail; } p_readreq->p_device = p_random_device; p_readreq->offset = random_read_offset(p_random_device); p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES; p_readreq->start_time = cf_getms(); /* Async read */ if (g_use_valloc) { uint8_t* p_buffer = cf_valloc(p_readreq->size); info->p_buffer = p_buffer; if (p_buffer) { uint64_t raw_start_time = cf_getms(); info->raw_start_time = raw_start_time; if(read_async_from_device(info, *(aio_context_t *)aio_context) < 0) { fprintf(stdout, "Error: Async read failed \n"); free(p_buffer); goto fail; } } else { fprintf(stdout, "ERROR: read buffer cf_valloc()\n"); } } else { uint8_t stack_buffer[p_readreq->size + 4096]; uint8_t* p_buffer = align_4096(stack_buffer); info->p_buffer = p_buffer; uint64_t raw_start_time = cf_getms(); info->raw_start_time = raw_start_time; if(read_async_from_device(info, *(aio_context_t*)aio_context) < 0) { fprintf(stdout, "Error: Async read failed \n"); goto fail; } } if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) { fprintf(stdout, "ERROR: too many read reqs queued\n"); fprintf(stdout, "drive(s) can't keep up - test stopped\n"); g_running = false; return (void*)-1;; } count++; int sleep_ms = (int) (((count * 1000) / g_read_reqs_per_sec) - (cf_getms() - g_run_start_ms)); if (sleep_ms > 0) { usleep((uint32_t)sleep_ms * 1000); } continue; /* Rollback for failure */ fail: if(info) { uintptr_t temp = (uintptr_t)info; cf_queue_push(async_info_queue, (void*)&temp); } } return (0); }
static as_status as_scan_generic( aerospike* as, as_error* err, const as_policy_scan* policy, const as_scan* scan, aerospike_scan_foreach_callback callback, void* udata, uint64_t* task_id_ptr) { as_error_reset(err); if (! policy) { policy = &as->config.policies.scan; } as_cluster* cluster = as->cluster; as_nodes* nodes = as_nodes_reserve(cluster); uint32_t n_nodes = nodes->size; if (n_nodes == 0) { as_nodes_release(nodes); return as_error_set_message(err, AEROSPIKE_ERR_SERVER, "Scan command failed because cluster is empty."); } // Reserve each node in cluster. for (uint32_t i = 0; i < n_nodes; i++) { as_node_reserve(nodes->array[i]); } uint64_t task_id; if (task_id_ptr) { if (*task_id_ptr == 0) { *task_id_ptr = cf_get_rand64() / 2; } task_id = *task_id_ptr; } else { task_id = cf_get_rand64() / 2; } // Create scan command as_buffer argbuffer; uint16_t n_fields = 0; size_t size = as_scan_command_size(scan, &n_fields, &argbuffer); uint8_t* cmd = as_command_init(size); size = as_scan_command_init(cmd, policy, scan, task_id, n_fields, &argbuffer); // Initialize task. uint32_t error_mutex = 0; as_scan_task task; task.cluster = as->cluster; task.policy = policy; task.scan = scan; task.callback = callback; task.udata = udata; task.err = err; task.error_mutex = &error_mutex; task.task_id = task_id; task.cmd = cmd; task.cmd_size = size; as_status status = AEROSPIKE_OK; if (scan->concurrent) { uint32_t n_wait_nodes = n_nodes; task.complete_q = cf_queue_create(sizeof(as_scan_complete_task), true); // Run node scans in parallel. for (uint32_t i = 0; i < n_nodes; i++) { // Stack allocate task for each node. It should be fine since the task // only needs to be valid within this function. as_scan_task* task_node = alloca(sizeof(as_scan_task)); memcpy(task_node, &task, sizeof(as_scan_task)); task_node->node = nodes->array[i]; int rc = as_thread_pool_queue_task(&cluster->thread_pool, as_scan_worker, task_node); if (rc) { // Thread could not be added. Abort entire scan. if (ck_pr_fas_32(task.error_mutex, 1) == 0) { status = as_error_update(task.err, AEROSPIKE_ERR_CLIENT, "Failed to add scan thread: %d", rc); } // Reset node count to threads that were run. n_wait_nodes = i; break; } } // Wait for tasks to complete. for (uint32_t i = 0; i < n_wait_nodes; i++) { as_scan_complete_task complete; cf_queue_pop(task.complete_q, &complete, CF_QUEUE_FOREVER); if (complete.result != AEROSPIKE_OK && status == AEROSPIKE_OK) { status = complete.result; } } // Release temporary queue. cf_queue_destroy(task.complete_q); } else { task.complete_q = 0; // Run node scans in series. for (uint32_t i = 0; i < n_nodes && status == AEROSPIKE_OK; i++) { task.node = nodes->array[i]; status = as_scan_command_execute(&task); } } // Release each node in cluster. for (uint32_t i = 0; i < n_nodes; i++) { as_node_release(nodes->array[i]); } // Release nodes array. as_nodes_release(nodes); // Free command memory. as_command_free(cmd, size); // If user aborts query, command is considered successful. if (status == AEROSPIKE_ERR_CLIENT_ABORT) { status = AEROSPIKE_OK; } // If completely successful, make the callback that signals completion. if (callback && status == AEROSPIKE_OK) { callback(NULL, udata); } return status; }
// Set of threads which talk to client over the connection for doing the needful // processing. Note that once fd is assigned to a thread all the work on that fd // is done by that thread. Fair fd usage is expected of the client. First thread // is special - also does accept [listens for new connections]. It is the only // thread which does it. void * thr_demarshal(void *arg) { cf_socket_cfg *s, *ls; // Create my epoll fd, register in the global list. struct epoll_event ev; int nevents, i, n, epoll_fd; cf_clock last_fd_print = 0; #if defined(USE_SYSTEMTAP) uint64_t nodeid = g_config.self_node; #endif // Early stage aborts; these will cause faults in process scope. cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument"); s = &g_config.socket; ls = &g_config.localhost_socket; #ifdef USE_JEM int orig_arena; if (0 > (orig_arena = jem_get_arena())) { cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!"); } else { cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena); } #endif // Figure out my thread index. pthread_t self = pthread_self(); int thr_id; for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) { if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self)) break; } if (thr_id == MAX_DEMARSHAL_THREADS) { cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!"); return(0); } // First thread accepts new connection at interface socket. if (thr_id == 0) { demarshal_file_handle_init(); epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); memset(&ev, 0, sizeof (ev)); ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = s->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port); if (ls->sock) { ev.events = EPOLLIN | EPOLLERR | EPOLLHUP; ev.data.fd = ls->sock; if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev)) cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno)); cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port); } } else { epoll_fd = epoll_create(EPOLL_SZ); if (epoll_fd == -1) cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno)); } g_demarshal_args->epoll_fd[thr_id] = epoll_fd; cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id); int id_cntr = 0; // Demarshal transactions from the socket. for ( ; ; ) { struct epoll_event events[EPOLL_SZ]; cf_detail(AS_DEMARSHAL, "calling epoll"); nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1); if (0 > nevents) { cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno)); } cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents); uint64_t now_ns = cf_getns(); uint64_t now_ms = now_ns / 1000000; // Iterate over all events. for (i = 0; i < nevents; i++) { if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) { // Accept new connections on the service socket. int csocket = -1; struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); char cpaddr[64]; if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) { // This means we're out of file descriptors - could be a SYN // flood attack or misbehaving client. Eventually we'd like // to make the reaper fairer, but for now we'll just have to // ignore the accept error and move on. if ((errno == EMFILE) || (errno == ENFILE)) { if (last_fd_print != (cf_getms() / 1000L)) { cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit"); last_fd_print = cf_getms() / 1000L; } continue; } cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno); } // Get the client IP address in string form. if (caddr.sin_family == AF_INET) { if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else if (caddr.sin_family == AF_INET6) { struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr; if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) { cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno); } } else { cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family); } cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket); // Validate the limit of protocol connections we allow. uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed; if (conns_open > g_config.n_proto_fd_max) { if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open); last_fd_print = cf_getms(); } shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Set the socket to nonblocking. if (-1 == cf_socket_set_nonblocking(csocket)) { cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; continue; } // Create as_file_handle and queue it up in epoll_fd for further // communication on one of the demarshal threads. as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle)); if (!fd_h) { cf_crash(AS_DEMARSHAL, "malloc"); } sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port)); fd_h->fd = csocket; fd_h->last_used = cf_getms(); fd_h->reap_me = false; fd_h->trans_active = false; fd_h->proto = 0; fd_h->proto_unread = 0; fd_h->fh_info = 0; fd_h->security_filter = as_security_filter_create(); // Insert into the global table so the reaper can manage it. Do // this before queueing it up for demarshal threads - once // EPOLL_CTL_ADD is done it's difficult to back out (if insert // into global table fails) because fd state could be anything. cf_rc_reserve(fd_h); pthread_mutex_lock(&g_file_handle_a_LOCK); int j; bool inserted = true; if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) { inserted = false; } else { g_file_handle_a[j] = fd_h; } pthread_mutex_unlock(&g_file_handle_a_LOCK); if (!inserted) { cf_info(AS_DEMARSHAL, "unable to add socket to file handle table"); shutdown(csocket, SHUT_RDWR); close(csocket); csocket = -1; cf_rc_free(fd_h); // will free even with ref-count of 2 } else { // Place the client socket in the event queue. memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ; ev.data.ptr = fd_h; // Round-robin pick up demarshal thread epoll_fd and add // this new connection to epoll. int id; while (true) { id = (id_cntr++) % g_demarshal_args->num_threads; if (g_demarshal_args->epoll_fd[id] != 0) { break; } } fd_h->epoll_fd = g_demarshal_args->epoll_fd[id]; if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) { cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads); pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); } else { cf_atomic_int_incr(&g_config.proto_connections_opened); } } } else { bool has_extra_ref = false; as_file_handle *fd_h = events[i].data.ptr; if (fd_h == 0) { cf_info(AS_DEMARSHAL, "event with null handle, continuing"); goto NextEvent; } cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events); // Process data on an existing connection: this might be more // activity on an already existing transaction, so we have some // state to manage. as_proto *proto_p = 0; int fd = fd_h->fd; if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events); // no longer in use: out of epoll etc goto NextEvent_FD_Cleanup; } if (fd_h->trans_active) { goto NextEvent; } // If pointer is NULL, then we need to create a transaction and // store it in the buffer. if (fd_h->proto == NULL) { as_proto proto; int sz; /* Get the number of available bytes */ if (-1 == ioctl(fd, FIONREAD, &sz)) { cf_info(AS_DEMARSHAL, "unable to get number of available bytes"); goto NextEvent_FD_Cleanup; } // If we don't have enough data to fill the message buffer, // just wait and we'll come back to this one. However, we'll // let messages with zero size through, since they are // likely errors. We don't cleanup the FD in this case since // we'll get more data on it. if (sz < sizeof(as_proto) && sz != 0) { goto NextEvent; } // Do a preliminary read of the header into a stack- // allocated structure, so that later on we can allocate the // entire message buffer. if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) { cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno); goto NextEvent_FD_Cleanup; } if (proto.version != PROTO_VERSION && // For backward compatibility, allow version 0 with // security messages. ! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) { cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u", fd_h->client, proto.version); goto NextEvent_FD_Cleanup; } // Swap the necessary elements of the as_proto. as_proto_swap(&proto); if (proto.sz > PROTO_SIZE_MAX) { cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64, fd_h->client, PROTO_SIZE_MAX, proto.sz); goto NextEvent_FD_Cleanup; } #ifdef USE_JEM // Attempt to peek the namespace and set the JEMalloc arena accordingly. size_t peeked_data_sz = 0; size_t min_field_sz = sizeof(uint32_t) + sizeof(char); size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz; size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.) uint8_t peekbuf[peekbuf_sz]; if (PROTO_TYPE_AS_MSG == proto.type) { size_t offset = sizeof(as_msg); // Number of bytes to peek from the socket. // size_t peek_sz = peekbuf_sz; // Peak up to the size of the peek buffer. size_t peek_sz = MIN(proto.sz, peekbuf_sz); // Peek only up to the minimum necessary number of bytes. if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) { // That's actually legitimate. The as_proto may have gone into one // packet, the as_msg into the next one, which we haven't yet received. // This just "never happened" without async. cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz); } if (peeked_data_sz > min_as_msg_sz) { // cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz); if (peeked_data_sz > proto.sz) { cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd); log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz); goto NextEvent_FD_Cleanup; } if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) { jem_set_arena(orig_arena); } else { uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0; bool found = false; // cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields); while (!found && (field_num < n_fields)) { as_msg_field *field = (as_msg_field *) (&peekbuf[offset]); uint32_t value_sz = ntohl(field->field_sz) - 1; // cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset); // cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz); // cf_debug(AS_DEMARSHAL, "\ttype %d", field->type); if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) { if (value_sz >= AS_ID_NAMESPACE_SZ) { cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz); goto NextEvent_FD_Cleanup; } char ns[AS_ID_NAMESPACE_SZ]; found = true; memcpy(ns, field->data, value_sz); ns[value_sz] = '\0'; // cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num); jem_set_arena(as_namespace_get_jem_arena(ns)); } else { // cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type); field_num++; offset += sizeof(as_msg_field) + value_sz; if (offset >= peeked_data_sz) { break; } } } if (!found) { cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz); jem_set_arena(orig_arena); } } } else { jem_set_arena(orig_arena); } } else { jem_set_arena(orig_arena); } #endif // Allocate the complete message buffer. proto_p = cf_malloc(sizeof(as_proto) + proto.sz); cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno)); memcpy(proto_p, &proto, sizeof(as_proto)); #ifdef USE_JEM // Jam in the peeked data. if (peeked_data_sz) { memcpy(proto_p->data, &peekbuf, peeked_data_sz); } fd_h->proto_unread = proto_p->sz - peeked_data_sz; #else fd_h->proto_unread = proto_p->sz; #endif fd_h->proto = (void *) proto_p; } else { proto_p = fd_h->proto; } if (fd_h->proto_unread > 0) { // Read the data. n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0); if (0 >= n) { if (errno == EAGAIN) { continue; } cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno)); goto NextEvent_FD_Cleanup; } // Decrement bytes-unread counter. cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread); fd_h->proto_unread -= n; } // Check for a finished read. if (0 == fd_h->proto_unread) { // It's only really live if it's injecting a transaction. fd_h->last_used = now_ms; thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress fd_h->proto = 0; fd_h->proto_unread = 0; // INIT_TR as_transaction tr; as_transaction_init(&tr, NULL, (cl_msg *)proto_p); cf_rc_reserve(fd_h); has_extra_ref = true; tr.proto_fd_h = fd_h; tr.start_time = now_ns; // set transaction start time tr.preprocessed = false; if (! as_proto_is_valid_type(proto_p)) { cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type); // We got a proto message type we don't recognize, so it // may not do any good to send back an as_msg error, but // it's the best we can do. At least we can keep the fd. as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } if (g_config.microbenchmarks) { histogram_insert_data_point(g_config.demarshal_hist, now_ns); tr.microbenchmark_time = cf_getns(); } // Check if it's compressed. if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) { // Decompress it - allocate buffer to hold decompressed // packet. uint8_t *decompressed_buf = NULL; size_t decompressed_buf_size = 0; int rv = 0; if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) { cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv); cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p"); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Count the packets. cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1); // Free the compressed packet since we'll be using the // decompressed packet from now on. cf_free(proto_p); proto_p = NULL; // Get original packet. tr.msgp = (cl_msg *)decompressed_buf; as_proto_swap(&(tr.msgp->proto)); if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) { cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]", tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size); as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } } // Security protocol transactions. if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) { as_security_transact(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Info protocol requests. if (tr.msgp->proto.type == PROTO_TYPE_INFO) { if (as_info(&tr)) { cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer"); goto NextEvent_FD_Cleanup; } cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp); // Fast path for batch requests. if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) { as_batch_queue_task(&tr); cf_atomic_int_incr(&g_config.proto_transactions); goto NextEvent; } // Either process the transaction directly in this thread, // or queue it for processing by another thread (tsvc/info). if (0 != thr_tsvc_process_or_enqueue(&tr)) { cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread"); goto NextEvent_FD_Cleanup; } else { cf_atomic_int_incr(&g_config.proto_transactions); } } // Jump the proto message free & FD cleanup. If we get here, the // above operations went smoothly. The message free & FD cleanup // job is handled elsewhere as directed by // thr_tsvc_process_or_enqueue(). goto NextEvent; NextEvent_FD_Cleanup: // If we allocated memory for the incoming message, free it. if (proto_p) { cf_free(proto_p); fd_h->proto = 0; } // If fd has extra reference for transaction, release it. if (has_extra_ref) { cf_rc_release(fd_h); } // Remove the fd from the events list. if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) { cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)", fd, epoll_fd, errno, cf_strerror(errno)); } pthread_mutex_lock(&g_file_handle_a_LOCK); fd_h->reap_me = true; as_release_file_handle(fd_h); fd_h = 0; pthread_mutex_unlock(&g_file_handle_a_LOCK); NextEvent: ; } // We should never be canceled externally, but just in case... pthread_testcancel(); } } return NULL; }
//Same as do_the_full_monte, but only till the command is sent to the node. //Most of the code is duplicated. Bad. int cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key, const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata) { cl_async_work *workitem = NULL; uint8_t wr_stack_buf[STACK_BUF_SZ]; uint8_t *wr_buf = wr_stack_buf; size_t wr_buf_sz = sizeof(wr_stack_buf); int progress_timeout_ms; uint64_t deadline_ms; uint64_t starttime, endtime; bool network_error; int fd = -1; int rv = CITRUSLEAF_FAIL_CLIENT; //Assume that this is a failure; // as_msg msg; cf_digest d_ret; cl_cluster_node *node = 0; #if ONEASYNCFD if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) { //cf_error("Async hashtab is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #else //If the async buffer is at the max limit, do not entertain more requests. if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) { //cf_error("Async buffer is full. Cannot insert any more elements"); return CITRUSLEAF_FAIL_ASYNCQ_FULL; } #endif //Allocate memory for work item that will be added to the async work list if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) { cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER); } else { workitem = malloc(sizeof(cl_async_work)); if (workitem == NULL) { return CITRUSLEAF_FAIL_CLIENT; } } //Compile the write buffer to be sent to the cluster if (n_values && ( values || operations) ){ cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL, *n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); }else{ cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/); } deadline_ms = 0; progress_timeout_ms = 0; if (cl_w_p && cl_w_p->timeout_ms) { deadline_ms = cf_getms() + cl_w_p->timeout_ms; // policy: if asking for a long timeout, give enough time to try twice if (cl_w_p->timeout_ms > 700) { progress_timeout_ms = cl_w_p->timeout_ms / 2; } else { progress_timeout_ms = cl_w_p->timeout_ms; } } else { progress_timeout_ms = g_async_nw_progress_timeout; } //Initialize the async work unit workitem->trid = *trid; workitem->deadline = deadline_ms; workitem->starttime = cf_getms(); workitem->udata = udata; as_msg *msgp; // Hate special cases, but we have to clear the verify bit on delete verify if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY)) { msgp = (as_msg *)wr_buf; msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY; } if (asc->compression_stat.compression_threshold > 0 && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold) { /* Compression is enabled. * Packet size is above threshold. * Compress the data */ uint8_t *compressed_buf = NULL; size_t compressed_buf_sz = 0; // Contstruct packet for compressed data. cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz); if (compressed_buf) { // If original packet size is > 16k, cl_compile had allocated memory for it. // Free that memory. // cf_packet_compression will allocate memory for compressed packet if (wr_buf != wr_stack_buf) { free(wr_buf); } // Update stats. citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz); wr_buf = compressed_buf; wr_buf_sz = compressed_buf_sz; //memcpy (wr_buf, compressed_buf, compressed_buf_sz); //wr_buf_sz = compressed_buf_sz; //free (compressed_buf); } //else compression failed, continue with uncompressed packet else { // Set compression stat citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz); } } int try = 0; // retry request based on the write_policy do { network_error = false; try++; #ifdef DEBUG if (try > 1) { cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self()); } #endif // Get an FD from a cluster. First get the probable node for the given digest. node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false); if (!node) { #ifdef DEBUG cf_debug("warning: no healthy nodes in cluster, retrying"); #endif usleep(10000); //Sleep for 10ms goto Retry; } // Now get the dedicated async FD of this node starttime = cf_getms(); fd = cl_cluster_node_fd_get(node, true); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime)); } if (fd == -1) { #ifdef DEBUG cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() ); #endif usleep(1000); goto Retry; } // Send the command to the node starttime = cf_getms(); rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms); endtime = cf_getms(); if ((endtime - starttime) > 10) { cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime)); } if (rv != 0) { cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)", rv,fd,errno,(uint64_t)pthread_self()); if (rv != ETIMEDOUT) network_error = true; goto Retry; } goto Ok; Retry: if (network_error == true) { /* * In case of Async work (for XDS), it may be extreme to * dun a node in case of network error. We just cleanup * things and retry to connect to the remote cluster. * The network error may be a transient one. As this is a * network error, its is better to wait for some significant * time before retrying. */ sleep(1); //Sleep for 1sec #if ONEASYNCFD //Do not close the FD #else cf_error("async sender: Closing the fd %d because of network error", fd); cf_close(fd); fd = -1; #endif } if (fd != -1) { cf_error("async sender: Closing the fd %d because of retry", fd); cf_close(fd); fd = -1; } if (node) { cl_cluster_node_put(node); node = 0; } if (deadline_ms && (deadline_ms < cf_getms() ) ) { #ifdef DEBUG cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms()); #endif rv = CITRUSLEAF_FAIL_TIMEOUT; goto Error; } } while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) ); Error: #ifdef DEBUG cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d", (int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), (int)(deadline_ms - cf_getms() ), rv ); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } #if ONEASYNCFD //Do not close the FD #else //If it is a network error, the fd would be closed and set to -1. //So, we reach this place with a valid FD in case of timeout. if (fd != -1) { cf_error("async sender: Closing the fd %d because of timeout", fd); cf_close(fd); } #endif return(rv); Ok: /* * We cannot release the node here as the asyc FD associated * with this node may get closed. We should do it only when * we got back the ack for the async command that we just did. */ //As we sent the command successfully, add it to the async work list workitem->node = node; workitem->fd = fd; //We are storing only the pointer to the workitem #if ONEASYNCFD if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) { //This should always succeed. cf_error("Unable to add unique entry into the hash table"); } cf_queue_push(node->asyncwork_q, &workitem); //Also put in the node's q #else cf_queue_push(g_cl_async_q, &workitem); #endif if (wr_buf != wr_stack_buf) { free(wr_buf); } rv = CITRUSLEAF_OK; return rv; } int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads) { // int num_threads; if (0 == cf_atomic32_get(g_async_initialized)) { cf_error("Async client not initialized cannot reinit"); return -1; } if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS; } // If number of thread is increased create more threads if (num_receiver_threads > g_async_num_threads) { unsigned int i; for (i = g_async_num_threads; i < num_receiver_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } } else { // else just reset the number the async threads will kill themselves cf_atomic32_set(&g_async_num_threads, num_receiver_threads); } cf_atomic32_set(&g_async_q_szlimit , size_limit); return ( 0 ); } int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn) { int i, num_threads; //Make sure that we do the initialization only once if (1 == cf_atomic32_incr(&g_async_initialized)) { // Start the receiver threads num_threads = num_receiver_threads; if (num_threads > MAX_ASYNC_RECEIVER_THREADS) { //Limit the threads to the max value even if caller asks for it num_threads = MAX_ASYNC_RECEIVER_THREADS; } #if ONEASYNCFD g_async_h_szlimit = size_limit * 3; //Max number of elements in the hash table g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *), g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) { cf_error("Failed to initialize the async work hastable"); cf_atomic32_decr(&g_async_initialized); return -1; } #else // create work queue g_async_q_szlimit = size_limit; if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to initialize the async work queue"); cf_atomic32_decr(&g_async_initialized); return -1; } for (i=0; i<num_threads; i++) { pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL); } g_async_num_threads = num_threads; #endif if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) { cf_error("Failed to create memory pool for workitems"); return -1; } g_fail_cb_fn = fail_cb_fn; g_success_cb_fn = success_cb_fn; // Initialize the stats g_async_stats.retries = 0; g_async_stats.dropouts = 0; } return(0); }