Пример #1
0
int
cf_queue_priority_pop(cf_queue_priority *q, void *buf, int ms_wait)
{
    if (q->threadsafe && (0 != pthread_mutex_lock(&q->LOCK)))
        return(-1);

    struct timespec tp;
    if (ms_wait > 0) {
        clock_gettime( CLOCK_REALTIME, &tp);
        tp.tv_sec += ms_wait / 1000;
        tp.tv_nsec += (ms_wait % 1000) * 1000000;
        if (tp.tv_nsec > 1000000000) {
            tp.tv_nsec -= 1000000000;
            tp.tv_sec++;
        }
    }

    if (q->threadsafe) {
        while (CF_Q_PRI_EMPTY(q)) {
            if (CF_QUEUE_FOREVER == ms_wait) {
                pthread_cond_wait(&q->CV, &q->LOCK);
            }
            else if (CF_QUEUE_NOWAIT == ms_wait) {
                pthread_mutex_unlock(&q->LOCK);
                return(CF_QUEUE_EMPTY);
            }
            else {
                pthread_cond_timedwait(&q->CV, &q->LOCK, &tp);
                if (CF_Q_PRI_EMPTY(q)) {
                    pthread_mutex_unlock(&q->LOCK);
                    return(CF_QUEUE_EMPTY);
                }
            }
        }
    }

    int rv;
    if (CF_Q_SZ(q->high_q))
        rv = cf_queue_pop(q->high_q, buf, 0);
    else if (CF_Q_SZ(q->medium_q))
        rv = cf_queue_pop(q->medium_q, buf, 0);
    else if (CF_Q_SZ(q->low_q))
        rv = cf_queue_pop(q->low_q, buf, 0);
    else rv = CF_QUEUE_EMPTY;

    if (q->threadsafe && (0 != pthread_mutex_unlock(&q->LOCK)))
        return(-1);


    return(rv);
}
Пример #2
0
Файл: act.c Проект: aanguss/act
//------------------------------------------------
// Runs in every thread of every read queue, pops
// readreq objects, does the read and reports the
// read transaction duration.
//
static void* run_reads(void* pv_req_queue) {
	cf_queue* p_req_queue = (cf_queue*)pv_req_queue;
	readreq* p_readreq;

	while (g_running) {
		if (cf_queue_pop(p_req_queue, (void*)&p_readreq, 100) != CF_QUEUE_OK) {
			continue;
		}

		if (g_use_valloc) {
			uint8_t* p_buffer = cf_valloc(p_readreq->size);

			if (p_buffer) {
				read_and_report(p_readreq, p_buffer);
				free(p_buffer);
			}
			else {
				fprintf(stdout, "ERROR: read buffer cf_valloc()\n");
			}
		}
		else {
			uint8_t stack_buffer[p_readreq->size + 4096];
			uint8_t* p_buffer = align_4096(stack_buffer);

			read_and_report(p_readreq, p_buffer);
		}

		free(p_readreq);
		cf_atomic_int_decr(&g_read_reqs_queued);
	}

	return (0);
}
Пример #3
0
// Process one queue's batch requests.
void*
batch_process_queue(void* q_to_wait_on)
{
	cf_queue* worker_queue = (cf_queue*)q_to_wait_on;
	batch_transaction btr;
	uint64_t start;

	while (1) {
		if (cf_queue_pop(worker_queue, &btr, CF_QUEUE_FOREVER) != 0) {
			cf_crash(AS_BATCH, "Failed to pop from batch worker queue.");
		}

		// Check for timeouts.
		if (btr.end_time != 0 && cf_getns() > btr.end_time) {
			cf_atomic_int_incr(&g_config.batch_timeout);

			if (btr.fd_h) {
				as_msg_send_reply(btr.fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT,
						0, 0, 0, 0, 0, 0, 0, btr.trid, NULL);
				btr.fd_h = 0;
			}
			batch_transaction_done(&btr);
			continue;
		}

		// Process batch request.
		start = cf_getns();
		batch_process_request(&btr);
		histogram_insert_data_point(g_config.batch_q_process_hist, start);
	}

	return 0;
}
Пример #4
0
Файл: act.c Проект: aanguss/act
//------------------------------------------------
// Close all file descriptors for a device.
//
static void fd_close_all(device* p_device) {
	int fd;

	while (cf_queue_pop(p_device->p_fd_queue, (void*)&fd, CF_QUEUE_NOWAIT) ==
			CF_QUEUE_OK) {
		close(fd);
	}
}
int cf_queue_priority_pop(cf_queue_priority *q, void *buf, int ms_wait)
{
	cf_queue_priority_lock(q);

	struct timespec tp;

	if (ms_wait > 0) {
		cf_set_wait_timespec(ms_wait, &tp);
	}

	if (q->threadsafe) {
		while (CF_Q_PRI_EMPTY(q)) {
			if (CF_QUEUE_FOREVER == ms_wait) {
				pthread_cond_wait(&q->CV, &q->LOCK);
			}
			else if (CF_QUEUE_NOWAIT == ms_wait) {
				pthread_mutex_unlock(&q->LOCK);
				return CF_QUEUE_EMPTY;
			}
			else {
				pthread_cond_timedwait(&q->CV, &q->LOCK, &tp);

				if (CF_Q_PRI_EMPTY(q)) {
					pthread_mutex_unlock(&q->LOCK);
					return CF_QUEUE_EMPTY;
				}
			}
		}
	}

	int rv = CF_QUEUE_EMPTY;

	if (CF_Q_SZ(q->high_q)) {
		rv = cf_queue_pop(q->high_q, buf, 0);
	}
	else if (CF_Q_SZ(q->medium_q)) {
		rv = cf_queue_pop(q->medium_q, buf, 0);
	}
	else if (CF_Q_SZ(q->low_q)) {
		rv = cf_queue_pop(q->low_q, buf, 0);
	}

	cf_queue_priority_unlock(q);
	return rv;
}
Пример #6
0
static dig_arr_t *
getDigestArray(void)
{
	dig_arr_t *dt;
	if (cf_queue_pop(g_q_dig_arr, &dt, CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) {
		dt = cf_malloc(sizeof(dig_arr_t));
	}
	dt->num = 0;
	return dt;
}
Пример #7
0
void *
as_netio_th(void *q_to_wait_on) {
	cf_queue *           q = (cf_queue*)q_to_wait_on;
	while (true) {
		as_netio io;
		if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) {
			cf_crash(AS_PROTO, "Failed to pop from IO worker queue.");
		}
		if (io.slow) {
			usleep(g_config.proto_slow_netio_sleep_ms * 1000);
		}
		as_netio_send(&io, g_netio_slow_queue, false);
	}
}
Пример #8
0
Файл: act.c Проект: aanguss/act
//------------------------------------------------
// Get a safe file descriptor for a device.
//
static int fd_get(device* p_device) {
	int fd = -1;

	if (cf_queue_pop(p_device->p_fd_queue, (void*)&fd, CF_QUEUE_NOWAIT) !=
			CF_QUEUE_OK) {
		fd = open(p_device->name, O_DIRECT | O_RDWR, S_IRUSR | S_IWUSR);

		if (fd == -1) {
			fprintf(stdout, "ERROR: open device %s\n", p_device->name);
		}
	}

	return (fd);
}
Пример #9
0
void
as_node_destroy(as_node* node)
{
	// Drain out the queue and close the FDs
	int rv;
	do {
		int	fd;
		rv = cf_queue_pop(node->conn_q, &fd, CF_QUEUE_NOWAIT);
		if (rv == CF_QUEUE_OK)
			cf_close(fd);
	} while (rv == CF_QUEUE_OK);
	
	/*
	 do {
	 int	fd;
	 rv = cf_queue_pop(node->conn_q_asyncfd, &fd, CF_QUEUE_NOWAIT);
	 if (rv == CF_QUEUE_OK)
	 cf_close(fd);
	 } while (rv == CF_QUEUE_OK);
	 */
	
	/*
	 do {
	 //When we reach this point, ideally there should not be any workitems.
	 cl_async_work *aw;
	 rv = cf_queue_pop(node->asyncwork_q, &aw, CF_QUEUE_NOWAIT);
	 if (rv == CF_QUEUE_OK) {
	 free(aw);
	 }
	 } while (rv == CF_QUEUE_OK);
	 
	 //We want to delete all the workitems of this node
	 if (g_cl_async_hashtab) {
	 shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, node);
	 }
	 */
	
	as_vector_destroy(&node->addresses);
	cf_queue_destroy(node->conn_q);
	//cf_queue_destroy(node->conn_q_asyncfd);
	//cf_queue_destroy(node->asyncwork_q);
	
	if (node->info_fd >= 0) {
		cf_close(node->info_fd);
	}

	cf_free(node);
}
Пример #10
0
void *
as_netio_th(void *q_to_wait_on) {
	cf_queue *           q = (cf_queue*)q_to_wait_on;
	while (true) {
		as_netio io;
		if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) {
			cf_crash(AS_PROTO, "Failed to pop from IO worker queue.");
		}
		if (io.slow) {
			usleep(g_config.proto_slow_netio_sleep_ms * 1000);
		}
		if (as_netio_send(&io, g_netio_slow_queue, false) != AS_NETIO_CONTINUE) {
			AS_RELEASE_FILE_HANDLE(io.fd_h);
			cf_buf_builder_free(io.bb_r);
		};
	}
}
Пример #11
0
int
as_node_get_connection(as_node* node, int* fd)
{
	//cf_queue* q = asyncfd ? node->conn_q_asyncfd : node->conn_q;
	cf_queue* q = node->conn_q;
	
	while (1) {
		int rv = cf_queue_pop(q, fd, CF_QUEUE_NOWAIT);
		
		if (rv == CF_QUEUE_OK) {
			int rv2 = is_connected(*fd);
			
			switch (rv2) {
				case CONNECTED:
					// It's still good.
					return 0;
					
				case CONNECTED_BADFD:
					// Local problem, don't try closing.
					cf_warn("Found bad file descriptor in queue: fd %d", *fd);
					break;
				
				case CONNECTED_NOT:
					// Can't use it - the remote end closed it.
				case CONNECTED_ERROR:
					// Some other problem, could have to do with remote end.
				default:
					cf_close(*fd);
					break;
			}
		}
		else if (rv == CF_QUEUE_EMPTY) {
			// We exhausted the queue. Try creating a fresh socket.
			return as_node_create_connection(node, fd);
		}
		else {
			cf_error("Bad return value from cf_queue_pop");
			*fd = -1;
			return CITRUSLEAF_FAIL_CLIENT;
		}
	}
}
Пример #12
0
void * cl_scan_worker(void * pv_asc) {
	cl_cluster* asc = (cl_cluster*)pv_asc;

	while (true) {
        // Response structure to be pushed in the complete q
        cl_node_response response; 
        memset(&response, 0, sizeof(cl_node_response));

        cl_scan_task task;

        if ( 0 != cf_queue_pop(asc->scan_q, &task, CF_QUEUE_FOREVER) ) {
            LOG("[WARNING] cl_scan_worker: queue pop failed\n");
        }

        if ( cf_debug_enabled() ) {
            LOG("[DEBUG] cl_scan_worker: getting one task item\n");
        }

        // This is how scan shutdown signals we're done.
        if ( ! task.asc ) {
            break;
        }

        // query if the node is still around
        int rc = CITRUSLEAF_FAIL_UNAVAILABLE;

        cl_cluster_node * node = cl_cluster_node_get_byname(task.asc, task.node_name);
        if ( node ) {
            rc = cl_scan_worker_do(node, &task);
        }
        else {
            LOG("[INFO] cl_scan_worker: No node found with the name %s\n", task.node_name);
        }
        strncpy(response.node_name, task.node_name, strlen(task.node_name));
        response.node_response = rc;
        response.job_id = task.job_id;
        cf_queue_push(task.complete_q, (void *)&response);
    }

	return NULL;
}
Пример #13
0
void cl_scan_destroy(cl_scan *scan) {

    if ( scan == NULL ) return;

    cl_scan_udf_destroy(&scan->udf);
    if (scan->ns)      free(scan->ns);
    if (scan->setname) free(scan->setname);

    if ( scan->res_streamq ) {
        as_val *val = NULL;
        while (CF_QUEUE_OK == cf_queue_pop (scan->res_streamq, 
                    &val, CF_QUEUE_NOWAIT)) {
            as_val_destroy(val);
            val = NULL;
        }

        cf_queue_destroy(scan->res_streamq);
        scan->res_streamq = NULL;
    }

    free(scan);
    scan = NULL;
}
Пример #14
0
void *
cf_queue_test_1_read(void *arg)
{
    cf_queue *q = (cf_queue *) arg;

    for (int i=0; i<TEST1_SZ; i++) {

        // sleep twice as long as the inserter, to test overflow
        usleep(TEST1_INTERVAL * 1000 * 2);

        int  v = -1;
        int rv = cf_queue_pop(q, &v, CF_QUEUE_FOREVER);
        if (rv != CF_QUEUE_OK) {
            fprintf(stderr, "cf_queue_test1: pop error %d",rv);
            return((void *) -1);
        }
        if (v != i) {
            fprintf(stderr, "cf_queue_test1: pop value error: %d should be %d",v,i);
            return((void *) -1);
        }

    }
    return((void *) 0);
}
Пример #15
0
static void* 
async_receiver_fn(void *thdata)
{
	int 		rv = -1;
	bool 		network_error = false;
	cl_async_work	*workitem = NULL;
	// cl_async_work	*tmpworkitem = NULL;
	as_msg 		msg;
	cf_queue	*q_to_use = NULL;
	cl_cluster_node	*thisnode = NULL;

	uint8_t		rd_stack_buf[STACK_BUF_SZ];	
	uint8_t		*rd_buf = rd_stack_buf;
	size_t		rd_buf_sz = 0;

	uint64_t	acktrid;
	// uint64_t	starttime, endtime;
	int		progress_timeout_ms;
	unsigned int 	thread_id = cf_atomic32_incr(&g_thread_count);

	if (thdata == NULL) {
		q_to_use = g_cl_async_q;
	} else {
		thisnode = (cl_cluster_node *)thdata;
		q_to_use = thisnode->asyncwork_q;
	}
    
	//Infinite loop which keeps picking work items from the list and try to find the end result 
	while(1) {
		network_error = false;
#if ONEASYNCFD
		if(thisnode->dunned == true) {
			do {
				rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT);
				if (rv == CF_QUEUE_OK) {
					cl_cluster_node_put(thisnode);
					free(workitem);
				}
			} while (rv == CF_QUEUE_OK);

			//We want to delete all the workitems of this node
			shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode);
			break;
		}
#endif
		//This call will block if there is no element in the queue
		cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER);
		//TODO: What if the node gets dunned while this pop call is blocked ?
#if ONEASYNCFD
		//cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d",
		//		cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab));
#endif

		// If we have no progress in 50ms, we should move to the next workitem 
		// and revisit this workitem at a later stage
		progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT;

		// Read into this fine cl_msg, which is the short header
		rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms);
		if (rv) {
#if DEBUG
			cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd);
#endif
			if (rv != ETIMEDOUT) {
				cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd);
				network_error = true;
				goto Error;
			} else {
				goto Retry;
			}

		}
#ifdef DEBUG_VERBOSE
		dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg));
#endif
		cl_proto_swap(&msg.proto);
		cl_msg_swap_header(&msg.m);

		// second read for the remainder of the message 
		rd_buf_sz =  msg.proto.sz  - msg.m.header_sz;
		if (rd_buf_sz > 0) {
			if (rd_buf_sz > sizeof(rd_stack_buf)) {
				rd_buf = malloc(rd_buf_sz);
				if (!rd_buf) {
					cf_error("malloc fail: trying %zu",rd_buf_sz);
					rv = -1; 
					goto Error; 
				}
			}

			rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms);
			if (rv) {
				//We already read some part of the message before but failed to read the
				//remaining data for whatever reason (network error or timeout). We cannot
				//reread as we already read partial data. Declare this as error.
				cf_error("Timeout after reading the header but before reading the body");
				goto Error;
			}
#ifdef DEBUG_VERBOSE
			dump_buf("read body from cluster", rd_buf, rd_buf_sz);
#endif	
		}

		rv = CITRUSLEAF_OK;
		goto Ok;

Retry:
		//We are trying to postpone the reading
		if (workitem->deadline && workitem->deadline < cf_getms()) {
			cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64,
					workitem->deadline, cf_getms());
			//cf_error("async receiver: Workitem missed the final deadline");
			rv = CITRUSLEAF_FAIL_TIMEOUT;
			goto Error;
		} else {
			//We have time. Push the element back to the queue to be considered later
			cf_queue_push(q_to_use, &workitem);
		}

		//If we allocated memory in this loop, release it.
		if (rd_buf && (rd_buf != rd_stack_buf)) {
			free(rd_buf);
		}

		cf_atomic_int_incr(&g_async_stats.retries);

		continue;

Error:
		if (network_error == true) {
			/* 
			 * In case of Async work (for XDS), it may be extreme to
			 * dun a node in case of network error. We just cleanup
			 * things and retry to connect to the remote cluster.
			 * The network error may be a transient one.
			 */
		} 

#if ONEASYNCFD
//Do not close FD
#else
		//We do not know the state of FD. It may have pending data to be read.
		//We cannot reuse the FD. So, close it to be on safe side.
		cf_error("async receiver: Closing the fd %d because of error", workitem->fd);
		cf_close(workitem->fd);
		workitem->fd = -1;
#endif
		cf_atomic_int_incr(&g_async_stats.dropouts);
		//Continue down with what we do during an Ok

		//Inform the caller that there is no response from the server for this workitem.
		//No response does not mean that the work is not done. The work might be 
		//successfully completed on the server side, we just didnt get response for it.
		if (g_fail_cb_fn) {
			g_fail_cb_fn(workitem->udata, rv, workitem->starttime);
		}
Ok:
		//rd_buf may not be there during an error condition.
		if (rd_buf && (rv == CITRUSLEAF_OK)) {
			//As of now, async functionality is there only for put call.
			//In put call, we do not get anything back other than the trid field.
			//So, just pass variable to get back the trid and ignore others.
			if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) {
				rv = CITRUSLEAF_FAIL_UNKNOWN;
			}
			else {
				rv = msg.m.result_code;
				if (workitem->trid != acktrid) {
#if ONEASYNCFD
					//It is likely that we may get response for a different trid.
					//Just delete the correct one from the queue 
					//put back the current workitem back in the queue.
					shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem);
					cf_queue_delete(q_to_use, &tmpworkitem, true);
					cf_queue_push(q_to_use, &workitem);
					//From now on workitem will be the one for which we got ack
					workitem = tmpworkitem;
#endif
#ifdef DEBUG
					cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d",
							workitem->trid, acktrid, workitem->fd);
#endif
				}
			}

			if (g_success_cb_fn) {
				g_success_cb_fn(workitem->udata, rv, workitem->starttime);
			}
		}

		//Remember to put back the FD into the pool, if it is re-usable.
		if (workitem->fd != -1) {
			cl_cluster_node_fd_put(workitem->node, workitem->fd, true);
		}
		//Also decrement the reference count for this node
		cl_cluster_node_put(workitem->node);

#if ONEASYNCFD
		//Delete the item from the global hashtable
		if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK)
		{
#if DEBUG
			cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid);
#endif
		}
#endif

		//Push it back into the free pool. If the attempt fails, free it.
		if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) {
			free(workitem);
		}

		//If we allocated memory in this loop, release it.
		if (rd_buf && (rd_buf != rd_stack_buf)) {
			free(rd_buf);
		}

		// Kick this thread out if its ID is greater than total
		if (thread_id > cf_atomic32_get(g_async_num_threads)) {
			cf_atomic32_decr(&g_thread_count);
			return NULL;
		}
	}//The infnite loop

	return NULL;
}
Пример #16
0
cf_vector * cl_scan_execute(cl_cluster * cluster, const cl_scan * scan, char * node_name, cl_rv * res, int (* callback)(as_val *, void *), void * udata) {

    cl_rv           rc                          = CITRUSLEAF_OK;
    uint8_t         wr_stack_buf[STACK_BUF_SZ]  = { 0 };
    uint8_t *       wr_buf                      = wr_stack_buf;
    size_t          wr_buf_sz                   = sizeof(wr_stack_buf);
    int             node_count                  = 0;
    cl_node_response  response;
    rc = scan_compile(scan, &wr_buf, &wr_buf_sz);

    if ( rc != CITRUSLEAF_OK ) {
        LOG("[ERROR] cl_scan_execute: scan compile failed: \n");
        *res = rc;
        return NULL;
    }

    // Setup worker
    cl_scan_task task = {
        .asc                = cluster,
        .ns                 = scan->ns,
        .scan_buf          = wr_buf,
        .scan_sz           = wr_buf_sz,
        .udata              = udata,
        .callback           = callback,
        .job_id                = scan->job_id,
        .type                = scan->udf.type,
    };

    task.complete_q      = cf_queue_create(sizeof(cl_node_response), true);
    cf_vector * result_v = NULL;

    // If node_name is not null, we are executing scan on a particular node
    if (node_name) {
        // Copy the node name in the task and push it in the global scan queue. One task for each node
        strcpy(task.node_name, node_name);
        cf_queue_push(cluster->scan_q, &task);
        node_count = 1;
    }
    else {
        // Node name is NULL, we have to scan all nodes 
        char *node_names    = NULL;    

        // Get a list of the node names, so we can can send work to each node
        cl_cluster_get_node_names(cluster, &node_count, &node_names);
        if ( node_count == 0 ) {
            LOG("[ERROR] cl_scan_execute: don't have any nodes?\n");
            *res = CITRUSLEAF_FAIL_CLIENT;
            goto Cleanup;
        }

        // Dispatch work to the worker queue to allow the transactions in parallel
        // NOTE: if a new node is introduced in the middle, it is NOT taken care of
        node_name = node_names;
        for ( int i=0; i < node_count; i++ ) {
            // fill in per-request specifics
            strcpy(task.node_name, node_name);
            cf_queue_push(cluster->scan_q, &task);
            node_name += NODE_NAME_SIZE;                    
        }
        free(node_names);
        node_names = NULL;
    }

    // Wait for the work to complete from all the nodes.
    // For every node, fill in the return value in the result vector
    result_v = cf_vector_create(sizeof(cl_node_response), node_count, 0);
    for ( int i=0; i < node_count; i++ ) {
        // Pop the response structure
        cf_queue_pop(task.complete_q, &response, CF_QUEUE_FOREVER);
        cf_vector_append(result_v, &response);
    }

Cleanup:
    if ( wr_buf && (wr_buf != wr_stack_buf) ) { 
        free(wr_buf); 
        wr_buf = 0;
    }
    cf_queue_destroy(task.complete_q);

    return result_v;
}

/**
 * Allocates and initializes a new cl_scan.
 */
cl_scan * cl_scan_new(const char * ns, const char * setname, uint64_t *job_id) {
    cl_scan * scan = (cl_scan*) malloc(sizeof(cl_scan));
    memset(scan, 0, sizeof(cl_scan));
    return cl_scan_init(scan, ns, setname, job_id);
}
Пример #17
0
static void* generate_async_reads(void* aio_context)
{
	uint64_t count = 0;
	while(g_running)
	{
		/* Create the struct of info needed at the process_read end */
		uintptr_t info_ptr;
		if (cf_queue_pop(async_info_queue, (void*)&info_ptr, CF_QUEUE_NOWAIT) !=
				CF_QUEUE_OK) 
		{
			fprintf(stdout, "Error: Could not pop info struct \n");
			return (void*)(-1);
		}
		as_async_info_t *info = (as_async_info_t*)info_ptr;
		memset(info, 0, sizeof(as_async_info_t));
		/* Generate the actual read request */
		uint32_t random_device_index = rand_32() % g_num_devices;
		device* p_random_device = &g_devices[random_device_index];
		readreq* p_readreq = &(info->p_readreq);
		if(p_readreq == NULL)
		{
			fprintf(stdout, "Error: preadreq null \n");
			goto fail;
		}
		p_readreq->p_device = p_random_device;
		p_readreq->offset = random_read_offset(p_random_device);
		p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES;
		p_readreq->start_time = cf_getms();

		/* Async read */
		if (g_use_valloc) 
		{
			uint8_t* p_buffer = cf_valloc(p_readreq->size);
			info->p_buffer = p_buffer;
			if (p_buffer) 
			{
				uint64_t raw_start_time = cf_getms();
				info->raw_start_time = raw_start_time;
				if(read_async_from_device(info, *(aio_context_t *)aio_context) < 0)
				{
					fprintf(stdout, "Error: Async read failed \n");
					free(p_buffer);
					goto fail; 
				}
			}
			else 
			{
				fprintf(stdout, "ERROR: read buffer cf_valloc()\n");
			}
		}
		else 
		{
			uint8_t stack_buffer[p_readreq->size + 4096];
			uint8_t* p_buffer = align_4096(stack_buffer);
			info->p_buffer = p_buffer;
			uint64_t raw_start_time = cf_getms();
			info->raw_start_time = raw_start_time;
			if(read_async_from_device(info, *(aio_context_t*)aio_context) < 0)
			{
				fprintf(stdout, "Error: Async read failed \n");
				goto fail;
			}
		}
		if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) 
		{
		  fprintf(stdout, "ERROR: too many read reqs queued\n");
		  fprintf(stdout, "drive(s) can't keep up - test stopped\n");
		  g_running = false;
		  return (void*)-1;;
		}

		count++;

		int sleep_ms = (int)
			(((count * 1000) / g_read_reqs_per_sec) -
				(cf_getms() - g_run_start_ms));

		if (sleep_ms > 0) {
			usleep((uint32_t)sleep_ms * 1000);
		}

		continue;

		/* Rollback for failure */
fail:
		if(info)
		{
			uintptr_t temp = (uintptr_t)info;
			cf_queue_push(async_info_queue, (void*)&temp);
		}
	}
	return (0);
}
static as_status
as_scan_generic(
	aerospike* as, as_error* err, const as_policy_scan* policy, const as_scan* scan,
	aerospike_scan_foreach_callback callback, void* udata, uint64_t* task_id_ptr)
{
	as_error_reset(err);
	
	if (! policy) {
		policy = &as->config.policies.scan;
	}
	
	as_cluster* cluster = as->cluster;
	as_nodes* nodes = as_nodes_reserve(cluster);
	uint32_t n_nodes = nodes->size;
	
	if (n_nodes == 0) {
		as_nodes_release(nodes);
		return as_error_set_message(err, AEROSPIKE_ERR_SERVER, "Scan command failed because cluster is empty.");
	}
	
	// Reserve each node in cluster.
	for (uint32_t i = 0; i < n_nodes; i++) {
		as_node_reserve(nodes->array[i]);
	}
	
	uint64_t task_id;
	if (task_id_ptr) {
		if (*task_id_ptr == 0) {
			*task_id_ptr = cf_get_rand64() / 2;
		}
		task_id = *task_id_ptr;
	}
	else {
		task_id = cf_get_rand64() / 2;
	}

	// Create scan command
	as_buffer argbuffer;
	uint16_t n_fields = 0;
	size_t size = as_scan_command_size(scan, &n_fields, &argbuffer);
	uint8_t* cmd = as_command_init(size);
	size = as_scan_command_init(cmd, policy, scan, task_id, n_fields, &argbuffer);
	
	// Initialize task.
	uint32_t error_mutex = 0;
	as_scan_task task;
	task.cluster = as->cluster;
	task.policy = policy;
	task.scan = scan;
	task.callback = callback;
	task.udata = udata;
	task.err = err;
	task.error_mutex = &error_mutex;
	task.task_id = task_id;
	task.cmd = cmd;
	task.cmd_size = size;
	
	as_status status = AEROSPIKE_OK;
	
	if (scan->concurrent) {
		uint32_t n_wait_nodes = n_nodes;
		task.complete_q = cf_queue_create(sizeof(as_scan_complete_task), true);

		// Run node scans in parallel.
		for (uint32_t i = 0; i < n_nodes; i++) {
			// Stack allocate task for each node.  It should be fine since the task
			// only needs to be valid within this function.
			as_scan_task* task_node = alloca(sizeof(as_scan_task));
			memcpy(task_node, &task, sizeof(as_scan_task));
			task_node->node = nodes->array[i];
			
			int rc = as_thread_pool_queue_task(&cluster->thread_pool, as_scan_worker, task_node);
			
			if (rc) {
				// Thread could not be added. Abort entire scan.
				if (ck_pr_fas_32(task.error_mutex, 1) == 0) {
					status = as_error_update(task.err, AEROSPIKE_ERR_CLIENT, "Failed to add scan thread: %d", rc);
				}
				
				// Reset node count to threads that were run.
				n_wait_nodes = i;
				break;
			}
		}

		// Wait for tasks to complete.
		for (uint32_t i = 0; i < n_wait_nodes; i++) {
			as_scan_complete_task complete;
			cf_queue_pop(task.complete_q, &complete, CF_QUEUE_FOREVER);
			
			if (complete.result != AEROSPIKE_OK && status == AEROSPIKE_OK) {
				status = complete.result;
			}
		}
		
		// Release temporary queue.
		cf_queue_destroy(task.complete_q);
	}
	else {
		task.complete_q = 0;
		
		// Run node scans in series.
		for (uint32_t i = 0; i < n_nodes && status == AEROSPIKE_OK; i++) {
			task.node = nodes->array[i];
			status = as_scan_command_execute(&task);
		}
	}
	
	// Release each node in cluster.
	for (uint32_t i = 0; i < n_nodes; i++) {
		as_node_release(nodes->array[i]);
	}
	
	// Release nodes array.
	as_nodes_release(nodes);

	// Free command memory.
	as_command_free(cmd, size);

	// If user aborts query, command is considered successful.
	if (status == AEROSPIKE_ERR_CLIENT_ABORT) {
		status = AEROSPIKE_OK;
	}

	// If completely successful, make the callback that signals completion.
	if (callback && status == AEROSPIKE_OK) {
		callback(NULL, udata);
	}
	return status;
}
Пример #19
0
// Set of threads which talk to client over the connection for doing the needful
// processing. Note that once fd is assigned to a thread all the work on that fd
// is done by that thread. Fair fd usage is expected of the client. First thread
// is special - also does accept [listens for new connections]. It is the only
// thread which does it.
void *
thr_demarshal(void *arg)
{
	cf_socket_cfg *s, *ls;
	// Create my epoll fd, register in the global list.
	struct epoll_event ev;
	int nevents, i, n, epoll_fd;
	cf_clock last_fd_print = 0;

#if defined(USE_SYSTEMTAP)
	uint64_t nodeid = g_config.self_node;
#endif

	// Early stage aborts; these will cause faults in process scope.
	cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument");
	s = &g_config.socket;
	ls = &g_config.localhost_socket;

#ifdef USE_JEM
	int orig_arena;
	if (0 > (orig_arena = jem_get_arena())) {
		cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!");
	} else {
		cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena);
	}
#endif

	// Figure out my thread index.
	pthread_t self = pthread_self();
	int thr_id;
	for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) {
		if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self))
			break;
	}

	if (thr_id == MAX_DEMARSHAL_THREADS) {
		cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!");
		return(0);
	}

	// First thread accepts new connection at interface socket.
	if (thr_id == 0) {
		demarshal_file_handle_init();
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));

		memset(&ev, 0, sizeof (ev));
		ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
		ev.data.fd = s->sock;
		if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev))
			cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
		cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port);

		if (ls->sock) {
			ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
			ev.data.fd = ls->sock;
			if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev))
			  cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
			cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port);
		}
	}
	else {
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));
	}

	g_demarshal_args->epoll_fd[thr_id] = epoll_fd;
	cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id);

	int id_cntr = 0;

	// Demarshal transactions from the socket.
	for ( ; ; ) {
		struct epoll_event events[EPOLL_SZ];

		cf_detail(AS_DEMARSHAL, "calling epoll");

		nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1);

		if (0 > nevents) {
			cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno));
		}

		cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents);

		uint64_t now_ns = cf_getns();
		uint64_t now_ms = now_ns / 1000000;

		// Iterate over all events.
		for (i = 0; i < nevents; i++) {
			if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) {
				// Accept new connections on the service socket.
				int csocket = -1;
				struct sockaddr_in caddr;
				socklen_t clen = sizeof(caddr);
				char cpaddr[64];

				if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) {
					// This means we're out of file descriptors - could be a SYN
					// flood attack or misbehaving client. Eventually we'd like
					// to make the reaper fairer, but for now we'll just have to
					// ignore the accept error and move on.
					if ((errno == EMFILE) || (errno == ENFILE)) {
						if (last_fd_print != (cf_getms() / 1000L)) {
							cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit");
							last_fd_print = cf_getms() / 1000L;
						}
						continue;
					}
					cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno);
				}

				// Get the client IP address in string form.
				if (caddr.sin_family == AF_INET) {
					if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else if (caddr.sin_family == AF_INET6) {
					struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr;

					if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else {
					cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family);
				}

				cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket);

				// Validate the limit of protocol connections we allow.
				uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed;
				if (conns_open > g_config.n_proto_fd_max) {
					if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs
						cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open);
						last_fd_print = cf_getms();
					}
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Set the socket to nonblocking.
				if (-1 == cf_socket_set_nonblocking(csocket)) {
					cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Create as_file_handle and queue it up in epoll_fd for further
				// communication on one of the demarshal threads.
				as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle));
				if (!fd_h) {
					cf_crash(AS_DEMARSHAL, "malloc");
				}

				sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port));
				fd_h->fd = csocket;

				fd_h->last_used = cf_getms();
				fd_h->reap_me = false;
				fd_h->trans_active = false;
				fd_h->proto = 0;
				fd_h->proto_unread = 0;
				fd_h->fh_info = 0;
				fd_h->security_filter = as_security_filter_create();

				// Insert into the global table so the reaper can manage it. Do
				// this before queueing it up for demarshal threads - once
				// EPOLL_CTL_ADD is done it's difficult to back out (if insert
				// into global table fails) because fd state could be anything.
				cf_rc_reserve(fd_h);

				pthread_mutex_lock(&g_file_handle_a_LOCK);

				int j;
				bool inserted = true;

				if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) {
					inserted = false;
				}
				else {
					g_file_handle_a[j] = fd_h;
				}

				pthread_mutex_unlock(&g_file_handle_a_LOCK);

				if (!inserted) {
					cf_info(AS_DEMARSHAL, "unable to add socket to file handle table");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					cf_rc_free(fd_h); // will free even with ref-count of 2
				}
				else {
					// Place the client socket in the event queue.
					memset(&ev, 0, sizeof(ev));
					ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ;
					ev.data.ptr = fd_h;

					// Round-robin pick up demarshal thread epoll_fd and add
					// this new connection to epoll.
					int id;
					while (true) {
						id = (id_cntr++) % g_demarshal_args->num_threads;
						if (g_demarshal_args->epoll_fd[id] != 0) {
							break;
						}
					}

					fd_h->epoll_fd = g_demarshal_args->epoll_fd[id];

					if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) {
						cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads);
						pthread_mutex_lock(&g_file_handle_a_LOCK);
						fd_h->reap_me = true;
						as_release_file_handle(fd_h);
						fd_h = 0;
						pthread_mutex_unlock(&g_file_handle_a_LOCK);
					}
					else {
						cf_atomic_int_incr(&g_config.proto_connections_opened);
					}
				}
			}
			else {
				bool has_extra_ref   = false;
				as_file_handle *fd_h = events[i].data.ptr;
				if (fd_h == 0) {
					cf_info(AS_DEMARSHAL, "event with null handle, continuing");
					goto NextEvent;
				}

				cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events);

				// Process data on an existing connection: this might be more
				// activity on an already existing transaction, so we have some
				// state to manage.
				as_proto *proto_p = 0;
				int fd = fd_h->fd;

				if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
					cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events);
					// no longer in use: out of epoll etc
					goto NextEvent_FD_Cleanup;
				}

				if (fd_h->trans_active) {
					goto NextEvent;
				}

				// If pointer is NULL, then we need to create a transaction and
				// store it in the buffer.
				if (fd_h->proto == NULL) {
					as_proto proto;
					int sz;

					/* Get the number of available bytes */
					if (-1 == ioctl(fd, FIONREAD, &sz)) {
						cf_info(AS_DEMARSHAL, "unable to get number of available bytes");
						goto NextEvent_FD_Cleanup;
					}

					// If we don't have enough data to fill the message buffer,
					// just wait and we'll come back to this one. However, we'll
					// let messages with zero size through, since they are
					// likely errors. We don't cleanup the FD in this case since
					// we'll get more data on it.
					if (sz < sizeof(as_proto) && sz != 0) {
						goto NextEvent;
					}

					// Do a preliminary read of the header into a stack-
					// allocated structure, so that later on we can allocate the
					// entire message buffer.
					if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) {
						cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno);
						goto NextEvent_FD_Cleanup;
					}

					if (proto.version != PROTO_VERSION &&
							// For backward compatibility, allow version 0 with
							// security messages.
							! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u",
								fd_h->client, proto.version);
						goto NextEvent_FD_Cleanup;
					}

					// Swap the necessary elements of the as_proto.
					as_proto_swap(&proto);

					if (proto.sz > PROTO_SIZE_MAX) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64,
								fd_h->client, PROTO_SIZE_MAX, proto.sz);
						goto NextEvent_FD_Cleanup;
					}

#ifdef USE_JEM
					// Attempt to peek the namespace and set the JEMalloc arena accordingly.
					size_t peeked_data_sz = 0;
					size_t min_field_sz = sizeof(uint32_t) + sizeof(char);
					size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz;
					size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.)
					uint8_t peekbuf[peekbuf_sz];
					if (PROTO_TYPE_AS_MSG == proto.type) {
						size_t offset = sizeof(as_msg);
						// Number of bytes to peek from the socket.
//						size_t peek_sz = peekbuf_sz;                 // Peak up to the size of the peek buffer.
						size_t peek_sz = MIN(proto.sz, peekbuf_sz);  // Peek only up to the minimum necessary number of bytes.
						if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) {
							// That's actually legitimate. The as_proto may have gone into one
							// packet, the as_msg into the next one, which we haven't yet received.
							// This just "never happened" without async.
							cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz);
						}
						if (peeked_data_sz > min_as_msg_sz) {
//							cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz);
							if (peeked_data_sz > proto.sz) {
								cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd);
								log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz);
								goto NextEvent_FD_Cleanup;
							}

							if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) {
								jem_set_arena(orig_arena);
							} else {
								uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0;
								bool found = false;
	//							cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields);
								while (!found && (field_num < n_fields)) {
									as_msg_field *field = (as_msg_field *) (&peekbuf[offset]);
									uint32_t value_sz = ntohl(field->field_sz) - 1;
	//								cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset);
	//								cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz);
	//								cf_debug(AS_DEMARSHAL, "\ttype %d", field->type);
									if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) {
										if (value_sz >= AS_ID_NAMESPACE_SZ) {
											cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz);
											goto NextEvent_FD_Cleanup;
										}
										char ns[AS_ID_NAMESPACE_SZ];
										found = true;
										memcpy(ns, field->data, value_sz);
										ns[value_sz] = '\0';
	//									cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num);
										jem_set_arena(as_namespace_get_jem_arena(ns));
									} else {
	//									cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type);
										field_num++;
										offset += sizeof(as_msg_field) + value_sz;
										if (offset >= peeked_data_sz) {
											break;
										}
									}
								}
								if (!found) {
									cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz);
									jem_set_arena(orig_arena);
								}
							}
						} else {
							jem_set_arena(orig_arena);
						}
					} else {
						jem_set_arena(orig_arena);
					}
#endif

					// Allocate the complete message buffer.
					proto_p = cf_malloc(sizeof(as_proto) + proto.sz);

					cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno));
					memcpy(proto_p, &proto, sizeof(as_proto));

#ifdef USE_JEM
					// Jam in the peeked data.
					if (peeked_data_sz) {
						memcpy(proto_p->data, &peekbuf, peeked_data_sz);
					}
					fd_h->proto_unread = proto_p->sz - peeked_data_sz;
#else
					fd_h->proto_unread = proto_p->sz;
#endif
					fd_h->proto = (void *) proto_p;
				}
				else {
					proto_p = fd_h->proto;
				}

				if (fd_h->proto_unread > 0) {

					// Read the data.
					n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0);
					if (0 >= n) {
						if (errno == EAGAIN) {
							continue;
						}
						cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno));
						goto NextEvent_FD_Cleanup;
					}

					// Decrement bytes-unread counter.
					cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread);
					fd_h->proto_unread -= n;
				}

				// Check for a finished read.
				if (0 == fd_h->proto_unread) {

					// It's only really live if it's injecting a transaction.
					fd_h->last_used = now_ms;

					thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress
					fd_h->proto = 0;
					fd_h->proto_unread = 0;

					// INIT_TR
					as_transaction tr;
					as_transaction_init(&tr, NULL, (cl_msg *)proto_p);

					cf_rc_reserve(fd_h);
					has_extra_ref   = true;
					tr.proto_fd_h   = fd_h;
					tr.start_time   = now_ns; // set transaction start time
					tr.preprocessed = false;

					if (! as_proto_is_valid_type(proto_p)) {
						cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type);
						// We got a proto message type we don't recognize, so it
						// may not do any good to send back an as_msg error, but
						// it's the best we can do. At least we can keep the fd.
						as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					if (g_config.microbenchmarks) {
						histogram_insert_data_point(g_config.demarshal_hist, now_ns);
						tr.microbenchmark_time = cf_getns();
					}

					// Check if it's compressed.
					if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) {
						// Decompress it - allocate buffer to hold decompressed
						// packet.
						uint8_t *decompressed_buf = NULL;
						size_t decompressed_buf_size = 0;
						int rv = 0;
						if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) {
							cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv);
							cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p");
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
						// Count the packets.
						cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1);
						// Free the compressed packet since we'll be using the
						// decompressed packet from now on.
						cf_free(proto_p);
						proto_p = NULL;
						// Get original packet.
						tr.msgp = (cl_msg *)decompressed_buf;
						as_proto_swap(&(tr.msgp->proto));

						if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) {
							cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]",
									tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size);
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
					}

					// Security protocol transactions.
					if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) {
						as_security_transact(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Info protocol requests.
					if (tr.msgp->proto.type == PROTO_TYPE_INFO) {
						if (as_info(&tr)) {
							cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer");
							goto NextEvent_FD_Cleanup;
						}
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp);

					// Fast path for batch requests.
					if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) {
						as_batch_queue_task(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Either process the transaction directly in this thread,
					// or queue it for processing by another thread (tsvc/info).
					if (0 != thr_tsvc_process_or_enqueue(&tr)) {
						cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread");
						goto NextEvent_FD_Cleanup;
					}
					else {
						cf_atomic_int_incr(&g_config.proto_transactions);
					}
				}

				// Jump the proto message free & FD cleanup. If we get here, the
				// above operations went smoothly. The message free & FD cleanup
				// job is handled elsewhere as directed by
				// thr_tsvc_process_or_enqueue().
				goto NextEvent;

NextEvent_FD_Cleanup:
				// If we allocated memory for the incoming message, free it.
				if (proto_p) {
					cf_free(proto_p);
					fd_h->proto = 0;
				}
				// If fd has extra reference for transaction, release it.
				if (has_extra_ref) {
					cf_rc_release(fd_h);
				}
				// Remove the fd from the events list.
				if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) {
					cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)",
							fd, epoll_fd, errno, cf_strerror(errno));
				}
				pthread_mutex_lock(&g_file_handle_a_LOCK);
				fd_h->reap_me = true;
				as_release_file_handle(fd_h);
				fd_h = 0;
				pthread_mutex_unlock(&g_file_handle_a_LOCK);
NextEvent:
				;
			}

			// We should never be canceled externally, but just in case...
			pthread_testcancel();
		}
	}

	return NULL;
}
Пример #20
0
//Same as do_the_full_monte, but only till the command is sent to the node.
//Most of the code is duplicated. Bad.
int
cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key,
			const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, 
			int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata)

{
	cl_async_work	*workitem = NULL;

	uint8_t		wr_stack_buf[STACK_BUF_SZ];
	uint8_t		*wr_buf = wr_stack_buf;
	size_t		wr_buf_sz = sizeof(wr_stack_buf);
	int        	progress_timeout_ms;
	uint64_t 	deadline_ms;
	uint64_t	starttime, endtime;
	bool 		network_error;
	int 		fd = -1;
	int		rv = CITRUSLEAF_FAIL_CLIENT;	//Assume that this is a failure;

	// as_msg 		msg;
	cf_digest	d_ret;
	cl_cluster_node	*node = 0;

#if ONEASYNCFD
	if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) {
		//cf_error("Async hashtab is full. Cannot insert any more elements");
		return CITRUSLEAF_FAIL_ASYNCQ_FULL;
	}
#else
	//If the async buffer is at the max limit, do not entertain more requests.
	if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) {
		//cf_error("Async buffer is full. Cannot insert any more elements");
		return CITRUSLEAF_FAIL_ASYNCQ_FULL;
	}
#endif

	//Allocate memory for work item that will be added to the async work list

	if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) {
		cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER);
	} else {
		workitem = malloc(sizeof(cl_async_work));
		if (workitem == NULL) {
			return CITRUSLEAF_FAIL_CLIENT;
		}
	}

	//Compile the write buffer to be sent to the cluster
	if (n_values && ( values || operations) ){
		cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL,
				*n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/);
	}else{
		cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/);
	}	

	deadline_ms = 0;
	progress_timeout_ms = 0;
	if (cl_w_p && cl_w_p->timeout_ms) {
		deadline_ms = cf_getms() + cl_w_p->timeout_ms;
		// policy: if asking for a long timeout, give enough time to try twice
		if (cl_w_p->timeout_ms > 700) {
			progress_timeout_ms = cl_w_p->timeout_ms / 2;
		}
		else {
			progress_timeout_ms = cl_w_p->timeout_ms;
		}
	}
	else {
		progress_timeout_ms = g_async_nw_progress_timeout;
	}

	//Initialize the async work unit
	workitem->trid = *trid;
	workitem->deadline = deadline_ms;
	workitem->starttime = cf_getms();
	workitem->udata = udata;

    as_msg *msgp;
    // Hate special cases, but we have to clear the verify bit on delete verify
    if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY))
    {
        msgp = (as_msg *)wr_buf;
        msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY;
    }
    
    if (asc->compression_stat.compression_threshold > 0 
     && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold)
    {
        /* Compression is enabled.
         * Packet size is above threshold.
         * Compress the data
         */
        uint8_t *compressed_buf = NULL;
        size_t compressed_buf_sz = 0;

        // Contstruct packet for compressed data.
        cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz);
        if (compressed_buf)
        {
            // If original packet size is > 16k, cl_compile had allocated memory for it.
            // Free that memory.
            // cf_packet_compression will allocate memory for compressed packet
            if (wr_buf != wr_stack_buf) {
                free(wr_buf);
            }
             // Update stats.
            citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz);	
            wr_buf =  compressed_buf;
            wr_buf_sz = compressed_buf_sz;
            //memcpy (wr_buf, compressed_buf, compressed_buf_sz);
            //wr_buf_sz = compressed_buf_sz;
            //free (compressed_buf);
        }
        //else compression failed, continue with uncompressed packet
        else
        {
            // Set compression stat
            citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz);	
        }
    }

	int try = 0;
	// retry request based on the write_policy
	do {
		network_error = false;
		try++;
#ifdef DEBUG		
		if (try > 1) {
			cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self());
		}
#endif        

		// Get an FD from a cluster. First get the probable node for the given digest.
		node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false);
		if (!node) {
#ifdef DEBUG
			cf_debug("warning: no healthy nodes in cluster, retrying");
#endif
			usleep(10000);	//Sleep for 10ms
			goto Retry;
		}

		// Now get the dedicated async FD of this node
		starttime = cf_getms();
		fd = cl_cluster_node_fd_get(node, true);
		endtime = cf_getms();
		if ((endtime - starttime) > 10) {
			cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime));
		}
		if (fd == -1) {
#ifdef DEBUG			
			cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() );
#endif			
			usleep(1000);
			goto Retry;
		}

		// Send the command to the node
		starttime = cf_getms();
		rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms);
		endtime = cf_getms();
		if ((endtime - starttime) > 10) {
			cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime));
		}
		if (rv != 0) {
			cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)",
					rv,fd,errno,(uint64_t)pthread_self());
			if (rv != ETIMEDOUT)
				network_error = true;
			goto Retry;
		}
		goto Ok;

Retry:
		if (network_error == true) {
			/* 
			 * In case of Async work (for XDS), it may be extreme to
			 * dun a node in case of network error. We just cleanup
			 * things and retry to connect to the remote cluster.
			 * The network error may be a transient one. As this is a
			 * network error, its is better to wait for some significant
			 * time before retrying.
			 */
			sleep(1);	//Sleep for 1sec
#if ONEASYNCFD
//Do not close the FD
#else
			cf_error("async sender: Closing the fd %d because of network error", fd);
			cf_close(fd);
			fd = -1;
#endif
		}

		if (fd != -1) {
			cf_error("async sender: Closing the fd %d because of retry", fd);
			cf_close(fd);
			fd = -1;
		}

		if (node) {
			cl_cluster_node_put(node); 
			node = 0; 
		}

		if (deadline_ms && (deadline_ms < cf_getms() ) ) {
#ifdef DEBUG            
			cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms());
#endif            
			rv = CITRUSLEAF_FAIL_TIMEOUT;
			goto Error;
		}
	} while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) );

Error:	
#ifdef DEBUG	
	cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d",
			(int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), 
			(int)(deadline_ms - cf_getms() ), rv );
#endif	

	if (wr_buf != wr_stack_buf) {
		free(wr_buf);
	}

#if ONEASYNCFD
	//Do not close the FD
#else
	//If it is a network error, the fd would be closed and set to -1.
	//So, we reach this place with a valid FD in case of timeout.
	if (fd != -1) {
		cf_error("async sender: Closing the fd %d because of timeout", fd);
		cf_close(fd);
	}
#endif

	return(rv);
Ok:
	/*
	 * We cannot release the node here as the asyc FD associated
	 * with this node may get closed. We should do it only when
	 * we got back the ack for the async command that we just did.
	 */

	//As we sent the command successfully, add it to the async work list
	workitem->node = node;
	workitem->fd = fd;
	//We are storing only the pointer to the workitem
#if ONEASYNCFD
	if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) {
		//This should always succeed.
		cf_error("Unable to add unique entry into the hash table");
	}
	cf_queue_push(node->asyncwork_q, &workitem);	//Also put in the node's q
#else
	cf_queue_push(g_cl_async_q, &workitem);
#endif

	if (wr_buf != wr_stack_buf) {
		free(wr_buf);
	}

	rv = CITRUSLEAF_OK;
	return rv;

}

int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads)
{
	// int num_threads;

	if (0 == cf_atomic32_get(g_async_initialized)) {
		cf_error("Async client not initialized cannot reinit");
		return -1;
	}
	
	if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) {
			//Limit the threads to the max value even if caller asks for it
			num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS;
	}

	// If number of thread is increased create more threads
	if (num_receiver_threads > g_async_num_threads) {
		unsigned int i;
		for (i = g_async_num_threads; i < num_receiver_threads; i++) {
			pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL);
		}
	}
	else {
		// else just reset the number the async threads will kill themselves
		cf_atomic32_set(&g_async_num_threads, num_receiver_threads);
	}

	cf_atomic32_set(&g_async_q_szlimit , size_limit);
	return ( 0 );

}
int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn)
{
	int i, num_threads;

	//Make sure that we do the initialization only once
	if (1 == cf_atomic32_incr(&g_async_initialized)) {

		// Start the receiver threads
		num_threads = num_receiver_threads;
		if (num_threads > MAX_ASYNC_RECEIVER_THREADS) {
			//Limit the threads to the max value even if caller asks for it
			num_threads = MAX_ASYNC_RECEIVER_THREADS;
		}

#if ONEASYNCFD
		g_async_h_szlimit = size_limit * 3;	//Max number of elements in the hash table
		g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table

		if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *),
					g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) {
			cf_error("Failed to initialize the async work hastable");
			cf_atomic32_decr(&g_async_initialized);
			return -1;
		}
#else
		// create work queue
		g_async_q_szlimit = size_limit;
		if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) {
			cf_error("Failed to initialize the async work queue");
			cf_atomic32_decr(&g_async_initialized);
			return -1;
		}

		for (i=0; i<num_threads; i++) {
			pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL);
		}
		g_async_num_threads = num_threads;
#endif

		if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) {
			cf_error("Failed to create memory pool for workitems");
			return -1;
		}

		g_fail_cb_fn = fail_cb_fn;
		g_success_cb_fn = success_cb_fn;

		// Initialize the stats
		g_async_stats.retries = 0;
		g_async_stats.dropouts = 0;

	}
	
	return(0);	
}