/** * Adds a connection socket to the chain of connections. * Connection sockets are those which need to be read from. * Returns the connection id, or -1 if a failure occurred. */ int dispatch_addconnection(int sock) { size_t c; pthread_rwlock_rdlock(&connectionslock); for (c = 0; c < connectionslen; c++) if (__sync_bool_compare_and_swap(&(connections[c].takenby), -1, -2)) break; pthread_rwlock_unlock(&connectionslock); if (c == connectionslen) { connection *newlst; pthread_rwlock_wrlock(&connectionslock); if (connectionslen > c) { /* another dispatcher just extended the list */ pthread_rwlock_unlock(&connectionslock); return dispatch_addconnection(sock); } newlst = realloc(connections, sizeof(connection) * (connectionslen + CONNGROWSZ)); if (newlst == NULL) { logerr("cannot add new connection: " "out of memory allocating more slots (max = %zd)\n", connectionslen); pthread_rwlock_unlock(&connectionslock); return -1; } memset(&newlst[connectionslen], '\0', sizeof(connection) * CONNGROWSZ); for (c = connectionslen; c < connectionslen + CONNGROWSZ; c++) newlst[c].takenby = -1; /* free */ connections = newlst; c = connectionslen; /* for the setup code below */ newlst[c].takenby = -2; connectionslen += CONNGROWSZ; pthread_rwlock_unlock(&connectionslock); } (void) fcntl(sock, F_SETFL, O_NONBLOCK); connections[c].sock = sock; connections[c].buflen = 0; connections[c].needmore = 0; connections[c].noexpire = 0; connections[c].destlen = 0; connections[c].wait = 0; connections[c].takenby = 0; /* now dispatchers will pick this one up */ acceptedconnections++; return c; }
/** * Adds a pseudo-listener for datagram (UDP) sockets, which is pseudo, * for in fact it adds a new connection, but makes sure that connection * won't be closed after being idle, and won't count that connection as * an incoming connection either. */ int dispatch_addlistener_udp(int sock) { int conn = dispatch_addconnection(sock); if (conn == -1) return 1; connections[conn].noexpire = 1; acceptedconnections--; return 0; }
/** * Adds a connection which we know is from an aggregator, so direct * pipe. This is different from normal connections that we don't want * to count them, never expire them, and want to recognise them when * we're doing reloads. */ int dispatch_addconnection_aggr(int sock) { int conn = dispatch_addconnection(sock); if (conn == -1) return 1; connections[conn].noexpire = 1; connections[conn].isaggr = 1; acceptedconnections--; return 0; }
/** * Reads from the queue and sends items to the remote server. This * function is designed to be a thread. Data sending is attempted to be * batched, but sent one by one to reduce loss on sending failure. * A connection with the server is maintained for as long as there is * data to be written. As soon as there is none, the connection is * dropped if a timeout of DISCONNECT_WAIT_TIME exceeds. */ static void * server_queuereader(void *d) { server *self = (server *)d; size_t len; ssize_t slen; const char **metric = self->batch; struct timeval start, stop; struct timeval timeout; queue *squeue; char idle = 0; size_t *secpos = NULL; *metric = NULL; self->metrics = 0; self->ticks = 0; #define FAIL_WAIT_TIME 6 /* 6 * 250ms = 1.5s */ #define DISCONNECT_WAIT_TIME 12 /* 12 * 250ms = 3s */ #define LEN_CRITICAL(Q) (queue_free(Q) < self->bsize) self->running = 1; while (1) { if (queue_len(self->queue) == 0) { /* if we're idling, close the TCP connection, this allows us * to reduce connections, while keeping the connection alive * if we're writing a lot */ gettimeofday(&start, NULL); if (self->ctype == CON_TCP && self->fd >= 0 && idle++ > DISCONNECT_WAIT_TIME) { close(self->fd); self->fd = -1; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); if (!self->keep_running) break; /* nothing to do, so slow down for a bit */ usleep((200 + (rand() % 100)) * 1000); /* 200ms - 300ms */ /* if we are in failure mode, keep checking if we can * connect, this avoids unnecessary queue moves */ if (!self->failure) /* it makes no sense to try and do something, so skip */ continue; } else if (self->secondariescnt > 0 && (self->failure >= FAIL_WAIT_TIME || (!self->failover && LEN_CRITICAL(self->queue)))) { size_t i; gettimeofday(&start, NULL); if (self->secondariescnt > 0) { if (secpos == NULL) { secpos = malloc(sizeof(size_t) * self->secondariescnt); if (secpos == NULL) { logerr("server: failed to allocate memory " "for secpos\n"); gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); continue; } for (i = 0; i < self->secondariescnt; i++) secpos[i] = i; } if (!self->failover) { /* randomise the failover list such that in the * grand scheme of things we don't punish the first * working server in the list to deal with all * traffic meant for a now failing server */ for (i = 0; i < self->secondariescnt; i++) { size_t n = rand() % (self->secondariescnt - i); if (n != i) { size_t t = secpos[n]; secpos[n] = secpos[i]; secpos[i] = t; } } } } /* offload data from our queue to our secondaries * when doing so, observe the following: * - avoid nodes that are in failure mode * - avoid nodes which queues are >= critical_len * when no nodes remain given the above * - send to nodes which queue size < critical_len * where there are no such nodes * - do nothing (we will overflow, since we can't send * anywhere) */ *metric = NULL; squeue = NULL; for (i = 0; i < self->secondariescnt; i++) { /* both conditions below make sure we skip ourself */ if (self->secondaries[secpos[i]]->failure) continue; squeue = self->secondaries[secpos[i]]->queue; if (!self->failover && LEN_CRITICAL(squeue)) { squeue = NULL; continue; } if (*metric == NULL) { /* send up to batch size of our queue to this queue */ len = queue_dequeue_vector( self->batch, self->queue, self->bsize); self->batch[len] = NULL; metric = self->batch; } for (; *metric != NULL; metric++) if (!queue_putback(squeue, *metric)) break; /* try to put back stuff that didn't fit */ for (; *metric != NULL; metric++) if (!queue_putback(self->queue, *metric)) break; } for (; *metric != NULL; metric++) { if (mode & MODE_DEBUG) logerr("dropping metric: %s", *metric); free((char *)*metric); self->dropped++; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); if (squeue == NULL) { /* we couldn't do anything, take it easy for a bit */ if (self->failure) self->failure = 1; if (!self->keep_running) break; usleep((200 + (rand() % 100)) * 1000); /* 200ms - 300ms */ } } else if (self->failure) { if (!self->keep_running) break; usleep((200 + (rand() % 100)) * 1000); /* 200ms - 300ms */ } /* at this point we've got work to do, if we're instructed to * shut down, however, try to get everything out of the door * (until we fail, see top of this loop) */ gettimeofday(&start, NULL); /* try to connect */ if (self->fd < 0) { if (self->ctype == CON_PIPE) { int intconn[2]; if (pipe(intconn) < 0) { if (!self->failure) logerr("failed to create pipe: %s\n", strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } dispatch_addconnection(intconn[0]); self->fd = intconn[1]; } else if (self->ctype == CON_UDP) { if ((self->fd = socket(self->saddr->ai_family, self->saddr->ai_socktype, self->saddr->ai_protocol)) < 0) { if (!self->failure) logerr("failed to create udp socket: %s\n", strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } if (connect(self->fd, self->saddr->ai_addr, self->saddr->ai_addrlen) < 0) { if (!self->failure) logerr("failed to connect udp socket: %s\n", strerror(errno)); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } } else if (self->ctype == CON_FILE) { if ((self->fd = open(self->ip, O_WRONLY | O_APPEND | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { if (!self->failure) logerr("failed to open file '%s': %s\n", self->ip, strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } } else { int ret; int args; if ((self->fd = socket(self->saddr->ai_family, self->saddr->ai_socktype, self->saddr->ai_protocol)) < 0) { if (!self->failure) logerr("failed to create socket: %s\n", strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } /* put socket in non-blocking mode such that we can * poll() (time-out) on the connect() call */ args = fcntl(self->fd, F_GETFL, NULL); (void) fcntl(self->fd, F_SETFL, args | O_NONBLOCK); ret = connect(self->fd, self->saddr->ai_addr, self->saddr->ai_addrlen); if (ret < 0 && errno == EINPROGRESS) { /* wait for connection to succeed if the OS thinks * it can succeed */ struct pollfd ufds[1]; ufds[0].fd = self->fd; ufds[0].events = POLLIN | POLLOUT; ret = poll(ufds, 1, self->iotimeout + (rand() % 100)); if (ret == 0) { /* time limit expired */ if (!self->failure) logerr("failed to connect() to " "%s:%u: Operation timed out\n", self->ip, self->port); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } else if (ret < 0) { /* some select error occurred */ if (!self->failure) logerr("failed to poll() for %s:%u: %s\n", self->ip, self->port, strerror(errno)); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } else { if (ufds[0].revents & POLLHUP) { if (!self->failure) logerr("failed to connect() for %s:%u: " "Hangup\n", self->ip, self->port); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } } } else if (ret < 0) { if (!self->failure) { logerr("failed to connect() to %s:%u: %s\n", self->ip, self->port, strerror(errno)); dispatch_check_rlimit_and_warn(); } close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } /* make socket blocking again */ (void) fcntl(self->fd, F_SETFL, args); } /* ensure we will break out of connections being stuck more * quickly than the kernel would give up */ timeout.tv_sec = 10; timeout.tv_usec = (rand() % 300) * 1000; setsockopt(self->fd, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)); #ifdef SO_NOSIGPIPE setsockopt(self->fd, SOL_SOCKET, SO_NOSIGPIPE, NULL, 0); #endif } /* send up to batch size */ len = queue_dequeue_vector(self->batch, self->queue, self->bsize); self->batch[len] = NULL; metric = self->batch; if (len != 0 && !self->keep_running) { /* be noisy during shutdown so we can track any slowing down * servers, possibly preventing us to shut down */ logerr("shutting down %s:%u: waiting for %zu metrics\n", self->ip, self->port, len + queue_len(self->queue)); } if (len == 0 && self->failure) { /* if we don't have anything to send, we have at least a * connection succeed, so assume the server is up again, * this is in particular important for recovering this * node by probes, to avoid starvation of this server since * its queue is possibly being offloaded to secondaries */ if (self->ctype != CON_UDP) logerr("server %s:%u: OK after probe\n", self->ip, self->port); self->failure = 0; } for (; *metric != NULL; metric++) { len = strlen(*metric); if ((slen = write(self->fd, *metric, len)) != len) { /* not fully sent, or failure, close connection * regardless so we don't get synchonisation problems, * partially sent data is an error for us, since we use * blocking sockets, and hence partial sent is * indication of a failure */ if (self->ctype != CON_UDP && !self->failure) logerr("failed to write() to %s:%u: %s\n", self->ip, self->port, (slen < 0 ? strerror(errno) : "uncomplete write")); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; /* put back stuff we couldn't process */ for (; *metric != NULL; metric++) { if (!queue_putback(self->queue, *metric)) { if (mode & MODE_DEBUG) logerr("server %s:%u: dropping metric: %s", self->ip, self->port, *metric); free((char *)*metric); self->dropped++; } } break; } else if (self->failure) { if (self->ctype != CON_UDP) logerr("server %s:%u: OK\n", self->ip, self->port); self->failure = 0; } free((char *)*metric); self->metrics++; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); idle = 0; } self->running = 0; if (self->fd >= 0) close(self->fd); return NULL; }
/** * pthread compatible routine that handles connections and processes * whatever comes in on those. */ static void * dispatch_runner(void *arg) { dispatcher *self = (dispatcher *)arg; connection *conn; int c; self->metrics = 0; self->blackholes = 0; self->ticks = 0; self->sleeps = 0; self->prevmetrics = 0; self->prevblackholes = 0; self->prevticks = 0; self->prevsleeps = 0; if (self->type == LISTENER) { struct pollfd ufds[sizeof(listeners) / sizeof(connection *)]; while (self->keep_running) { for (c = 0; c < sizeof(listeners) / sizeof(connection *); c++) { if (listeners[c] == NULL) break; ufds[c].fd = listeners[c]->sock; ufds[c].events = POLLIN; } if (poll(ufds, c, 1000) > 0) { for (--c; c >= 0; c--) { if (ufds[c].revents & POLLIN) { int client; struct sockaddr addr; socklen_t addrlen = sizeof(addr); if ((client = accept(ufds[c].fd, &addr, &addrlen)) < 0) { logerr("dispatch: failed to " "accept() new connection: %s\n", strerror(errno)); dispatch_check_rlimit_and_warn(); continue; } if (dispatch_addconnection(client) == -1) { close(client); continue; } } } } } } else if (self->type == CONNECTION) { int work; struct timeval start, stop; while (self->keep_running) { work = 0; if (self->route_refresh_pending) { self->rtr = self->pending_rtr; self->pending_rtr = NULL; self->route_refresh_pending = 0; self->hold = 0; } gettimeofday(&start, NULL); pthread_rwlock_rdlock(&connectionslock); for (c = 0; c < connectionslen; c++) { conn = &(connections[c]); /* atomically try to "claim" this connection */ if (!__sync_bool_compare_and_swap(&(conn->takenby), 0, self->id)) continue; if (self->hold && !conn->isaggr) { conn->takenby = 0; continue; } work += dispatch_connection(conn, self, start); } pthread_rwlock_unlock(&connectionslock); gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); /* nothing done, avoid spinlocking */ if (self->keep_running && work == 0) { gettimeofday(&start, NULL); usleep((100 + (rand() % 200)) * 1000); /* 100ms - 300ms */ gettimeofday(&stop, NULL); self->sleeps += timediff(start, stop); } } } else { logerr("huh? unknown self type!\n"); } return NULL; }
/** * Adds a connection socket to the chain of connections. * Connection sockets are those which need to be read from. * Returns the connection id, or -1 if a failure occurred. */ int dispatch_addconnection(int sock) { size_t c; struct sockaddr_in6 saddr; socklen_t saddr_len = sizeof(saddr); pthread_rwlock_rdlock(&connectionslock); for (c = 0; c < connectionslen; c++) if (__sync_bool_compare_and_swap(&(connections[c].takenby), -1, -2)) break; pthread_rwlock_unlock(&connectionslock); if (c == connectionslen) { connection *newlst; pthread_rwlock_wrlock(&connectionslock); if (connectionslen > c) { /* another dispatcher just extended the list */ pthread_rwlock_unlock(&connectionslock); return dispatch_addconnection(sock); } newlst = realloc(connections, sizeof(connection) * (connectionslen + CONNGROWSZ)); if (newlst == NULL) { logerr("cannot add new connection: " "out of memory allocating more slots (max = %zu)\n", connectionslen); pthread_rwlock_unlock(&connectionslock); return -1; } memset(&newlst[connectionslen], '\0', sizeof(connection) * CONNGROWSZ); for (c = connectionslen; c < connectionslen + CONNGROWSZ; c++) newlst[c].takenby = -1; /* free */ connections = newlst; c = connectionslen; /* for the setup code below */ newlst[c].takenby = -2; connectionslen += CONNGROWSZ; pthread_rwlock_unlock(&connectionslock); } /* figure out who's calling */ if (getpeername(sock, (struct sockaddr *)&saddr, &saddr_len) == 0) { snprintf(connections[c].srcaddr, sizeof(connections[c].srcaddr), "(unknown)"); switch (saddr.sin6_family) { case PF_INET: inet_ntop(saddr.sin6_family, &((struct sockaddr_in *)&saddr)->sin_addr, connections[c].srcaddr, sizeof(connections[c].srcaddr)); break; case PF_INET6: inet_ntop(saddr.sin6_family, &saddr.sin6_addr, connections[c].srcaddr, sizeof(connections[c].srcaddr)); break; } } (void) fcntl(sock, F_SETFL, O_NONBLOCK); connections[c].sock = sock; connections[c].buflen = 0; connections[c].needmore = 0; connections[c].noexpire = 0; connections[c].isaggr = 0; connections[c].destlen = 0; gettimeofday(&connections[c].lastwork, NULL); connections[c].hadwork = 1; /* force first iteration before stalling */ connections[c].takenby = 0; /* now dispatchers will pick this one up */ acceptedconnections++; return c; }
/** * pthread compatible routine that handles connections and processes * whatever comes in on those. */ static void * dispatch_runner(void *arg) { dispatcher *self = (dispatcher *)arg; connection *conn; int work; int c; self->metrics = 0; self->ticks = 0; self->state = SLEEPING; if (self->type == LISTENER) { fd_set fds; int maxfd = -1; struct timeval tv; while (self->keep_running) { FD_ZERO(&fds); tv.tv_sec = 0; tv.tv_usec = 250 * 1000; /* 250 ms */ for (c = 0; c < sizeof(listeners) / sizeof(connection *); c++) { conn = listeners[c]; if (conn == NULL) break; FD_SET(conn->sock, &fds); if (conn->sock > maxfd) maxfd = conn->sock; } if (select(maxfd + 1, &fds, NULL, NULL, &tv) > 0) { for (c = 0; c < sizeof(listeners) / sizeof(connection *); c++) { conn = listeners[c]; if (conn == NULL) break; if (FD_ISSET(conn->sock, &fds)) { int client; struct sockaddr addr; socklen_t addrlen = sizeof(addr); if ((client = accept(conn->sock, &addr, &addrlen)) < 0) { logerr("dispatch: failed to " "accept() new connection: %s\n", strerror(errno)); dispatch_check_rlimit_and_warn(); continue; } if (dispatch_addconnection(client) == -1) { close(client); continue; } } } } } } else if (self->type == CONNECTION) { while (self->keep_running) { work = 0; if (self->route_refresh_pending) { self->routes = self->pending_routes; self->pending_routes = NULL; self->route_refresh_pending = 0; } pthread_rwlock_rdlock(&connectionslock); for (c = 0; c < connectionslen; c++) { conn = &(connections[c]); /* atomically try to "claim" this connection */ if (!__sync_bool_compare_and_swap(&(conn->takenby), 0, self->id)) continue; self->state = RUNNING; work += dispatch_connection(conn, self); } pthread_rwlock_unlock(&connectionslock); self->state = SLEEPING; /* nothing done, avoid spinlocking */ if (self->keep_running && work == 0) usleep((100 + (rand() % 200)) * 1000); /* 100ms - 300ms */ } } else { logerr("huh? unknown self type!\n"); } return NULL; }