static void setup_for_epoch(disk_writer_t *self, time_t blob_epoch) { if (self->last_epoch == blob_epoch) return; if (self->last_epoch) { if (fsync(self->fd)) { WARN_ERRNO("failed to fsync '%s', everyting is lost!", self->last_file_path); } if (close(self->fd)) { WARN_ERRNO("failed to close '%s', everyting is lost!", self->last_file_path); } } if (blob_epoch) { if (snprintf(self->last_file_path, PATH_MAX, "%s/%li.srlc", self->fallback_path, blob_epoch) >= PATH_MAX ) { /* XXX: should this really die? */ DIE_RC(EXIT_FAILURE,"filename was truncated to %d bytes: '%s'", PATH_MAX, self->last_file_path); } recreate_fallback_path(self->fallback_path); self->fd = open(self->last_file_path, O_WRONLY|O_APPEND|O_CREAT, 0640); if (self->fd < 0) { WARN_ERRNO("failed to open '%s', everyting is lost!", self->last_file_path); blob_epoch= 0; } } self->last_epoch= blob_epoch; }
/* try to get the OS to send our packets more efficiently when sending via TCP. */ static void cork(relay_socket_t * s, int flag) { if (!s || s->proto != IPPROTO_TCP) return; #ifdef TCP_CORK /* Linux */ if (setsockopt(s->socket, IPPROTO_TCP, TCP_CORK, (char *) &flag, sizeof(int)) < 0) WARN_ERRNO("setsockopt TCP_CORK: %s", strerror(errno)); #elif defined(TCP_NOPUSH) /* BSD */ if (setsockopt(s->socket, IPPROTO_TCP, TCP_NOPUSH, (char *) &flag, sizeof(int)) < 0) WARN_ERRNO("setsockopt TCP_NOPUSH: %s", strerror(errno)); #else #error No TCP_CORK or TCP_NOPUSH #endif }
void *udp_server(void *arg) { sock_t *s = (sock_t *) arg; ssize_t received; #ifdef PACKETS_PER_SECOND uint32_t packets = 0, prev_packets = 0; uint32_t epoch, prev_epoch = 0; #endif char buf[MAX_CHUNK_SIZE]; // unused, but makes recv() happy while (not_aborted()) { received = recv(s->socket, buf, MAX_CHUNK_SIZE, 0); #ifdef PACKETS_PER_SECOND if ((epoch = time(0)) != prev_epoch) { SAY("packets: %d", packets - prev_packets); prev_epoch = epoch; prev_packets = packets; } packets++; #endif if (received < 0) break; buf_to_blob_enqueue(buf,received); } WARN_ERRNO("recv failed"); set_aborted(); pthread_exit(NULL); }
/* write a blob to disk */ static void write_blob_to_disk(disk_writer_t *self, blob_t *b) { assert(BLOB_REF_PTR(b)); setup_for_epoch(self, BLOB_RECEIVED_TIME(b).tv_sec); if ( self->fd >= 0 ) { ssize_t wrote= write(self->fd, BLOB_BUF(b), BLOB_BUF_SIZE(b)); if ( wrote == BLOB_BUF_SIZE(b) ) { RELAY_ATOMIC_INCREMENT( self->pcounters->disk_count, 1 ); return; } WARN_ERRNO("Wrote only %ld of %i bytes to '%s', error:", wrote, BLOB_BUF_SIZE(b), self->last_file_path); } RELAY_ATOMIC_INCREMENT( self->pcounters->disk_error_count, 1 ); }
void *tcp_server(void *arg) { sock_t *s = (sock_t *) arg; int i,fd,try_to_read,received; struct tcp_client *clients,*client; struct pollfd *pfds = NULL; volatile nfds_t nfds; setnonblocking(s->socket); nfds = 1; pfds = mallocz_or_die(nfds * sizeof(struct pollfd)); pfds->fd = s->socket; pfds->events = POLLIN; clients = NULL; RELAY_ATOMIC_AND( RECEIVED_STATS.active_connections, 0); int rc; for (;;) { rc = poll(pfds,nfds,CONFIG.polling_interval_ms); if (rc == -1) { if (rc == EINTR) continue; WARN_ERRNO("poll"); goto out; } for (i = 0; i < nfds; i++) { if (!pfds[i].revents) continue; if (pfds[i].fd == s->socket) { fd = accept(s->socket, NULL, NULL); if (fd == -1) { WARN_ERRNO("accept"); goto out; } setnonblocking(fd); RELAY_ATOMIC_INCREMENT( RECEIVED_STATS.active_connections, 1 ); pfds = realloc_or_die(pfds, (nfds + 1) * sizeof(*pfds)); clients = realloc_or_die(clients,(nfds + 1) * sizeof(*clients)); clients[nfds].pos = 0; clients[nfds].buf = mallocz_or_die(ASYNC_BUFFER_SIZE); // WARN("[%d] CREATE %p fd: %d",i,clients[nfds].buf,fd); pfds[nfds].fd = fd; pfds[nfds].events = POLLIN; pfds[nfds].revents = 0; nfds++; } else { client = &clients[i]; try_to_read = ASYNC_BUFFER_SIZE - client->pos; // try to read as much as possible if (try_to_read <= 0) { WARN("disconnecting, try to read: %d, pos: %d", try_to_read, client->pos); goto disconnect; } received = recv(pfds[i].fd, client->buf + client->pos, try_to_read,0); if (received <= 0) { if (received == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) continue; disconnect: shutdown(pfds[i].fd,SHUT_RDWR); close(pfds[i].fd); // WARN("[%d] DESTROY %p %d %d fd: %d vs %d",i,client->buf,client->x,i,pfds[i].fd,client->fd); free(client->buf); // shft left memcpy(pfds + i,pfds + i + 1, (nfds - i - 1) * sizeof(struct pollfd)); memcpy(clients + i,clients + i + 1, (nfds - i - 1) * sizeof(struct tcp_client)); nfds--; pfds = realloc_or_die(pfds, nfds * sizeof(struct pollfd)); clients = realloc_or_die(clients, nfds * sizeof(struct tcp_client)); RELAY_ATOMIC_DECREMENT( RECEIVED_STATS.active_connections, 1 ); continue; } client->pos += received; try_to_consume_one_more: if (client->pos < EXPECTED_HEADER_SIZE) continue; if (EXPECTED(client) > MAX_CHUNK_SIZE) { WARN("received frame (%d) > MAX_CHUNK_SIZE(%d)",EXPECTED(client),MAX_CHUNK_SIZE); goto disconnect; } if (client->pos >= EXPECTED(client) + EXPECTED_HEADER_SIZE) { buf_to_blob_enqueue(client->buf,EXPECTED(client)); client->pos -= EXPECTED(client) + EXPECTED_HEADER_SIZE; if (client->pos < 0) { WARN("BAD PACKET wrong 'next' position(< 0) pos: %d expected packet size:%d header_size: %d", client->pos, EXPECTED(client),EXPECTED_HEADER_SIZE); goto disconnect; } if (client->pos > 0) { // [ h ] [ h ] [ h ] [ h ] [ D ] [ D ] [ D ] [ h ] [ h ] [ h ] [ h ] [ D ] // ^ pos(12) // after we remove the first packet + header it becomes: // [ h ] [ h ] [ h ] [ h ] [ D ] [ D ] [ D ] [ h ] [ h ] [ h ] [ h ] [ D ] // ^ pos (5) // and then we copy from header + data, to position 0, 5 bytes // // [ h ] [ h ] [ h ] [ h ] [ D ] // ^ pos (5) memmove(client->buf, client->buf + EXPECTED_HEADER_SIZE + EXPECTED(client), client->pos); if (client->pos >= EXPECTED_HEADER_SIZE) goto try_to_consume_one_more; } } } } } out: for (i = 0; i < nfds; i++) { if (pfds[i].fd != s->socket) free(clients[i].buf); shutdown(pfds[i].fd, SHUT_RDWR); close(pfds[i].fd); } free(pfds); free(clients); set_aborted(); pthread_exit(NULL); }
/* the main loop for the socket worker process */ void *socket_worker_thread(void *arg) { socket_worker_t *self = (socket_worker_t *) arg; queue_t *main_queue = &self->queue; relay_socket_t *sck = NULL; queue_t private_queue; queue_t spill_queue; memset(&private_queue, 0, sizeof(queue_t)); memset(&spill_queue, 0, sizeof(queue_t)); const config_t *config = self->base.config; int join_err; #define RATE_UPDATE_PERIOD 15 time_t last_rate_update = 0; while (!RELAY_ATOMIC_READ(self->base.stopping)) { time_t now = time(NULL); if (!sck) { SAY("Opening forwarding socket"); sck = open_output_socket_eventually(&self->base); if (sck == NULL || !(sck->type == SOCK_DGRAM || sck->type == SOCK_STREAM)) { FATAL_ERRNO("Failed to open forwarding socket"); break; } connected_inc(); } long since_rate_update = now - last_rate_update; if (since_rate_update >= RATE_UPDATE_PERIOD) { last_rate_update = now; update_rates(&self->rates[0], &self->totals, since_rate_update); update_rates(&self->rates[1], &self->totals, since_rate_update); update_rates(&self->rates[2], &self->totals, since_rate_update); } /* if we dont have anything in our local queue we need to hijack the main one */ if (private_queue.head == NULL) { /* hijack the queue - copy the queue state into our private copy * and then reset the queue state to empty. So the formerly * shared queue is now private. We only do this if necessary. */ if (!queue_hijack(main_queue, &private_queue, &GLOBAL.pool.lock)) { /* nothing to do, so sleep a while and redo the loop */ worker_wait_millisec(config->polling_interval_millisec); continue; } } RELAY_ATOMIC_INCREMENT(self->counters.received_count, private_queue.count); /* ok, so we should have something in our queue to process */ if (private_queue.head == NULL) { WARN("Empty private queue"); break; } ssize_t wrote = 0; if (!process_queue(self, sck, &private_queue, &spill_queue, &wrote)) { if (!RELAY_ATOMIC_READ(self->base.stopping)) { WARN("Closing forwarding socket"); close(sck->socket); sck = NULL; connected_dec(); } } accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals); } if (control_is(RELAY_STOPPING)) { SAY("Socket worker stopping, trying forwarding flush"); stats_count_t old_sent = self->totals.sent_count; stats_count_t old_spilled = self->totals.spilled_count; stats_count_t old_dropped = self->totals.dropped_count; if (sck) { ssize_t wrote = 0; if (!process_queue(self, sck, &private_queue, &spill_queue, &wrote)) { WARN_ERRNO("Forwarding flush failed"); } accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals); SAY("Forwarding flush forwarded %zd bytes in %llu events, spilled %llu events, dropped %llu events ", wrote, (unsigned long long) (self->totals.sent_count - old_sent), (unsigned long long) (self->totals.spilled_count - old_spilled), (unsigned long long) (self->totals.dropped_count - old_dropped)); } else { WARN("No forwarding socket to flush to"); } SAY("Socket worker spilling any remaining events to disk"); stats_count_t spilled = spill_all(self, &private_queue, &spill_queue); SAY("Socket worker spilled %llu events to disk", (unsigned long long) spilled); } else { accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals); } SAY("worker[%s] in its lifetime received %lu sent %lu spilled %lu dropped %lu", (sck ? sck->to_string : self->base.arg), (unsigned long) RELAY_ATOMIC_READ(self->totals.received_count), (unsigned long) RELAY_ATOMIC_READ(self->totals.sent_count), (unsigned long) RELAY_ATOMIC_READ(self->totals.spilled_count), (unsigned long) RELAY_ATOMIC_READ(self->totals.dropped_count)); if (sck) { close(sck->socket); connected_dec(); } /* we are done so shut down our "pet" disk worker, and then exit with a message */ RELAY_ATOMIC_OR(self->disk_writer->base.stopping, WORKER_STOPPING); join_err = pthread_join(self->disk_writer->base.tid, NULL); if (join_err) FATAL("shutting down disk_writer thread error: pthread error %d", join_err); free(self->disk_writer); return NULL; }
static int process_queue(socket_worker_t * self, relay_socket_t * sck, queue_t * private_queue, queue_t * spill_queue, ssize_t * wrote) { if (sck == NULL) { WARN("NULL forwarding socket"); return 0; } blob_t *cur_blob; struct timeval now; struct timeval send_start_time; struct timeval send_end_time; stats_count_t spilled = 0; const config_t *config = self->base.config; const uint64_t spill_microsec = 1000 * config->spill_millisec; const uint64_t grace_microsec = 1000 * config->spill_grace_millisec; const struct sockaddr *dest_addr = (const struct sockaddr *) &sck->sa.in; socklen_t addr_len = sck->addrlen; int in_grace_period = 0; struct timeval grace_period_start; int failed = 0; *wrote = 0; get_time(&send_start_time); cork(sck, 1); while (private_queue->head != NULL) { get_time(&now); /* While not all the socket backends are present, for a configured maximum time, * do not spill/drop. This is a bit crude, better rules/heuristics welcome. */ if (!connected_all()) { if (in_grace_period == 0) { in_grace_period = 1; get_time(&grace_period_start); SAY("Spill/drop grace period of %d millisec started", config->spill_grace_millisec); } if (elapsed_usec(&grace_period_start, &now) >= grace_microsec) { in_grace_period = 0; SAY("Spill/drop grace period of %d millisec expired", config->spill_grace_millisec); } } else { if (in_grace_period) { SAY("Spill/drop grace period of %d millisec canceled", config->spill_grace_millisec); } in_grace_period = 0; } if (in_grace_period == 0) { spilled += spill_by_age(self, config->spill_enabled, private_queue, spill_queue, spill_microsec, &now); } cur_blob = private_queue->head; if (!cur_blob) break; void *blob_data; ssize_t blob_size; if (sck->type == SOCK_DGRAM) { blob_size = BLOB_BUF_SIZE(cur_blob); blob_data = BLOB_BUF_addr(cur_blob); } else { /* sck->type == SOCK_STREAM */ blob_size = BLOB_DATA_MBR_SIZE(cur_blob); blob_data = BLOB_DATA_MBR_addr(cur_blob); } ssize_t blob_left = blob_size; ssize_t blob_sent = 0; int sendto_errno = 0; failed = 0; /* Keep sending while we have data left since a single sendto() * doesn't necessarily send all of it. This may eventually fail * if sendto() returns -1. */ while (!RELAY_ATOMIC_READ(self->base.stopping) && blob_left > 0) { const void *data = (const char *) blob_data + blob_sent; ssize_t sent; sendto_errno = 0; if (sck->type == SOCK_DGRAM) { sent = sendto(sck->socket, data, blob_left, MSG_NOSIGNAL, dest_addr, addr_len); } else { /* sck->type == SOCK_STREAM */ sent = sendto(sck->socket, data, blob_left, MSG_NOSIGNAL, NULL, 0); } sendto_errno = errno; if (0) { /* For debugging. */ peek_send(sck, data, blob_left, sent); } if (sent == -1) { WARN_ERRNO("sendto() tried sending %zd bytes to %s but sent none", blob_left, sck->to_string); RELAY_ATOMIC_INCREMENT(self->counters.error_count, 1); if (sendto_errno == EINTR) { /* sendto() got interrupted by a signal. Wait a while and retry. */ WARN("Interrupted, resuming"); worker_wait_millisec(config->sleep_after_disaster_millisec); continue; } failed = 1; break; /* stop sending from the hijacked queue */ } blob_sent += sent; blob_left -= sent; } if (blob_sent == blob_size) { RELAY_ATOMIC_INCREMENT(self->counters.sent_count, 1); } else if (blob_sent < blob_size) { /* Despite the send-loop above, we failed to send all the bytes. */ WARN("sendto() tried sending %zd bytes to %s but sent only %zd", blob_size, sck->to_string, blob_sent); RELAY_ATOMIC_INCREMENT(self->counters.partial_count, 1); failed = 1; } *wrote += blob_sent; if (failed) { /* We failed to send this packet. Exit the loop, and * right after the loop close the socket, and get out, * letting the main loop to reconnect. */ if ((sendto_errno == EAGAIN || sendto_errno == EWOULDBLOCK)) { /* Traffic jam. Wait a while, but still get out. */ WARN("Traffic jam"); worker_wait_millisec(config->sleep_after_disaster_millisec); } break; } else { queue_shift_nolock(private_queue); blob_destroy(cur_blob); } } cork(sck, 0); get_time(&send_end_time); if (spilled) { if (config->spill_enabled) { WARN("Wrote %lu items which were over spill threshold", (unsigned long) spilled); } else { WARN("Spill disabled: DROPPED %lu items which were over spill threshold", (unsigned long) spilled); } } /* this assumes end_time >= start_time */ uint64_t usec = elapsed_usec(&send_start_time, &send_end_time); RELAY_ATOMIC_INCREMENT(self->counters.send_elapsed_usec, usec); return failed == 0; }