/* destroy a worker */ void socket_worker_destroy(socket_worker_t * worker) { uint32_t was_stopping = RELAY_ATOMIC_OR(worker->base.stopping, WORKER_STOPPING); /* Avoid race between worker_pool_reload_static and worker_pool_destroy_static(). * * TODO: Another possible solution for this race could be a destructor thread * that waits on a semaphore and then destroys all. Possible flaw: what is * a thread doesn't decrement the semaphore? * * Note that similar solution is used also by the graphite worker. */ if (was_stopping & WORKER_STOPPING) return; pthread_join(worker->base.tid, NULL); LOCK_DESTROY(&worker->lock); free(worker->base.arg); free(worker); }
void set_abort_bits(uint32_t v) { RELAY_ATOMIC_OR(ABORT, v); }
/* initialize a worker safely */ socket_worker_t *socket_worker_create(const char *arg, const config_t * config) { socket_worker_t *worker = calloc_or_fatal(sizeof(*worker)); disk_writer_t *disk_writer = calloc_or_fatal(sizeof(disk_writer_t)); if (worker == NULL || disk_writer == NULL) return NULL; int create_err; worker->base.config = config; worker->base.arg = strdup(arg); worker->exists = 1; if (!socketize(arg, &worker->base.output_socket, IPPROTO_TCP, RELAY_CONN_IS_OUTBOUND, "worker")) { FATAL("Failed to socketize worker"); return NULL; } worker->disk_writer = disk_writer; disk_writer->base.config = config; disk_writer->counters = &worker->counters; disk_writer->recents = &worker->recents; disk_writer->totals = &worker->totals; #define DECAY_1MIN 60 #define DECAY_5MIN (5 * DECAY_1MIN) #define DECAY_15MIN (15 * DECAY_1MIN) rates_init(&worker->rates[0], DECAY_1MIN); rates_init(&worker->rates[1], DECAY_5MIN); rates_init(&worker->rates[2], DECAY_15MIN); LOCK_INIT(&worker->lock); /* setup spill_path */ int wrote = snprintf(disk_writer->spill_path, PATH_MAX, "%s/event_relay.%s", config->spill_root, worker->base.output_socket.arg_clean); if (wrote < 0 || wrote >= PATH_MAX) { FATAL("Failed to construct spill_path %s", disk_writer->spill_path); return NULL; } /* Create the disk_writer before we create the main worker. * We do this because the disk_writer only consumes things * that have been handled by the main worker, and vice versa * when the main worker fails to send then it might want to give * the item to the disk worker. If we did it the other way round * we might have something to assign to the disk worker but no * disk worker to assign it to. */ create_err = pthread_create(&disk_writer->base.tid, NULL, disk_writer_thread, disk_writer); if (create_err) { FATAL("Failed to create disk worker, pthread error: %d", create_err); return NULL; } /* and finally create the thread */ create_err = pthread_create(&worker->base.tid, NULL, socket_worker_thread, worker); if (create_err) { int join_err; /* we died, so shut down our "pet" disk worker, and then exit with a message */ RELAY_ATOMIC_OR(disk_writer->base.stopping, WORKER_STOPPING); /* have to handle failure of the shutdown too */ join_err = pthread_join(disk_writer->base.tid, NULL); if (join_err) { FATAL ("Failed to create socket worker, pthread error: %d, and also failed to join disk worker, pthread error: %d", create_err, join_err); } else { FATAL("Failed to create socket worker, pthread error: %d, disk worker shut down ok", create_err); } return NULL; } /* return the worker */ return worker; }
/* the main loop for the socket worker process */ void *socket_worker_thread(void *arg) { socket_worker_t *self = (socket_worker_t *) arg; queue_t *main_queue = &self->queue; relay_socket_t *sck = NULL; queue_t private_queue; queue_t spill_queue; memset(&private_queue, 0, sizeof(queue_t)); memset(&spill_queue, 0, sizeof(queue_t)); const config_t *config = self->base.config; int join_err; #define RATE_UPDATE_PERIOD 15 time_t last_rate_update = 0; while (!RELAY_ATOMIC_READ(self->base.stopping)) { time_t now = time(NULL); if (!sck) { SAY("Opening forwarding socket"); sck = open_output_socket_eventually(&self->base); if (sck == NULL || !(sck->type == SOCK_DGRAM || sck->type == SOCK_STREAM)) { FATAL_ERRNO("Failed to open forwarding socket"); break; } connected_inc(); } long since_rate_update = now - last_rate_update; if (since_rate_update >= RATE_UPDATE_PERIOD) { last_rate_update = now; update_rates(&self->rates[0], &self->totals, since_rate_update); update_rates(&self->rates[1], &self->totals, since_rate_update); update_rates(&self->rates[2], &self->totals, since_rate_update); } /* if we dont have anything in our local queue we need to hijack the main one */ if (private_queue.head == NULL) { /* hijack the queue - copy the queue state into our private copy * and then reset the queue state to empty. So the formerly * shared queue is now private. We only do this if necessary. */ if (!queue_hijack(main_queue, &private_queue, &GLOBAL.pool.lock)) { /* nothing to do, so sleep a while and redo the loop */ worker_wait_millisec(config->polling_interval_millisec); continue; } } RELAY_ATOMIC_INCREMENT(self->counters.received_count, private_queue.count); /* ok, so we should have something in our queue to process */ if (private_queue.head == NULL) { WARN("Empty private queue"); break; } ssize_t wrote = 0; if (!process_queue(self, sck, &private_queue, &spill_queue, &wrote)) { if (!RELAY_ATOMIC_READ(self->base.stopping)) { WARN("Closing forwarding socket"); close(sck->socket); sck = NULL; connected_dec(); } } accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals); } if (control_is(RELAY_STOPPING)) { SAY("Socket worker stopping, trying forwarding flush"); stats_count_t old_sent = self->totals.sent_count; stats_count_t old_spilled = self->totals.spilled_count; stats_count_t old_dropped = self->totals.dropped_count; if (sck) { ssize_t wrote = 0; if (!process_queue(self, sck, &private_queue, &spill_queue, &wrote)) { WARN_ERRNO("Forwarding flush failed"); } accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals); SAY("Forwarding flush forwarded %zd bytes in %llu events, spilled %llu events, dropped %llu events ", wrote, (unsigned long long) (self->totals.sent_count - old_sent), (unsigned long long) (self->totals.spilled_count - old_spilled), (unsigned long long) (self->totals.dropped_count - old_dropped)); } else { WARN("No forwarding socket to flush to"); } SAY("Socket worker spilling any remaining events to disk"); stats_count_t spilled = spill_all(self, &private_queue, &spill_queue); SAY("Socket worker spilled %llu events to disk", (unsigned long long) spilled); } else { accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals); } SAY("worker[%s] in its lifetime received %lu sent %lu spilled %lu dropped %lu", (sck ? sck->to_string : self->base.arg), (unsigned long) RELAY_ATOMIC_READ(self->totals.received_count), (unsigned long) RELAY_ATOMIC_READ(self->totals.sent_count), (unsigned long) RELAY_ATOMIC_READ(self->totals.spilled_count), (unsigned long) RELAY_ATOMIC_READ(self->totals.dropped_count)); if (sck) { close(sck->socket); connected_dec(); } /* we are done so shut down our "pet" disk worker, and then exit with a message */ RELAY_ATOMIC_OR(self->disk_writer->base.stopping, WORKER_STOPPING); join_err = pthread_join(self->disk_writer->base.tid, NULL); if (join_err) FATAL("shutting down disk_writer thread error: pthread error %d", join_err); free(self->disk_writer); return NULL; }