Example #1
0
relay_socket_t *open_output_socket_eventually(struct worker_base *base)
{
    const config_t *config = base->config;
    relay_socket_t *sck = NULL;
    int nap = config->sleep_after_disaster_millisec;
    int max = config->max_socket_open_wait_millisec;

    while (!RELAY_ATOMIC_READ(base->stopping) && !sck) {
        if (open_socket(&base->output_socket, DO_CONNECT, config->server_socket_sndbuf_bytes, 0)) {
            sck = &base->output_socket;
        } else {
            /* no socket - wait a while, double the wait (up to a limit), and then redo the loop */
            SAY("waiting %d millisec to retry socket %s", nap, base->output_socket.to_string);
            worker_wait_millisec(nap);
            if (nap < max) {
                nap = 2 * nap + (time(NULL) & 31);      /* "Random" fuzz of up to 0.031s. */
            }
            if (nap > max) {
                nap = max;
            }
        }
    }

    if (RELAY_ATOMIC_READ(base->stopping)) {
        WARN("Stopping, not opening sockets");
    }

    return sck;
}
Example #2
0
void mark_second_elapsed() {
    char str[MAX_BUF_LEN+1];
    stats_count_t received= RELAY_ATOMIC_READ(RECEIVED_STATS.received_count);
    stats_count_t active  = RELAY_ATOMIC_READ(RECEIVED_STATS.active_connections);

    /* set it in the process name */
    int wrote= snprintf(
        str, MAX_BUF_LEN,
        STATSfmt " ^" STATSfmt , received, active
    );

    add_worker_stats_to_ps_str(str + wrote, MAX_BUF_LEN - wrote);
    setproctitle(str);
}
Example #3
0
void
snapshot_stats(stats_basic_counters_t *counters, stats_basic_counters_t *totals) {
    stats_count_t received_count= RELAY_ATOMIC_READ(counters->received_count);  /* number of items we have received */
    stats_count_t sent_count=     RELAY_ATOMIC_READ(counters->sent_count);      /* number of items we have sent */
    stats_count_t partial_count=  RELAY_ATOMIC_READ(counters->partial_count);   /* number of items we have spilled */
    stats_count_t spilled_count=  RELAY_ATOMIC_READ(counters->spilled_count);   /* number of items we have spilled */
    stats_count_t error_count=    RELAY_ATOMIC_READ(counters->error_count);     /* number of items that had an error */
    stats_count_t disk_count=     RELAY_ATOMIC_READ(counters->disk_count);      /* number of items written to disk */
    stats_count_t disk_error_count=  RELAY_ATOMIC_READ(counters->disk_error_count);  /* number of items we failed to write to disk */
    stats_count_t send_elapsed_usec= RELAY_ATOMIC_READ(counters->send_elapsed_usec); /* time to send */

    RELAY_ATOMIC_INCREMENT(totals->received_count, received_count);     /* number of items we have received */
    RELAY_ATOMIC_INCREMENT(totals->sent_count, sent_count);             /* number of items we have sent */
    RELAY_ATOMIC_INCREMENT(totals->partial_count, partial_count);       /* number of items we have spilled */
    RELAY_ATOMIC_INCREMENT(totals->spilled_count, spilled_count);       /* number of items we have spilled */
    RELAY_ATOMIC_INCREMENT(totals->error_count, error_count);           /* number of items that had an error */
    RELAY_ATOMIC_INCREMENT(totals->disk_count, disk_count);             /* number of items written to disk */
    RELAY_ATOMIC_INCREMENT(totals->disk_error_count, disk_error_count);   /* number of items written to disk */
    RELAY_ATOMIC_INCREMENT(totals->send_elapsed_usec, send_elapsed_usec); /* time to send */
    
    RELAY_ATOMIC_DECREMENT(counters->received_count, received_count);   /* number of items we have received */
    RELAY_ATOMIC_DECREMENT(counters->sent_count, sent_count);           /* number of items we have sent */
    RELAY_ATOMIC_DECREMENT(counters->partial_count, partial_count);     /* number of items we have spilled */
    RELAY_ATOMIC_DECREMENT(counters->spilled_count, spilled_count);     /* number of items we have spilled */
    RELAY_ATOMIC_DECREMENT(counters->error_count, error_count);         /* number of items that had an error */
    RELAY_ATOMIC_DECREMENT(counters->disk_count, disk_count);           /* number of items written to disk */
    RELAY_ATOMIC_DECREMENT(counters->disk_error_count, disk_error_count);   /* number of items written to disk */
    RELAY_ATOMIC_DECREMENT(counters->send_elapsed_usec, send_elapsed_usec); /* time to send */
}
Example #4
0
/* create a disk writer worker thread
 * main loop for the disk writer worker process */
void *disk_writer_thread(void *arg) {
    disk_writer_t *self = (disk_writer_t *)arg;

    queue_t private_queue;
    queue_t *main_queue = &self->queue;
    blob_t *b;
    uint32_t done_work= 0;
    
    recreate_fallback_path(self->fallback_path);
    SAY("disk writer started using path '%s' for files", self->fallback_path);

    memset(&private_queue, 0, sizeof(private_queue));

    while( 1 ){

        q_hijack(main_queue, &private_queue, &POOL.lock);
        b= private_queue.head;

        if ( b == NULL ) {
            if (done_work) {
                SAY("cleared disk queue of %d items", done_work);
                done_work= 0;
            }

            setup_for_epoch(self, 0);
            if (RELAY_ATOMIC_READ(self->exit)) {
                /* nothing to do and we have been asked to exit, so break from the loop */
                break;
            }
            else {
                w_wait( CONFIG.polling_interval_ms );
            }
        } else {
            do {
                done_work++;
                write_blob_to_disk(self, b);
                b_destroy( q_shift_nolock( &private_queue) );
            } while ((b = private_queue.head) != NULL);

            (void)snapshot_stats( self->pcounters, self->ptotals );
        }
    }

    (void)snapshot_stats( self->pcounters, self->ptotals );

    SAY("disk_writer saved " STATSfmt " packets in its lifetime", self->ptotals->disk_count);

    return NULL;
}
Example #5
0
uint32_t is_aborted() {
    uint32_t v= RELAY_ATOMIC_READ(ABORT);
    return (v & DIE) == DIE;
}
Example #6
0
uint32_t not_aborted() {
    uint32_t v= RELAY_ATOMIC_READ(ABORT);
    return (v & DIE) == 0;
}
Example #7
0
uint32_t get_abort_val() {
    return RELAY_ATOMIC_READ(ABORT);
}
Example #8
0
/* the main loop for the socket worker process */
void *socket_worker_thread(void *arg)
{
    socket_worker_t *self = (socket_worker_t *) arg;

    queue_t *main_queue = &self->queue;
    relay_socket_t *sck = NULL;

    queue_t private_queue;
    queue_t spill_queue;

    memset(&private_queue, 0, sizeof(queue_t));
    memset(&spill_queue, 0, sizeof(queue_t));

    const config_t *config = self->base.config;

    int join_err;

#define RATE_UPDATE_PERIOD 15
    time_t last_rate_update = 0;

    while (!RELAY_ATOMIC_READ(self->base.stopping)) {
        time_t now = time(NULL);

        if (!sck) {
            SAY("Opening forwarding socket");
            sck = open_output_socket_eventually(&self->base);
            if (sck == NULL || !(sck->type == SOCK_DGRAM || sck->type == SOCK_STREAM)) {
                FATAL_ERRNO("Failed to open forwarding socket");
                break;
            }
            connected_inc();
        }

        long since_rate_update = now - last_rate_update;
        if (since_rate_update >= RATE_UPDATE_PERIOD) {
            last_rate_update = now;
            update_rates(&self->rates[0], &self->totals, since_rate_update);
            update_rates(&self->rates[1], &self->totals, since_rate_update);
            update_rates(&self->rates[2], &self->totals, since_rate_update);
        }

        /* if we dont have anything in our local queue we need to hijack the main one */
        if (private_queue.head == NULL) {
            /* hijack the queue - copy the queue state into our private copy
             * and then reset the queue state to empty. So the formerly
             * shared queue is now private. We only do this if necessary.
             */
            if (!queue_hijack(main_queue, &private_queue, &GLOBAL.pool.lock)) {
                /* nothing to do, so sleep a while and redo the loop */
                worker_wait_millisec(config->polling_interval_millisec);
                continue;
            }
        }

        RELAY_ATOMIC_INCREMENT(self->counters.received_count, private_queue.count);

        /* ok, so we should have something in our queue to process */
        if (private_queue.head == NULL) {
            WARN("Empty private queue");
            break;
        }

        ssize_t wrote = 0;
        if (!process_queue(self, sck, &private_queue, &spill_queue, &wrote)) {
            if (!RELAY_ATOMIC_READ(self->base.stopping)) {
                WARN("Closing forwarding socket");
                close(sck->socket);
                sck = NULL;
                connected_dec();
            }
        }

        accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals);
    }

    if (control_is(RELAY_STOPPING)) {
        SAY("Socket worker stopping, trying forwarding flush");
        stats_count_t old_sent = self->totals.sent_count;
        stats_count_t old_spilled = self->totals.spilled_count;
        stats_count_t old_dropped = self->totals.dropped_count;
        if (sck) {
            ssize_t wrote = 0;
            if (!process_queue(self, sck, &private_queue, &spill_queue, &wrote)) {
                WARN_ERRNO("Forwarding flush failed");
            }
            accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals);
            SAY("Forwarding flush forwarded %zd bytes in %llu events, spilled %llu events, dropped %llu events ",
                wrote, (unsigned long long) (self->totals.sent_count - old_sent),
                (unsigned long long) (self->totals.spilled_count - old_spilled),
                (unsigned long long) (self->totals.dropped_count - old_dropped));
        } else {
            WARN("No forwarding socket to flush to");
        }
        SAY("Socket worker spilling any remaining events to disk");
        stats_count_t spilled = spill_all(self, &private_queue, &spill_queue);
        SAY("Socket worker spilled %llu events to disk", (unsigned long long) spilled);
    } else {
        accumulate_and_clear_stats(&self->counters, &self->recents, &self->totals);
    }

    SAY("worker[%s] in its lifetime received %lu sent %lu spilled %lu dropped %lu",
        (sck ? sck->to_string : self->base.arg),
        (unsigned long) RELAY_ATOMIC_READ(self->totals.received_count),
        (unsigned long) RELAY_ATOMIC_READ(self->totals.sent_count),
        (unsigned long) RELAY_ATOMIC_READ(self->totals.spilled_count),
        (unsigned long) RELAY_ATOMIC_READ(self->totals.dropped_count));

    if (sck) {
        close(sck->socket);
        connected_dec();
    }

    /* we are done so shut down our "pet" disk worker, and then exit with a message */
    RELAY_ATOMIC_OR(self->disk_writer->base.stopping, WORKER_STOPPING);

    join_err = pthread_join(self->disk_writer->base.tid, NULL);

    if (join_err)
        FATAL("shutting down disk_writer thread error: pthread error %d", join_err);

    free(self->disk_writer);

    return NULL;
}
Example #9
0
static int process_queue(socket_worker_t * self, relay_socket_t * sck, queue_t * private_queue, queue_t * spill_queue,
                         ssize_t * wrote)
{
    if (sck == NULL) {
        WARN("NULL forwarding socket");
        return 0;
    }

    blob_t *cur_blob;
    struct timeval now;
    struct timeval send_start_time;
    struct timeval send_end_time;
    stats_count_t spilled = 0;

    const config_t *config = self->base.config;
    const uint64_t spill_microsec = 1000 * config->spill_millisec;
    const uint64_t grace_microsec = 1000 * config->spill_grace_millisec;

    const struct sockaddr *dest_addr = (const struct sockaddr *) &sck->sa.in;
    socklen_t addr_len = sck->addrlen;

    int in_grace_period = 0;
    struct timeval grace_period_start;

    int failed = 0;

    *wrote = 0;

    get_time(&send_start_time);

    cork(sck, 1);

    while (private_queue->head != NULL) {
        get_time(&now);

        /* While not all the socket backends are present, for a configured maximum time,
         * do not spill/drop. This is a bit crude, better rules/heuristics welcome. */
        if (!connected_all()) {
            if (in_grace_period == 0) {
                in_grace_period = 1;
                get_time(&grace_period_start);
                SAY("Spill/drop grace period of %d millisec started", config->spill_grace_millisec);
            }
            if (elapsed_usec(&grace_period_start, &now) >= grace_microsec) {
                in_grace_period = 0;
                SAY("Spill/drop grace period of %d millisec expired", config->spill_grace_millisec);
            }
        } else {
            if (in_grace_period) {
                SAY("Spill/drop grace period of %d millisec canceled", config->spill_grace_millisec);
            }
            in_grace_period = 0;
        }

        if (in_grace_period == 0) {
            spilled += spill_by_age(self, config->spill_enabled, private_queue, spill_queue, spill_microsec, &now);
        }

        cur_blob = private_queue->head;
        if (!cur_blob)
            break;

        void *blob_data;
        ssize_t blob_size;

        if (sck->type == SOCK_DGRAM) {
            blob_size = BLOB_BUF_SIZE(cur_blob);
            blob_data = BLOB_BUF_addr(cur_blob);
        } else {                /* sck->type == SOCK_STREAM */
            blob_size = BLOB_DATA_MBR_SIZE(cur_blob);
            blob_data = BLOB_DATA_MBR_addr(cur_blob);
        }

        ssize_t blob_left = blob_size;
        ssize_t blob_sent = 0;
        int sendto_errno = 0;

        failed = 0;

        /* Keep sending while we have data left since a single sendto()
         * doesn't necessarily send all of it.  This may eventually fail
         * if sendto() returns -1. */
        while (!RELAY_ATOMIC_READ(self->base.stopping) && blob_left > 0) {
            const void *data = (const char *) blob_data + blob_sent;
            ssize_t sent;

            sendto_errno = 0;
            if (sck->type == SOCK_DGRAM) {
                sent = sendto(sck->socket, data, blob_left, MSG_NOSIGNAL, dest_addr, addr_len);
            } else {            /* sck->type == SOCK_STREAM */
                sent = sendto(sck->socket, data, blob_left, MSG_NOSIGNAL, NULL, 0);
            }
            sendto_errno = errno;

            if (0) {            /* For debugging. */
                peek_send(sck, data, blob_left, sent);
            }

            if (sent == -1) {
                WARN_ERRNO("sendto() tried sending %zd bytes to %s but sent none", blob_left, sck->to_string);
                RELAY_ATOMIC_INCREMENT(self->counters.error_count, 1);
                if (sendto_errno == EINTR) {
                    /* sendto() got interrupted by a signal.  Wait a while and retry. */
                    WARN("Interrupted, resuming");
                    worker_wait_millisec(config->sleep_after_disaster_millisec);
                    continue;
                }
                failed = 1;
                break;          /* stop sending from the hijacked queue */
            }

            blob_sent += sent;
            blob_left -= sent;
        }

        if (blob_sent == blob_size) {
            RELAY_ATOMIC_INCREMENT(self->counters.sent_count, 1);
        } else if (blob_sent < blob_size) {
            /* Despite the send-loop above, we failed to send all the bytes. */
            WARN("sendto() tried sending %zd bytes to %s but sent only %zd", blob_size, sck->to_string, blob_sent);
            RELAY_ATOMIC_INCREMENT(self->counters.partial_count, 1);
            failed = 1;
        }

        *wrote += blob_sent;

        if (failed) {
            /* We failed to send this packet.  Exit the loop, and
             * right after the loop close the socket, and get out,
             * letting the main loop to reconnect. */
            if ((sendto_errno == EAGAIN || sendto_errno == EWOULDBLOCK)) {
                /* Traffic jam.  Wait a while, but still get out. */
                WARN("Traffic jam");
                worker_wait_millisec(config->sleep_after_disaster_millisec);
            }
            break;
        } else {
            queue_shift_nolock(private_queue);
            blob_destroy(cur_blob);
        }
    }

    cork(sck, 0);

    get_time(&send_end_time);

    if (spilled) {
        if (config->spill_enabled) {
            WARN("Wrote %lu items which were over spill threshold", (unsigned long) spilled);
        } else {
            WARN("Spill disabled: DROPPED %lu items which were over spill threshold", (unsigned long) spilled);
        }
    }

    /* this assumes end_time >= start_time */
    uint64_t usec = elapsed_usec(&send_start_time, &send_end_time);
    RELAY_ATOMIC_INCREMENT(self->counters.send_elapsed_usec, usec);

    return failed == 0;
}