int run_watchdog (int pids[], int num_pids, char* port, char* email_address, char* mail_server) { signal (SIGPIPE, SIG_IGN); pid_t pid = fork(); if (pid < 0) { PERR ("fork"); exit (EXIT_FAILURE); } if (pid == 0) { umask(0); openlog ("SPROCKETS_WATCHDOG", LOG_CONS, LOG_DAEMON); pid_t sid = setsid(); if (sid < 0) { PMSG ("Sprockets watchdog failed to setsid. Exiting now.\n"); exit (EXIT_FAILURE); } if ((chdir("/")) < 0) { PMSG ("Sprockets watchdog failed to chdir. Exiting now.\n"); exit (EXIT_FAILURE); } int sigs[] = { SIGINT, SIGQUIT, SIGHUP, SIGCHLD }; int sfd = setup_sighandlers (sigs, 3); if (sfd == 0) { PMSG ("Sprockets watchdog failed to set sighandlers. Exiting now.\n"); exit (EXIT_FAILURE); } int epfd = init_epoll (sfd); if (epfd == 0) { PMSG ("Sprockets watchdog failed to initialize epoll. Exiting now.\n"); exit (EXIT_FAILURE); } int server_fd = NULL; if (port != NULL) { server_fd = sprocket_tcp_server (port, NULL); if (server_fd == 0) { PMSG ("Couldn't start watchdog server\n"); exit (EXIT_FAILURE); } if (add_fd_to_epoll (epfd, server_fd) == 0) { PERR ("Couldn't add watchdog server to epoll\n"); exit (EXIT_FAILURE); } } close(STDIN_FILENO); close(STDOUT_FILENO); close(STDERR_FILENO); syslog (LOG_NOTICE, "Sprockets watchdog successfully started\n"); int paused_for_signal = 0; char* stats = NULL; auto_string* stat_buffer = NULL; pthread_mutex_t stats_mutex; pthread_mutex_init(&stats_mutex, NULL); auto_array* thread_array = auto_array_create (10); if (thread_array == NULL) { syslog (LOG_CRIT, "Unable to create thread array - out of memory"); exit (EXIT_FAILURE); } while (1) { struct epoll_event events[MAX_EPOLL_EVENTS]; int num_fd = epoll_wait(epfd, events, MAX_EPOLL_EVENTS, 500); if (num_fd == -1) { if (errno == EINTR) { syslog (LOG_NOTICE, "epoll_wait interrupted. Continuing\n"); continue; } syslog (LOG_CRIT, "epoll_wait error: %s\n", strerror (errno)); exit (EXIT_FAILURE); } if (num_fd != 0) { // no fds timeout occurred for (int i = 0; i < num_fd; ++i) { if (events[i].data.fd == sfd) { // caught signal struct signalfd_siginfo fdsi; int s = read (sfd, &fdsi, sizeof fdsi); if (s != sizeof fdsi) { syslog (LOG_CRIT, "Read signal error: %s\n", strerror (errno)); continue; } switch (fdsi.ssi_signo) { case SIGINT: syslog (LOG_NOTICE, "Caught SIGINT - pausing\n"); paused_for_signal = 1; break; case SIGQUIT: syslog (LOG_NOTICE, "Caught SIGQUIT - exiting\n"); for (int ii = 0; ii < num_pids; ++ii) { if (pids[ii] != 0) { kill (pids[ii], SIGTERM); } } for (int ii = 0; ii < thread_array->count; ++ii) { watchdog_thread* wt = auto_array_get (thread_array, ii); char ex[2] = "EX"; write (wt->pipe_write, ex, 2); auto_array_delete (thread_array, free); } exit (EXIT_SUCCESS); case SIGHUP: syslog (LOG_NOTICE, "Caught SIGHUP\n"); paused_for_signal = 0; break; case SIGCHLD: syslog (LOG_NOTICE, "Caught SIGCHLD\n"); break; default: syslog (LOG_NOTICE, "Caught unknown signal\n"); break; } } else if (events[i].data.fd == server_fd) { struct sockaddr_in addr; socklen_t addr_sz = 0; int client_fd = accept (server_fd, (struct sockaddr*) &addr, &addr_sz); watchdog_thread_args* thread_args = malloc (sizeof thread_args); if (thread_args == NULL) { syslog (LOG_CRIT, "Malloc returned NULL: %s", strerror (errno)); exit (EXIT_FAILURE); } watchdog_thread* dog_thread = malloc (sizeof dog_thread); if (dog_thread == NULL) { syslog (LOG_CRIT, "Malloc returned NULL: %s", strerror (errno)); exit (EXIT_FAILURE); } int pipefd[2]; if (pipe (pipefd) < 0) { syslog (LOG_CRIT, "watchdog pipe error: %s", strerror (errno)); continue; } dog_thread->pipe_write = pipefd[1]; thread_args->pipe_read = pipefd[0]; thread_args->client_fd = client_fd; thread_args->stats = &stats; thread_args->stats_mutex = &stats_mutex; pthread_attr_t attr; int trv = 0; if ((trv = pthread_attr_init (&attr)) != 0) { syslog (LOG_CRIT, "pthread_attr_init: %s\n", strerror (trv)); continue; } if ((trv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) != 0) { syslog (LOG_CRIT, "pthread_attr_setdetachstate: %s\n", strerror (trv)); continue; } if ((trv = pthread_create(&dog_thread->th_id, &attr, client_thread, thread_args)) != 0) { syslog (LOG_CRIT, "pthread_create: %s\n", strerror (trv)); continue; } auto_array_add (thread_array, dog_thread); pthread_attr_destroy(&attr); } } } if (paused_for_signal) { syslog (LOG_NOTICE, "Paused for signal\n"); } else { auto_array* stat_files = auto_array_create (num_pids); for (int i = 0; i < num_pids; ++i) { if (pids[i] == 0) { continue; } char pid[32]; sprintf (pid, "%d", pids[i]); FILE* p = get_stat_filep (pid); if (p == NULL) { syslog (LOG_CRIT, "Process with pid %s has halted", pid); if (email_address != NULL && mail_server != NULL) { char msg[2048]; snprintf (msg, 2048, email_alert_msg_format, email_address, email_address, pid); send_email (mail_server, email_address, email_address, msg); pids[i] = 0; } continue; } auto_array_add (stat_files, p); } pthread_mutex_lock(&stats_mutex); if (stat_buffer != NULL) { auto_string_delete (stat_buffer); } stat_buffer = auto_string_create (1024); if (stat_buffer == NULL) { syslog (LOG_CRIT, "Unable to create stat buffer - out of memory"); exit (EXIT_FAILURE); } for (int i = 0; i < stat_files->count; ++i) { FILE* f = auto_array_get (stat_files, i); char* tmp = get_proc_string (f); if (tmp != NULL) { auto_string_append (stat_buffer, tmp); free (tmp); } } stats = stat_buffer->buf; pthread_mutex_unlock (&stats_mutex); for (int i = 0; i < thread_array->count; ++i) { watchdog_thread* wt = auto_array_get (thread_array, i); char ok[2] = "OK"; if (write (wt->pipe_write, ok, 2) <= 0) { syslog (LOG_CRIT, "watchdog write client thread: %s", strerror (errno)); auto_array_remove (thread_array, i); close (wt->pipe_write); free (wt); } } auto_array_delete (stat_files, close_file); } } } return pid; }
static void event_loop(int sockfd, int hidfd) { int efd, i, s; struct epoll_event events[MAXEVENTS]; struct timeval last_recv, last_heartbeat; int clientfds[MAXCLIENTS]; int client_count = 0; efd = epoll_create1(0); if (efd == -1) { perror("epoll_create"); goto out; } if (add_fd_to_epoll(efd, hidfd) < 0 || add_fd_to_epoll(efd, sockfd) < 0) { goto out; } memset(events, 0, sizeof(events)); memset(&last_recv, 0, sizeof(last_recv)); memset(&last_heartbeat, 0, sizeof(last_heartbeat)); /* The event loop */ while (1) { int n, item; n = epoll_wait (efd, events, MAXEVENTS, 2000); if (n < 0) { if (errno != EINTR) { perror("epoll_wait"); } goto out; } if (client_count > 0) { struct timeval now; gettimeofday(&now, NULL); if (now.tv_sec - last_recv.tv_sec > RESET_TIMEOUT) { send_hmr_reset(hidfd); last_recv = now; } if (now.tv_sec - last_heartbeat.tv_sec > HEARTBEAT_INTERVAL) { send_hmr_heartbeat(hidfd); last_heartbeat = now; } } for (item = 0; item < n; item++) { struct epoll_event *ev = &events[item]; if (ev->events & (EPOLLERR | EPOLLHUP)) { fprintf (stderr, "epoll error\n"); if (ev->data.fd == sockfd || ev->data.fd == hidfd) { goto out; } else { remove_client(ev->data.fd, clientfds, &client_count); continue; } } else if (sockfd == ev->data.fd) { /* We have a notification on the listening socket, which * means one or more incoming connections. */ while (1) { struct sockaddr in_addr; socklen_t in_len; int infd; char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; in_len = sizeof(in_addr); infd = accept(sockfd, &in_addr, &in_len); if (infd == -1) { if (errno == EAGAIN || errno == EWOULDBLOCK) { /* We have processed all incoming * connections. */ break; } else { perror("accept"); break; } } s = getnameinfo(&in_addr, in_len, hbuf, sizeof(hbuf), sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV); if (s == 0) { printf("Accepted connection on descriptor %d " "(host=%s, port=%s)\n", infd, hbuf, sbuf); } /* Make the incoming socket non-blocking and add it to the * list of fds to monitor. */ s = make_fd_non_blocking(infd); if (s == -1) { close(infd); continue; } if (add_fd_to_epoll(efd, infd) < 0) { close(infd); continue; } if (client_count < MAXCLIENTS) { if (client_count == 0) { send_hmr_reset(hidfd); send_hmr_heartbeat(hidfd); gettimeofday(&last_recv, NULL); last_heartbeat = last_recv; } clientfds[client_count++] = infd; } } } else if (hidfd == ev->data.fd) { char buf[8]; ssize_t count = read(hidfd, buf, sizeof(buf)); gettimeofday(&last_recv, NULL); if (count == 8) { int cl, length = buf[0]; if (length < 8) { for (cl = 0; cl < client_count; ) { if (write_exact(clientfds[cl], buf + 1, length) < 0) { remove_client(clientfds[cl], clientfds, &client_count); /* keep cl, as remove_client shifted the array */ } else { cl++; } } } } } else { /* We have data on the fd waiting to be read. Read and * discard it. We must read whatever data is available * completely, as we are running in edge-triggered mode * and won't get a notification again for the same * data. */ int done = 0; while (1) { ssize_t count; char buf[512]; count = read (ev->data.fd, buf, sizeof buf); if (count == -1) { /* If errno == EAGAIN, that means we have read all * data. So go back to the main loop. */ if (errno != EAGAIN && errno != EWOULDBLOCK) { perror ("read"); done = 1; } break; } else if (count == 0) { /* End of file. The remote has closed the connection */ done = 1; break; } } if (done) { printf ("Closed connection on descriptor %d\n", ev->data.fd); /* Closing the descriptor will make epoll remove it * from the set of descriptors which are monitored. */ remove_client(ev->data.fd, clientfds, &client_count); } } } } out: for (i = 0; i < client_count; i++) { close(clientfds[i]); } if (efd >= 0) { close(efd); } }
static void worker_thread_handle_msgqueue_event( struct worker_thread* state, uint32_t event_flags ) { if (event_flags & EPOLLERR) { D_FMTSTRING("Error on message queue!"); return; } for (;;) { /* * Drain all the bytes from the queue descriptor. */ struct message msg; ssize_t bytes = HANDLE_EINTR_ON_SYSCALL( mq_receive(state->wk_messagequeue.mq_queuefds, (char*) &msg, sizeof(msg), NULL /* don't care about message priority */)); if (bytes == -1) { if (errno == EAGAIN) { return; } D_FUNCFAIL_ERRNO(mq_receive); return; } BUGSTOP_IF((msg.msg_code != kITTMessageAddClient), "Unknown message code"); /* * The client object gets ownership of the socket descriptor. * Should anything go wrong in the creation process it will * close the socket descriptor. */ struct client* clnt = client_create(msg.msg_data.msg_fd, state->wk_allocator); if (!clnt) { /* * Client creation failed. Try to get next message, if any. * Close client socket. */ HANDLE_EINTR_ON_SYSCALL(close(msg.msg_data.msg_fd)); continue; } if (add_fd_to_epoll(state->wk_epoll_fds, clnt->cl_sockfd, EPOLLIN | EPOLLRDHUP, kDataTypePTR, clnt) == -1) { /* * Failed to add client socket to epoll so release all resources and get * the next message. */ client_destroy(clnt, state->wk_allocator); continue; } /* * Add client to list. */ dlist_push_tail(state->wk_clients, clnt); } }
static int server_init( struct server* p_srv ) { assert(p_srv); memset(p_srv, 0, sizeof(*p_srv)); p_srv->sv_allocator = allocator_handle; p_srv->sv_quitflag = 0; p_srv->sv_acceptfd = create_server_socket(); if (-1 == p_srv->sv_acceptfd) { return -1; } /* * Level 1 - accept socket created. */ ++p_srv->sv_rollback; p_srv->sv_epollfd = epoll_create(kMaxEpollCompletionEntries); if (-1 == p_srv->sv_epollfd) { return -1; } /* * Level 2 - epoll descriptor allocated. */ ++p_srv->sv_rollback; /* * Block SIGINT and create a signal descriptor to receive it via epoll. */ sigset_t sig_mask; sigemptyset(&sig_mask); if (-1 == sigaddset(&sig_mask, SIGINT)) { return -1; } if (-1 == sigprocmask(SIG_BLOCK, &sig_mask, NULL)) { return -1; } p_srv->sv_sigfds = signalfd(-1, &sig_mask, SFD_NONBLOCK); if (-1 == p_srv->sv_sigfds) { return -1; } /* * Level 3 - signal descriptor for SIGINT allocated. */ ++p_srv->sv_rollback; /* * Add termination signal and accept socket to epoll interface. */ if (-1 == add_fd_to_epoll(p_srv->sv_epollfd, p_srv->sv_sigfds, EPOLLIN | EPOLLET, kDataTypeFD, p_srv->sv_sigfds)) { return -1; } if (-1 == add_fd_to_epoll(p_srv->sv_epollfd, p_srv->sv_acceptfd, EPOLLIN | EPOLLET | EPOLLRDHUP, kDataTypeFD, p_srv->sv_acceptfd)) { return -1; } p_srv->sv_threadrdy_eventfds = eventfd(0, 0); if (p_srv->sv_threadrdy_eventfds == -1) { D_FUNCFAIL_ERRNO(eventfd); return -1; } /* * Level 4 - thread notification event created. */ ++p_srv->sv_rollback; /* * Get number of available processors. The number of spawned threads is * nr_processors * thread_to_proc_ratio. */ long nr_procs = sysconf(_SC_NPROCESSORS_ONLN); if (nr_procs == -1) { D_FUNCFAIL_ERRNO(sysconf); return -1; } D_FMTSTRING("Online processors %d, will spawn %d threads.", nr_procs, nr_procs); p_srv->sv_workers = p_srv->sv_allocator->al_mem_alloc( p_srv->sv_allocator, (sizeof(struct worker_thread*) * nr_procs)); if (!p_srv->sv_workers) { D_FMTSTRING("Out of memory!"); return -1; } /* * Level 5 - memory for worker thread data allocated. */ ++p_srv->sv_rollback; memset(p_srv->sv_workers, 0, sizeof(struct worker_thread*) * nr_procs); /* * Initialize data and start worker threads. */ for (long l = 0; l < nr_procs; ++l) { char thread_msgqueue[NAME_MAX]; snprintf(thread_msgqueue, sizeof(thread_msgqueue) - 1, "/__msgqueue_thread_%d__", (int) l); struct worker_thread* current = worker_thread_create(thread_msgqueue, p_srv->sv_allocator); if (current) { if (worker_thread_start(p_srv, current, NULL) == 0) { /* * Thread successfully initialized, add it to list. */ p_srv->sv_workers[p_srv->sv_worker_count++] = current; } else { /* * Cleanup thread data since pthread_create() failed. */ worker_thread_destroy(current); } } } if (!p_srv->sv_worker_count) { D_FMTSTRING("Fatal : failed to initialize at least one worker thread!"); return -1; } D_FMTSTRING("Started a total of %d worker threads", p_srv->sv_worker_count); /* * Server is up and running. */ return 0; }
/* * @@ Not implemented. @@ */ static void* worker_thread_proc( void* args ) { struct worker_thread* state = (struct worker_thread*) args; BUGSTOP_IF((!state), "Invalid thread state specified!"); D_FMTSTRING("Client thread (%u) starting\n", syscall(SYS_gettid)); /* * Add the message queue and the termination event to epoll. */ int result = 0; if (add_fd_to_epoll(state->wk_epoll_fds, state->wk_termsig.so_sigfds, EPOLLIN | EPOLLET, kDataTypePTR, (void*) &state->wk_termsig) == 0) { ++result; } if (add_fd_to_epoll(state->wk_epoll_fds, state->wk_messagequeue.mq_queuefds, EPOLLIN | EPOLLET, kDataTypePTR, (void*) &state->wk_messagequeue) == 0) { ++result; } /* * Notify waiter with our initialize status. */ uint64_t init_status = (result == 2 ? kThreadInitOk : kThreadInitFail); HANDLE_EINTR_ON_SYSCALL(write(state->wk_readyevent, &init_status, sizeof(init_status))); if (result != 2) { /* * Failed to init so return. */ return NULL; } /* * Loop forever waiting for events. */ for (; !state->wk_quitflag;) { struct epoll_event rec_events[kMaxEpollCompletionEntries]; int ev_count = HANDLE_EINTR_ON_SYSCALL(epoll_wait(state->wk_epoll_fds, rec_events, kMaxEpollCompletionEntries, -1 /* don;t timeout */)); if (ev_count == -1) { D_FUNCFAIL_ERRNO(epoll_wait); break; } for (int i = 0; i < ev_count && !state->wk_quitflag; ++i) { worker_thread_handle_event(state, rec_events + i); } } return NULL; }