int hast_proto_recv_hdr(const struct proto_conn *conn, struct nv **nvp) { struct hast_main_header hdr; struct nv *nv; struct ebuf *eb; void *hptr; eb = NULL; nv = NULL; if (proto_recv(conn, &hdr, sizeof(hdr)) == -1) goto fail; if (hdr.version > HAST_PROTO_VERSION) { errno = ERPCMISMATCH; goto fail; } hdr.size = le32toh(hdr.size); eb = ebuf_alloc(hdr.size); if (eb == NULL) goto fail; if (ebuf_add_tail(eb, NULL, hdr.size) == -1) goto fail; hptr = ebuf_data(eb, NULL); PJDLOG_ASSERT(hptr != NULL); if (proto_recv(conn, hptr, hdr.size) == -1) goto fail; nv = nv_ntoh(eb); if (nv == NULL) goto fail; *nvp = nv; return (0); fail: if (eb != NULL) ebuf_free(eb); return (-1); }
int hast_proto_recv_data(const struct hast_resource *res, struct proto_conn *conn, struct nv *nv, void *data, size_t size) { unsigned int ii; bool freedata; size_t dsize; void *dptr; int ret; PJDLOG_ASSERT(data != NULL); PJDLOG_ASSERT(size > 0); ret = -1; freedata = false; dptr = data; dsize = nv_get_uint32(nv, "size"); if (dsize > size) { errno = EINVAL; goto end; } else if (dsize == 0) { (void)nv_set_error(nv, 0); } else { if (proto_recv(conn, data, dsize) == -1) goto end; for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0; ii--) { ret = pipeline[ii - 1].hps_recv(res, nv, &dptr, &dsize, &freedata); if (ret == -1) goto end; } ret = -1; if (dsize > size) { errno = EINVAL; goto end; } if (dptr != data) bcopy(dptr, data, dsize); } ret = 0; end: if (freedata) free(dptr); return (ret); }
static void tls_call_exec_client(struct proto_conn *sock, const char *srcaddr, const char *dstaddr, int timeout) { char *timeoutstr, *startfdstr, *debugstr; int startfd; /* Declare that we are receiver. */ proto_recv(sock, NULL, 0); if (pjdlog_mode_get() == PJDLOG_MODE_STD) startfd = 3; else /* if (pjdlog_mode_get() == PJDLOG_MODE_SYSLOG) */ startfd = 0; if (proto_descriptor(sock) != startfd) { /* Move socketpair descriptor to descriptor number startfd. */ if (dup2(proto_descriptor(sock), startfd) == -1) pjdlog_exit(EX_OSERR, "dup2() failed"); proto_close(sock); } else { /* * The FD_CLOEXEC is cleared by dup2(2), so when we not * call it, we have to clear it by hand in case it is set. */ if (fcntl(startfd, F_SETFD, 0) == -1) pjdlog_exit(EX_OSERR, "fcntl() failed"); } closefrom(startfd + 1); if (asprintf(&startfdstr, "%d", startfd) == -1) pjdlog_exit(EX_TEMPFAIL, "asprintf() failed"); if (timeout == -1) timeout = TLS_DEFAULT_TIMEOUT; if (asprintf(&timeoutstr, "%d", timeout) == -1) pjdlog_exit(EX_TEMPFAIL, "asprintf() failed"); if (asprintf(&debugstr, "%d", pjdlog_debug_get()) == -1) pjdlog_exit(EX_TEMPFAIL, "asprintf() failed"); execl(proto_get("execpath"), proto_get("execpath"), "proto", "tls", proto_get("user"), "client", startfdstr, srcaddr == NULL ? "" : srcaddr, dstaddr, proto_get("tls:fingerprint"), proto_get("tcp:port"), timeoutstr, debugstr, NULL); pjdlog_exit(EX_SOFTWARE, "execl() failed"); }
static int tls_recv(void *ctx, unsigned char *data, size_t size, int *fdp) { struct tls_ctx *tlsctx = ctx; PJDLOG_ASSERT(tlsctx != NULL); PJDLOG_ASSERT(tlsctx->tls_magic == TLS_CTX_MAGIC); PJDLOG_ASSERT(tlsctx->tls_side == TLS_SIDE_CLIENT || tlsctx->tls_side == TLS_SIDE_SERVER_WORK); PJDLOG_ASSERT(tlsctx->tls_sock != NULL); PJDLOG_ASSERT(tlsctx->tls_wait_called); PJDLOG_ASSERT(fdp == NULL); if (proto_recv(tlsctx->tls_sock, data, size) == -1) return (errno); return (0); }
void hastd_secondary(struct hast_resource *res, struct nv *nvin) { sigset_t mask; pthread_t td; pid_t pid; int error, mode, debuglevel; /* * Create communication channel between parent and child. */ if (proto_client(NULL, "socketpair://", &res->hr_ctrl) < 0) { KEEP_ERRNO((void)pidfile_remove(pfh)); pjdlog_exit(EX_OSERR, "Unable to create control sockets between parent and child"); } /* * Create communication channel between child and parent. */ if (proto_client(NULL, "socketpair://", &res->hr_event) < 0) { KEEP_ERRNO((void)pidfile_remove(pfh)); pjdlog_exit(EX_OSERR, "Unable to create event sockets between child and parent"); } pid = fork(); if (pid < 0) { KEEP_ERRNO((void)pidfile_remove(pfh)); pjdlog_exit(EX_OSERR, "Unable to fork"); } if (pid > 0) { /* This is parent. */ proto_close(res->hr_remotein); res->hr_remotein = NULL; proto_close(res->hr_remoteout); res->hr_remoteout = NULL; /* Declare that we are receiver. */ proto_recv(res->hr_event, NULL, 0); /* Declare that we are sender. */ proto_send(res->hr_ctrl, NULL, 0); res->hr_workerpid = pid; return; } gres = res; mode = pjdlog_mode_get(); debuglevel = pjdlog_debug_get(); /* Declare that we are sender. */ proto_send(res->hr_event, NULL, 0); /* Declare that we are receiver. */ proto_recv(res->hr_ctrl, NULL, 0); descriptors_cleanup(res); descriptors_assert(res, mode); pjdlog_init(mode); pjdlog_debug_set(debuglevel); pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role)); PJDLOG_VERIFY(sigemptyset(&mask) == 0); PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); /* Error in setting timeout is not critical, but why should it fail? */ if (proto_timeout(res->hr_remotein, 2 * HAST_KEEPALIVE) < 0) pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0) pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); init_local(res); init_environment(); if (drop_privs(res) != 0) exit(EX_CONFIG); pjdlog_info("Privileges successfully dropped."); /* * Create the control thread before sending any event to the parent, * as we can deadlock when parent sends control request to worker, * but worker has no control thread started yet, so parent waits. * In the meantime worker sends an event to the parent, but parent * is unable to handle the event, because it waits for control * request response. */ error = pthread_create(&td, NULL, ctrl_thread, res); PJDLOG_ASSERT(error == 0); init_remote(res, nvin); event_send(res, EVENT_CONNECT); error = pthread_create(&td, NULL, recv_thread, res); PJDLOG_ASSERT(error == 0); error = pthread_create(&td, NULL, disk_thread, res); PJDLOG_ASSERT(error == 0); (void)send_thread(res); }
static void init_remote(struct hast_resource *res, struct nv *nvin) { uint64_t resuid; struct nv *nvout; unsigned char *map; size_t mapsize; #ifdef notyet /* Setup direction. */ if (proto_send(res->hr_remoteout, NULL, 0) == -1) pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); #endif map = NULL; mapsize = 0; nvout = nv_alloc(); nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize"); nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize"); resuid = nv_get_uint64(nvin, "resuid"); res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt"); res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt"); nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt"); nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt"); mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); map = malloc(mapsize); if (map == NULL) { pjdlog_exitx(EX_TEMPFAIL, "Unable to allocate memory (%zu bytes) for activemap.", mapsize); } /* * When we work as primary and secondary is missing we will increase * localcnt in our metadata. When secondary is connected and synced * we make localcnt be equal to remotecnt, which means nodes are more * or less in sync. * Split-brain condition is when both nodes are not able to communicate * and are both configured as primary nodes. In turn, they can both * make incompatible changes to the data and we have to detect that. * Under split-brain condition we will increase our localcnt on first * write and remote node will increase its localcnt on first write. * When we connect we can see that primary's localcnt is greater than * our remotecnt (primary was modified while we weren't watching) and * our localcnt is greater than primary's remotecnt (we were modified * while primary wasn't watching). * There are many possible combinations which are all gathered below. * Don't pay too much attention to exact numbers, the more important * is to compare them. We compare secondary's local with primary's * remote and secondary's remote with primary's local. * Note that every case where primary's localcnt is smaller than * secondary's remotecnt and where secondary's localcnt is smaller than * primary's remotecnt should be impossible in practise. We will perform * full synchronization then. Those cases are marked with an asterisk. * Regular synchronization means that only extents marked as dirty are * synchronized (regular synchronization). * * SECONDARY METADATA PRIMARY METADATA * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary. * local=3 remote=3 local=2 remote=3* ?! Full sync from primary. * local=3 remote=3 local=2 remote=4* ?! Full sync from primary. * local=3 remote=3 local=3 remote=2 Primary is out-of-date, * regular sync from secondary. * local=3 remote=3 local=3 remote=3 Regular sync just in case. * local=3 remote=3 local=3 remote=4* ?! Full sync from primary. * local=3 remote=3 local=4 remote=2 Split-brain condition. * local=3 remote=3 local=4 remote=3 Secondary out-of-date, * regular sync from primary. * local=3 remote=3 local=4 remote=4* ?! Full sync from primary. */ if (res->hr_resuid == 0) { /* * Provider is used for the first time. If primary node done no * writes yet as well (we will find "virgin" argument) then * there is no need to synchronize anything. If primary node * done any writes already we have to synchronize everything. */ PJDLOG_ASSERT(res->hr_secondary_localcnt == 0); res->hr_resuid = resuid; if (metadata_write(res) < 0) exit(EX_NOINPUT); if (nv_exists(nvin, "virgin")) { free(map); map = NULL; mapsize = 0; } else { memset(map, 0xff, mapsize); } nv_add_int8(nvout, 1, "virgin"); nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); } else if (res->hr_resuid != resuid) { char errmsg[256]; free(map); (void)snprintf(errmsg, sizeof(errmsg), "Resource unique ID mismatch (primary=%ju, secondary=%ju).", (uintmax_t)resuid, (uintmax_t)res->hr_resuid); pjdlog_error("%s", errmsg); nv_add_string(nvout, errmsg, "errmsg"); if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) { pjdlog_exit(EX_TEMPFAIL, "Unable to send response to %s", res->hr_remoteaddr); } nv_free(nvout); exit(EX_CONFIG); } else if ( /* Is primary out-of-date? */ (res->hr_secondary_localcnt > res->hr_primary_remotecnt && res->hr_secondary_remotecnt == res->hr_primary_localcnt) || /* Are the nodes more or less in sync? */ (res->hr_secondary_localcnt == res->hr_primary_remotecnt && res->hr_secondary_remotecnt == res->hr_primary_localcnt) || /* Is secondary out-of-date? */ (res->hr_secondary_localcnt == res->hr_primary_remotecnt && res->hr_secondary_remotecnt < res->hr_primary_localcnt)) { /* * Nodes are more or less in sync or one of the nodes is * out-of-date. * It doesn't matter at this point which one, we just have to * send out local bitmap to the remote node. */ if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) != (ssize_t)mapsize) { pjdlog_exit(LOG_ERR, "Unable to read activemap"); } if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && res->hr_secondary_remotecnt == res->hr_primary_localcnt) { /* Primary is out-of-date, sync from secondary. */ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); } else { /* * Secondary is out-of-date or counts match. * Sync from primary. */ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); } } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && res->hr_primary_localcnt > res->hr_secondary_remotecnt) { /* * Not good, we have split-brain condition. */ free(map); pjdlog_error("Split-brain detected, exiting."); nv_add_string(nvout, "Split-brain condition!", "errmsg"); if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) { pjdlog_exit(EX_TEMPFAIL, "Unable to send response to %s", res->hr_remoteaddr); } nv_free(nvout); /* Exit on split-brain. */ event_send(res, EVENT_SPLITBRAIN); exit(EX_CONFIG); } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt || res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ { /* * This should never happen in practise, but we will perform * full synchronization. */ PJDLOG_ASSERT(res->hr_secondary_localcnt < res->hr_primary_remotecnt || res->hr_primary_localcnt < res->hr_secondary_remotecnt); mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); memset(map, 0xff, mapsize); if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) { /* In this one of five cases sync from secondary. */ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); } else { /* For the rest four cases sync from primary. */ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); } pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).", (uintmax_t)res->hr_primary_localcnt, (uintmax_t)res->hr_primary_remotecnt, (uintmax_t)res->hr_secondary_localcnt, (uintmax_t)res->hr_secondary_remotecnt); } nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize"); if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) { pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s", res->hr_remoteaddr); } if (map != NULL) free(map); nv_free(nvout); #ifdef notyet /* Setup direction. */ if (proto_recv(res->hr_remotein, NULL, 0) == -1) pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); #endif }
static int tls_accept(void *ctx, void **newctxp) { struct tls_ctx *tlsctx = ctx; struct tls_ctx *newtlsctx; struct proto_conn *sock, *tcp; pid_t pid; int error; PJDLOG_ASSERT(tlsctx != NULL); PJDLOG_ASSERT(tlsctx->tls_magic == TLS_CTX_MAGIC); PJDLOG_ASSERT(tlsctx->tls_side == TLS_SIDE_SERVER_LISTEN); if (proto_connect(NULL, "socketpair://", -1, &sock) == -1) return (errno); /* Accept TCP connection. */ if (proto_accept(tlsctx->tls_tcp, &tcp) == -1) { error = errno; proto_close(sock); return (error); } pid = fork(); switch (pid) { case -1: /* Failure. */ error = errno; proto_close(sock); return (error); case 0: /* Child. */ pjdlog_prefix_set("[TLS sandbox] (server) "); #ifdef HAVE_SETPROCTITLE setproctitle("[TLS sandbox] (server) "); #endif /* Close listen socket. */ proto_close(tlsctx->tls_tcp); tls_call_exec_server(sock, tcp); /* NOTREACHED */ PJDLOG_ABORT("Unreachable."); default: /* Parent. */ newtlsctx = calloc(1, sizeof(*tlsctx)); if (newtlsctx == NULL) { error = errno; proto_close(sock); proto_close(tcp); (void)kill(pid, SIGKILL); return (error); } proto_local_address(tcp, newtlsctx->tls_laddr, sizeof(newtlsctx->tls_laddr)); PJDLOG_ASSERT(strncmp(newtlsctx->tls_laddr, "tcp://", 6) == 0); bcopy("tls://", newtlsctx->tls_laddr, 6); *strrchr(newtlsctx->tls_laddr, ':') = '\0'; proto_remote_address(tcp, newtlsctx->tls_raddr, sizeof(newtlsctx->tls_raddr)); PJDLOG_ASSERT(strncmp(newtlsctx->tls_raddr, "tcp://", 6) == 0); bcopy("tls://", newtlsctx->tls_raddr, 6); *strrchr(newtlsctx->tls_raddr, ':') = '\0'; proto_close(tcp); proto_recv(sock, NULL, 0); newtlsctx->tls_sock = sock; newtlsctx->tls_tcp = NULL; newtlsctx->tls_wait_called = true; newtlsctx->tls_side = TLS_SIDE_SERVER_WORK; newtlsctx->tls_magic = TLS_CTX_MAGIC; *newctxp = newtlsctx; return (0); } }
static uint8_t recv_u8(void) { return proto_recv(); }
static int sender_connect(void) { unsigned char rnd[32], hash[32], resp[32]; struct proto_conn *conn; char welcome[8]; int16_t val; val = 1; if (proto_send(adhost->adh_conn, &val, sizeof(val)) < 0) { pjdlog_exit(EX_TEMPFAIL, "Unable to send connection request to parent"); } if (proto_recv(adhost->adh_conn, &val, sizeof(val)) < 0) { pjdlog_exit(EX_TEMPFAIL, "Unable to receive reply to connection request from parent"); } if (val != 0) { errno = val; pjdlog_errno(LOG_WARNING, "Unable to connect to %s", adhost->adh_remoteaddr); return (-1); } if (proto_connection_recv(adhost->adh_conn, true, &conn) < 0) { pjdlog_exit(EX_TEMPFAIL, "Unable to receive connection from parent"); } if (proto_connect_wait(conn, adcfg->adc_timeout) < 0) { pjdlog_errno(LOG_WARNING, "Unable to connect to %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Connected to %s.", adhost->adh_remoteaddr); /* Error in setting timeout is not critical, but why should it fail? */ if (proto_timeout(conn, adcfg->adc_timeout) < 0) pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); else pjdlog_debug(1, "Timeout set to %d.", adcfg->adc_timeout); /* Exchange welcome message, which includes version number. */ (void)snprintf(welcome, sizeof(welcome), "ADIST%02d", ADIST_VERSION); if (proto_send(conn, welcome, sizeof(welcome)) < 0) { pjdlog_errno(LOG_WARNING, "Unable to send welcome message to %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Welcome message sent (%s).", welcome); bzero(welcome, sizeof(welcome)); if (proto_recv(conn, welcome, sizeof(welcome)) < 0) { pjdlog_errno(LOG_WARNING, "Unable to receive welcome message from %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } if (strncmp(welcome, "ADIST", 5) != 0 || !isdigit(welcome[5]) || !isdigit(welcome[6]) || welcome[7] != '\0') { pjdlog_warning("Invalid welcome message from %s.", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Welcome message received (%s).", welcome); /* * Receiver can only reply with version number lower or equal to * the one we sent. */ adhost->adh_version = atoi(welcome + 5); if (adhost->adh_version > ADIST_VERSION) { pjdlog_warning("Invalid version number from %s (%d received, up to %d supported).", adhost->adh_remoteaddr, adhost->adh_version, ADIST_VERSION); proto_close(conn); return (-1); } pjdlog_debug(1, "Version %d negotiated with %s.", adhost->adh_version, adhost->adh_remoteaddr); if (proto_send(conn, adcfg->adc_name, sizeof(adcfg->adc_name)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to send name to %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Name (%s) sent.", adcfg->adc_name); if (proto_recv(conn, rnd, sizeof(rnd)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to receive challenge from %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Challenge received."); if (HMAC(EVP_sha256(), adhost->adh_password, (int)strlen(adhost->adh_password), rnd, (int)sizeof(rnd), hash, NULL) == NULL) { pjdlog_warning("Unable to generate response."); proto_close(conn); return (-1); } pjdlog_debug(1, "Response generated."); if (proto_send(conn, hash, sizeof(hash)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to send response to %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Response sent."); if (adist_random(rnd, sizeof(rnd)) == -1) { pjdlog_warning("Unable to generate challenge."); proto_close(conn); return (-1); } pjdlog_debug(1, "Challenge generated."); if (proto_send(conn, rnd, sizeof(rnd)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to send challenge to %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Challenge sent."); if (proto_recv(conn, resp, sizeof(resp)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to receive response from %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Response received."); if (HMAC(EVP_sha256(), adhost->adh_password, (int)strlen(adhost->adh_password), rnd, (int)sizeof(rnd), hash, NULL) == NULL) { pjdlog_warning("Unable to generate hash."); proto_close(conn); return (-1); } pjdlog_debug(1, "Hash generated."); if (memcmp(resp, hash, sizeof(hash)) != 0) { pjdlog_warning("Invalid response from %s (wrong password?).", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_info("Receiver authenticated."); if (proto_recv(conn, &adhost->adh_trail_offset, sizeof(adhost->adh_trail_offset)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to receive size of the most recent trail file from %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } adhost->adh_trail_offset = le64toh(adhost->adh_trail_offset); if (proto_recv(conn, &adhost->adh_trail_name, sizeof(adhost->adh_trail_name)) == -1) { pjdlog_errno(LOG_WARNING, "Unable to receive name of the most recent trail file from %s", adhost->adh_remoteaddr); proto_close(conn); return (-1); } pjdlog_debug(1, "Trail name (%s) and offset (%ju) received.", adhost->adh_trail_name, (uintmax_t)adhost->adh_trail_offset); rw_wlock(&adist_remote_lock); mtx_lock(&adist_remote_mtx); PJDLOG_ASSERT(adhost->adh_remote == NULL); PJDLOG_ASSERT(conn != NULL); adhost->adh_remote = conn; mtx_unlock(&adist_remote_mtx); rw_unlock(&adist_remote_lock); cv_signal(&adist_remote_cond); return (0); }