static int cluster_running_check(struct join_message *jm) { int ret; /* * When the joining node is newly created and we are not waiting for * join we do not need to check anything. */ if (jm->nr_nodes != 0) { ret = cluster_sanity_check(jm); if (ret != CJ_RES_SUCCESS) return ret; } jm->inc_epoch = 1; return CJ_RES_SUCCESS; }
static int get_cluster_status(struct sd_node *from, struct sd_node *entries, int nr_entries, uint64_t ctime, uint32_t epoch, uint32_t *status, uint8_t *inc_epoch) { int i, j, ret = SD_RES_SUCCESS; int nr, nr_local_entries, nr_leave_entries; struct sd_node local_entries[SD_MAX_NODES]; char str[256]; uint32_t sys_stat = sys_stat_get(); *status = sys_stat; if (inc_epoch) *inc_epoch = 0; ret = cluster_sanity_check(entries, nr_entries, ctime, epoch); if (ret) goto out; switch (sys_stat) { case SD_STATUS_HALT: case SD_STATUS_OK: if (inc_epoch) *inc_epoch = 1; break; case SD_STATUS_WAIT_FOR_FORMAT: if (nr_entries != 0) ret = SD_RES_NOT_FORMATTED; break; case SD_STATUS_WAIT_FOR_JOIN: nr = sys->nr_nodes + 1; nr_local_entries = epoch_log_read_nr(epoch, (char *)local_entries, sizeof(local_entries)); if (nr != nr_local_entries) { nr_leave_entries = get_nodes_nr_from(&sys->leave_list); if (nr_local_entries == nr + nr_leave_entries) { /* Even though some nodes have left, we can make do without them. * Order cluster to do recovery right now. */ if (inc_epoch) *inc_epoch = 1; *status = SD_STATUS_OK; } break; } for (i = 0; i < nr_local_entries; i++) { if (node_eq(local_entries + i, from)) goto next; for (j = 0; j < sys->nr_nodes; j++) { if (node_eq(local_entries + i, sys->nodes + j)) goto next; } break; next: ; } *status = SD_STATUS_OK; break; case SD_STATUS_SHUTDOWN: ret = SD_RES_SHUTDOWN; break; default: break; } out: if (ret) eprintf("%x, %s\n", ret, addr_to_str(str, sizeof(str), from->addr, from->port)); return ret; }
static int cluster_wait_for_join_check(struct sd_node *joined, struct join_message *jm) { struct sd_node local_entries[SD_MAX_NODES]; int nr, nr_local_entries, nr_failed_entries, nr_delayed_nodes; uint32_t local_epoch = get_latest_epoch(); int ret; if (jm->nr_nodes == 0) return CJ_RES_JOIN_LATER; ret = cluster_sanity_check(jm); if (ret != CJ_RES_SUCCESS) { if (jm->epoch > sys->epoch) { eprintf("transfer mastership (%d, %d)\n", jm->epoch, sys->epoch); return CJ_RES_MASTER_TRANSFER; } return ret; } nr_local_entries = epoch_log_read(jm->epoch, local_entries, sizeof(local_entries)); if (nr_local_entries == -1) return CJ_RES_FAIL; if (jm->epoch < local_epoch) { eprintf("joining node epoch too small: %" PRIu32 " vs %" PRIu32 "\n", jm->epoch, local_epoch); return CJ_RES_JOIN_LATER; } if (jm->nr_nodes != nr_local_entries) { eprintf("epoch log entries do not match: %d vs %d\n", jm->nr_nodes, nr_local_entries); return CJ_RES_FAIL; } if (memcmp(jm->nodes, local_entries, sizeof(jm->nodes[0]) * jm->nr_nodes) != 0) { eprintf("epoch log entries does not match\n"); return CJ_RES_FAIL; } if (!current_vnode_info) nr = 1; else nr = current_vnode_info->nr_nodes + 1; nr_delayed_nodes = get_nodes_nr_from(&sys->delayed_nodes); /* * If we have all members from the last epoch log in the in-memory * node list, and no new nodes joining we can set the cluster live * now without incrementing the epoch. */ if (nr == nr_local_entries && !nr_delayed_nodes) { jm->cluster_status = SD_STATUS_OK; return CJ_RES_SUCCESS; } /* * If we reach the old node count, but some node failed we have to * update the epoch before setting the cluster live. */ nr_failed_entries = get_nodes_nr_from(&sys->failed_nodes); if (nr_local_entries == nr + nr_failed_entries - nr_delayed_nodes) { jm->inc_epoch = 1; jm->cluster_status = SD_STATUS_OK; return CJ_RES_SUCCESS; } /* * The join was successful, but we don't have enough nodes yet to set * the cluster live. */ return CJ_RES_SUCCESS; }