/* * check_replica_cycles -- (internal) check if healthy replicas form cycles * shorter than the number of all replicas */ static int check_replica_cycles(struct pool_set *set, struct poolset_health_status *set_hs) { LOG(3, "set %p, set_hs %p", set, set_hs); unsigned first_healthy; unsigned count_healthy = 0; for (unsigned r = 0; r < set->nreplicas; ++r) { if (!replica_is_replica_healthy(r, set_hs)) { count_healthy = 0; continue; } if (count_healthy == 0) first_healthy = r; ++count_healthy; struct pool_hdr *hdrh = PART(REP(set, first_healthy), 0).hdr; struct pool_hdr *hdr = PART(REP(set, r), 0).hdr; if (uuidcmp(hdrh->uuid, hdr->next_repl_uuid) == 0 && count_healthy < set->nreplicas) { /* * healthy replicas form a cycle shorter than * the number of all replicas; for the user it * means that: */ ERR("there exist healthy replicas which come" " from a different poolset file"); return -1; } } return 0; }
/* * create_remote_replicas -- (internal) recreate all broken replicas */ static int create_remote_replicas(struct pool_set *set, struct poolset_health_status *set_hs) { for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; if (!rep->remote) continue; if (replica_is_replica_healthy(r, set_hs)) continue; /* ignore errors from remove operation */ remove_remote(rep->remote->node_addr, rep->remote->pool_desc); unsigned nlanes = REMOTE_NLANES; int ret = util_poolset_remote_replica_open(set, r, set->poolsize, 1, &nlanes); if (ret) { LOG(1, "Creating '%s' on '%s' failed", rep->remote->pool_desc, rep->remote->node_addr); return ret; } } return 0; }
/* * open_remote_replicas -- (internal) open all unbroken remote replicas */ static int open_remote_replicas(struct pool_set *set, struct poolset_health_status *set_hs) { LOG(3, "set %p, set_hs %p", set, set_hs); for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; if (!rep->remote) continue; if (!replica_is_replica_healthy(r, set_hs)) continue; unsigned nlanes = REMOTE_NLANES; int ret = util_poolset_remote_replica_open(set, r, set->poolsize, 0, &nlanes); if (ret) { LOG(1, "Opening '%s' on '%s' failed", rep->remote->pool_desc, rep->remote->node_addr); return ret; } } return 0; }
/* * replica_is_poolset_healthy -- check if all replicas in a poolset are not * marked as broken nor inconsistent in the * helping structure */ int replica_is_poolset_healthy(struct poolset_health_status *set_hs) { for (unsigned r = 0; r < set_hs->nreplicas; ++r) { if (!replica_is_replica_healthy(r, set_hs)) return 0; } return 1; }
/* * check_replica_sizes -- (internal) check if all replicas are large * enough to hold data from a healthy replica */ static int check_replica_sizes(struct pool_set *set, struct poolset_health_status *set_hs) { LOG(3, "set %p, set_hs %p", set, set_hs); ssize_t pool_size = -1; for (unsigned r = 0; r < set->nreplicas; ++r) { /* skip broken replicas */ if (!replica_is_replica_healthy(r, set_hs)) continue; /* get the size of a pool in the replica */ ssize_t replica_pool_size; if (REP(set, r)->remote) /* XXX: no way to get the size of a remote pool yet */ replica_pool_size = (ssize_t)set->poolsize; else replica_pool_size = replica_get_pool_size(set, r); if (replica_pool_size < 0) { LOG(1, "getting pool size from replica %u failed", r); set_hs->replica[r]->flags |= IS_BROKEN; continue; } /* check if the pool is bigger than minimum size */ enum pool_type type = pool_hdr_get_type(HDR(REP(set, r), 0)); if ((size_t)replica_pool_size < pool_get_min_size(type)) { LOG(1, "pool size from replica %u is smaller than the minimum size allowed for the pool", r); set_hs->replica[r]->flags |= IS_BROKEN; continue; } /* check if each replica is big enough to hold the pool data */ if (set->poolsize < (size_t)replica_pool_size) { ERR( "some replicas are too small to hold synchronized data"); return -1; } if (pool_size < 0) { pool_size = replica_pool_size; continue; } /* check if pools in all healthy replicas are of equal size */ if (pool_size != replica_pool_size) { ERR("pool sizes from different replicas differ"); return -1; } } return 0; }
/* * update_uuids -- (internal) set all uuids that might have changed or be unset * after recreating parts */ static int update_uuids(struct pool_set *set, struct poolset_health_status *set_hs) { for (unsigned r = 0; r < set->nreplicas; ++r) { if (!replica_is_replica_healthy(r, set_hs)) update_parts_linkage(set, r, set_hs); update_replicas_linkage(set, r); update_poolset_uuids(set, r, set_hs); } return 0; }
/* * check_store_all_sizes -- (internal) store sizes from pool descriptor for all * healthy replicas */ static int check_store_all_sizes(struct pool_set *set, struct poolset_health_status *set_hs) { for (unsigned r = 0; r < set->nreplicas; ++r) { if (!replica_is_replica_healthy(r, set_hs)) continue; if (replica_check_store_size(set, set_hs, r)) return -1; } return 0; }
/* * replica_find_healthy_replica -- find a replica number which is a good source * of data */ unsigned replica_find_healthy_replica(struct poolset_health_status *set_hs) { if (set_hs->nreplicas == 1) { return replica_is_replica_broken(0, set_hs) ? UNDEF_REPLICA : 0; } else { for (unsigned r = 0; r < set_hs->nreplicas; ++r) { if (replica_is_replica_healthy(r, set_hs)) return r; } return UNDEF_REPLICA; } }
/* * update_uuids -- (internal) set all uuids that might have changed or be unset * after recreating parts */ static int update_uuids(struct pool_set *set, struct poolset_health_status *set_hs) { LOG(3, "set %p, set_hs %p", set, set_hs); for (unsigned r = 0; r < set->nreplicas; ++r) { if (!replica_is_replica_healthy(r, set_hs)) update_parts_linkage(set, r, set_hs); update_replicas_linkage(set, r); update_poolset_uuids(set, r, set_hs); } if (update_remote_headers(set)) return -1; return 0; }
/* * copy_data_to_broken_parts -- (internal) copy data to all parts created * in place of the broken ones */ static int copy_data_to_broken_parts(struct pool_set *set, unsigned healthy_replica, unsigned flags, struct poolset_health_status *set_hs) { size_t poolsize = replica_get_pool_size(set, healthy_replica); for (unsigned r = 0; r < set_hs->nreplicas; ++r) { /* skip unbroken and consistent replicas */ if (replica_is_replica_healthy(r, set_hs)) continue; struct pool_replica *rep = REP(set, r); struct pool_replica *rep_h = REP(set, healthy_replica); for (unsigned p = 0; p < rep->nparts; ++p) { /* skip unbroken parts from consistent replicas */ if (!replica_is_part_broken(r, p, set_hs) && replica_is_replica_consistent(r, set_hs)) continue; const struct pool_set_part *part = &rep->part[p]; size_t off = replica_get_part_data_offset(set, r, p); size_t len = replica_get_part_data_len(set, r, p); /* do not allow copying too much data */ if (off >= poolsize) continue; if (off + len > poolsize) len = poolsize - off; void *src_addr = ADDR_SUM(rep_h->part[0].addr, off); /* First part of replica is mapped with header */ size_t fpoff = (p == 0) ? POOL_HDR_SIZE : 0; /* copy all data */ if (!is_dry_run(flags)) { memcpy(ADDR_SUM(part->addr, fpoff), src_addr, len); pmem_msync(ADDR_SUM(part->addr, fpoff), len); } } } return 0; }
/* * copy_data_to_broken_parts -- (internal) copy data to all parts created * in place of the broken ones */ static int copy_data_to_broken_parts(struct pool_set *set, unsigned healthy_replica, unsigned flags, struct poolset_health_status *set_hs) { /* get pool size from healthy replica */ size_t poolsize = set->poolsize; for (unsigned r = 0; r < set_hs->nreplicas; ++r) { /* skip unbroken and consistent replicas */ if (replica_is_replica_healthy(r, set_hs)) continue; struct pool_replica *rep = REP(set, r); struct pool_replica *rep_h = REP(set, healthy_replica); for (unsigned p = 0; p < rep->nparts; ++p) { /* skip unbroken parts from consistent replicas */ if (!replica_is_part_broken(r, p, set_hs) && replica_is_replica_consistent(r, set_hs)) continue; const struct pool_set_part *part = &rep->part[p]; size_t off = replica_get_part_data_offset(set, r, p); size_t len = replica_get_part_data_len(set, r, p); if (rep->remote) len = poolsize - off; /* do not allow copying too much data */ if (off >= poolsize) continue; /* * First part of replica is mapped * with header */ size_t fpoff = (p == 0) ? POOL_HDR_SIZE : 0; void *dst_addr = ADDR_SUM(part->addr, fpoff); if (rep->remote) { int ret = Rpmem_persist(rep->remote->rpp, off - POOL_HDR_SIZE, len, 0); if (ret) { LOG(1, "Copying data to remote node " "failed -- '%s' on '%s'", rep->remote->pool_desc, rep->remote->node_addr); return -1; } } else if (rep_h->remote) { int ret = Rpmem_read(rep_h->remote->rpp, dst_addr, off - POOL_HDR_SIZE, len); if (ret) { LOG(1, "Reading data from remote node " "failed -- '%s' on '%s'", rep_h->remote->pool_desc, rep_h->remote->node_addr); return -1; } } else { if (off + len > poolsize) len = poolsize - off; void *src_addr = ADDR_SUM(rep_h->part[0].addr, off); /* copy all data */ memcpy(dst_addr, src_addr, len); pmem_msync(dst_addr, len); } } } return 0; }