/* * replica_check_poolset_health -- check if a given poolset can be considered as * healthy, and store the status in a helping structure */ int replica_check_poolset_health(struct pool_set *set, struct poolset_health_status **set_hsp, unsigned flags) { if (replica_create_poolset_health_status(set, set_hsp)) { LOG(1, "Creating poolset health status failed"); return -1; } struct poolset_health_status *set_hs = *set_hsp; /* check if part files exist, and if not - create them, and open them */ check_and_open_poolset_part_files(set, set_hs, flags); /* map all headers */ map_all_unbroken_headers(set, set_hs); /* check if checksums are correct for parts in all replicas */ check_checksums(set, set_hs); /* check if uuids in parts across each replica are consistent */ if (check_replicas_consistency(set, set_hs)) { LOG(1, "Replica consistency check failed"); goto err; } /* check poolset_uuid values between replicas */ if (check_poolset_uuids(set, set_hs)) { LOG(1, "Poolset uuids check failed"); goto err; } /* check if uuids for adjacent replicas are consistent */ if (check_uuids_between_replicas(set, set_hs)) { LOG(1, "Replica uuids check failed"); goto err; } if (check_store_all_sizes(set, set_hs)) { LOG(1, "Reading pool sizes failed"); goto err; } unmap_all_headers(set); util_poolset_fdclose(set); return 0; err: unmap_all_headers(set); util_poolset_fdclose(set); replica_free_poolset_health_status(set_hs); return -1; }
/* * pmemlog_open_common -- (internal) open a log memory pool * * This routine does all the work, but takes a cow flag so internal * calls can map a read-only pool if required. */ static PMEMlogpool * pmemlog_open_common(const char *path, int cow) { LOG(3, "path %s cow %d", path, cow); struct pool_set *set; if (util_pool_open(&set, path, cow, PMEMLOG_MIN_POOL, LOG_HDR_SIG, LOG_FORMAT_MAJOR, LOG_FORMAT_COMPAT, LOG_FORMAT_INCOMPAT, LOG_FORMAT_RO_COMPAT, NULL) != 0) { LOG(2, "cannot open pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMlogpool *plp = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&plp->addr, sizeof(struct pmemlog) - ((uintptr_t)&plp->addr - (uintptr_t)&plp->hdr)); plp->addr = plp; plp->size = rep->repsize; plp->set = set; plp->is_pmem = rep->is_pmem; plp->is_dax = rep->part[0].is_dax; if (set->nreplicas > 1) { errno = ENOTSUP; ERR("!replicas not supported"); goto err; } /* validate pool descriptor */ if (pmemlog_descr_check(plp, rep->repsize) != 0) { LOG(2, "descriptor check failed"); goto err; } /* initialize runtime parts */ if (pmemlog_runtime_init(plp, set->rdonly) != 0) { ERR("pool initialization failed"); goto err; } util_poolset_fdclose(set); LOG(3, "plp %p", plp); return plp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 0); errno = oerrno; return NULL; }
/* * pmemblk_open_common -- (internal) open a block memory pool * * This routine does all the work, but takes a cow flag so internal * calls can map a read-only pool if required. * * Passing in bsize == 0 means a valid pool header must exist (which * will supply the block size). */ static PMEMblkpool * pmemblk_open_common(const char *path, size_t bsize, int cow) { LOG(3, "path %s bsize %zu cow %d", path, bsize, cow); struct pool_set *set; if (util_pool_open(&set, path, cow, PMEMBLK_MIN_POOL, roundup(sizeof (struct pmemblk), Pagesize), BLK_HDR_SIG, BLK_FORMAT_MAJOR, BLK_FORMAT_COMPAT, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT) != 0) { LOG(2, "cannot open pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMblkpool *pbp = rep->part[0].addr; pbp->addr = pbp; pbp->size = rep->repsize; VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, sizeof (struct pmemblk) - ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); if (set->nreplicas > 1) { ERR("replicas not supported"); goto err; } /* validate pool descriptor */ if (pmemblk_descr_check(pbp, &bsize) != 0) { LOG(2, "descriptor check failed"); goto err; } /* initialize runtime parts */ if (pmemblk_runtime_init(pbp, bsize, set->rdonly, rep->is_pmem) != 0) { ERR("pool initialization failed"); goto err; } util_poolset_fdclose(set); util_poolset_free(set); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 0); errno = oerrno; return NULL; }
/* * pmemlog_create -- create a log memory pool */ PMEMlogpool * pmemlog_create(const char *path, size_t poolsize, mode_t mode) { LOG(3, "path %s poolsize %zu mode %d", path, poolsize, mode); struct pool_set *set; if (util_pool_create(&set, path, poolsize, PMEMLOG_MIN_POOL, LOG_HDR_SIG, LOG_FORMAT_MAJOR, LOG_FORMAT_COMPAT, LOG_FORMAT_INCOMPAT, LOG_FORMAT_RO_COMPAT, NULL, REPLICAS_DISABLED) != 0) { LOG(2, "cannot create pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMlogpool *plp = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&plp->addr, sizeof(struct pmemlog) - ((uintptr_t)&plp->addr - (uintptr_t)&plp->hdr)); plp->addr = plp; plp->size = rep->repsize; plp->set = set; plp->is_pmem = rep->is_pmem; plp->is_dax = rep->part[0].is_dax; /* create pool descriptor */ if (pmemlog_descr_create(plp, rep->repsize) != 0) { LOG(2, "descriptor creation failed"); goto err; } /* initialize runtime parts */ if (pmemlog_runtime_init(plp, 0) != 0) { ERR("pool initialization failed"); goto err; } if (util_poolset_chmod(set, mode)) goto err; util_poolset_fdclose(set); LOG(3, "plp %p", plp); return plp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 1); errno = oerrno; return NULL; }
/* * replica_open_poolset_part_files -- open all part files for a poolset */ int replica_open_poolset_part_files(struct pool_set *set) { for (unsigned r = 0; r < set->nreplicas; ++r) { if (set->replica[r]->remote) continue; if (replica_open_replica_part_files(set, r)) { LOG(1, "Opening replica %u, part files failed", r); goto err; } } return 0; err: util_poolset_fdclose(set); return -1; }
/* * pmemobj_open_common -- open a transactional memory pool (set) * * This routine does all the work, but takes a cow flag so internal * calls can map a read-only pool if required. */ static PMEMobjpool * pmemobj_open_common(const char *path, const char *layout, int cow, int boot) { LOG(3, "path %s layout %s cow %d", path, layout, cow); struct pool_set *set; if (util_pool_open(&set, path, cow, PMEMOBJ_MIN_POOL, roundup(sizeof (struct pmemobjpool), Pagesize), OBJ_HDR_SIG, OBJ_FORMAT_MAJOR, OBJ_FORMAT_COMPAT, OBJ_FORMAT_INCOMPAT, OBJ_FORMAT_RO_COMPAT) != 0) { LOG(2, "cannot open pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); /* read-only mode is not supported in libpmemobj */ if (set->rdonly) { ERR("read-only mode is not supported"); errno = EINVAL; goto err; } PMEMobjpool *pop; for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; pop = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pop->addr, sizeof (struct pmemobjpool) - ((uintptr_t)&pop->addr - (uintptr_t)&pop->hdr)); pop->addr = pop; pop->size = rep->repsize; if (pmemobj_descr_check(pop, layout, set->poolsize) != 0) { LOG(2, "descriptor check failed"); goto err; } /* initialize replica runtime - is_pmem, funcs, ... */ if (pmemobj_replica_init(pop, rep->is_pmem) != 0) { ERR("pool initialization failed"); goto err; } /* link replicas */ if (r < set->nreplicas - 1) pop->replica = set->replica[r + 1]->part[0].addr; } /* * If there is more than one replica, check if all of them are * consistent (recoverable). * On success, choose any replica and copy entire lanes (redo logs) * to all the other replicas to synchronize them. */ if (set->nreplicas > 1) { for (unsigned r = 0; r < set->nreplicas; r++) { pop = set->replica[r]->part[0].addr; if (pmemobj_check_basic(pop) == 0) { ERR("inconsistent replica #%u", r); goto err; } } /* copy lanes */ pop = set->replica[0]->part[0].addr; void *src = (void *)((uintptr_t)pop + pop->lanes_offset); size_t len = pop->nlanes * sizeof (struct lane_layout); for (unsigned r = 1; r < set->nreplicas; r++) { pop = set->replica[r]->part[0].addr; void *dst = (void *)((uintptr_t)pop + pop->lanes_offset); pop->memcpy_persist_local(dst, src, len); } } pop = set->replica[0]->part[0].addr; pop->is_master_replica = 1; for (unsigned r = 1; r < set->nreplicas; r++) { PMEMobjpool *rep = set->replica[r]->part[0].addr; rep->is_master_replica = 0; } #ifdef USE_VG_MEMCHECK heap_vg_open(pop); #endif VALGRIND_DO_CREATE_MEMPOOL(pop, 0, 0); /* initialize runtime parts - lanes, obj stores, ... */ if (pmemobj_runtime_init(pop, 0, boot) != 0) { ERR("pool initialization failed"); goto err; } util_poolset_fdclose(set); util_poolset_free(set); #ifdef USE_VG_MEMCHECK if (boot) pmemobj_vg_boot(pop); #endif LOG(3, "pop %p", pop); return pop; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 0); errno = oerrno; return NULL; }
/* * pmemobj_create -- create a transactional memory pool (set) */ PMEMobjpool * pmemobj_create(const char *path, const char *layout, size_t poolsize, mode_t mode) { LOG(3, "path %s layout %s poolsize %zu mode %o", path, layout, poolsize, mode); /* check length of layout */ if (layout && (strlen(layout) >= PMEMOBJ_MAX_LAYOUT)) { ERR("Layout too long"); errno = EINVAL; return NULL; } struct pool_set *set; if (util_pool_create(&set, path, poolsize, PMEMOBJ_MIN_POOL, roundup(sizeof (struct pmemobjpool), Pagesize), OBJ_HDR_SIG, OBJ_FORMAT_MAJOR, OBJ_FORMAT_COMPAT, OBJ_FORMAT_INCOMPAT, OBJ_FORMAT_RO_COMPAT) != 0) { LOG(2, "cannot create pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); PMEMobjpool *pop; for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; pop = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pop->addr, sizeof (struct pmemobjpool) - ((uintptr_t)&pop->addr - (uintptr_t)&pop->hdr)); pop->addr = pop; pop->size = rep->repsize; /* create pool descriptor */ if (pmemobj_descr_create(pop, layout, set->poolsize) != 0) { LOG(2, "descriptor creation failed"); goto err; } /* initialize replica runtime - is_pmem, funcs, ... */ if (pmemobj_replica_init(pop, rep->is_pmem) != 0) { ERR("pool initialization failed"); goto err; } /* link replicas */ if (r < set->nreplicas - 1) pop->replica = set->replica[r + 1]->part[0].addr; } pop = set->replica[0]->part[0].addr; pop->is_master_replica = 1; for (unsigned r = 1; r < set->nreplicas; r++) { PMEMobjpool *rep = set->replica[r]->part[0].addr; rep->is_master_replica = 0; } VALGRIND_DO_CREATE_MEMPOOL(pop, 0, 0); /* initialize runtime parts - lanes, obj stores, ... */ if (pmemobj_runtime_init(pop, 0, 1 /* boot*/) != 0) { ERR("pool initialization failed"); goto err; } if (util_poolset_chmod(set, mode)) goto err; util_poolset_fdclose(set); util_poolset_free(set); LOG(3, "pop %p", pop); return pop; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 1); errno = oerrno; return NULL; }
/* * pmemblk_create -- create a block memory pool */ PMEMblkpool * pmemblk_create(const char *path, size_t bsize, size_t poolsize, mode_t mode) { LOG(3, "path %s bsize %zu poolsize %zu mode %o", path, bsize, poolsize, mode); /* check if bsize is valid */ if (bsize == 0) { ERR("Invalid block size %zu", bsize); errno = EINVAL; return NULL; } if (bsize > UINT32_MAX) { ERR("Invalid block size %zu", bsize); errno = EINVAL; return NULL; } struct pool_set *set; if (util_pool_create(&set, path, poolsize, PMEMBLK_MIN_POOL, roundup(sizeof (struct pmemblk), Pagesize), BLK_HDR_SIG, BLK_FORMAT_MAJOR, BLK_FORMAT_COMPAT, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT) != 0) { LOG(2, "cannot create pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMblkpool *pbp = rep->part[0].addr; pbp->addr = pbp; pbp->size = rep->repsize; VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, sizeof (struct pmemblk) - ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); if (set->nreplicas > 1) { ERR("replicas not supported"); goto err; } /* create pool descriptor */ if (pmemblk_descr_create(pbp, (uint32_t)bsize, set->zeroed) != 0) { LOG(2, "descriptor creation failed"); goto err; } /* initialize runtime parts */ if (pmemblk_runtime_init(pbp, bsize, 0, rep->is_pmem) != 0) { ERR("pool initialization failed"); goto err; } if (util_poolset_chmod(set, mode)) goto err; util_poolset_fdclose(set); util_poolset_free(set); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 1); errno = oerrno; return NULL; }
static inline #endif PMEMblkpool * pmemblk_createU(const char *path, size_t bsize, size_t poolsize, mode_t mode) { LOG(3, "path %s bsize %zu poolsize %zu mode %o", path, bsize, poolsize, mode); /* check if bsize is valid */ if (bsize == 0) { ERR("Invalid block size %zu", bsize); errno = EINVAL; return NULL; } if (bsize > UINT32_MAX) { ERR("Invalid block size %zu", bsize); errno = EINVAL; return NULL; } struct pool_set *set; if (util_pool_create(&set, path, poolsize, PMEMBLK_MIN_POOL, BLK_HDR_SIG, BLK_FORMAT_MAJOR, BLK_FORMAT_COMPAT, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT, NULL, REPLICAS_DISABLED) != 0) { LOG(2, "cannot create pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMblkpool *pbp = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, sizeof(struct pmemblk) - ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); pbp->addr = pbp; pbp->size = rep->repsize; pbp->set = set; pbp->is_pmem = rep->is_pmem; pbp->is_dev_dax = rep->part[0].is_dev_dax; /* is_dev_dax implies is_pmem */ ASSERT(!pbp->is_dev_dax || pbp->is_pmem); /* create pool descriptor */ blk_descr_create(pbp, (uint32_t)bsize, set->zeroed); /* initialize runtime parts */ if (blk_runtime_init(pbp, bsize, 0) != 0) { ERR("pool initialization failed"); goto err; } if (util_poolset_chmod(set, mode)) goto err; util_poolset_fdclose(set); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, DELETE_CREATED_PARTS); errno = oerrno; return NULL; }