/* * pmemobj_open_common -- open a transactional memory pool (set) * * This routine does all the work, but takes a cow flag so internal * calls can map a read-only pool if required. */ static PMEMobjpool * pmemobj_open_common(const char *path, const char *layout, int cow, int boot) { LOG(3, "path %s layout %s cow %d", path, layout, cow); struct pool_set *set; if (util_pool_open(&set, path, cow, PMEMOBJ_MIN_POOL, roundup(sizeof (struct pmemobjpool), Pagesize), OBJ_HDR_SIG, OBJ_FORMAT_MAJOR, OBJ_FORMAT_COMPAT, OBJ_FORMAT_INCOMPAT, OBJ_FORMAT_RO_COMPAT) != 0) { LOG(2, "cannot open pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); /* read-only mode is not supported in libpmemobj */ if (set->rdonly) { ERR("read-only mode is not supported"); errno = EINVAL; goto err; } PMEMobjpool *pop; for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; pop = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pop->addr, sizeof (struct pmemobjpool) - ((uintptr_t)&pop->addr - (uintptr_t)&pop->hdr)); pop->addr = pop; pop->size = rep->repsize; if (pmemobj_descr_check(pop, layout, set->poolsize) != 0) { LOG(2, "descriptor check failed"); goto err; } /* initialize replica runtime - is_pmem, funcs, ... */ if (pmemobj_replica_init(pop, rep->is_pmem) != 0) { ERR("pool initialization failed"); goto err; } /* link replicas */ if (r < set->nreplicas - 1) pop->replica = set->replica[r + 1]->part[0].addr; } /* * If there is more than one replica, check if all of them are * consistent (recoverable). * On success, choose any replica and copy entire lanes (redo logs) * to all the other replicas to synchronize them. */ if (set->nreplicas > 1) { for (unsigned r = 0; r < set->nreplicas; r++) { pop = set->replica[r]->part[0].addr; if (pmemobj_check_basic(pop) == 0) { ERR("inconsistent replica #%u", r); goto err; } } /* copy lanes */ pop = set->replica[0]->part[0].addr; void *src = (void *)((uintptr_t)pop + pop->lanes_offset); size_t len = pop->nlanes * sizeof (struct lane_layout); for (unsigned r = 1; r < set->nreplicas; r++) { pop = set->replica[r]->part[0].addr; void *dst = (void *)((uintptr_t)pop + pop->lanes_offset); pop->memcpy_persist_local(dst, src, len); } } pop = set->replica[0]->part[0].addr; pop->is_master_replica = 1; for (unsigned r = 1; r < set->nreplicas; r++) { PMEMobjpool *rep = set->replica[r]->part[0].addr; rep->is_master_replica = 0; } #ifdef USE_VG_MEMCHECK heap_vg_open(pop); #endif VALGRIND_DO_CREATE_MEMPOOL(pop, 0, 0); /* initialize runtime parts - lanes, obj stores, ... */ if (pmemobj_runtime_init(pop, 0, boot) != 0) { ERR("pool initialization failed"); goto err; } util_poolset_fdclose(set); util_poolset_free(set); #ifdef USE_VG_MEMCHECK if (boot) pmemobj_vg_boot(pop); #endif LOG(3, "pop %p", pop); return pop; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 0); errno = oerrno; return NULL; }
/* * util_pool_open -- open a memory pool (set or a single file) * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. */ int util_pool_open(struct pool_set **setp, const char *path, int rdonly, size_t minsize, size_t hdrsize, const char *sig, uint32_t major, uint32_t compat, uint32_t incompat, uint32_t ro_compat) { LOG(3, "setp %p path %s rdonly %d minsize %zu " "hdrsize %zu sig %s major %u " "compat %#x incompat %#x ro_comapt %#x", setp, path, rdonly, minsize, hdrsize, sig, major, compat, incompat, ro_compat); int flags = rdonly ? MAP_PRIVATE|MAP_NORESERVE : MAP_SHARED; int ret = util_poolset_open(setp, path, minsize); if (ret < 0) { LOG(2, "cannot open pool set"); return -1; } struct pool_set *set = *setp; ASSERT(set->nreplicas > 0); set->rdonly = 0; set->poolsize = SIZE_MAX; for (unsigned r = 0; r < set->nreplicas; r++) { if (util_replica_open(set, r, flags, hdrsize, sig, major, compat, incompat, ro_compat) != 0) { LOG(2, "replica open failed"); goto err; } } /* check replicas linkage */ for (unsigned r = 0; r < set->nreplicas; r++) { if (memcmp(HDR(REP(set, r - 1), 0)->uuid, HDR(REP(set, r), 0)->prev_repl_uuid, POOL_HDR_UUID_LEN) || memcmp(HDR(REP(set, r + 1), 0)->uuid, HDR(REP(set, r), 0)->next_repl_uuid, POOL_HDR_UUID_LEN)) { ERR("wrong replica UUID"); errno = EINVAL; goto err; } } return 0; err: LOG(4, "error clean up"); int oerrno = errno; for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; VALGRIND_REMOVE_PMEM_MAPPING(rep->part[0].addr, rep->part[0].size); util_unmap(rep->part[0].addr, rep->part[0].size); } util_poolset_close(set, 0); errno = oerrno; return -1; }
/* * pmempool_transform -- alter poolset structure */ int pmempool_transform(const char *poolset_file_src, const char *poolset_file_dst, unsigned flags) { ASSERTne(poolset_file_src, NULL); ASSERTne(poolset_file_dst, NULL); /* check if the source poolset has correct signature */ if (util_is_poolset_file(poolset_file_src) != 1) { ERR("source file is not a poolset file"); goto err; } /* check if the destination poolset has correct signature */ if (util_is_poolset_file(poolset_file_dst) != 1) { ERR("destination file is not a poolset file"); goto err; } /* open the source poolset file */ int fd_in = util_file_open(poolset_file_src, NULL, 0, O_RDONLY); if (fd_in < 0) { ERR("cannot open source poolset file"); goto err; } /* parse the source poolset file */ struct pool_set *set_in = NULL; if (util_poolset_parse(&set_in, poolset_file_src, fd_in)) { ERR("parsing source poolset failed"); close(fd_in); goto err; } close(fd_in); /* open the destination poolset file */ int fd_out = util_file_open(poolset_file_dst, NULL, 0, O_RDONLY); if (fd_out < 0) { ERR("cannot open destination poolset file"); goto err; } /* parse the destination poolset file */ struct pool_set *set_out = NULL; if (util_poolset_parse(&set_out, poolset_file_dst, fd_out)) { ERR("parsing destination poolset failed"); close(fd_out); goto err_free_poolin; } close(fd_out); /* check if the source poolset is of a correct type */ if (pool_set_type(set_in) != POOL_TYPE_OBJ) { ERR("source poolset is of a wrong type"); goto err_free_poolout; } /* check if the source poolset is healthy */ struct poolset_health_status *set_in_hs = NULL; if (replica_check_poolset_health(set_in, &set_in_hs, flags)) { ERR("source poolset health check failed"); goto err_free_poolout; } if (!replica_is_poolset_healthy(set_in_hs)) { ERR("source poolset is broken"); replica_free_poolset_health_status(set_in_hs); goto err_free_poolout; } replica_free_poolset_health_status(set_in_hs); /* transform poolset */ if (transform_replica(set_in, set_out, flags)) { ERR("transformation failed"); goto err_free_poolin; } util_poolset_close(set_in, 0); util_poolset_close(set_out, 0); return 0; err_free_poolout: util_poolset_close(set_out, 0); err_free_poolin: util_poolset_close(set_in, 0); err: if (errno == 0) errno = EINVAL; return -1; }
/* * pmemobj_create -- create a transactional memory pool (set) */ PMEMobjpool * pmemobj_create(const char *path, const char *layout, size_t poolsize, mode_t mode) { LOG(3, "path %s layout %s poolsize %zu mode %o", path, layout, poolsize, mode); /* check length of layout */ if (layout && (strlen(layout) >= PMEMOBJ_MAX_LAYOUT)) { ERR("Layout too long"); errno = EINVAL; return NULL; } struct pool_set *set; if (util_pool_create(&set, path, poolsize, PMEMOBJ_MIN_POOL, roundup(sizeof (struct pmemobjpool), Pagesize), OBJ_HDR_SIG, OBJ_FORMAT_MAJOR, OBJ_FORMAT_COMPAT, OBJ_FORMAT_INCOMPAT, OBJ_FORMAT_RO_COMPAT) != 0) { LOG(2, "cannot create pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); PMEMobjpool *pop; for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; pop = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pop->addr, sizeof (struct pmemobjpool) - ((uintptr_t)&pop->addr - (uintptr_t)&pop->hdr)); pop->addr = pop; pop->size = rep->repsize; /* create pool descriptor */ if (pmemobj_descr_create(pop, layout, set->poolsize) != 0) { LOG(2, "descriptor creation failed"); goto err; } /* initialize replica runtime - is_pmem, funcs, ... */ if (pmemobj_replica_init(pop, rep->is_pmem) != 0) { ERR("pool initialization failed"); goto err; } /* link replicas */ if (r < set->nreplicas - 1) pop->replica = set->replica[r + 1]->part[0].addr; } pop = set->replica[0]->part[0].addr; pop->is_master_replica = 1; for (unsigned r = 1; r < set->nreplicas; r++) { PMEMobjpool *rep = set->replica[r]->part[0].addr; rep->is_master_replica = 0; } VALGRIND_DO_CREATE_MEMPOOL(pop, 0, 0); /* initialize runtime parts - lanes, obj stores, ... */ if (pmemobj_runtime_init(pop, 0, 1 /* boot*/) != 0) { ERR("pool initialization failed"); goto err; } if (util_poolset_chmod(set, mode)) goto err; util_poolset_fdclose(set); util_poolset_free(set); LOG(3, "pop %p", pop); return pop; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, 1); errno = oerrno; return NULL; }
/* * pool_parse_params -- parse pool type, file size and block size */ static int pool_params_parse(const PMEMpoolcheck *ppc, struct pool_params *params, int check) { LOG(3, NULL); int is_btt = ppc->args.pool_type == PMEMPOOL_POOL_TYPE_BTT; params->type = POOL_TYPE_UNKNOWN; params->is_poolset = util_is_poolset_file(ppc->path) == 1; int fd = util_file_open(ppc->path, NULL, 0, O_RDONLY); if (fd < 0) return -1; int ret = 0; util_stat_t stat_buf; ret = util_fstat(fd, &stat_buf); if (ret) goto out_close; ASSERT(stat_buf.st_size >= 0); params->mode = stat_buf.st_mode; struct pool_set *set; void *addr; if (params->is_poolset) { /* * Need to close the poolset because it will be opened with * flock in the following instructions. */ close(fd); fd = -1; if (check) { if (pool_set_map(ppc->path, &set, 1)) return -1; } else { ret = util_poolset_create_set(&set, ppc->path, 0, 0); if (ret < 0) { LOG(2, "cannot open pool set -- '%s'", ppc->path); return -1; } if (set->remote) { ERR("poolsets with remote replicas are not " "supported"); return -1; } if (util_pool_open_nocheck(set, 1)) return -1; } params->size = set->poolsize; addr = set->replica[0]->part[0].addr; } else if (is_btt) { params->size = (size_t)stat_buf.st_size; #ifndef _WIN32 if (params->mode & S_IFBLK) if (ioctl(fd, BLKGETSIZE64, ¶ms->size)) { ERR("!ioctl"); goto out_close; } #endif addr = NULL; } else { params->size = (size_t)stat_buf.st_size; addr = mmap(NULL, (uint64_t)stat_buf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) { ret = -1; goto out_close; } } /* stop processing for BTT device */ if (is_btt) { params->type = POOL_TYPE_BTT; params->is_part = false; goto out_close; } struct pool_hdr hdr; memcpy(&hdr, addr, sizeof(hdr)); util_convert2h_hdr_nocheck(&hdr); pool_params_from_header(params, &hdr); if (ppc->args.pool_type != PMEMPOOL_POOL_TYPE_DETECT) { enum pool_type declared_type = pmempool_check_type_to_pool_type(ppc->args.pool_type); if ((params->type & ~declared_type) != 0) { ERR("declared pool type does not match"); ret = 1; goto out_unmap; } } if (params->type == POOL_TYPE_BLK) { struct pmemblk pbp; memcpy(&pbp, addr, sizeof(pbp)); params->blk.bsize = le32toh(pbp.bsize); } else if (params->type == POOL_TYPE_OBJ) { struct pmemobjpool pop; memcpy(&pop, addr, sizeof(pop)); memcpy(params->obj.layout, pop.layout, PMEMOBJ_MAX_LAYOUT); } out_unmap: if (params->is_poolset) { ASSERTeq(fd, -1); ASSERTne(addr, NULL); util_poolset_close(set, 0); } else if (!is_btt) { ASSERTne(fd, -1); ASSERTne(addr, NULL); munmap(addr, params->size); } out_close: if (fd != -1) close(fd); return ret; }
/* * util_poolset_open -- (internal) open memory pool set * * On success returns 0 and a pointer to a newly allocated structure * containing the info of all the parts of the pool set and replicas. */ static int util_poolset_open(struct pool_set **setp, const char *path, size_t minsize) { LOG(3, "setp %p path %s minsize %zu", setp, path, minsize); int oerrno; int ret = 0; int fd; size_t size = 0; /* do not check minsize */ if ((fd = util_file_open(path, &size, 0, O_RDONLY)) == -1) return -1; char signature[POOLSET_HDR_SIG_LEN]; /* * read returns ssize_t, but we know it will return value between -1 * and POOLSET_HDR_SIG_LEN (11), so we can safely cast it to int */ ret = (int)read(fd, signature, POOLSET_HDR_SIG_LEN); if (ret < 0) { ERR("!read %d", fd); goto err; } if (ret < POOLSET_HDR_SIG_LEN || strncmp(signature, POOLSET_HDR_SIG, POOLSET_HDR_SIG_LEN)) { LOG(4, "not a pool set header"); if (size < minsize) { ERR("size %zu smaller than %zu", size, minsize); errno = EINVAL; ret = -1; goto err; } /* close the file and open with O_RDWR */ (void) close(fd); size = 0; if ((fd = util_file_open(path, &size, 0, O_RDWR)) == -1) return -1; *setp = util_poolset_single(path, size, fd, 0); if (*setp == NULL) { ret = -1; goto err; } /* do not close the file */ return 0; } ret = util_poolset_parse(path, fd, setp); if (ret != 0) goto err; ret = util_poolset_files(*setp, minsize, 0); if (ret != 0) util_poolset_close(*setp, 0); err: oerrno = errno; (void) close(fd); errno = oerrno; return ret; }
/* * util_pool_open -- open a memory pool (set or a single file) * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. */ int util_pool_open(struct pool_set **setp, const char *path, int rdonly, size_t minsize, size_t hdrsize, const char *sig, uint32_t major, uint32_t compat, uint32_t incompat, uint32_t ro_compat) { LOG(3, "setp %p path %s rdonly %d minsize %zu " "hdrsize %zu sig %.8s major %u " "compat %#x incompat %#x ro_comapt %#x", setp, path, rdonly, minsize, hdrsize, sig, major, compat, incompat, ro_compat); int flags = rdonly ? MAP_PRIVATE|MAP_NORESERVE : MAP_SHARED; int ret = util_poolset_open(setp, path, minsize); if (ret < 0) { LOG(2, "cannot open pool set"); return -1; } struct pool_set *set = *setp; ASSERT(set->nreplicas > 0); set->rdonly = 0; set->poolsize = SIZE_MAX; for (unsigned r = 0; r < set->nreplicas; r++) { if (util_replica_open(set, r, flags, hdrsize) != 0) { LOG(2, "replica open failed"); goto err; } } /* check headers, check UUID's, check replicas linkage */ for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; for (unsigned p = 0; p < rep->nparts; p++) { if (util_header_check(set, r, p, sig, major, compat, incompat, ro_compat) != 0) { LOG(2, "header check failed - part #%d", p); goto err; } set->rdonly |= rep->part[p].rdonly; } if (memcmp(HDR(REP(set, r - 1), 0)->uuid, HDR(REP(set, r), 0)->prev_repl_uuid, POOL_HDR_UUID_LEN) || memcmp(HDR(REP(set, r + 1), 0)->uuid, HDR(REP(set, r), 0)->next_repl_uuid, POOL_HDR_UUID_LEN)) { ERR("wrong replica UUID"); errno = EINVAL; goto err; } } /* unmap all headers */ for (unsigned r = 0; r < set->nreplicas; r++) { struct pool_replica *rep = set->replica[r]; for (unsigned p = 0; p < rep->nparts; p++) util_unmap_hdr(&rep->part[p]); } return 0; err: LOG(4, "error clean up"); int oerrno = errno; for (unsigned r = 0; r < set->nreplicas; r++) util_replica_close(set, r); util_poolset_close(set, 0); errno = oerrno; return -1; }
/* * blk_open_common -- (internal) open a block memory pool * * This routine does all the work, but takes a cow flag so internal * calls can map a read-only pool if required. * * Passing in bsize == 0 means a valid pool header must exist (which * will supply the block size). */ static PMEMblkpool * blk_open_common(const char *path, size_t bsize, int cow) { LOG(3, "path %s bsize %zu cow %d", path, bsize, cow); struct pool_set *set; if (util_pool_open(&set, path, cow, PMEMBLK_MIN_POOL, BLK_HDR_SIG, BLK_FORMAT_MAJOR, BLK_FORMAT_COMPAT, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT, NULL) != 0) { LOG(2, "cannot open pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMblkpool *pbp = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, sizeof(struct pmemblk) - ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); pbp->addr = pbp; pbp->size = rep->repsize; pbp->set = set; pbp->is_pmem = rep->is_pmem; pbp->is_dev_dax = rep->part[0].is_dev_dax; /* is_dev_dax implies is_pmem */ ASSERT(!pbp->is_dev_dax || pbp->is_pmem); if (set->nreplicas > 1) { errno = ENOTSUP; ERR("!replicas not supported"); goto err; } /* validate pool descriptor */ if (blk_descr_check(pbp, &bsize) != 0) { LOG(2, "descriptor check failed"); goto err; } /* initialize runtime parts */ if (blk_runtime_init(pbp, bsize, set->rdonly) != 0) { ERR("pool initialization failed"); goto err; } util_poolset_fdclose(set); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, DO_NOT_DELETE_PARTS); errno = oerrno; return NULL; }
static inline #endif PMEMblkpool * pmemblk_createU(const char *path, size_t bsize, size_t poolsize, mode_t mode) { LOG(3, "path %s bsize %zu poolsize %zu mode %o", path, bsize, poolsize, mode); /* check if bsize is valid */ if (bsize == 0) { ERR("Invalid block size %zu", bsize); errno = EINVAL; return NULL; } if (bsize > UINT32_MAX) { ERR("Invalid block size %zu", bsize); errno = EINVAL; return NULL; } struct pool_set *set; if (util_pool_create(&set, path, poolsize, PMEMBLK_MIN_POOL, BLK_HDR_SIG, BLK_FORMAT_MAJOR, BLK_FORMAT_COMPAT, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT, NULL, REPLICAS_DISABLED) != 0) { LOG(2, "cannot create pool or pool set"); return NULL; } ASSERT(set->nreplicas > 0); struct pool_replica *rep = set->replica[0]; PMEMblkpool *pbp = rep->part[0].addr; VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, sizeof(struct pmemblk) - ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); pbp->addr = pbp; pbp->size = rep->repsize; pbp->set = set; pbp->is_pmem = rep->is_pmem; pbp->is_dev_dax = rep->part[0].is_dev_dax; /* is_dev_dax implies is_pmem */ ASSERT(!pbp->is_dev_dax || pbp->is_pmem); /* create pool descriptor */ blk_descr_create(pbp, (uint32_t)bsize, set->zeroed); /* initialize runtime parts */ if (blk_runtime_init(pbp, bsize, 0) != 0) { ERR("pool initialization failed"); goto err; } if (util_poolset_chmod(set, mode)) goto err; util_poolset_fdclose(set); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; util_poolset_close(set, DELETE_CREATED_PARTS); errno = oerrno; return NULL; }