/* * btt_init -- prepare a btt namespace for use, returning an opaque handle * * Returns handle on success, otherwise NULL/errno. * * XXX handle case where lbasize doesn't match lbasize found in valid arenas. * XXX check rawsize against size from valid arenas. * XXX what if write_layout produces something read_layout says is invalid? * XXX what if arenas have different nfree? */ struct btt * btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[], int maxlane, void *ns, const struct ns_callback *ns_cbp) { LOG(3, "rawsize %zu lbasize %u", rawsize, lbasize); if (rawsize < BTT_MIN_SIZE) { LOG(1, "rawsize smaller than BTT_MIN_SIZE %zu", BTT_MIN_SIZE); errno = EINVAL; return NULL; } struct btt *bttp = Malloc(sizeof (*bttp)); if (bttp == NULL) { LOG(1, "!Malloc %zu bytes", sizeof (*bttp)); return NULL; } memset(bttp, '\0', sizeof (*bttp)); pthread_mutex_init(&bttp->layout_write_mutex, NULL); memcpy(bttp->parent_uuid, parent_uuid, BTTINFO_UUID_LEN); bttp->rawsize = rawsize; bttp->lbasize = lbasize; bttp->ns = ns; bttp->ns_cbp = ns_cbp; /* * Load up layout, if it exists. * * Whether read_layout() finds a valid layout or not, it finishes * updating these layout-related fields: * bttp->nfree * bttp->nlba * bttp->narena * since these fields are used even before a valid layout it written. */ if (read_layout(bttp, 0) < 0) { btt_fini(bttp); /* free up any allocations */ return NULL; } bttp->nlane = bttp->nfree; /* maxlane, if provided, is an upper bound on nlane */ if (maxlane && bttp->nlane > maxlane) bttp->nlane = maxlane; LOG(3, "success, bttp %p nlane %d", bttp, bttp->nlane); return bttp; }
/* * pmemblk_unmap -- unmap a block memory pool */ void pmemblk_unmap(PMEMblk *pbp) { LOG(3, "pbp %p", pbp); btt_fini(pbp->bttp); if (pbp->locks) { for (int i = 0; i < pbp->nlane; i++) pthread_mutex_destroy(&pbp->locks[i]); Free((void *)pbp->locks); } #ifdef DEBUG /* destroy debug lock */ pthread_mutex_destroy(&pbp->write_lock); #endif util_unmap(pbp->addr, pbp->size); }
/* * pmemblk_close -- close a block memory pool */ void pmemblk_close(PMEMblkpool *pbp) { LOG(3, "pbp %p", pbp); btt_fini(pbp->bttp); if (pbp->locks) { for (unsigned i = 0; i < pbp->nlane; i++) pthread_mutex_destroy(&pbp->locks[i]); Free((void *)pbp->locks); } #ifdef DEBUG /* destroy debug lock */ pthread_mutex_destroy(&pbp->write_lock); #endif util_poolset_close(pbp->set, DO_NOT_DELETE_PARTS); }
/* * pmemblk_close -- close a block memory pool */ void pmemblk_close(PMEMblkpool *pbp) { LOG(3, "pbp %p", pbp); btt_fini(pbp->bttp); if (pbp->locks) { for (unsigned i = 0; i < pbp->nlane; i++) pthread_mutex_destroy(&pbp->locks[i]); Free((void *)pbp->locks); } #ifdef DEBUG /* destroy debug lock */ pthread_mutex_destroy(&pbp->write_lock); #endif VALGRIND_REMOVE_PMEM_MAPPING(pbp->addr, pbp->size); util_unmap(pbp->addr, pbp->size); }
/* * pmemblk_runtime_init -- (internal) initialize block memory pool runtime data */ static int pmemblk_runtime_init(PMEMblkpool *pbp, size_t bsize, int rdonly, int is_pmem) { LOG(3, "pbp %p bsize %zu rdonly %d is_pmem %d", pbp, bsize, rdonly, is_pmem); /* remove volatile part of header */ VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, sizeof (struct pmemblk) - sizeof (struct pool_hdr) - sizeof (pbp->bsize) - sizeof (pbp->is_zeroed)); /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ pbp->rdonly = rdonly; pbp->is_pmem = is_pmem; pbp->data = (char *)pbp->addr + roundup(sizeof (*pbp), BLK_FORMAT_DATA_ALIGN); ASSERT(((char *)pbp->addr + pbp->size) >= (char *)pbp->data); pbp->datasize = (size_t) (((char *)pbp->addr + pbp->size) - (char *)pbp->data); LOG(4, "data area %p data size %zu bsize %zu", pbp->data, pbp->datasize, bsize); long ncpus = sysconf(_SC_NPROCESSORS_ONLN); if (ncpus < 1) ncpus = 1; ns_cb.ns_is_zeroed = pbp->is_zeroed; /* things free by "goto err" if not NULL */ struct btt *bttp = NULL; pthread_mutex_t *locks = NULL; bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.poolset_uuid, (unsigned)ncpus * 2, pbp, &ns_cb); if (bttp == NULL) goto err; /* btt_init set errno, called LOG */ pbp->bttp = bttp; pbp->nlane = btt_nlane(pbp->bttp); pbp->next_lane = 0; if ((locks = Malloc(pbp->nlane * sizeof (*locks))) == NULL) { ERR("!Malloc for lane locks"); goto err; } for (unsigned i = 0; i < pbp->nlane; i++) util_mutex_init(&locks[i], NULL); pbp->locks = locks; #ifdef DEBUG /* initialize debug lock */ util_mutex_init(&pbp->write_lock, NULL); #endif /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use. It is not considered an error if this fails. */ util_range_none(pbp->addr, sizeof (struct pool_hdr)); /* the data area should be kept read-only for debug version */ RANGE_RO(pbp->data, pbp->datasize); return 0; err: LOG(4, "error clean up"); int oerrno = errno; if (locks) Free((void *)locks); if (bttp) btt_fini(bttp); errno = oerrno; return -1; }
/* * pmemblk_map_common -- (internal) map a block memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. * * Passing in bsize == 0 means a valid pool header must exist (which * will supply the block size). */ static PMEMblk * pmemblk_map_common(int fd, size_t bsize, int rdonly) { LOG(3, "fd %d bsize %zu rdonly %d", fd, bsize, rdonly); /* things free by "goto err" if not NULL */ void *addr = NULL; struct btt *bttp = NULL; pthread_mutex_t *locks = NULL; struct stat stbuf; if (fstat(fd, &stbuf) < 0) { LOG(1, "!fstat"); return NULL; } if (stbuf.st_size < PMEMBLK_MIN_POOL) { LOG(1, "size %zu smaller than %zu", stbuf.st_size, PMEMBLK_MIN_POOL); errno = EINVAL; return NULL; } if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL) return NULL; /* util_map() set errno, called LOG */ /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, stbuf.st_size); /* opaque info lives at the beginning of mapped memory pool */ struct pmemblk *pbp = addr; struct pool_hdr hdr; memcpy(&hdr, &pbp->hdr, sizeof (hdr)); if (util_convert_hdr(&hdr)) { /* * valid header found */ if (strncmp(hdr.signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN)) { LOG(1, "wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != BLK_FORMAT_MAJOR) { LOG(1, "blk pool version %d (library expects %d)", hdr.major, BLK_FORMAT_MAJOR); errno = EINVAL; goto err; } size_t hdr_bsize = le32toh(pbp->bsize); if (bsize && bsize != hdr_bsize) { LOG(1, "wrong bsize (%zu), pool created with bsize %zu", bsize, hdr_bsize); errno = EINVAL; goto err; } bsize = hdr_bsize; LOG(3, "using block size from header: %zu", bsize); int retval = util_feature_check(&hdr, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT, BLK_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { /* * no valid header was found */ if (rdonly) { LOG(1, "read-only and no header found"); errno = EROFS; goto err; } LOG(3, "creating new blk memory pool"); struct pool_hdr *hdrp = &pbp->hdr; memset(hdrp, '\0', sizeof (*hdrp)); strncpy(hdrp->signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(BLK_FORMAT_MAJOR); hdrp->compat_features = htole32(BLK_FORMAT_COMPAT); hdrp->incompat_features = htole32(BLK_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(BLK_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); hdrp->crtime = htole64((uint64_t)time(NULL)); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); hdrp->checksum = htole64(hdrp->checksum); /* store pool's header */ libpmem_persist(is_pmem, hdrp, sizeof (*hdrp)); /* create rest of required metadata */ pbp->bsize = htole32(bsize); libpmem_persist(is_pmem, &pbp->bsize, sizeof (bsize)); } /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ pbp->addr = addr; pbp->size = stbuf.st_size; pbp->rdonly = rdonly; pbp->is_pmem = is_pmem; pbp->data = addr + roundup(sizeof (*pbp), BLK_FORMAT_DATA_ALIGN); pbp->datasize = (pbp->addr + pbp->size) - pbp->data; LOG(4, "data area %p data size %zu bsize %zu", pbp->data, pbp->datasize, bsize); int ncpus = sysconf(_SC_NPROCESSORS_ONLN); if (ncpus < 1) ncpus = 1; bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.uuid, ncpus, pbp, &ns_cb); if (bttp == NULL) goto err; /* btt_init set errno, called LOG */ pbp->bttp = bttp; pbp->nlane = btt_nlane(pbp->bttp); pbp->next_lane = 0; if ((locks = Malloc(pbp->nlane * sizeof (*locks))) == NULL) { LOG(1, "!Malloc for lane locks"); goto err; } for (int i = 0; i < pbp->nlane; i++) if (pthread_mutex_init(&locks[i], NULL) < 0) { LOG(1, "!pthread_mutex_init"); goto err; } pbp->locks = locks; #ifdef DEBUG /* initialize debug lock */ if (pthread_mutex_init(&pbp->write_lock, NULL) < 0) { LOG(1, "!pthread_mutex_init"); goto err; } #endif /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use not it is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the data area should be kept read-only for debug version */ RANGE_RO(pbp->data, pbp->datasize); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; if (locks) Free((void *)locks); if (bttp) btt_fini(bttp); util_unmap(addr, stbuf.st_size); errno = oerrno; return NULL; }