Esempio n. 1
0
File: blk.c Progetto: Neuvenen/nvml
/*
 * pmemblk_runtime_init -- (internal) initialize block memory pool runtime data
 */
static int
pmemblk_runtime_init(PMEMblkpool *pbp, size_t bsize, int rdonly, int is_pmem)
{
	LOG(3, "pbp %p bsize %zu rdonly %d is_pmem %d",
			pbp, bsize, rdonly, is_pmem);

	/* remove volatile part of header */
	VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
			sizeof (struct pmemblk) -
			sizeof (struct pool_hdr) -
			sizeof (pbp->bsize) -
			sizeof (pbp->is_zeroed));

	/*
	 * Use some of the memory pool area for run-time info.  This
	 * run-time state is never loaded from the file, it is always
	 * created here, so no need to worry about byte-order.
	 */
	pbp->rdonly = rdonly;
	pbp->is_pmem = is_pmem;
	pbp->data = (char *)pbp->addr +
			roundup(sizeof (*pbp), BLK_FORMAT_DATA_ALIGN);
	ASSERT(((char *)pbp->addr + pbp->size) >= (char *)pbp->data);
	pbp->datasize = (size_t)
			(((char *)pbp->addr + pbp->size) - (char *)pbp->data);

	LOG(4, "data area %p data size %zu bsize %zu",
		pbp->data, pbp->datasize, bsize);

	long ncpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (ncpus < 1)
		ncpus = 1;

	ns_cb.ns_is_zeroed = pbp->is_zeroed;

	/* things free by "goto err" if not NULL */
	struct btt *bttp = NULL;
	pthread_mutex_t *locks = NULL;

	bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.poolset_uuid,
			(unsigned)ncpus * 2, pbp, &ns_cb);

	if (bttp == NULL)
		goto err;	/* btt_init set errno, called LOG */

	pbp->bttp = bttp;

	pbp->nlane = btt_nlane(pbp->bttp);
	pbp->next_lane = 0;
	if ((locks = Malloc(pbp->nlane * sizeof (*locks))) == NULL) {
		ERR("!Malloc for lane locks");
		goto err;
	}

	for (unsigned i = 0; i < pbp->nlane; i++)
		util_mutex_init(&locks[i], NULL);

	pbp->locks = locks;

#ifdef DEBUG
	/* initialize debug lock */
	util_mutex_init(&pbp->write_lock, NULL);
#endif

	/*
	 * If possible, turn off all permissions on the pool header page.
	 *
	 * The prototype PMFS doesn't allow this when large pages are in
	 * use. It is not considered an error if this fails.
	 */
	util_range_none(pbp->addr, sizeof (struct pool_hdr));

	/* the data area should be kept read-only for debug version */
	RANGE_RO(pbp->data, pbp->datasize);

	return 0;

err:
	LOG(4, "error clean up");
	int oerrno = errno;
	if (locks)
		Free((void *)locks);
	if (bttp)
		btt_fini(bttp);
	errno = oerrno;
	return -1;
}
Esempio n. 2
0
File: log.c Progetto: harrybaa/nvml
/*
 * pmemlog_append -- add data to a log memory pool
 */
int
pmemlog_append(PMEMlogpool *plp, const void *buf, size_t count)
{
    int ret = 0;

    LOG(3, "plp %p buf %p count %zu", plp, buf, count);

    if (plp->rdonly) {
        ERR("can't append to read-only log");
        errno = EROFS;
        return -1;
    }

    if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) {
        ERR("!pthread_rwlock_wrlock");
        return -1;
    }

    /* get the current values */
    uint64_t end_offset = le64toh(plp->end_offset);
    uint64_t write_offset = le64toh(plp->write_offset);

    if (write_offset >= end_offset) {
        /* no space left */
        errno = ENOSPC;
        ERR("!pmemlog_append");
        ret = -1;
    } else {
        /* make sure we don't write past the available space */
        if (count > (end_offset - write_offset)) {
            errno = ENOSPC;
            ERR("!pmemlog_append");
            ret = -1;
        } else {
            char *data = plp->addr;

            /*
             * unprotect the log space range,
             * where the new data will be stored
             * (debug version only)
             */
            RANGE_RW(&data[write_offset], count);

            if (plp->is_pmem)
                pmem_memcpy_nodrain(&data[write_offset],
                                    buf, count);
            else
                memcpy(&data[write_offset], buf, count);

            /* protect the log space range (debug version only) */
            RANGE_RO(&data[write_offset], count);

            write_offset += count;
        }
    }

    /* persist the data and the metadata only if there was no error */
    if (ret == 0)
        pmemlog_persist(plp, write_offset);

    int oerrno = errno;
    if ((errno = pthread_rwlock_unlock(plp->rwlockp)))
        ERR("!pthread_rwlock_unlock");
    errno = oerrno;

    return ret;
}
Esempio n. 3
0
/*
 * pmemlog_map_common -- (internal) map a log memory pool
 *
 * This routine does all the work, but takes a rdonly flag so internal
 * calls can map a read-only pool if required.
 */
static PMEMlog *
pmemlog_map_common(int fd, int rdonly)
{
	LOG(3, "fd %d rdonly %d", fd, rdonly);

	struct stat stbuf;
	if (fstat(fd, &stbuf) < 0) {
		LOG(1, "!fstat");
		return NULL;
	}

	if (stbuf.st_size < PMEMLOG_MIN_POOL) {
		LOG(1, "size %lld smaller than %zu",
				(long long)stbuf.st_size, PMEMLOG_MIN_POOL);
		errno = EINVAL;
		return NULL;
	}

	void *addr;
	if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL)
		return NULL;	/* util_map() set errno, called LOG */

	/* check if the mapped region is located in persistent memory */
	int is_pmem = pmem_is_pmem(addr, stbuf.st_size);

	/* opaque info lives at the beginning of mapped memory pool */
	struct pmemlog *plp = addr;
	struct pool_hdr hdr;

	memcpy(&hdr, &plp->hdr, sizeof (hdr));

	if (util_convert_hdr(&hdr)) {
		/*
		 * valid header found
		 */
		if (strncmp(hdr.signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN)) {
			LOG(1, "wrong pool type: \"%s\"", hdr.signature);
			errno = EINVAL;
			goto err;
		}

		if (hdr.major != LOG_FORMAT_MAJOR) {
			LOG(1, "log pool version %d (library expects %d)",
				hdr.major, LOG_FORMAT_MAJOR);
			errno = EINVAL;
			goto err;
		}

		uint64_t hdr_start = le64toh(plp->start_offset);
		uint64_t hdr_end = le64toh(plp->end_offset);
		uint64_t hdr_write = le64toh(plp->write_offset);

		if ((hdr_start != roundup(sizeof (*plp),
					LOG_FORMAT_DATA_ALIGN)) ||
			(hdr_end != stbuf.st_size) || (hdr_start > hdr_end)) {
			LOG(1, "wrong start/end offsets (start: %ju end: %ju), "
				"pool size %lld",
				hdr_start, hdr_end, (long long)stbuf.st_size);
			errno = EINVAL;
			goto err;
		}

		if ((hdr_write > hdr_end) || (hdr_write < hdr_start)) {
			LOG(1, "wrong write offset "
				"(start: %ju end: %ju write: %ju)",
				hdr_start, hdr_end, hdr_write);
			errno = EINVAL;
			goto err;
		}

		LOG(3, "start: %ju, end: %ju, write: %ju",
			hdr_start, hdr_end, hdr_write);

		int retval = util_feature_check(&hdr, LOG_FORMAT_INCOMPAT,
							LOG_FORMAT_RO_COMPAT,
							LOG_FORMAT_COMPAT);
		if (retval < 0)
		    goto err;
		else if (retval == 0)
		    rdonly = 1;
	} else {
		/*
		 * no valid header was found
		 */
		if (rdonly) {
			LOG(1, "read-only and no header found");
			errno = EROFS;
			goto err;
		}
		LOG(3, "creating new log memory pool");

		struct pool_hdr *hdrp = &plp->hdr;

		memset(hdrp, '\0', sizeof (*hdrp));
		strncpy(hdrp->signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN);
		hdrp->major = htole32(LOG_FORMAT_MAJOR);
		hdrp->compat_features = htole32(LOG_FORMAT_COMPAT);
		hdrp->incompat_features = htole32(LOG_FORMAT_INCOMPAT);
		hdrp->ro_compat_features = htole32(LOG_FORMAT_RO_COMPAT);
		uuid_generate(hdrp->uuid);
		hdrp->crtime = htole64((uint64_t)time(NULL));
		util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1);
		hdrp->checksum = htole64(hdrp->checksum);

		/* store pool's header */
		libpmem_persist(is_pmem, hdrp, sizeof (*hdrp));

		/* create rest of required metadata */
		plp->start_offset = htole64(roundup(sizeof (*plp),
						LOG_FORMAT_DATA_ALIGN));
		plp->end_offset = htole64(stbuf.st_size);
		plp->write_offset = plp->start_offset;

		/* store non-volatile part of pool's descriptor */
		libpmem_persist(is_pmem, &plp->start_offset,
							3 * sizeof (uint64_t));
	}

	/*
	 * Use some of the memory pool area for run-time info.  This
	 * run-time state is never loaded from the file, it is always
	 * created here, so no need to worry about byte-order.
	 */
	plp->addr = addr;
	plp->size = stbuf.st_size;
	plp->rdonly = rdonly;
	plp->is_pmem = is_pmem;

	if ((plp->rwlockp = Malloc(sizeof (*plp->rwlockp))) == NULL) {
		LOG(1, "!Malloc for a RW lock");
		goto err;
	}

	if (pthread_rwlock_init(plp->rwlockp, NULL)) {
		LOG(1, "!pthread_rwlock_init");
		goto err_free;
	}

	/*
	 * If possible, turn off all permissions on the pool header page.
	 *
	 * The prototype PMFS doesn't allow this when large pages are in
	 * use. It is not considered an error if this fails.
	 */
	util_range_none(addr, sizeof (struct pool_hdr));

	/* the rest should be kept read-only (debug version only) */
	RANGE_RO(addr + sizeof (struct pool_hdr),
			stbuf.st_size - sizeof (struct pool_hdr));

	LOG(3, "plp %p", plp);
	return plp;

err_free:
	Free((void *)plp->rwlockp);
err:
	LOG(4, "error clean up");
	int oerrno = errno;
	util_unmap(addr, stbuf.st_size);
	errno = oerrno;
	return NULL;
}
Esempio n. 4
0
/*
 * pmemlog_appendv -- add gathered data to a log memory pool
 */
int
pmemlog_appendv(PMEMlog *plp, const struct iovec *iov, int iovcnt)
{
	LOG(3, "plp %p iovec %p iovcnt %d", plp, iov, iovcnt);

	int ret = 0; // success
	int i;

	if (plp->rdonly) {
		LOG(1, "can't append to read-only log");
		errno = EROFS;
		return -1;
	}

	if (pthread_rwlock_wrlock(plp->rwlockp)) {
		LOG(1, "!pthread_rwlock_wrlock");
		return -1;
	}

	/* get the current values */
	uint64_t end_offset = le64toh(plp->end_offset);
	uint64_t write_offset = le64toh(plp->write_offset);

	if (write_offset >= end_offset) {
		/* no space left */
		errno = ENOSPC;
		ret = -1;
	} else {
		char *data = plp->addr;
		uint64_t count = 0;
		char *buf;

		/* calculate required space */
		for (i = 0; i < iovcnt; ++i)
			count += iov[i].iov_len;

		/* check if there is enough free space */
		if (count > (end_offset - write_offset)) {
			errno = ENOSPC;
			ret = -1;
		} else {
			/* append the data */
			for (i = 0; i < iovcnt; ++i) {
				buf = iov[i].iov_base;
				count = iov[i].iov_len;

				/*
				 * unprotect the log space range,
				 * where the new data will be stored
				 * (debug version only)
				 */
				RANGE_RW(&data[write_offset], count);

				memcpy(&data[write_offset], buf, count);

				/*
				 * protect the log space range
				 * (debug version only)
				 */
				RANGE_RO(&data[write_offset], count);

				write_offset += count;
			}
		}
	}

	/* persist the data and the metadata only if there was no error */
	if (ret == 0)
		pmemlog_persist(plp, write_offset);

	int oerrno = errno;
	if (pthread_rwlock_unlock(plp->rwlockp))
		LOG(1, "!pthread_rwlock_unlock");
	errno = oerrno;

	return ret;
}
Esempio n. 5
0
File: btt.c Progetto: mdalecki/nvml
/*
 * write_layout -- (internal) write out the initial btt metadata layout
 *
 * Called with write == 1 only once in the life time of a btt namespace, when
 * the first write happens.  The caller of this routine is responsible for
 * locking out multiple threads.  This routine doesn't read anything -- by the
 * time it is called, it is known there's no layout in the namespace and a new
 * layout should be written.
 *
 * Calling with write == 0 tells this routine to do the calculations for
 * bttp->narena and bttp->nlba, but don't write out any metadata.
 *
 * If successful, sets bttp->layout to 1 and returns 0.  Otherwise -1
 * is returned and errno is set, and bttp->layout remains 0 so that
 * later attempts to write will try again to create the layout.
 */
static int
write_layout(struct btt *bttp, int lane, int write)
{
	LOG(3, "bttp %p lane %d write %d", bttp, lane, write);

	ASSERT(bttp->rawsize >= BTT_MIN_SIZE);
	ASSERT(bttp->nfree);

	/*
	 * The number of arenas is the number of full arena of
	 * size BTT_MAX_ARENA that fit into rawsize and then, if
	 * the remainder is at least BTT_MIN_SIZE in size, then
	 * that adds one more arena.
	 */
	bttp->narena = bttp->rawsize / BTT_MAX_ARENA;
	if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE)
		bttp->narena++;
	LOG(4, "narena %u", bttp->narena);

	int flog_size = bttp->nfree * 2 * sizeof (struct btt_flog);
	flog_size = roundup(flog_size, BTT_ALIGNMENT);

	uint32_t internal_lbasize = bttp->lbasize;
	if (internal_lbasize < BTT_MIN_LBA)
		internal_lbasize = BTT_MIN_LBA;
	internal_lbasize =
		roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT);
	LOG(4, "adjusted internal_lbasize %u", internal_lbasize);

	uint64_t total_nlba = 0;
	uint64_t rawsize = bttp->rawsize;
	int arena_num = 0;
	off_t arena_off = 0;

	/*
	 * for each arena...
	 */
	while (rawsize >= BTT_MIN_SIZE) {
		LOG(4, "layout arena %u", arena_num);

		uint64_t arena_rawsize = rawsize;
		if (arena_rawsize > BTT_MAX_ARENA) {
			arena_rawsize = BTT_MAX_ARENA;
		}
		rawsize -= arena_rawsize;
		arena_num++;

		uint64_t arena_datasize = arena_rawsize;
		arena_datasize -= 2 * sizeof (struct btt_info);
		arena_datasize -= flog_size;

		/* allow for map alignment padding */
		uint64_t internal_nlba = (arena_datasize - BTT_ALIGNMENT) /
			(internal_lbasize + BTT_MAP_ENTRY_SIZE);
		uint64_t external_nlba = internal_nlba - bttp->nfree;

		LOG(4, "internal_nlba %zu external_nlba %zu",
				internal_nlba, external_nlba);

		total_nlba += external_nlba;

		/*
		 * The rest of the loop body calculates metadata structures
		 * and lays it out for this arena.  So only continue if
		 * the write flag is set.
		 */
		if (!write)
			continue;

		uint64_t mapsize = roundup(external_nlba * BTT_MAP_ENTRY_SIZE,
							BTT_ALIGNMENT);
		arena_datasize -= mapsize;

		ASSERT(arena_datasize / internal_lbasize >= internal_nlba);

		/*
		 * Calculate offsets for the BTT info block.  These are
		 * all relative to the beginning of the arena.
		 */
		uint64_t nextoff;
		if (rawsize)
			nextoff = arena_rawsize;
		else
			nextoff = 0;
		uint64_t infooff = arena_rawsize - sizeof (struct btt_info);
		uint64_t flogoff = infooff - flog_size;
		uint64_t mapoff = flogoff - mapsize;
		uint64_t dataoff = sizeof (struct btt_info);

		LOG(4, "nextoff 0x%016lx", nextoff);
		LOG(4, "dataoff 0x%016lx", dataoff);
		LOG(4, "mapoff  0x%016lx", mapoff);
		LOG(4, "flogoff 0x%016lx", flogoff);
		LOG(4, "infooff 0x%016lx", infooff);

		ASSERTeq(arena_datasize, mapoff - dataoff);

		/* write out the initial map, identity style */
		off_t map_entry_off = arena_off + mapoff;
		uint32_t *mapp = NULL;
		int mlen = 0;
		int next_index = 0;
		int remaining = 0;
		for (int i = 0; i < external_nlba; i++) {
			if (remaining == 0) {
				/* flush previous mapped area */
				if (mapp != NULL) {
					/*
					 * Protect the memory again
					 * (debug version only).
					 * If (mapp != NULL) it had to be
					 * unprotected earlier.
					 */
					RANGE_RO(mapp, mlen);

					(*bttp->ns_cbp->nssync)(bttp->ns,
						lane, mapp, mlen);
				}
				/* request a mapping of remaining map area */
				mlen = (*bttp->ns_cbp->nsmap)(bttp->ns,
					lane, (void **)&mapp,
					(external_nlba - i) * sizeof (uint32_t),
					map_entry_off);

				if (mlen < 0)
					return -1;

				/* unprotect the memory (debug version only) */
				RANGE_RW(mapp, mlen);

				remaining = mlen;
				next_index = 0;
			}
			mapp[next_index++] = htole32(i | BTT_MAP_ENTRY_ZERO);
			remaining -= sizeof (uint32_t);
		}

		/* protect the memory again (debug version only) */
		RANGE_RO(mapp, mlen);

		/* flush previous mapped area */
		if (mapp != NULL)
			(*bttp->ns_cbp->nssync)(bttp->ns, lane, mapp, mlen);

		/* write out the initial flog */
		off_t flog_entry_off = arena_off + flogoff;
		uint32_t next_free_lba = external_nlba;
		for (int i = 0; i < bttp->nfree; i++) {
			struct btt_flog flog;
			flog.lba = 0;
			flog.old_map = flog.new_map =
				htole32(next_free_lba | BTT_MAP_ENTRY_ZERO);
			flog.seq = htole32(1);

			/*
			 * Write both btt_flog structs in the pair, writing
			 * the second one as all zeros.
			 */
			LOG(6, "flog[%d] entry off %zu initial %u + zero = %u",
					i, flog_entry_off, next_free_lba,
					next_free_lba | BTT_MAP_ENTRY_ZERO);
			if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog,
					sizeof (flog), flog_entry_off) < 0)
				return -1;
			flog_entry_off += sizeof (flog);

			LOG(6, "flog[%d] entry off %zu zeros",
					i, flog_entry_off);
			if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog,
					sizeof (Zflog), flog_entry_off) < 0)
				return -1;
			flog_entry_off += sizeof (flog);

			next_free_lba++;
		}

		/*
		 * Construct the BTT info block and write it out
		 * at both the beginning and end of the arena.
		 */
		struct btt_info info;
		memset(&info, '\0', sizeof (info));
		memcpy(info.sig, Sig, BTTINFO_SIG_LEN);
		memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN);
		info.major = htole16(BTTINFO_MAJOR_VERSION);
		info.minor = htole16(BTTINFO_MINOR_VERSION);
		info.external_lbasize = htole32(bttp->lbasize);
		info.external_nlba = htole32(external_nlba);
		info.internal_lbasize = htole32(internal_lbasize);
		info.internal_nlba = htole32(internal_nlba);
		info.nfree = htole32(bttp->nfree);
		info.infosize = htole32(sizeof (info));
		info.nextoff = htole64(nextoff);
		info.dataoff = htole64(dataoff);
		info.mapoff = htole64(mapoff);
		info.flogoff = htole64(flogoff);
		info.infooff = htole64(infooff);

		util_checksum(&info, sizeof (info), &info.checksum, 1);

		if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
					sizeof (info), arena_off) < 0)
			return -1;
		if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
					sizeof (info), arena_off + nextoff) < 0)
			return -1;

		arena_off += nextoff;
	}

	ASSERTeq(bttp->narena, arena_num);

	bttp->nlba = total_nlba;

	if (write) {
		/*
		 * The layout is written now, so load up the arenas.
		 */
		return read_arenas(bttp, lane, bttp->narena);
	}

	return 0;
}
Esempio n. 6
0
/*
 * pmemlog_appendv -- add gathered data to a log memory pool
 */
int
pmemlog_appendv(PMEMlogpool *plp, const struct iovec *iov, int iovcnt)
{
    LOG(3, "plp %p iovec %p iovcnt %d", plp, iov, iovcnt);

    int ret = 0; // success
    int i;

    ASSERT(iovcnt > 0);

    if (plp->rdonly) {
        ERR("can't append to read-only log");
        errno = EROFS;
        return -1;
    }

    if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) {
        ERR("!pthread_rwlock_wrlock");
        return -1;
    }

    /* get the current values */
    uint64_t end_offset = le64toh(plp->end_offset);
    uint64_t write_offset = le64toh(plp->write_offset);

    if (write_offset >= end_offset) {
        /* no space left */
        errno = ENOSPC;
        ERR("!pmemlog_appendv");
        ret = -1;
        goto end;
    }

    char *data = plp->addr;
    uint64_t count = 0;
    char *buf;

    /* calculate required space */
    for (i = 0; i < iovcnt; ++i)
        count += iov[i].iov_len;

    /* check if there is enough free space */
    if (count > (end_offset - write_offset)) {
        errno = ENOSPC;
        ret = -1;
        goto end;
    }

    /* append the data */
    for (i = 0; i < iovcnt; ++i) {
        buf = iov[i].iov_base;
        count = iov[i].iov_len;

        /*
         * unprotect the log space range, where the new data will be
         * stored (debug version only)
         */
        RANGE_RW(&data[write_offset], count, plp->is_dax);

        if (plp->is_pmem)
            pmem_memcpy_nodrain(&data[write_offset], buf, count);
        else
            memcpy(&data[write_offset], buf, count);

        /*
         * protect the log space range (debug version only)
         */
        RANGE_RO(&data[write_offset], count, plp->is_dax);

        write_offset += count;
    }

    /* persist the data and the metadata */
    pmemlog_persist(plp, write_offset);

end:
    util_rwlock_unlock(plp->rwlockp);

    return ret;
}
Esempio n. 7
0
File: blk.c Progetto: mdalecki/nvml
/*
 * pmemblk_map_common -- (internal) map a block memory pool
 *
 * This routine does all the work, but takes a rdonly flag so internal
 * calls can map a read-only pool if required.
 *
 * Passing in bsize == 0 means a valid pool header must exist (which
 * will supply the block size).
 */
static PMEMblk *
pmemblk_map_common(int fd, size_t bsize, int rdonly)
{
	LOG(3, "fd %d bsize %zu rdonly %d", fd, bsize, rdonly);

	/* things free by "goto err" if not NULL */
	void *addr = NULL;
	struct btt *bttp = NULL;
	pthread_mutex_t *locks = NULL;

	struct stat stbuf;
	if (fstat(fd, &stbuf) < 0) {
		LOG(1, "!fstat");
		return NULL;
	}

	if (stbuf.st_size < PMEMBLK_MIN_POOL) {
		LOG(1, "size %zu smaller than %zu",
				stbuf.st_size, PMEMBLK_MIN_POOL);
		errno = EINVAL;
		return NULL;
	}

	if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL)
		return NULL;	/* util_map() set errno, called LOG */

	/* check if the mapped region is located in persistent memory */
	int is_pmem = pmem_is_pmem(addr, stbuf.st_size);

	/* opaque info lives at the beginning of mapped memory pool */
	struct pmemblk *pbp = addr;

	struct pool_hdr hdr;
	memcpy(&hdr, &pbp->hdr, sizeof (hdr));

	if (util_convert_hdr(&hdr)) {
		/*
		 * valid header found
		 */
		if (strncmp(hdr.signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN)) {
			LOG(1, "wrong pool type: \"%s\"", hdr.signature);

			errno = EINVAL;
			goto err;
		}

		if (hdr.major != BLK_FORMAT_MAJOR) {
			LOG(1, "blk pool version %d (library expects %d)",
				hdr.major, BLK_FORMAT_MAJOR);

			errno = EINVAL;
			goto err;
		}

		size_t hdr_bsize = le32toh(pbp->bsize);
		if (bsize && bsize != hdr_bsize) {
			LOG(1, "wrong bsize (%zu), pool created with bsize %zu",
					bsize, hdr_bsize);
			errno = EINVAL;
			goto err;
		}
		bsize = hdr_bsize;
		LOG(3, "using block size from header: %zu", bsize);

		int retval = util_feature_check(&hdr, BLK_FORMAT_INCOMPAT,
							BLK_FORMAT_RO_COMPAT,
							BLK_FORMAT_COMPAT);
		if (retval < 0)
		    goto err;
		else if (retval == 0)
		    rdonly = 1;
	} else {
		/*
		 * no valid header was found
		 */
		if (rdonly) {
			LOG(1, "read-only and no header found");
			errno = EROFS;
			goto err;
		}
		LOG(3, "creating new blk memory pool");

		struct pool_hdr *hdrp = &pbp->hdr;

		memset(hdrp, '\0', sizeof (*hdrp));
		strncpy(hdrp->signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN);
		hdrp->major = htole32(BLK_FORMAT_MAJOR);
		hdrp->compat_features = htole32(BLK_FORMAT_COMPAT);
		hdrp->incompat_features = htole32(BLK_FORMAT_INCOMPAT);
		hdrp->ro_compat_features = htole32(BLK_FORMAT_RO_COMPAT);
		uuid_generate(hdrp->uuid);
		hdrp->crtime = htole64((uint64_t)time(NULL));
		util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1);
		hdrp->checksum = htole64(hdrp->checksum);

		/* store pool's header */
		libpmem_persist(is_pmem, hdrp, sizeof (*hdrp));

		/* create rest of required metadata */
		pbp->bsize = htole32(bsize);
		libpmem_persist(is_pmem, &pbp->bsize, sizeof (bsize));
	}

	/*
	 * Use some of the memory pool area for run-time info.  This
	 * run-time state is never loaded from the file, it is always
	 * created here, so no need to worry about byte-order.
	 */
	pbp->addr = addr;
	pbp->size = stbuf.st_size;
	pbp->rdonly = rdonly;
	pbp->is_pmem = is_pmem;
	pbp->data = addr + roundup(sizeof (*pbp), BLK_FORMAT_DATA_ALIGN);
	pbp->datasize = (pbp->addr + pbp->size) - pbp->data;

	LOG(4, "data area %p data size %zu bsize %zu",
		pbp->data, pbp->datasize, bsize);

	int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (ncpus < 1)
		ncpus = 1;

	bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.uuid,
			ncpus, pbp, &ns_cb);

	if (bttp == NULL)
		goto err;	/* btt_init set errno, called LOG */

	pbp->bttp = bttp;

	pbp->nlane = btt_nlane(pbp->bttp);
	pbp->next_lane = 0;
	if ((locks = Malloc(pbp->nlane * sizeof (*locks))) == NULL) {
		LOG(1, "!Malloc for lane locks");
		goto err;
	}

	for (int i = 0; i < pbp->nlane; i++)
		if (pthread_mutex_init(&locks[i], NULL) < 0) {
			LOG(1, "!pthread_mutex_init");
			goto err;
		}

	pbp->locks = locks;

#ifdef DEBUG
	/* initialize debug lock */
	if (pthread_mutex_init(&pbp->write_lock, NULL) < 0) {
		LOG(1, "!pthread_mutex_init");
		goto err;
	}
#endif

	/*
	 * If possible, turn off all permissions on the pool header page.
	 *
	 * The prototype PMFS doesn't allow this when large pages are in
	 * use not it is not considered an error if this fails.
	 */
	util_range_none(addr, sizeof (struct pool_hdr));

	/* the data area should be kept read-only for debug version */
	RANGE_RO(pbp->data, pbp->datasize);

	LOG(3, "pbp %p", pbp);
	return pbp;

err:
	LOG(4, "error clean up");
	int oerrno = errno;
	if (locks)
		Free((void *)locks);
	if (bttp)
		btt_fini(bttp);
	util_unmap(addr, stbuf.st_size);
	errno = oerrno;
	return NULL;
}