示例#1
0
文件: log.c 项目: harrybaa/nvml
/*
 * pmemlog_persist -- (internal) persist data, then metadata
 *
 * On entry, the write lock should be held.
 */
static void
pmemlog_persist(PMEMlogpool *plp, uint64_t new_write_offset)
{
    uint64_t old_write_offset = le64toh(plp->write_offset);
    size_t length = new_write_offset - old_write_offset;

    /* unprotect the log space range (debug version only) */
    RANGE_RW(plp->addr + old_write_offset, length);

    /* persist the data */
    if (plp->is_pmem)
        pmem_drain(); /* data already flushed */
    else
        pmem_msync(plp->addr + old_write_offset, length);

    /* protect the log space range (debug version only) */
    RANGE_RO(plp->addr + old_write_offset, length);

    /* unprotect the pool descriptor (debug version only) */
    RANGE_RW(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN);

    /* write the metadata */
    plp->write_offset = htole64(new_write_offset);

    /* persist the metadata */
    if (plp->is_pmem)
        pmem_persist(&plp->write_offset, sizeof (plp->write_offset));
    else
        pmem_msync(&plp->write_offset, sizeof (plp->write_offset));

    /* set the write-protection again (debug version only) */
    RANGE_RO(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN);
}
示例#2
0
文件: blk.c 项目: xguo/nvml
/*
 * nszero -- (internal) zero data in the namespace encapsulating the BTT
 *
 * This routine is provided to btt_init() to allow the btt module to
 * zero the memory pool containing the BTT layout.
 */
static int
nszero(void *ns, unsigned lane, size_t count, uint64_t off)
{
	struct pmemblk *pbp = (struct pmemblk *)ns;

	LOG(13, "pbp %p lane %u count %zu off %ju", pbp, lane, count, off);

	if (off + count > pbp->datasize) {
		ERR("offset + count (%zu) past end of data area (%zu)",
				off + count, pbp->datasize);
		errno = EINVAL;
		return -1;
	}

	void *dest = (char *)pbp->data + off;

	/* unprotect the memory (debug version only) */
	RANGE_RW(dest, count);

	pmem_memset_persist(dest, 0, count);

	/* protect the memory again (debug version only) */
	RANGE_RO(dest, count);

	return 0;
}
示例#3
0
文件: log.c 项目: harrybaa/nvml
/*
 * pmemlog_rewind -- discard all data, resetting a log memory pool to empty
 */
void
pmemlog_rewind(PMEMlogpool *plp)
{
    LOG(3, "plp %p", plp);

    if (plp->rdonly) {
        ERR("can't rewind read-only log");
        errno = EROFS;
        return;
    }

    if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) {
        ERR("!pthread_rwlock_wrlock");
        return;
    }

    /* unprotect the pool descriptor (debug version only) */
    RANGE_RW(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN);

    plp->write_offset = plp->start_offset;
    if (plp->is_pmem)
        pmem_persist(&plp->write_offset, sizeof (uint64_t));
    else
        pmem_msync(&plp->write_offset, sizeof (uint64_t));

    /* set the write-protection again (debug version only) */
    RANGE_RO(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN);

    if ((errno = pthread_rwlock_unlock(plp->rwlockp)))
        ERR("!pthread_rwlock_unlock");
}
示例#4
0
文件: blk.c 项目: jebtang/nvml
/*
 * nswrite -- (internal) write data to the namespace encapsulating the BTT
 *
 * This routine is provided to btt_init() to allow the btt module to
 * do I/O on the memory pool containing the BTT layout.
 */
static int
nswrite(void *ns, int lane, const void *buf, size_t count, off_t off)
{
	struct pmemblk *pbp = (struct pmemblk *)ns;

	LOG(13, "pbp %p lane %d count %zu off %lld",
			pbp, lane, count, (long long)off);

	if (off + count > pbp->datasize) {
		ERR("offset + count (%lld) past end of data area (%zu)",
				(long long)off + count, pbp->datasize);
		errno = EINVAL;
		return -1;
	}

	void *dest = pbp->data + off;

#ifdef DEBUG
	/* grab debug write lock */
	if ((errno = pthread_mutex_lock(&pbp->write_lock))) {
		ERR("!pthread_mutex_lock");
		return -1;
	}
#endif

	/* unprotect the memory (debug version only) */
	RANGE_RW(dest, count);

	if (pbp->is_pmem)
		pmem_memcpy_nodrain(dest, buf, count);
	else
		memcpy(dest, buf, count);

	/* protect the memory again (debug version only) */
	RANGE_RO(dest, count);

#ifdef DEBUG
	/* release debug write lock */
	if ((errno = pthread_mutex_unlock(&pbp->write_lock)))
		ERR("!pthread_mutex_unlock");
#endif

	if (pbp->is_pmem)
		pmem_drain();
	else
		pmem_msync(dest, count);

	return 0;
}
示例#5
0
文件: blk.c 项目: xguo/nvml
/*
 * nswrite -- (internal) write data to the namespace encapsulating the BTT
 *
 * This routine is provided to btt_init() to allow the btt module to
 * do I/O on the memory pool containing the BTT layout.
 */
static int
nswrite(void *ns, unsigned lane, const void *buf, size_t count,
		uint64_t off)
{
	struct pmemblk *pbp = (struct pmemblk *)ns;

	LOG(13, "pbp %p lane %u count %zu off %ju", pbp, lane, count, off);

	if (off + count > pbp->datasize) {
		ERR("offset + count (%zu) past end of data area (%zu)",
				off + count, pbp->datasize);
		errno = EINVAL;
		return -1;
	}

	void *dest = (char *)pbp->data + off;

#ifdef DEBUG
	/* grab debug write lock */
	util_mutex_lock(&pbp->write_lock);
#endif

	/* unprotect the memory (debug version only) */
	RANGE_RW(dest, count);

	if (pbp->is_pmem)
		pmem_memcpy_nodrain(dest, buf, count);
	else
		memcpy(dest, buf, count);

	/* protect the memory again (debug version only) */
	RANGE_RO(dest, count);

#ifdef DEBUG
	/* release debug write lock */
	util_mutex_unlock(&pbp->write_lock);
#endif

	if (pbp->is_pmem)
		pmem_drain();
	else
		pmem_msync(dest, count);

	return 0;
}
示例#6
0
文件: blk.c 项目: mdalecki/nvml
/*
 * nswrite -- (internal) write data to the namespace encapsulating the BTT
 *
 * This routine is provided to btt_init() to allow the btt module to
 * do I/O on the memory pool containing the BTT layout.
 */
static int
nswrite(void *ns, int lane, const void *buf, size_t count, off_t off)
{
	struct pmemblk *pbp = (struct pmemblk *)ns;

	LOG(13, "pbp %p lane %d count %zu off %zu", pbp, lane, count, off);

	if (off + count >= pbp->datasize) {
		LOG(1, "offset + count (%zu) past end of data area (%zu)",
				off + count, pbp->datasize - 1);
		errno = EINVAL;
		return -1;
	}

	void *dest = pbp->data + off;

#ifdef DEBUG
	/* grab debug write lock */
	if (pthread_mutex_lock(&pbp->write_lock))
		LOG(1, "!pthread_mutex_lock");
#endif

	/* unprotect the memory (debug version only) */
	RANGE_RW(dest, count);

	memcpy(dest, buf, count);

	/* protect the memory again (debug version only) */
	RANGE_RO(dest, count);

#ifdef DEBUG
	/* release debug write lock */
	if (pthread_mutex_unlock(&pbp->write_lock))
		LOG(1, "!pthread_mutex_unlock");
#endif

	libpmem_persist(pbp->is_pmem, dest, count);

	return 0;
}
示例#7
0
文件: log.c 项目: harrybaa/nvml
/*
 * pmemlog_appendv -- add gathered data to a log memory pool
 */
int
pmemlog_appendv(PMEMlogpool *plp, const struct iovec *iov, int iovcnt)
{
    LOG(3, "plp %p iovec %p iovcnt %d", plp, iov, iovcnt);

    int ret = 0; // success
    int i;

    ASSERT(iovcnt > 0);

    if (plp->rdonly) {
        ERR("can't append to read-only log");
        errno = EROFS;
        return -1;
    }

    if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) {
        ERR("!pthread_rwlock_wrlock");
        return -1;
    }

    /* get the current values */
    uint64_t end_offset = le64toh(plp->end_offset);
    uint64_t write_offset = le64toh(plp->write_offset);

    if (write_offset >= end_offset) {
        /* no space left */
        errno = ENOSPC;
        ERR("!pmemlog_appendv");
        ret = -1;
    } else {
        char *data = plp->addr;
        uint64_t count = 0;
        char *buf;

        /* calculate required space */
        for (i = 0; i < iovcnt; ++i)
            count += iov[i].iov_len;

        /* check if there is enough free space */
        if (count > (end_offset - write_offset)) {
            errno = ENOSPC;
            ret = -1;
        } else {
            /* append the data */
            for (i = 0; i < iovcnt; ++i) {
                buf = iov[i].iov_base;
                count = iov[i].iov_len;

                /*
                 * unprotect the log space range,
                 * where the new data will be stored
                 * (debug version only)
                 */
                RANGE_RW(&data[write_offset], count);

                if (plp->is_pmem)
                    pmem_memcpy_nodrain(&data[write_offset],
                                        buf, count);
                else
                    memcpy(&data[write_offset], buf, count);

                /*
                 * protect the log space range
                 * (debug version only)
                 */
                RANGE_RO(&data[write_offset], count);

                write_offset += count;
            }
        }
    }

    /* persist the data and the metadata only if there was no error */
    if (ret == 0)
        pmemlog_persist(plp, write_offset);

    int oerrno = errno;
    if ((errno = pthread_rwlock_unlock(plp->rwlockp)))
        ERR("!pthread_rwlock_unlock");
    errno = oerrno;

    return ret;
}
示例#8
0
文件: log.c 项目: harrybaa/nvml
/*
 * pmemlog_append -- add data to a log memory pool
 */
int
pmemlog_append(PMEMlogpool *plp, const void *buf, size_t count)
{
    int ret = 0;

    LOG(3, "plp %p buf %p count %zu", plp, buf, count);

    if (plp->rdonly) {
        ERR("can't append to read-only log");
        errno = EROFS;
        return -1;
    }

    if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) {
        ERR("!pthread_rwlock_wrlock");
        return -1;
    }

    /* get the current values */
    uint64_t end_offset = le64toh(plp->end_offset);
    uint64_t write_offset = le64toh(plp->write_offset);

    if (write_offset >= end_offset) {
        /* no space left */
        errno = ENOSPC;
        ERR("!pmemlog_append");
        ret = -1;
    } else {
        /* make sure we don't write past the available space */
        if (count > (end_offset - write_offset)) {
            errno = ENOSPC;
            ERR("!pmemlog_append");
            ret = -1;
        } else {
            char *data = plp->addr;

            /*
             * unprotect the log space range,
             * where the new data will be stored
             * (debug version only)
             */
            RANGE_RW(&data[write_offset], count);

            if (plp->is_pmem)
                pmem_memcpy_nodrain(&data[write_offset],
                                    buf, count);
            else
                memcpy(&data[write_offset], buf, count);

            /* protect the log space range (debug version only) */
            RANGE_RO(&data[write_offset], count);

            write_offset += count;
        }
    }

    /* persist the data and the metadata only if there was no error */
    if (ret == 0)
        pmemlog_persist(plp, write_offset);

    int oerrno = errno;
    if ((errno = pthread_rwlock_unlock(plp->rwlockp)))
        ERR("!pthread_rwlock_unlock");
    errno = oerrno;

    return ret;
}
示例#9
0
文件: btt.c 项目: mdalecki/nvml
/*
 * write_layout -- (internal) write out the initial btt metadata layout
 *
 * Called with write == 1 only once in the life time of a btt namespace, when
 * the first write happens.  The caller of this routine is responsible for
 * locking out multiple threads.  This routine doesn't read anything -- by the
 * time it is called, it is known there's no layout in the namespace and a new
 * layout should be written.
 *
 * Calling with write == 0 tells this routine to do the calculations for
 * bttp->narena and bttp->nlba, but don't write out any metadata.
 *
 * If successful, sets bttp->layout to 1 and returns 0.  Otherwise -1
 * is returned and errno is set, and bttp->layout remains 0 so that
 * later attempts to write will try again to create the layout.
 */
static int
write_layout(struct btt *bttp, int lane, int write)
{
	LOG(3, "bttp %p lane %d write %d", bttp, lane, write);

	ASSERT(bttp->rawsize >= BTT_MIN_SIZE);
	ASSERT(bttp->nfree);

	/*
	 * The number of arenas is the number of full arena of
	 * size BTT_MAX_ARENA that fit into rawsize and then, if
	 * the remainder is at least BTT_MIN_SIZE in size, then
	 * that adds one more arena.
	 */
	bttp->narena = bttp->rawsize / BTT_MAX_ARENA;
	if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE)
		bttp->narena++;
	LOG(4, "narena %u", bttp->narena);

	int flog_size = bttp->nfree * 2 * sizeof (struct btt_flog);
	flog_size = roundup(flog_size, BTT_ALIGNMENT);

	uint32_t internal_lbasize = bttp->lbasize;
	if (internal_lbasize < BTT_MIN_LBA)
		internal_lbasize = BTT_MIN_LBA;
	internal_lbasize =
		roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT);
	LOG(4, "adjusted internal_lbasize %u", internal_lbasize);

	uint64_t total_nlba = 0;
	uint64_t rawsize = bttp->rawsize;
	int arena_num = 0;
	off_t arena_off = 0;

	/*
	 * for each arena...
	 */
	while (rawsize >= BTT_MIN_SIZE) {
		LOG(4, "layout arena %u", arena_num);

		uint64_t arena_rawsize = rawsize;
		if (arena_rawsize > BTT_MAX_ARENA) {
			arena_rawsize = BTT_MAX_ARENA;
		}
		rawsize -= arena_rawsize;
		arena_num++;

		uint64_t arena_datasize = arena_rawsize;
		arena_datasize -= 2 * sizeof (struct btt_info);
		arena_datasize -= flog_size;

		/* allow for map alignment padding */
		uint64_t internal_nlba = (arena_datasize - BTT_ALIGNMENT) /
			(internal_lbasize + BTT_MAP_ENTRY_SIZE);
		uint64_t external_nlba = internal_nlba - bttp->nfree;

		LOG(4, "internal_nlba %zu external_nlba %zu",
				internal_nlba, external_nlba);

		total_nlba += external_nlba;

		/*
		 * The rest of the loop body calculates metadata structures
		 * and lays it out for this arena.  So only continue if
		 * the write flag is set.
		 */
		if (!write)
			continue;

		uint64_t mapsize = roundup(external_nlba * BTT_MAP_ENTRY_SIZE,
							BTT_ALIGNMENT);
		arena_datasize -= mapsize;

		ASSERT(arena_datasize / internal_lbasize >= internal_nlba);

		/*
		 * Calculate offsets for the BTT info block.  These are
		 * all relative to the beginning of the arena.
		 */
		uint64_t nextoff;
		if (rawsize)
			nextoff = arena_rawsize;
		else
			nextoff = 0;
		uint64_t infooff = arena_rawsize - sizeof (struct btt_info);
		uint64_t flogoff = infooff - flog_size;
		uint64_t mapoff = flogoff - mapsize;
		uint64_t dataoff = sizeof (struct btt_info);

		LOG(4, "nextoff 0x%016lx", nextoff);
		LOG(4, "dataoff 0x%016lx", dataoff);
		LOG(4, "mapoff  0x%016lx", mapoff);
		LOG(4, "flogoff 0x%016lx", flogoff);
		LOG(4, "infooff 0x%016lx", infooff);

		ASSERTeq(arena_datasize, mapoff - dataoff);

		/* write out the initial map, identity style */
		off_t map_entry_off = arena_off + mapoff;
		uint32_t *mapp = NULL;
		int mlen = 0;
		int next_index = 0;
		int remaining = 0;
		for (int i = 0; i < external_nlba; i++) {
			if (remaining == 0) {
				/* flush previous mapped area */
				if (mapp != NULL) {
					/*
					 * Protect the memory again
					 * (debug version only).
					 * If (mapp != NULL) it had to be
					 * unprotected earlier.
					 */
					RANGE_RO(mapp, mlen);

					(*bttp->ns_cbp->nssync)(bttp->ns,
						lane, mapp, mlen);
				}
				/* request a mapping of remaining map area */
				mlen = (*bttp->ns_cbp->nsmap)(bttp->ns,
					lane, (void **)&mapp,
					(external_nlba - i) * sizeof (uint32_t),
					map_entry_off);

				if (mlen < 0)
					return -1;

				/* unprotect the memory (debug version only) */
				RANGE_RW(mapp, mlen);

				remaining = mlen;
				next_index = 0;
			}
			mapp[next_index++] = htole32(i | BTT_MAP_ENTRY_ZERO);
			remaining -= sizeof (uint32_t);
		}

		/* protect the memory again (debug version only) */
		RANGE_RO(mapp, mlen);

		/* flush previous mapped area */
		if (mapp != NULL)
			(*bttp->ns_cbp->nssync)(bttp->ns, lane, mapp, mlen);

		/* write out the initial flog */
		off_t flog_entry_off = arena_off + flogoff;
		uint32_t next_free_lba = external_nlba;
		for (int i = 0; i < bttp->nfree; i++) {
			struct btt_flog flog;
			flog.lba = 0;
			flog.old_map = flog.new_map =
				htole32(next_free_lba | BTT_MAP_ENTRY_ZERO);
			flog.seq = htole32(1);

			/*
			 * Write both btt_flog structs in the pair, writing
			 * the second one as all zeros.
			 */
			LOG(6, "flog[%d] entry off %zu initial %u + zero = %u",
					i, flog_entry_off, next_free_lba,
					next_free_lba | BTT_MAP_ENTRY_ZERO);
			if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog,
					sizeof (flog), flog_entry_off) < 0)
				return -1;
			flog_entry_off += sizeof (flog);

			LOG(6, "flog[%d] entry off %zu zeros",
					i, flog_entry_off);
			if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog,
					sizeof (Zflog), flog_entry_off) < 0)
				return -1;
			flog_entry_off += sizeof (flog);

			next_free_lba++;
		}

		/*
		 * Construct the BTT info block and write it out
		 * at both the beginning and end of the arena.
		 */
		struct btt_info info;
		memset(&info, '\0', sizeof (info));
		memcpy(info.sig, Sig, BTTINFO_SIG_LEN);
		memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN);
		info.major = htole16(BTTINFO_MAJOR_VERSION);
		info.minor = htole16(BTTINFO_MINOR_VERSION);
		info.external_lbasize = htole32(bttp->lbasize);
		info.external_nlba = htole32(external_nlba);
		info.internal_lbasize = htole32(internal_lbasize);
		info.internal_nlba = htole32(internal_nlba);
		info.nfree = htole32(bttp->nfree);
		info.infosize = htole32(sizeof (info));
		info.nextoff = htole64(nextoff);
		info.dataoff = htole64(dataoff);
		info.mapoff = htole64(mapoff);
		info.flogoff = htole64(flogoff);
		info.infooff = htole64(infooff);

		util_checksum(&info, sizeof (info), &info.checksum, 1);

		if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
					sizeof (info), arena_off) < 0)
			return -1;
		if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
					sizeof (info), arena_off + nextoff) < 0)
			return -1;

		arena_off += nextoff;
	}

	ASSERTeq(bttp->narena, arena_num);

	bttp->nlba = total_nlba;

	if (write) {
		/*
		 * The layout is written now, so load up the arenas.
		 */
		return read_arenas(bttp, lane, bttp->narena);
	}

	return 0;
}