/* * pmemlog_persist -- (internal) persist data, then metadata * * On entry, the write lock should be held. */ static void pmemlog_persist(PMEMlogpool *plp, uint64_t new_write_offset) { uint64_t old_write_offset = le64toh(plp->write_offset); size_t length = new_write_offset - old_write_offset; /* unprotect the log space range (debug version only) */ RANGE_RW(plp->addr + old_write_offset, length); /* persist the data */ if (plp->is_pmem) pmem_drain(); /* data already flushed */ else pmem_msync(plp->addr + old_write_offset, length); /* protect the log space range (debug version only) */ RANGE_RO(plp->addr + old_write_offset, length); /* unprotect the pool descriptor (debug version only) */ RANGE_RW(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); /* write the metadata */ plp->write_offset = htole64(new_write_offset); /* persist the metadata */ if (plp->is_pmem) pmem_persist(&plp->write_offset, sizeof (plp->write_offset)); else pmem_msync(&plp->write_offset, sizeof (plp->write_offset)); /* set the write-protection again (debug version only) */ RANGE_RO(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); }
/* * nszero -- (internal) zero data in the namespace encapsulating the BTT * * This routine is provided to btt_init() to allow the btt module to * zero the memory pool containing the BTT layout. */ static int nszero(void *ns, unsigned lane, size_t count, uint64_t off) { struct pmemblk *pbp = (struct pmemblk *)ns; LOG(13, "pbp %p lane %u count %zu off %ju", pbp, lane, count, off); if (off + count > pbp->datasize) { ERR("offset + count (%zu) past end of data area (%zu)", off + count, pbp->datasize); errno = EINVAL; return -1; } void *dest = (char *)pbp->data + off; /* unprotect the memory (debug version only) */ RANGE_RW(dest, count); pmem_memset_persist(dest, 0, count); /* protect the memory again (debug version only) */ RANGE_RO(dest, count); return 0; }
/* * pmemlog_rewind -- discard all data, resetting a log memory pool to empty */ void pmemlog_rewind(PMEMlogpool *plp) { LOG(3, "plp %p", plp); if (plp->rdonly) { ERR("can't rewind read-only log"); errno = EROFS; return; } if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) { ERR("!pthread_rwlock_wrlock"); return; } /* unprotect the pool descriptor (debug version only) */ RANGE_RW(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); plp->write_offset = plp->start_offset; if (plp->is_pmem) pmem_persist(&plp->write_offset, sizeof (uint64_t)); else pmem_msync(&plp->write_offset, sizeof (uint64_t)); /* set the write-protection again (debug version only) */ RANGE_RO(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); if ((errno = pthread_rwlock_unlock(plp->rwlockp))) ERR("!pthread_rwlock_unlock"); }
/* * nswrite -- (internal) write data to the namespace encapsulating the BTT * * This routine is provided to btt_init() to allow the btt module to * do I/O on the memory pool containing the BTT layout. */ static int nswrite(void *ns, int lane, const void *buf, size_t count, off_t off) { struct pmemblk *pbp = (struct pmemblk *)ns; LOG(13, "pbp %p lane %d count %zu off %lld", pbp, lane, count, (long long)off); if (off + count > pbp->datasize) { ERR("offset + count (%lld) past end of data area (%zu)", (long long)off + count, pbp->datasize); errno = EINVAL; return -1; } void *dest = pbp->data + off; #ifdef DEBUG /* grab debug write lock */ if ((errno = pthread_mutex_lock(&pbp->write_lock))) { ERR("!pthread_mutex_lock"); return -1; } #endif /* unprotect the memory (debug version only) */ RANGE_RW(dest, count); if (pbp->is_pmem) pmem_memcpy_nodrain(dest, buf, count); else memcpy(dest, buf, count); /* protect the memory again (debug version only) */ RANGE_RO(dest, count); #ifdef DEBUG /* release debug write lock */ if ((errno = pthread_mutex_unlock(&pbp->write_lock))) ERR("!pthread_mutex_unlock"); #endif if (pbp->is_pmem) pmem_drain(); else pmem_msync(dest, count); return 0; }
/* * nswrite -- (internal) write data to the namespace encapsulating the BTT * * This routine is provided to btt_init() to allow the btt module to * do I/O on the memory pool containing the BTT layout. */ static int nswrite(void *ns, unsigned lane, const void *buf, size_t count, uint64_t off) { struct pmemblk *pbp = (struct pmemblk *)ns; LOG(13, "pbp %p lane %u count %zu off %ju", pbp, lane, count, off); if (off + count > pbp->datasize) { ERR("offset + count (%zu) past end of data area (%zu)", off + count, pbp->datasize); errno = EINVAL; return -1; } void *dest = (char *)pbp->data + off; #ifdef DEBUG /* grab debug write lock */ util_mutex_lock(&pbp->write_lock); #endif /* unprotect the memory (debug version only) */ RANGE_RW(dest, count); if (pbp->is_pmem) pmem_memcpy_nodrain(dest, buf, count); else memcpy(dest, buf, count); /* protect the memory again (debug version only) */ RANGE_RO(dest, count); #ifdef DEBUG /* release debug write lock */ util_mutex_unlock(&pbp->write_lock); #endif if (pbp->is_pmem) pmem_drain(); else pmem_msync(dest, count); return 0; }
/* * nswrite -- (internal) write data to the namespace encapsulating the BTT * * This routine is provided to btt_init() to allow the btt module to * do I/O on the memory pool containing the BTT layout. */ static int nswrite(void *ns, int lane, const void *buf, size_t count, off_t off) { struct pmemblk *pbp = (struct pmemblk *)ns; LOG(13, "pbp %p lane %d count %zu off %zu", pbp, lane, count, off); if (off + count >= pbp->datasize) { LOG(1, "offset + count (%zu) past end of data area (%zu)", off + count, pbp->datasize - 1); errno = EINVAL; return -1; } void *dest = pbp->data + off; #ifdef DEBUG /* grab debug write lock */ if (pthread_mutex_lock(&pbp->write_lock)) LOG(1, "!pthread_mutex_lock"); #endif /* unprotect the memory (debug version only) */ RANGE_RW(dest, count); memcpy(dest, buf, count); /* protect the memory again (debug version only) */ RANGE_RO(dest, count); #ifdef DEBUG /* release debug write lock */ if (pthread_mutex_unlock(&pbp->write_lock)) LOG(1, "!pthread_mutex_unlock"); #endif libpmem_persist(pbp->is_pmem, dest, count); return 0; }
/* * pmemlog_appendv -- add gathered data to a log memory pool */ int pmemlog_appendv(PMEMlogpool *plp, const struct iovec *iov, int iovcnt) { LOG(3, "plp %p iovec %p iovcnt %d", plp, iov, iovcnt); int ret = 0; // success int i; ASSERT(iovcnt > 0); if (plp->rdonly) { ERR("can't append to read-only log"); errno = EROFS; return -1; } if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) { ERR("!pthread_rwlock_wrlock"); return -1; } /* get the current values */ uint64_t end_offset = le64toh(plp->end_offset); uint64_t write_offset = le64toh(plp->write_offset); if (write_offset >= end_offset) { /* no space left */ errno = ENOSPC; ERR("!pmemlog_appendv"); ret = -1; } else { char *data = plp->addr; uint64_t count = 0; char *buf; /* calculate required space */ for (i = 0; i < iovcnt; ++i) count += iov[i].iov_len; /* check if there is enough free space */ if (count > (end_offset - write_offset)) { errno = ENOSPC; ret = -1; } else { /* append the data */ for (i = 0; i < iovcnt; ++i) { buf = iov[i].iov_base; count = iov[i].iov_len; /* * unprotect the log space range, * where the new data will be stored * (debug version only) */ RANGE_RW(&data[write_offset], count); if (plp->is_pmem) pmem_memcpy_nodrain(&data[write_offset], buf, count); else memcpy(&data[write_offset], buf, count); /* * protect the log space range * (debug version only) */ RANGE_RO(&data[write_offset], count); write_offset += count; } } } /* persist the data and the metadata only if there was no error */ if (ret == 0) pmemlog_persist(plp, write_offset); int oerrno = errno; if ((errno = pthread_rwlock_unlock(plp->rwlockp))) ERR("!pthread_rwlock_unlock"); errno = oerrno; return ret; }
/* * pmemlog_append -- add data to a log memory pool */ int pmemlog_append(PMEMlogpool *plp, const void *buf, size_t count) { int ret = 0; LOG(3, "plp %p buf %p count %zu", plp, buf, count); if (plp->rdonly) { ERR("can't append to read-only log"); errno = EROFS; return -1; } if ((errno = pthread_rwlock_wrlock(plp->rwlockp))) { ERR("!pthread_rwlock_wrlock"); return -1; } /* get the current values */ uint64_t end_offset = le64toh(plp->end_offset); uint64_t write_offset = le64toh(plp->write_offset); if (write_offset >= end_offset) { /* no space left */ errno = ENOSPC; ERR("!pmemlog_append"); ret = -1; } else { /* make sure we don't write past the available space */ if (count > (end_offset - write_offset)) { errno = ENOSPC; ERR("!pmemlog_append"); ret = -1; } else { char *data = plp->addr; /* * unprotect the log space range, * where the new data will be stored * (debug version only) */ RANGE_RW(&data[write_offset], count); if (plp->is_pmem) pmem_memcpy_nodrain(&data[write_offset], buf, count); else memcpy(&data[write_offset], buf, count); /* protect the log space range (debug version only) */ RANGE_RO(&data[write_offset], count); write_offset += count; } } /* persist the data and the metadata only if there was no error */ if (ret == 0) pmemlog_persist(plp, write_offset); int oerrno = errno; if ((errno = pthread_rwlock_unlock(plp->rwlockp))) ERR("!pthread_rwlock_unlock"); errno = oerrno; return ret; }
/* * write_layout -- (internal) write out the initial btt metadata layout * * Called with write == 1 only once in the life time of a btt namespace, when * the first write happens. The caller of this routine is responsible for * locking out multiple threads. This routine doesn't read anything -- by the * time it is called, it is known there's no layout in the namespace and a new * layout should be written. * * Calling with write == 0 tells this routine to do the calculations for * bttp->narena and bttp->nlba, but don't write out any metadata. * * If successful, sets bttp->layout to 1 and returns 0. Otherwise -1 * is returned and errno is set, and bttp->layout remains 0 so that * later attempts to write will try again to create the layout. */ static int write_layout(struct btt *bttp, int lane, int write) { LOG(3, "bttp %p lane %d write %d", bttp, lane, write); ASSERT(bttp->rawsize >= BTT_MIN_SIZE); ASSERT(bttp->nfree); /* * The number of arenas is the number of full arena of * size BTT_MAX_ARENA that fit into rawsize and then, if * the remainder is at least BTT_MIN_SIZE in size, then * that adds one more arena. */ bttp->narena = bttp->rawsize / BTT_MAX_ARENA; if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE) bttp->narena++; LOG(4, "narena %u", bttp->narena); int flog_size = bttp->nfree * 2 * sizeof (struct btt_flog); flog_size = roundup(flog_size, BTT_ALIGNMENT); uint32_t internal_lbasize = bttp->lbasize; if (internal_lbasize < BTT_MIN_LBA) internal_lbasize = BTT_MIN_LBA; internal_lbasize = roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT); LOG(4, "adjusted internal_lbasize %u", internal_lbasize); uint64_t total_nlba = 0; uint64_t rawsize = bttp->rawsize; int arena_num = 0; off_t arena_off = 0; /* * for each arena... */ while (rawsize >= BTT_MIN_SIZE) { LOG(4, "layout arena %u", arena_num); uint64_t arena_rawsize = rawsize; if (arena_rawsize > BTT_MAX_ARENA) { arena_rawsize = BTT_MAX_ARENA; } rawsize -= arena_rawsize; arena_num++; uint64_t arena_datasize = arena_rawsize; arena_datasize -= 2 * sizeof (struct btt_info); arena_datasize -= flog_size; /* allow for map alignment padding */ uint64_t internal_nlba = (arena_datasize - BTT_ALIGNMENT) / (internal_lbasize + BTT_MAP_ENTRY_SIZE); uint64_t external_nlba = internal_nlba - bttp->nfree; LOG(4, "internal_nlba %zu external_nlba %zu", internal_nlba, external_nlba); total_nlba += external_nlba; /* * The rest of the loop body calculates metadata structures * and lays it out for this arena. So only continue if * the write flag is set. */ if (!write) continue; uint64_t mapsize = roundup(external_nlba * BTT_MAP_ENTRY_SIZE, BTT_ALIGNMENT); arena_datasize -= mapsize; ASSERT(arena_datasize / internal_lbasize >= internal_nlba); /* * Calculate offsets for the BTT info block. These are * all relative to the beginning of the arena. */ uint64_t nextoff; if (rawsize) nextoff = arena_rawsize; else nextoff = 0; uint64_t infooff = arena_rawsize - sizeof (struct btt_info); uint64_t flogoff = infooff - flog_size; uint64_t mapoff = flogoff - mapsize; uint64_t dataoff = sizeof (struct btt_info); LOG(4, "nextoff 0x%016lx", nextoff); LOG(4, "dataoff 0x%016lx", dataoff); LOG(4, "mapoff 0x%016lx", mapoff); LOG(4, "flogoff 0x%016lx", flogoff); LOG(4, "infooff 0x%016lx", infooff); ASSERTeq(arena_datasize, mapoff - dataoff); /* write out the initial map, identity style */ off_t map_entry_off = arena_off + mapoff; uint32_t *mapp = NULL; int mlen = 0; int next_index = 0; int remaining = 0; for (int i = 0; i < external_nlba; i++) { if (remaining == 0) { /* flush previous mapped area */ if (mapp != NULL) { /* * Protect the memory again * (debug version only). * If (mapp != NULL) it had to be * unprotected earlier. */ RANGE_RO(mapp, mlen); (*bttp->ns_cbp->nssync)(bttp->ns, lane, mapp, mlen); } /* request a mapping of remaining map area */ mlen = (*bttp->ns_cbp->nsmap)(bttp->ns, lane, (void **)&mapp, (external_nlba - i) * sizeof (uint32_t), map_entry_off); if (mlen < 0) return -1; /* unprotect the memory (debug version only) */ RANGE_RW(mapp, mlen); remaining = mlen; next_index = 0; } mapp[next_index++] = htole32(i | BTT_MAP_ENTRY_ZERO); remaining -= sizeof (uint32_t); } /* protect the memory again (debug version only) */ RANGE_RO(mapp, mlen); /* flush previous mapped area */ if (mapp != NULL) (*bttp->ns_cbp->nssync)(bttp->ns, lane, mapp, mlen); /* write out the initial flog */ off_t flog_entry_off = arena_off + flogoff; uint32_t next_free_lba = external_nlba; for (int i = 0; i < bttp->nfree; i++) { struct btt_flog flog; flog.lba = 0; flog.old_map = flog.new_map = htole32(next_free_lba | BTT_MAP_ENTRY_ZERO); flog.seq = htole32(1); /* * Write both btt_flog structs in the pair, writing * the second one as all zeros. */ LOG(6, "flog[%d] entry off %zu initial %u + zero = %u", i, flog_entry_off, next_free_lba, next_free_lba | BTT_MAP_ENTRY_ZERO); if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog, sizeof (flog), flog_entry_off) < 0) return -1; flog_entry_off += sizeof (flog); LOG(6, "flog[%d] entry off %zu zeros", i, flog_entry_off); if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog, sizeof (Zflog), flog_entry_off) < 0) return -1; flog_entry_off += sizeof (flog); next_free_lba++; } /* * Construct the BTT info block and write it out * at both the beginning and end of the arena. */ struct btt_info info; memset(&info, '\0', sizeof (info)); memcpy(info.sig, Sig, BTTINFO_SIG_LEN); memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN); info.major = htole16(BTTINFO_MAJOR_VERSION); info.minor = htole16(BTTINFO_MINOR_VERSION); info.external_lbasize = htole32(bttp->lbasize); info.external_nlba = htole32(external_nlba); info.internal_lbasize = htole32(internal_lbasize); info.internal_nlba = htole32(internal_nlba); info.nfree = htole32(bttp->nfree); info.infosize = htole32(sizeof (info)); info.nextoff = htole64(nextoff); info.dataoff = htole64(dataoff); info.mapoff = htole64(mapoff); info.flogoff = htole64(flogoff); info.infooff = htole64(infooff); util_checksum(&info, sizeof (info), &info.checksum, 1); if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, sizeof (info), arena_off) < 0) return -1; if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, sizeof (info), arena_off + nextoff) < 0) return -1; arena_off += nextoff; } ASSERTeq(bttp->narena, arena_num); bttp->nlba = total_nlba; if (write) { /* * The layout is written now, so load up the arenas. */ return read_arenas(bttp, lane, bttp->narena); } return 0; }