/* * pmemlog_persist -- (internal) persist data, then metadata * * On entry, the write lock should be held. */ static void pmemlog_persist(PMEMlog *plp, uint64_t new_write_offset) { uint64_t old_write_offset = le64toh(plp->write_offset); size_t length = new_write_offset - old_write_offset; /* unprotect the log space range (debug version only) */ RANGE_RW(plp->addr + old_write_offset, length); /* persist the data */ libpmem_persist(plp->is_pmem, plp->addr + old_write_offset, length); /* protect the log space range (debug version only) */ RANGE_RO(plp->addr + old_write_offset, length); /* unprotect the pool descriptor (debug version only) */ RANGE_RW(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); /* write the metadata */ plp->write_offset = htole64(new_write_offset); /* persist the metadata */ libpmem_persist(plp->is_pmem, &plp->write_offset, sizeof (plp->write_offset)); /* set the write-protection again (debug version only) */ RANGE_RO(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); }
/* * pmemlog_rewind -- discard all data, resetting a log memory pool to empty */ void pmemlog_rewind(PMEMlog *plp) { LOG(3, "plp %p", plp); if (plp->rdonly) { LOG(1, "can't rewind read-only log"); errno = EROFS; return; } if (pthread_rwlock_wrlock(plp->rwlockp)) { LOG(1, "!pthread_rwlock_wrlock"); return; } /* unprotect the pool descriptor (debug version only) */ RANGE_RW(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); plp->write_offset = plp->start_offset; libpmem_persist(plp->is_pmem, &plp->write_offset, sizeof (uint64_t)); /* set the write-protection again (debug version only) */ RANGE_RO(plp->addr + sizeof (struct pool_hdr), LOG_FORMAT_DATA_ALIGN); if (pthread_rwlock_unlock(plp->rwlockp)) LOG(1, "!pthread_rwlock_unlock"); }
/* * nssync -- (internal) flush changes made to a namespace range * * This is used in conjunction with the addresses handed out by * nsmap() above. There's no need to sync things written via * nswrite() since those changes are flushed each time nswrite() * is called. * * This routine is provided to btt_init() to allow the btt module to * do I/O on the memory pool containing the BTT layout. */ static void nssync(void *ns, int lane, void *addr, size_t len) { struct pmemblk *pbp = (struct pmemblk *)ns; LOG(12, "pbp %p lane %d addr %p len %zu", pbp, lane, addr, len); libpmem_persist(pbp->is_pmem, addr, len); }
/* * nswrite -- (internal) write data to the namespace encapsulating the BTT * * This routine is provided to btt_init() to allow the btt module to * do I/O on the memory pool containing the BTT layout. */ static int nswrite(void *ns, int lane, const void *buf, size_t count, off_t off) { struct pmemblk *pbp = (struct pmemblk *)ns; LOG(13, "pbp %p lane %d count %zu off %zu", pbp, lane, count, off); if (off + count >= pbp->datasize) { LOG(1, "offset + count (%zu) past end of data area (%zu)", off + count, pbp->datasize - 1); errno = EINVAL; return -1; } void *dest = pbp->data + off; #ifdef DEBUG /* grab debug write lock */ if (pthread_mutex_lock(&pbp->write_lock)) LOG(1, "!pthread_mutex_lock"); #endif /* unprotect the memory (debug version only) */ RANGE_RW(dest, count); memcpy(dest, buf, count); /* protect the memory again (debug version only) */ RANGE_RO(dest, count); #ifdef DEBUG /* release debug write lock */ if (pthread_mutex_unlock(&pbp->write_lock)) LOG(1, "!pthread_mutex_unlock"); #endif libpmem_persist(pbp->is_pmem, dest, count); return 0; }
/* * pmemlog_map_common -- (internal) map a log memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. */ static PMEMlog * pmemlog_map_common(int fd, int rdonly) { LOG(3, "fd %d rdonly %d", fd, rdonly); struct stat stbuf; if (fstat(fd, &stbuf) < 0) { LOG(1, "!fstat"); return NULL; } if (stbuf.st_size < PMEMLOG_MIN_POOL) { LOG(1, "size %lld smaller than %zu", (long long)stbuf.st_size, PMEMLOG_MIN_POOL); errno = EINVAL; return NULL; } void *addr; if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL) return NULL; /* util_map() set errno, called LOG */ /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, stbuf.st_size); /* opaque info lives at the beginning of mapped memory pool */ struct pmemlog *plp = addr; struct pool_hdr hdr; memcpy(&hdr, &plp->hdr, sizeof (hdr)); if (util_convert_hdr(&hdr)) { /* * valid header found */ if (strncmp(hdr.signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN)) { LOG(1, "wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != LOG_FORMAT_MAJOR) { LOG(1, "log pool version %d (library expects %d)", hdr.major, LOG_FORMAT_MAJOR); errno = EINVAL; goto err; } uint64_t hdr_start = le64toh(plp->start_offset); uint64_t hdr_end = le64toh(plp->end_offset); uint64_t hdr_write = le64toh(plp->write_offset); if ((hdr_start != roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)) || (hdr_end != stbuf.st_size) || (hdr_start > hdr_end)) { LOG(1, "wrong start/end offsets (start: %ju end: %ju), " "pool size %lld", hdr_start, hdr_end, (long long)stbuf.st_size); errno = EINVAL; goto err; } if ((hdr_write > hdr_end) || (hdr_write < hdr_start)) { LOG(1, "wrong write offset " "(start: %ju end: %ju write: %ju)", hdr_start, hdr_end, hdr_write); errno = EINVAL; goto err; } LOG(3, "start: %ju, end: %ju, write: %ju", hdr_start, hdr_end, hdr_write); int retval = util_feature_check(&hdr, LOG_FORMAT_INCOMPAT, LOG_FORMAT_RO_COMPAT, LOG_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { /* * no valid header was found */ if (rdonly) { LOG(1, "read-only and no header found"); errno = EROFS; goto err; } LOG(3, "creating new log memory pool"); struct pool_hdr *hdrp = &plp->hdr; memset(hdrp, '\0', sizeof (*hdrp)); strncpy(hdrp->signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(LOG_FORMAT_MAJOR); hdrp->compat_features = htole32(LOG_FORMAT_COMPAT); hdrp->incompat_features = htole32(LOG_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(LOG_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); hdrp->crtime = htole64((uint64_t)time(NULL)); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); hdrp->checksum = htole64(hdrp->checksum); /* store pool's header */ libpmem_persist(is_pmem, hdrp, sizeof (*hdrp)); /* create rest of required metadata */ plp->start_offset = htole64(roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)); plp->end_offset = htole64(stbuf.st_size); plp->write_offset = plp->start_offset; /* store non-volatile part of pool's descriptor */ libpmem_persist(is_pmem, &plp->start_offset, 3 * sizeof (uint64_t)); } /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ plp->addr = addr; plp->size = stbuf.st_size; plp->rdonly = rdonly; plp->is_pmem = is_pmem; if ((plp->rwlockp = Malloc(sizeof (*plp->rwlockp))) == NULL) { LOG(1, "!Malloc for a RW lock"); goto err; } if (pthread_rwlock_init(plp->rwlockp, NULL)) { LOG(1, "!pthread_rwlock_init"); goto err_free; } /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use. It is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the rest should be kept read-only (debug version only) */ RANGE_RO(addr + sizeof (struct pool_hdr), stbuf.st_size - sizeof (struct pool_hdr)); LOG(3, "plp %p", plp); return plp; err_free: Free((void *)plp->rwlockp); err: LOG(4, "error clean up"); int oerrno = errno; util_unmap(addr, stbuf.st_size); errno = oerrno; return NULL; }
/* * pmemblk_map_common -- (internal) map a block memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. * * Passing in bsize == 0 means a valid pool header must exist (which * will supply the block size). */ static PMEMblk * pmemblk_map_common(int fd, size_t bsize, int rdonly) { LOG(3, "fd %d bsize %zu rdonly %d", fd, bsize, rdonly); /* things free by "goto err" if not NULL */ void *addr = NULL; struct btt *bttp = NULL; pthread_mutex_t *locks = NULL; struct stat stbuf; if (fstat(fd, &stbuf) < 0) { LOG(1, "!fstat"); return NULL; } if (stbuf.st_size < PMEMBLK_MIN_POOL) { LOG(1, "size %zu smaller than %zu", stbuf.st_size, PMEMBLK_MIN_POOL); errno = EINVAL; return NULL; } if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL) return NULL; /* util_map() set errno, called LOG */ /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, stbuf.st_size); /* opaque info lives at the beginning of mapped memory pool */ struct pmemblk *pbp = addr; struct pool_hdr hdr; memcpy(&hdr, &pbp->hdr, sizeof (hdr)); if (util_convert_hdr(&hdr)) { /* * valid header found */ if (strncmp(hdr.signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN)) { LOG(1, "wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != BLK_FORMAT_MAJOR) { LOG(1, "blk pool version %d (library expects %d)", hdr.major, BLK_FORMAT_MAJOR); errno = EINVAL; goto err; } size_t hdr_bsize = le32toh(pbp->bsize); if (bsize && bsize != hdr_bsize) { LOG(1, "wrong bsize (%zu), pool created with bsize %zu", bsize, hdr_bsize); errno = EINVAL; goto err; } bsize = hdr_bsize; LOG(3, "using block size from header: %zu", bsize); int retval = util_feature_check(&hdr, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT, BLK_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { /* * no valid header was found */ if (rdonly) { LOG(1, "read-only and no header found"); errno = EROFS; goto err; } LOG(3, "creating new blk memory pool"); struct pool_hdr *hdrp = &pbp->hdr; memset(hdrp, '\0', sizeof (*hdrp)); strncpy(hdrp->signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(BLK_FORMAT_MAJOR); hdrp->compat_features = htole32(BLK_FORMAT_COMPAT); hdrp->incompat_features = htole32(BLK_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(BLK_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); hdrp->crtime = htole64((uint64_t)time(NULL)); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); hdrp->checksum = htole64(hdrp->checksum); /* store pool's header */ libpmem_persist(is_pmem, hdrp, sizeof (*hdrp)); /* create rest of required metadata */ pbp->bsize = htole32(bsize); libpmem_persist(is_pmem, &pbp->bsize, sizeof (bsize)); } /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ pbp->addr = addr; pbp->size = stbuf.st_size; pbp->rdonly = rdonly; pbp->is_pmem = is_pmem; pbp->data = addr + roundup(sizeof (*pbp), BLK_FORMAT_DATA_ALIGN); pbp->datasize = (pbp->addr + pbp->size) - pbp->data; LOG(4, "data area %p data size %zu bsize %zu", pbp->data, pbp->datasize, bsize); int ncpus = sysconf(_SC_NPROCESSORS_ONLN); if (ncpus < 1) ncpus = 1; bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.uuid, ncpus, pbp, &ns_cb); if (bttp == NULL) goto err; /* btt_init set errno, called LOG */ pbp->bttp = bttp; pbp->nlane = btt_nlane(pbp->bttp); pbp->next_lane = 0; if ((locks = Malloc(pbp->nlane * sizeof (*locks))) == NULL) { LOG(1, "!Malloc for lane locks"); goto err; } for (int i = 0; i < pbp->nlane; i++) if (pthread_mutex_init(&locks[i], NULL) < 0) { LOG(1, "!pthread_mutex_init"); goto err; } pbp->locks = locks; #ifdef DEBUG /* initialize debug lock */ if (pthread_mutex_init(&pbp->write_lock, NULL) < 0) { LOG(1, "!pthread_mutex_init"); goto err; } #endif /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use not it is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the data area should be kept read-only for debug version */ RANGE_RO(pbp->data, pbp->datasize); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; if (locks) Free((void *)locks); if (bttp) btt_fini(bttp); util_unmap(addr, stbuf.st_size); errno = oerrno; return NULL; }