/* * util_header_check -- (internal) validate header of a single pool set file */ static int util_header_check(struct pool_set *set, unsigned repidx, unsigned partidx, const char *sig, uint32_t major, uint32_t compat, uint32_t incompat, uint32_t ro_compat) { LOG(3, "set %p repidx %u partidx %u sig %s major %u " "compat %#x incompat %#x ro_comapt %#x", set, repidx, partidx, sig, major, compat, incompat, ro_compat); struct pool_replica *rep = set->replica[repidx]; /* opaque info lives at the beginning of mapped memory pool */ struct pool_hdr *hdrp = rep->part[partidx].hdr; struct pool_hdr hdr; memcpy(&hdr, hdrp, sizeof (hdr)); if (!util_convert_hdr(&hdr)) { errno = EINVAL; return -1; } /* valid header found */ if (strncmp(hdr.signature, sig, POOL_HDR_SIG_LEN)) { ERR("wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; return -1; } if (hdr.major != major) { ERR("pool version %d (library expects %d)", hdr.major, major); errno = EINVAL; return -1; } if (util_check_arch_flags(&hdr.arch_flags)) { ERR("wrong architecture flags"); errno = EINVAL; return -1; } /* check pool set UUID */ if (memcmp(HDR(REP(set, 0), 0)->poolset_uuid, hdr.poolset_uuid, POOL_HDR_UUID_LEN)) { ERR("wrong pool set UUID"); errno = EINVAL; return -1; } /* check pool set linkage */ if (memcmp(HDR(rep, partidx - 1)->uuid, hdr.prev_part_uuid, POOL_HDR_UUID_LEN) || memcmp(HDR(rep, partidx + 1)->uuid, hdr.next_part_uuid, POOL_HDR_UUID_LEN)) { ERR("wrong part UUID"); errno = EINVAL; return -1; } /* check format version */ if (HDR(rep, 0)->major != hdrp->major) { ERR("incompatible pool format"); errno = EINVAL; return -1; } /* check compatibility features */ if (HDR(rep, 0)->compat_features != hdrp->compat_features || HDR(rep, 0)->incompat_features != hdrp->incompat_features || HDR(rep, 0)->ro_compat_features != hdrp->ro_compat_features) { ERR("incompatible feature flags"); errno = EINVAL; return -1; } rep->part[partidx].rdonly = 0; int retval = util_feature_check(&hdr, incompat, ro_compat, compat); if (retval < 0) return -1; else if (retval == 0) rep->part[partidx].rdonly = 1; return 0; }
/* * pmemlog_map_common -- (internal) map a log memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. */ static PMEMlog * pmemlog_map_common(int fd, int rdonly) { LOG(3, "fd %d rdonly %d", fd, rdonly); struct stat stbuf; if (fstat(fd, &stbuf) < 0) { LOG(1, "!fstat"); return NULL; } if (stbuf.st_size < PMEMLOG_MIN_POOL) { LOG(1, "size %lld smaller than %zu", (long long)stbuf.st_size, PMEMLOG_MIN_POOL); errno = EINVAL; return NULL; } void *addr; if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL) return NULL; /* util_map() set errno, called LOG */ /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, stbuf.st_size); /* opaque info lives at the beginning of mapped memory pool */ struct pmemlog *plp = addr; struct pool_hdr hdr; memcpy(&hdr, &plp->hdr, sizeof (hdr)); if (util_convert_hdr(&hdr)) { /* * valid header found */ if (strncmp(hdr.signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN)) { LOG(1, "wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != LOG_FORMAT_MAJOR) { LOG(1, "log pool version %d (library expects %d)", hdr.major, LOG_FORMAT_MAJOR); errno = EINVAL; goto err; } uint64_t hdr_start = le64toh(plp->start_offset); uint64_t hdr_end = le64toh(plp->end_offset); uint64_t hdr_write = le64toh(plp->write_offset); if ((hdr_start != roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)) || (hdr_end != stbuf.st_size) || (hdr_start > hdr_end)) { LOG(1, "wrong start/end offsets (start: %ju end: %ju), " "pool size %lld", hdr_start, hdr_end, (long long)stbuf.st_size); errno = EINVAL; goto err; } if ((hdr_write > hdr_end) || (hdr_write < hdr_start)) { LOG(1, "wrong write offset " "(start: %ju end: %ju write: %ju)", hdr_start, hdr_end, hdr_write); errno = EINVAL; goto err; } LOG(3, "start: %ju, end: %ju, write: %ju", hdr_start, hdr_end, hdr_write); int retval = util_feature_check(&hdr, LOG_FORMAT_INCOMPAT, LOG_FORMAT_RO_COMPAT, LOG_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { /* * no valid header was found */ if (rdonly) { LOG(1, "read-only and no header found"); errno = EROFS; goto err; } LOG(3, "creating new log memory pool"); struct pool_hdr *hdrp = &plp->hdr; memset(hdrp, '\0', sizeof (*hdrp)); strncpy(hdrp->signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(LOG_FORMAT_MAJOR); hdrp->compat_features = htole32(LOG_FORMAT_COMPAT); hdrp->incompat_features = htole32(LOG_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(LOG_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); hdrp->crtime = htole64((uint64_t)time(NULL)); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); hdrp->checksum = htole64(hdrp->checksum); /* store pool's header */ libpmem_persist(is_pmem, hdrp, sizeof (*hdrp)); /* create rest of required metadata */ plp->start_offset = htole64(roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)); plp->end_offset = htole64(stbuf.st_size); plp->write_offset = plp->start_offset; /* store non-volatile part of pool's descriptor */ libpmem_persist(is_pmem, &plp->start_offset, 3 * sizeof (uint64_t)); } /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ plp->addr = addr; plp->size = stbuf.st_size; plp->rdonly = rdonly; plp->is_pmem = is_pmem; if ((plp->rwlockp = Malloc(sizeof (*plp->rwlockp))) == NULL) { LOG(1, "!Malloc for a RW lock"); goto err; } if (pthread_rwlock_init(plp->rwlockp, NULL)) { LOG(1, "!pthread_rwlock_init"); goto err_free; } /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use. It is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the rest should be kept read-only (debug version only) */ RANGE_RO(addr + sizeof (struct pool_hdr), stbuf.st_size - sizeof (struct pool_hdr)); LOG(3, "plp %p", plp); return plp; err_free: Free((void *)plp->rwlockp); err: LOG(4, "error clean up"); int oerrno = errno; util_unmap(addr, stbuf.st_size); errno = oerrno; return NULL; }
/* * pmemlog_map_common -- (internal) map a log memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. * * If empty flag is set, the file is assumed to be a new memory pool, and * a new pool header is created. Otherwise, a valid header must exist. */ static PMEMlogpool * pmemlog_map_common(int fd, size_t poolsize, int rdonly, int empty) { LOG(3, "fd %d poolsize %zu rdonly %d empty %d", fd, poolsize, rdonly, empty); void *addr; if ((addr = util_map(fd, poolsize, rdonly)) == NULL) { (void) close(fd); return NULL; /* util_map() set errno, called LOG */ } VALGRIND_REGISTER_PMEM_MAPPING(addr, poolsize); VALGRIND_REGISTER_PMEM_FILE(fd, addr, poolsize, 0); (void) close(fd); /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, poolsize); /* opaque info lives at the beginning of mapped memory pool */ struct pmemlog *plp = addr; if (!empty) { struct pool_hdr hdr; memcpy(&hdr, &plp->hdr, sizeof (hdr)); if (!util_convert_hdr(&hdr)) { errno = EINVAL; goto err; } /* * valid header found */ if (strncmp(hdr.signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN)) { ERR("wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != LOG_FORMAT_MAJOR) { ERR("log pool version %d (library expects %d)", hdr.major, LOG_FORMAT_MAJOR); errno = EINVAL; goto err; } /* XXX - pools sets / replicas */ if (memcmp(hdr.uuid, hdr.prev_part_uuid, POOL_HDR_UUID_LEN) || memcmp(hdr.uuid, hdr.next_part_uuid, POOL_HDR_UUID_LEN) || memcmp(hdr.uuid, hdr.prev_repl_uuid, POOL_HDR_UUID_LEN) || memcmp(hdr.uuid, hdr.next_repl_uuid, POOL_HDR_UUID_LEN)) { ERR("wrong UUID"); errno = EINVAL; goto err; } uint64_t hdr_start = le64toh(plp->start_offset); uint64_t hdr_end = le64toh(plp->end_offset); uint64_t hdr_write = le64toh(plp->write_offset); if ((hdr_start != roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)) || (hdr_end != poolsize) || (hdr_start > hdr_end)) { ERR("wrong start/end offsets (start: %ju end: %ju), " "pool size %zu", hdr_start, hdr_end, poolsize); errno = EINVAL; goto err; } if ((hdr_write > hdr_end) || (hdr_write < hdr_start)) { ERR("wrong write offset " "(start: %ju end: %ju write: %ju)", hdr_start, hdr_end, hdr_write); errno = EINVAL; goto err; } LOG(3, "start: %ju, end: %ju, write: %ju", hdr_start, hdr_end, hdr_write); int retval = util_feature_check(&hdr, LOG_FORMAT_INCOMPAT, LOG_FORMAT_RO_COMPAT, LOG_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { LOG(3, "creating new log memory pool"); ASSERTeq(rdonly, 0); struct pool_hdr *hdrp = &plp->hdr; /* check if the pool header is all zero */ if (!util_is_zeroed(hdrp, sizeof (*hdrp))) { ERR("Non-empty file detected"); errno = EINVAL; goto err; } /* create required metadata first */ plp->start_offset = htole64(roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)); plp->end_offset = htole64(poolsize); plp->write_offset = plp->start_offset; /* store non-volatile part of pool's descriptor */ pmem_msync(&plp->start_offset, 3 * sizeof (uint64_t)); /* create pool header */ strncpy(hdrp->signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(LOG_FORMAT_MAJOR); hdrp->compat_features = htole32(LOG_FORMAT_COMPAT); hdrp->incompat_features = htole32(LOG_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(LOG_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); /* XXX - pools sets / replicas */ uuid_generate(hdrp->poolset_uuid); memcpy(hdrp->prev_part_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); memcpy(hdrp->next_part_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); memcpy(hdrp->prev_repl_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); memcpy(hdrp->next_repl_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); hdrp->crtime = htole64((uint64_t)time(NULL)); if (util_get_arch_flags(&hdrp->arch_flags)) { ERR("Reading architecture flags failed\n"); errno = EINVAL; goto err; } hdrp->arch_flags.alignment_desc = htole64(hdrp->arch_flags.alignment_desc); hdrp->arch_flags.e_machine = htole16(hdrp->arch_flags.e_machine); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); /* store pool's header */ pmem_msync(hdrp, sizeof (*hdrp)); } /* remove volatile part of header */ VALGRIND_REMOVE_PMEM_MAPPING(&plp->addr, sizeof (struct pmemlog) - sizeof (struct pool_hdr) - 3 * sizeof (uint64_t)); /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ plp->addr = addr; plp->size = poolsize; plp->rdonly = rdonly; plp->is_pmem = is_pmem; if ((plp->rwlockp = Malloc(sizeof (*plp->rwlockp))) == NULL) { ERR("!Malloc for a RW lock"); goto err; } if ((errno = pthread_rwlock_init(plp->rwlockp, NULL))) { ERR("!pthread_rwlock_init"); goto err_free; } /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use. It is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the rest should be kept read-only (debug version only) */ RANGE_RO(addr + sizeof (struct pool_hdr), poolsize - sizeof (struct pool_hdr)); LOG(3, "plp %p", plp); return plp; err_free: Free((void *)plp->rwlockp); err: LOG(4, "error clean up"); int oerrno = errno; VALGRIND_REMOVE_PMEM_MAPPING(addr, poolsize); util_unmap(addr, poolsize); errno = oerrno; return NULL; }
/* * pmemblk_map_common -- (internal) map a block memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. * * Passing in bsize == 0 means a valid pool header must exist (which * will supply the block size). */ static PMEMblk * pmemblk_map_common(int fd, size_t bsize, int rdonly) { LOG(3, "fd %d bsize %zu rdonly %d", fd, bsize, rdonly); /* things free by "goto err" if not NULL */ void *addr = NULL; struct btt *bttp = NULL; pthread_mutex_t *locks = NULL; struct stat stbuf; if (fstat(fd, &stbuf) < 0) { LOG(1, "!fstat"); return NULL; } if (stbuf.st_size < PMEMBLK_MIN_POOL) { LOG(1, "size %zu smaller than %zu", stbuf.st_size, PMEMBLK_MIN_POOL); errno = EINVAL; return NULL; } if ((addr = util_map(fd, stbuf.st_size, rdonly)) == NULL) return NULL; /* util_map() set errno, called LOG */ /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, stbuf.st_size); /* opaque info lives at the beginning of mapped memory pool */ struct pmemblk *pbp = addr; struct pool_hdr hdr; memcpy(&hdr, &pbp->hdr, sizeof (hdr)); if (util_convert_hdr(&hdr)) { /* * valid header found */ if (strncmp(hdr.signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN)) { LOG(1, "wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != BLK_FORMAT_MAJOR) { LOG(1, "blk pool version %d (library expects %d)", hdr.major, BLK_FORMAT_MAJOR); errno = EINVAL; goto err; } size_t hdr_bsize = le32toh(pbp->bsize); if (bsize && bsize != hdr_bsize) { LOG(1, "wrong bsize (%zu), pool created with bsize %zu", bsize, hdr_bsize); errno = EINVAL; goto err; } bsize = hdr_bsize; LOG(3, "using block size from header: %zu", bsize); int retval = util_feature_check(&hdr, BLK_FORMAT_INCOMPAT, BLK_FORMAT_RO_COMPAT, BLK_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { /* * no valid header was found */ if (rdonly) { LOG(1, "read-only and no header found"); errno = EROFS; goto err; } LOG(3, "creating new blk memory pool"); struct pool_hdr *hdrp = &pbp->hdr; memset(hdrp, '\0', sizeof (*hdrp)); strncpy(hdrp->signature, BLK_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(BLK_FORMAT_MAJOR); hdrp->compat_features = htole32(BLK_FORMAT_COMPAT); hdrp->incompat_features = htole32(BLK_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(BLK_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); hdrp->crtime = htole64((uint64_t)time(NULL)); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); hdrp->checksum = htole64(hdrp->checksum); /* store pool's header */ libpmem_persist(is_pmem, hdrp, sizeof (*hdrp)); /* create rest of required metadata */ pbp->bsize = htole32(bsize); libpmem_persist(is_pmem, &pbp->bsize, sizeof (bsize)); } /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ pbp->addr = addr; pbp->size = stbuf.st_size; pbp->rdonly = rdonly; pbp->is_pmem = is_pmem; pbp->data = addr + roundup(sizeof (*pbp), BLK_FORMAT_DATA_ALIGN); pbp->datasize = (pbp->addr + pbp->size) - pbp->data; LOG(4, "data area %p data size %zu bsize %zu", pbp->data, pbp->datasize, bsize); int ncpus = sysconf(_SC_NPROCESSORS_ONLN); if (ncpus < 1) ncpus = 1; bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.uuid, ncpus, pbp, &ns_cb); if (bttp == NULL) goto err; /* btt_init set errno, called LOG */ pbp->bttp = bttp; pbp->nlane = btt_nlane(pbp->bttp); pbp->next_lane = 0; if ((locks = Malloc(pbp->nlane * sizeof (*locks))) == NULL) { LOG(1, "!Malloc for lane locks"); goto err; } for (int i = 0; i < pbp->nlane; i++) if (pthread_mutex_init(&locks[i], NULL) < 0) { LOG(1, "!pthread_mutex_init"); goto err; } pbp->locks = locks; #ifdef DEBUG /* initialize debug lock */ if (pthread_mutex_init(&pbp->write_lock, NULL) < 0) { LOG(1, "!pthread_mutex_init"); goto err; } #endif /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use not it is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the data area should be kept read-only for debug version */ RANGE_RO(pbp->data, pbp->datasize); LOG(3, "pbp %p", pbp); return pbp; err: LOG(4, "error clean up"); int oerrno = errno; if (locks) Free((void *)locks); if (bttp) btt_fini(bttp); util_unmap(addr, stbuf.st_size); errno = oerrno; return NULL; }