/* * util_map_part -- (internal) map a part of a pool set */ static int util_map_part(struct pool_set_part *part, void *addr, size_t size, off_t offset, int flags) { LOG(3, "part %p addr %p size %zu offset %ju flags %d", part, addr, size, offset, flags); ASSERTeq((uintptr_t)addr % Pagesize, 0); ASSERTeq(offset % Pagesize, 0); ASSERTeq(size % Pagesize, 0); part->size = size ? size : (part->filesize & ~(Pagesize - 1)) - offset; part->addr = mmap(addr, part->size, PROT_READ|PROT_WRITE, flags, part->fd, offset); if (part->addr == MAP_FAILED) { ERR("!mmap: %s", part->path); return -1; } if (addr != NULL && (flags & MAP_FIXED) && part->addr != addr) { ERR("!mmap: %s", part->path); munmap(addr, size); return -1; } VALGRIND_REGISTER_PMEM_MAPPING(part->addr, part->size); VALGRIND_REGISTER_PMEM_FILE(part->fd, part->addr, part->size, offset); return 0; }
/* * util_map_part -- (internal) map a header of a pool set */ static int util_map_hdr(struct pool_set_part *part, size_t size, off_t offset, int flags) { LOG(3, "part %p size %zu offset %ju flags %d", part, size, offset, flags); ASSERTne(size, 0); ASSERTeq(size % Pagesize, 0); ASSERTeq(offset % Pagesize, 0); part->hdrsize = size; part->hdr = mmap(NULL, part->hdrsize, PROT_READ|PROT_WRITE, flags, part->fd, offset); if (part->hdr == MAP_FAILED) { ERR("!mmap: %s", part->path); return -1; } VALGRIND_REGISTER_PMEM_MAPPING(part->hdr, part->hdrsize); VALGRIND_REGISTER_PMEM_FILE(part->fd, part->hdr, part->hdrsize, offset); return 0; }
/* * pmem_map -- map the entire file for read/write access */ void * pmem_map(int fd) { LOG(3, "fd %d", fd); struct stat stbuf; if (fstat(fd, &stbuf) < 0) { ERR("!fstat"); return NULL; } void *addr; if ((addr = util_map(fd, stbuf.st_size, 0)) == NULL) return NULL; /* util_map() set errno, called LOG */ LOG(3, "returning %p", addr); VALGRIND_REGISTER_PMEM_MAPPING(addr, stbuf.st_size); VALGRIND_REGISTER_PMEM_FILE(fd, addr, stbuf.st_size, 0); return addr; }
/* * util_map_part -- map a header of a pool set */ int util_map_hdr(struct pool_set_part *part, int flags) { LOG(3, "part %p flags %d", part, flags); COMPILE_ERROR_ON(POOL_HDR_SIZE == 0); ASSERTeq(POOL_HDR_SIZE % Pagesize, 0); void *hdrp = mmap(NULL, POOL_HDR_SIZE, PROT_READ|PROT_WRITE, flags, part->fd, 0); if (hdrp == MAP_FAILED) { ERR("!mmap: %s", part->path); return -1; } part->hdrsize = POOL_HDR_SIZE; part->hdr = hdrp; VALGRIND_REGISTER_PMEM_MAPPING(part->hdr, part->hdrsize); VALGRIND_REGISTER_PMEM_FILE(part->fd, part->hdr, part->hdrsize, 0); return 0; }
/* * util_replica_open -- (internal) open a memory pool replica */ static int util_replica_open(struct pool_set *set, unsigned repidx, int flags, size_t hdrsize) { LOG(3, "set %p repidx %u flags %d hdrsize %zu\n", set, repidx, flags, hdrsize); struct pool_replica *rep = set->replica[repidx]; rep->repsize -= (rep->nparts - 1) * hdrsize; /* determine a hint address for mmap() */ void *addr = util_map_hint(rep->repsize); /* XXX - randomize */ if (addr == NULL) { ERR("cannot find a contiguous region of given size"); return -1; } /* map the first part and reserve space for remaining parts */ if (util_map_part(&rep->part[0], addr, rep->repsize, 0, flags) != 0) { LOG(2, "pool mapping failed - part #0"); return -1; } VALGRIND_REGISTER_PMEM_MAPPING(rep->part[0].addr, rep->part[0].size); VALGRIND_REGISTER_PMEM_FILE(rep->part[0].fd, rep->part[0].addr, rep->part[0].size, 0); /* map all headers - don't care about the address */ for (unsigned p = 0; p < rep->nparts; p++) { if (util_map_hdr(&rep->part[p], hdrsize, 0, flags) != 0) { LOG(2, "header mapping failed - part #%d", p); goto err; } } size_t mapsize = rep->part[0].filesize & ~(Pagesize - 1); addr = (char *)rep->part[0].addr + mapsize; /* * map the remaining parts of the usable pool space * (4K-aligned) */ for (unsigned p = 1; p < rep->nparts; p++) { /* map data part */ if (util_map_part(&rep->part[p], addr, 0, hdrsize, flags | MAP_FIXED) != 0) { LOG(2, "usable space mapping failed - part #%d", p); goto err; } VALGRIND_REGISTER_PMEM_FILE(rep->part[p].fd, rep->part[p].addr, rep->part[p].size, hdrsize); mapsize += rep->part[p].size; addr = (char *)addr + rep->part[p].size; } rep->is_pmem = pmem_is_pmem(rep->part[0].addr, rep->part[0].size); ASSERTeq(mapsize, rep->repsize); /* calculate pool size - choose the smallest replica size */ if (rep->repsize < set->poolsize) set->poolsize = rep->repsize; LOG(3, "replica addr %p", rep->part[0].addr); return 0; err: LOG(4, "error clean up"); int oerrno = errno; for (unsigned p = 0; p < rep->nparts; p++) util_unmap_hdr(&rep->part[p]); util_unmap_part(&rep->part[0]); errno = oerrno; return -1; }
/* * pmemlog_map_common -- (internal) map a log memory pool * * This routine does all the work, but takes a rdonly flag so internal * calls can map a read-only pool if required. * * If empty flag is set, the file is assumed to be a new memory pool, and * a new pool header is created. Otherwise, a valid header must exist. */ static PMEMlogpool * pmemlog_map_common(int fd, size_t poolsize, int rdonly, int empty) { LOG(3, "fd %d poolsize %zu rdonly %d empty %d", fd, poolsize, rdonly, empty); void *addr; if ((addr = util_map(fd, poolsize, rdonly)) == NULL) { (void) close(fd); return NULL; /* util_map() set errno, called LOG */ } VALGRIND_REGISTER_PMEM_MAPPING(addr, poolsize); VALGRIND_REGISTER_PMEM_FILE(fd, addr, poolsize, 0); (void) close(fd); /* check if the mapped region is located in persistent memory */ int is_pmem = pmem_is_pmem(addr, poolsize); /* opaque info lives at the beginning of mapped memory pool */ struct pmemlog *plp = addr; if (!empty) { struct pool_hdr hdr; memcpy(&hdr, &plp->hdr, sizeof (hdr)); if (!util_convert_hdr(&hdr)) { errno = EINVAL; goto err; } /* * valid header found */ if (strncmp(hdr.signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN)) { ERR("wrong pool type: \"%s\"", hdr.signature); errno = EINVAL; goto err; } if (hdr.major != LOG_FORMAT_MAJOR) { ERR("log pool version %d (library expects %d)", hdr.major, LOG_FORMAT_MAJOR); errno = EINVAL; goto err; } /* XXX - pools sets / replicas */ if (memcmp(hdr.uuid, hdr.prev_part_uuid, POOL_HDR_UUID_LEN) || memcmp(hdr.uuid, hdr.next_part_uuid, POOL_HDR_UUID_LEN) || memcmp(hdr.uuid, hdr.prev_repl_uuid, POOL_HDR_UUID_LEN) || memcmp(hdr.uuid, hdr.next_repl_uuid, POOL_HDR_UUID_LEN)) { ERR("wrong UUID"); errno = EINVAL; goto err; } uint64_t hdr_start = le64toh(plp->start_offset); uint64_t hdr_end = le64toh(plp->end_offset); uint64_t hdr_write = le64toh(plp->write_offset); if ((hdr_start != roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)) || (hdr_end != poolsize) || (hdr_start > hdr_end)) { ERR("wrong start/end offsets (start: %ju end: %ju), " "pool size %zu", hdr_start, hdr_end, poolsize); errno = EINVAL; goto err; } if ((hdr_write > hdr_end) || (hdr_write < hdr_start)) { ERR("wrong write offset " "(start: %ju end: %ju write: %ju)", hdr_start, hdr_end, hdr_write); errno = EINVAL; goto err; } LOG(3, "start: %ju, end: %ju, write: %ju", hdr_start, hdr_end, hdr_write); int retval = util_feature_check(&hdr, LOG_FORMAT_INCOMPAT, LOG_FORMAT_RO_COMPAT, LOG_FORMAT_COMPAT); if (retval < 0) goto err; else if (retval == 0) rdonly = 1; } else { LOG(3, "creating new log memory pool"); ASSERTeq(rdonly, 0); struct pool_hdr *hdrp = &plp->hdr; /* check if the pool header is all zero */ if (!util_is_zeroed(hdrp, sizeof (*hdrp))) { ERR("Non-empty file detected"); errno = EINVAL; goto err; } /* create required metadata first */ plp->start_offset = htole64(roundup(sizeof (*plp), LOG_FORMAT_DATA_ALIGN)); plp->end_offset = htole64(poolsize); plp->write_offset = plp->start_offset; /* store non-volatile part of pool's descriptor */ pmem_msync(&plp->start_offset, 3 * sizeof (uint64_t)); /* create pool header */ strncpy(hdrp->signature, LOG_HDR_SIG, POOL_HDR_SIG_LEN); hdrp->major = htole32(LOG_FORMAT_MAJOR); hdrp->compat_features = htole32(LOG_FORMAT_COMPAT); hdrp->incompat_features = htole32(LOG_FORMAT_INCOMPAT); hdrp->ro_compat_features = htole32(LOG_FORMAT_RO_COMPAT); uuid_generate(hdrp->uuid); /* XXX - pools sets / replicas */ uuid_generate(hdrp->poolset_uuid); memcpy(hdrp->prev_part_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); memcpy(hdrp->next_part_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); memcpy(hdrp->prev_repl_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); memcpy(hdrp->next_repl_uuid, hdrp->uuid, POOL_HDR_UUID_LEN); hdrp->crtime = htole64((uint64_t)time(NULL)); if (util_get_arch_flags(&hdrp->arch_flags)) { ERR("Reading architecture flags failed\n"); errno = EINVAL; goto err; } hdrp->arch_flags.alignment_desc = htole64(hdrp->arch_flags.alignment_desc); hdrp->arch_flags.e_machine = htole16(hdrp->arch_flags.e_machine); util_checksum(hdrp, sizeof (*hdrp), &hdrp->checksum, 1); /* store pool's header */ pmem_msync(hdrp, sizeof (*hdrp)); } /* remove volatile part of header */ VALGRIND_REMOVE_PMEM_MAPPING(&plp->addr, sizeof (struct pmemlog) - sizeof (struct pool_hdr) - 3 * sizeof (uint64_t)); /* * Use some of the memory pool area for run-time info. This * run-time state is never loaded from the file, it is always * created here, so no need to worry about byte-order. */ plp->addr = addr; plp->size = poolsize; plp->rdonly = rdonly; plp->is_pmem = is_pmem; if ((plp->rwlockp = Malloc(sizeof (*plp->rwlockp))) == NULL) { ERR("!Malloc for a RW lock"); goto err; } if ((errno = pthread_rwlock_init(plp->rwlockp, NULL))) { ERR("!pthread_rwlock_init"); goto err_free; } /* * If possible, turn off all permissions on the pool header page. * * The prototype PMFS doesn't allow this when large pages are in * use. It is not considered an error if this fails. */ util_range_none(addr, sizeof (struct pool_hdr)); /* the rest should be kept read-only (debug version only) */ RANGE_RO(addr + sizeof (struct pool_hdr), poolsize - sizeof (struct pool_hdr)); LOG(3, "plp %p", plp); return plp; err_free: Free((void *)plp->rwlockp); err: LOG(4, "error clean up"); int oerrno = errno; VALGRIND_REMOVE_PMEM_MAPPING(addr, poolsize); util_unmap(addr, poolsize); errno = oerrno; return NULL; }
/* * util_replica_create -- (internal) create a new memory pool replica */ static int util_replica_create(struct pool_set *set, unsigned repidx, int flags, const char *sig, uint32_t major, uint32_t compat, uint32_t incompat, uint32_t ro_compat, const unsigned char *prev_repl_uuid, const unsigned char *next_repl_uuid, const unsigned char *arch_flags) { LOG(3, "set %p repidx %u flags %d sig %.8s major %u " "compat %#x incompat %#x ro_comapt %#x" "prev_repl_uuid %p next_repl_uuid %p arch_flags %p", set, repidx, flags, sig, major, compat, incompat, ro_compat, prev_repl_uuid, next_repl_uuid, arch_flags); struct pool_replica *rep = set->replica[repidx]; /* determine a hint address for mmap() */ void *addr = util_map_hint(rep->repsize, 0); if (addr == MAP_FAILED) { ERR("cannot find a contiguous region of given size"); return -1; } /* map the first part and reserve space for remaining parts */ /* XXX investigate this idea of reserving space on Windows */ if (util_map_part(&rep->part[0], addr, rep->repsize, 0, flags) != 0) { LOG(2, "pool mapping failed - part #0"); return -1; } VALGRIND_REGISTER_PMEM_MAPPING(rep->part[0].addr, rep->part[0].size); VALGRIND_REGISTER_PMEM_FILE(rep->part[0].fd, rep->part[0].addr, rep->part[0].size, 0); /* map all headers - don't care about the address */ for (unsigned p = 0; p < rep->nparts; p++) { if (util_map_hdr(&rep->part[p], flags) != 0) { LOG(2, "header mapping failed - part #%d", p); goto err; } } /* create headers, set UUID's */ for (unsigned p = 0; p < rep->nparts; p++) { if (util_header_create(set, repidx, p, sig, major, compat, incompat, ro_compat, prev_repl_uuid, next_repl_uuid, arch_flags) != 0) { LOG(2, "header creation failed - part #%d", p); goto err; } } /* unmap all headers */ for (unsigned p = 0; p < rep->nparts; p++) util_unmap_hdr(&rep->part[p]); set->zeroed &= rep->part[0].created; size_t mapsize = rep->part[0].filesize & ~(Pagesize - 1); addr = (char *)rep->part[0].addr + mapsize; /* * map the remaining parts of the usable pool space (4K-aligned) */ for (unsigned p = 1; p < rep->nparts; p++) { /* map data part */ if (util_map_part(&rep->part[p], addr, 0, POOL_HDR_SIZE, flags | MAP_FIXED) != 0) { LOG(2, "usable space mapping failed - part #%d", p); goto err; } VALGRIND_REGISTER_PMEM_FILE(rep->part[p].fd, rep->part[p].addr, rep->part[p].size, POOL_HDR_SIZE); mapsize += rep->part[p].size; set->zeroed &= rep->part[p].created; addr = (char *)addr + rep->part[p].size; } rep->is_pmem = pmem_is_pmem(rep->part[0].addr, rep->part[0].size); ASSERTeq(mapsize, rep->repsize); LOG(3, "replica addr %p", rep->part[0].addr); return 0; err: LOG(4, "error clean up"); int oerrno = errno; for (unsigned p = 0; p < rep->nparts; p++) util_unmap_hdr(&rep->part[p]); util_unmap_part(&rep->part[0]); errno = oerrno; return -1; }
/* * util_replica_create -- (internal) create a new memory pool replica */ static int util_replica_create(struct pool_set *set, unsigned repidx, int flags, size_t hdrsize, const char *sig, uint32_t major, uint32_t compat, uint32_t incompat, uint32_t ro_compat) { LOG(3, "set %p repidx %u flags %d hdrsize %zu sig %s major %u " "compat %#x incompat %#x ro_comapt %#x", set, repidx, flags, hdrsize, sig, major, compat, incompat, ro_compat); struct pool_replica *rep = set->replica[repidx]; rep->repsize -= (rep->nparts - 1) * hdrsize; /* determine a hint address for mmap() */ void *addr = util_map_hint(rep->repsize); /* XXX - randomize */ if (addr == NULL) { ERR("cannot find a contiguous region of given size"); return -1; } /* map the first part and reserve space for remaining parts */ if (util_map_part(&rep->part[0], addr, rep->repsize, 0, flags) != 0) { LOG(2, "pool mapping failed - part #0"); return -1; } VALGRIND_REGISTER_PMEM_MAPPING(rep->part[0].addr, rep->part[0].size); VALGRIND_REGISTER_PMEM_FILE(rep->part[0].fd, rep->part[0].addr, rep->part[0].size, 0); /* map all the remaining headers - don't care about the address */ for (unsigned p = 1; p < rep->nparts; p++) { if (util_map_part(&rep->part[p], NULL, hdrsize, 0, flags) != 0) { LOG(2, "header mapping failed - part #%d", p); goto err; } VALGRIND_REGISTER_PMEM_FILE(rep->part[p].fd, rep->part[p].addr, rep->part[p].size, 0); } /* create headers, set UUID's */ for (unsigned p = 0; p < rep->nparts; p++) { if (util_header_create(set, repidx, p, sig, major, compat, incompat, ro_compat) != 0) { LOG(2, "header creation failed - part #%d", p); goto err; } } set->zeroed &= rep->part[0].created; size_t mapsize = rep->part[0].filesize & ~(Pagesize - 1); addr = rep->part[0].addr + mapsize; /* * unmap headers; map the remaining parts of the usable pool space * (4K-aligned) */ for (unsigned p = 1; p < rep->nparts; p++) { /* unmap header */ if (util_unmap_part(&rep->part[p]) != 0) { LOG(2, "header unmapping failed - part #%d", p); } /* map data part */ if (util_map_part(&rep->part[p], addr, 0, hdrsize, flags | MAP_FIXED) != 0) { LOG(2, "usable space mapping failed - part #%d", p); goto err; } VALGRIND_REGISTER_PMEM_FILE(rep->part[p].fd, rep->part[p].addr, rep->part[p].size, hdrsize); mapsize += rep->part[p].size; set->zeroed &= rep->part[p].created; addr += rep->part[p].size; } rep->is_pmem = pmem_is_pmem(rep->part[0].addr, rep->part[0].size); ASSERTeq(mapsize, rep->repsize); /* calculate pool size - choose the smallest replica size */ if (rep->repsize < set->poolsize) set->poolsize = rep->repsize; LOG(3, "replica addr %p", rep->part[0].addr); return 0; err: LOG(4, "error clean up"); int oerrno = errno; VALGRIND_REMOVE_PMEM_MAPPING(rep->part[0].addr, rep->part[0].size); util_unmap(rep->part[0].addr, rep->part[0].size); errno = oerrno; return -1; }
static inline #endif void * pmem_map_fileU(const char *path, size_t len, int flags, mode_t mode, size_t *mapped_lenp, int *is_pmemp) { LOG(3, "path \"%s\" size %zu flags %x mode %o mapped_lenp %p " "is_pmemp %p", path, len, flags, mode, mapped_lenp, is_pmemp); int oerrno; int fd; int open_flags = O_RDWR; int delete_on_err = 0; int file_type = util_file_get_type(path); #ifdef _WIN32 open_flags |= O_BINARY; #endif if (file_type == OTHER_ERROR) return NULL; if (flags & ~(PMEM_FILE_ALL_FLAGS)) { ERR("invalid flag specified %x", flags); errno = EINVAL; return NULL; } if (file_type == TYPE_DEVDAX) { if (flags & ~(PMEM_DAX_VALID_FLAGS)) { ERR("flag unsupported for Device DAX %x", flags); errno = EINVAL; return NULL; } else { /* we are ignoring all of the flags */ flags = 0; ssize_t actual_len = util_file_get_size(path); if (actual_len < 0) { ERR("unable to read Device DAX size"); errno = EINVAL; return NULL; } if (len != 0 && len != (size_t)actual_len) { ERR("Device DAX length must be either 0 or " "the exact size of the device: %zu", actual_len); errno = EINVAL; return NULL; } len = 0; } } if (flags & PMEM_FILE_CREATE) { if ((os_off_t)len < 0) { ERR("invalid file length %zu", len); errno = EINVAL; return NULL; } open_flags |= O_CREAT; } if (flags & PMEM_FILE_EXCL) open_flags |= O_EXCL; if ((len != 0) && !(flags & PMEM_FILE_CREATE)) { ERR("non-zero 'len' not allowed without PMEM_FILE_CREATE"); errno = EINVAL; return NULL; } if ((len == 0) && (flags & PMEM_FILE_CREATE)) { ERR("zero 'len' not allowed with PMEM_FILE_CREATE"); errno = EINVAL; return NULL; } if ((flags & PMEM_FILE_TMPFILE) && !(flags & PMEM_FILE_CREATE)) { ERR("PMEM_FILE_TMPFILE not allowed without PMEM_FILE_CREATE"); errno = EINVAL; return NULL; } if (flags & PMEM_FILE_TMPFILE) { if ((fd = util_tmpfile(path, OS_DIR_SEP_STR"pmem.XXXXXX", open_flags & O_EXCL)) < 0) { LOG(2, "failed to create temporary file at \"%s\"", path); return NULL; } } else { if ((fd = os_open(path, open_flags, mode)) < 0) { ERR("!open %s", path); return NULL; } if ((flags & PMEM_FILE_CREATE) && (flags & PMEM_FILE_EXCL)) delete_on_err = 1; } if (flags & PMEM_FILE_CREATE) { /* * Always set length of file to 'len'. * (May either extend or truncate existing file.) */ if (os_ftruncate(fd, (os_off_t)len) != 0) { ERR("!ftruncate"); goto err; } if ((flags & PMEM_FILE_SPARSE) == 0) { if ((errno = os_posix_fallocate(fd, 0, (os_off_t)len)) != 0) { ERR("!posix_fallocate"); goto err; } } } else { ssize_t actual_size = util_file_get_size(path); if (actual_size < 0) { ERR("stat %s: negative size", path); errno = EINVAL; goto err; } len = (size_t)actual_size; } void *addr = pmem_map_register(fd, len, path, file_type == TYPE_DEVDAX); if (addr == NULL) goto err; if (mapped_lenp != NULL) *mapped_lenp = len; if (is_pmemp != NULL) *is_pmemp = pmem_is_pmem(addr, len); LOG(3, "returning %p", addr); VALGRIND_REGISTER_PMEM_MAPPING(addr, len); VALGRIND_REGISTER_PMEM_FILE(fd, addr, len, 0); (void) os_close(fd); return addr; err: oerrno = errno; (void) os_close(fd); if (delete_on_err) (void) os_unlink(path); errno = oerrno; return NULL; }