Exemple #1
 * list_insert_new -- allocate and insert element to oob and user lists
 * pop         - pmemobj pool handle
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head, must be locked if not NULL
 * dest        - destination on user list
 * before      - insert before/after destination on user list
 * size        - size of allocation, will be increased by OBJ_OOB_SIZE
 * constructor - object's constructor
 * arg         - argument for object's constructor
 * oidp        - pointer to target object ID
static int
list_insert_new(PMEMobjpool *pop,
	size_t pe_offset, struct list_head *user_head, PMEMoid dest, int before,
	size_t size, int (*constructor)(void *ctx, void *ptr,
	size_t usable_size, void *arg), void *arg, PMEMoid *oidp)
	LOG(3, NULL);
	ASSERT(user_head != NULL);

	int ret;

	struct lane_section *lane_section;

#ifdef DEBUG
	int r = pmemobj_mutex_assert_locked(pop, &user_head->lock);
	ASSERTeq(r, 0);

	lane_hold(pop, &lane_section, LANE_SECTION_LIST);

	ASSERTne(lane_section, NULL);
	ASSERTne(lane_section->layout, NULL);

	/* increase allocation size by oob header size */
	size += OBJ_OOB_SIZE;
	struct lane_list_layout *section =
		(struct lane_list_layout *)lane_section->layout;
	struct redo_log *redo = section->redo;
	size_t redo_index = 0;
	uint64_t sec_off_off = OBJ_PTR_TO_OFF(pop, &section->obj_offset);

	if (constructor) {
		if ((ret = pmalloc_construct(pop,
				&section->obj_offset, size,
				constructor, arg))) {
			goto err_pmalloc;
	} else {
		ret = pmalloc(pop, &section->obj_offset, size);
		if (ret) {
			goto err_pmalloc;

	uint64_t obj_doffset = section->obj_offset;

	ASSERT((ssize_t)pe_offset >= 0);

	dest = list_get_dest(pop, user_head, dest,
		(ssize_t)pe_offset, before);

	struct list_entry *entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
				obj_doffset + pe_offset);

	struct list_entry *dest_entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
				dest.off + pe_offset);

	struct list_args_insert args = {
		.dest = dest,
		.dest_entry_ptr = dest_entry_ptr,
		.head = user_head,
		.before = before,

	struct list_args_common args_common = {
		.obj_doffset = obj_doffset,
		.entry_ptr = entry_ptr,
		.pe_offset = (ssize_t)pe_offset,

	uint64_t next_offset;
	uint64_t prev_offset;

	/* insert element to user list */
	redo_index = list_insert_user(pop,
		redo, redo_index, &args, &args_common,
		&next_offset, &prev_offset);

	/* don't need to use redo log for filling new element */
	list_fill_entry_persist(pop, entry_ptr,
			next_offset, prev_offset);

	if (oidp != NULL) {
		if (OBJ_PTR_IS_VALID(pop, oidp))
			redo_index = list_set_oid_redo_log(pop, redo,
					redo_index, oidp, obj_doffset, 0);
		else {
			oidp->off = obj_doffset;
			oidp->pool_uuid_lo = pop->uuid_lo;

	/* clear the obj_offset in lane section */
	redo_log_store_last(pop->redo, redo, redo_index, sec_off_off, 0);

	redo_log_process(pop->redo, redo, REDO_NUM_ENTRIES);

	ret = 0;


	return ret;

 * list_insert_new_user -- allocate and insert element to oob and user lists
 * pop         - pmemobj pool handle
 * oob_head    - oob list head
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head
 * dest        - destination on user list
 * before      - insert before/after destination on user list
 * size        - size of allocation, will be increased by OBJ_OOB_SIZE
 * constructor - object's constructor
 * arg         - argument for object's constructor
 * oidp        - pointer to target object ID
list_insert_new_user(PMEMobjpool *pop,
	size_t pe_offset, struct list_head *user_head, PMEMoid dest, int before,
	size_t size, int (*constructor)(void *ctx, void *ptr,
	size_t usable_size, void *arg), void *arg, PMEMoid *oidp)
	int ret;
	if ((ret = pmemobj_mutex_lock(pop, &user_head->lock))) {
		LOG(2, "pmemobj_mutex_lock failed");
		return ret;

	ret = list_insert_new(pop, pe_offset, user_head,
			dest, before, size, constructor, arg, oidp);

	pmemobj_mutex_unlock_nofail(pop, &user_head->lock);

	return ret;
Exemple #2
 * list_insert -- insert object to a single list
 * pop          - pmemobj handle
 * pe_offset    - offset to list entry on user list relative to user data
 * head         - list head
 * dest         - destination object ID
 * before       - before/after destination
 * oid          - target object ID
list_insert(PMEMobjpool *pop,
	ssize_t pe_offset, struct list_head *head,
	PMEMoid dest, int before,
	PMEMoid oid)
	LOG(3, NULL);
	ASSERTne(head, NULL);

	int ret;

	struct lane_section *lane_section;

	lane_hold(pop, &lane_section, LANE_SECTION_LIST);

	if ((ret = pmemobj_mutex_lock(pop, &head->lock))) {
		LOG(2, "pmemobj_mutex_lock failed");
		goto err;

	ASSERTne(lane_section, NULL);
	ASSERTne(lane_section->layout, NULL);

	struct lane_list_layout *section =
		(struct lane_list_layout *)lane_section->layout;
	struct redo_log *redo = section->redo;
	size_t redo_index = 0;

	dest = list_get_dest(pop, head, dest, pe_offset, before);

	struct list_entry *entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
			(uintptr_t)((ssize_t)oid.off + pe_offset));

	struct list_entry *dest_entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
			(uintptr_t)((ssize_t)dest.off + pe_offset));

	struct list_args_insert args = {
		.dest = dest,
		.dest_entry_ptr = dest_entry_ptr,
		.head = head,
		.before = before,

	struct list_args_common args_common = {
		.obj_doffset = oid.off,
		.entry_ptr = entry_ptr,
		.pe_offset = (ssize_t)pe_offset,

	uint64_t next_offset;
	uint64_t prev_offset;

	/* insert element to user list */
	redo_index = list_insert_user(pop, redo, redo_index,
			&args, &args_common, &next_offset, &prev_offset);

	/* fill entry of existing element using redo log */
	redo_index = list_fill_entry_redo_log(pop, redo, redo_index,
			&args_common, next_offset, prev_offset, 1);

	redo_log_set_last(pop->redo, redo, redo_index - 1);

	redo_log_process(pop->redo, redo, REDO_NUM_ENTRIES);

	pmemobj_mutex_unlock_nofail(pop, &head->lock);

	return ret;

 * list_remove_free -- remove from two lists and free an object
 * pop         - pmemobj pool handle
 * oob_head    - oob list head
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head, *must* be locked if not NULL
 * oidp        - pointer to target object ID
static void
list_remove_free(PMEMobjpool *pop, size_t pe_offset,
	struct list_head *user_head, PMEMoid *oidp)
	LOG(3, NULL);
	ASSERT(user_head != NULL);

#ifdef DEBUG
	int r = pmemobj_mutex_assert_locked(pop, &user_head->lock);
	ASSERTeq(r, 0);

	struct lane_section *lane_section;

	lane_hold(pop, &lane_section, LANE_SECTION_LIST);

	ASSERTne(lane_section, NULL);
	ASSERTne(lane_section->layout, NULL);

	struct lane_list_layout *section =
		(struct lane_list_layout *)lane_section->layout;
	uint64_t sec_off_off = OBJ_PTR_TO_OFF(pop, &section->obj_offset);
	struct redo_log *redo = section->redo;
	size_t redo_index = 0;

	uint64_t obj_doffset = oidp->off;

	ASSERT((ssize_t)pe_offset >= 0);

	struct list_entry *entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
				obj_doffset + pe_offset);

	struct list_args_remove args = {
		.pe_offset = (ssize_t)pe_offset,
		.head = user_head,
		.entry_ptr = entry_ptr,
		.obj_doffset = obj_doffset

	/* remove from user list */
	redo_index = list_remove_single(pop, redo, redo_index, &args);

	/* clear the oid */
	if (OBJ_PTR_IS_VALID(pop, oidp))
		redo_index = list_set_oid_redo_log(pop, redo, redo_index,
				oidp, 0, 1);
		oidp->off = 0;

	redo_log_store_last(pop->redo, redo, redo_index, sec_off_off,

	redo_log_process(pop->redo, redo, REDO_NUM_ENTRIES);

	 * Don't need to fill next and prev offsets of removing element
	 * because the element is freed.
	pfree(pop, &section->obj_offset);


 * list_remove_free_user -- remove from two lists and free an object
 * pop         - pmemobj pool handle
 * oob_head    - oob list head
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head
 * oidp        - pointer to target object ID
list_remove_free_user(PMEMobjpool *pop, size_t pe_offset,
	struct list_head *user_head, PMEMoid *oidp)
	LOG(3, NULL);

	int ret;
	if ((ret = pmemobj_mutex_lock(pop, &user_head->lock))) {
		LOG(2, "pmemobj_mutex_lock failed");
		return ret;

	list_remove_free(pop, pe_offset, user_head, oidp);

	pmemobj_mutex_unlock_nofail(pop, &user_head->lock);

	return 0;
Exemple #3
 * CTL_WRITE_HANDLER(proto) -- creates a new allocation class
static int
CTL_WRITE_HANDLER(desc)(void *ctx,
	enum ctl_query_source source, void *arg, struct ctl_indexes *indexes)
	PMEMobjpool *pop = ctx;
	uint8_t id;
	struct alloc_class_collection *ac = heap_alloc_classes(&pop->heap);
	struct pobj_alloc_class_desc *p = arg;

	if (p->unit_size <= 0 || p->unit_size > PMEMOBJ_MAX_ALLOC_SIZE ||
		p->units_per_block <= 0) {
		errno = EINVAL;
		return -1;

	if (p->alignment != 0 && p->unit_size % p->alignment != 0) {
		ERR("unit size must be evenly divisible by alignment");
		errno = EINVAL;
		return -1;

	if (p->alignment > (MEGABYTE * 2)) {
		ERR("alignment cannot be larger than 2 megabytes");
		errno = EINVAL;
		return -1;

	enum header_type lib_htype = MAX_HEADER_TYPES;
	switch (p->header_type) {
			lib_htype = HEADER_LEGACY;
			lib_htype = HEADER_COMPACT;
			lib_htype = HEADER_NONE;
			ERR("invalid header type");
			errno = EINVAL;
			return -1;

	if (SLIST_EMPTY(indexes)) {
		if (alloc_class_find_first_free_slot(ac, &id) != 0) {
			ERR("no available free allocation class identifier");
			errno = EINVAL;
			return -1;
	} else {
		struct ctl_index *idx = SLIST_FIRST(indexes);
		ASSERTeq(strcmp(idx->name, "class_id"), 0);

		if (idx->value < 0 || idx->value >= MAX_ALLOCATION_CLASSES) {
			ERR("class id outside of the allowed range");
			errno = ERANGE;
			return -1;

		id = (uint8_t)idx->value;

		if (alloc_class_reserve(ac, id) != 0) {
			ERR("attempted to overwrite an allocation class");
			errno = EEXIST;
			return -1;

	size_t runsize_bytes =
		CHUNK_ALIGN_UP((p->units_per_block * p->unit_size) +

	/* aligning the buffer might require up-to to 'alignment' bytes */
	if (p->alignment != 0)
		runsize_bytes += p->alignment;

	uint32_t size_idx = (uint32_t)(runsize_bytes / CHUNKSIZE);
	if (size_idx > UINT16_MAX)
		size_idx = UINT16_MAX;

	struct alloc_class *c = alloc_class_new(id,
		heap_alloc_classes(&pop->heap), CLASS_RUN,
		lib_htype, p->unit_size, p->alignment, size_idx);
	if (c == NULL) {
		errno = EINVAL;
		return -1;

	if (heap_create_alloc_class_buckets(&pop->heap, c) != 0) {
		alloc_class_delete(ac, c);
		return -1;

	p->class_id = c->id;
	p->units_per_block = c->run.nallocs;

	return 0;
Exemple #4
 * status_answer_push -- (internal) push single answer to answers queue
static void
status_answer_push(struct check_data *data, struct check_status *st)
	TAILQ_INSERT_TAIL(&data->answers, st, next);
	POBJ_FOREACH_TYPE(pop, iter_c) {
		ASSERTeq(D_RO(iter_c)->value, TEST_VALUE);
Exemple #6
 * heap_vg_open -- notifies Valgrind about heap layout
heap_vg_open(struct palloc_heap *heap, object_callback cb,
	void *arg, int objects)
	ASSERTne(cb, NULL);
	VALGRIND_DO_MAKE_MEM_UNDEFINED(heap->layout, heap->size);

	struct heap_layout *layout = heap->layout;

	VALGRIND_DO_MAKE_MEM_DEFINED(&layout->header, sizeof(layout->header));

	unsigned zones = heap_max_zone(heap->size);

	struct memory_block m = MEMORY_BLOCK_NONE;
	for (unsigned i = 0; i < zones; ++i) {
		struct zone *z = ZID_TO_ZONE(layout, i);
		uint32_t chunks;
		m.zone_id = i;
		m.chunk_id = 0;

		VALGRIND_DO_MAKE_MEM_DEFINED(&z->header, sizeof(z->header));

		if (z->header.magic != ZONE_HEADER_MAGIC)

		chunks = z->header.size_idx;

		for (uint32_t c = 0; c < chunks; ) {
			struct chunk_header *hdr = &z->chunk_headers[c];
			m.chunk_id = c;

			VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));

			m.size_idx = hdr->size_idx;
			heap_vg_open_chunk(heap, cb, arg, objects, &m);
			m.block_off = 0;

			ASSERT(hdr->size_idx > 0);

			if (hdr->type == CHUNK_TYPE_RUN) {
				 * Mark run data headers as defined.
				for (unsigned j = 1; j < hdr->size_idx; ++j) {
					struct chunk_header *data_hdr =
						&z->chunk_headers[c + j];
						sizeof(struct chunk_header));
			} else {
				 * Mark unused chunk headers as not accessible.
					&z->chunk_headers[c + 1],
					(hdr->size_idx - 1) *
					sizeof(struct chunk_header));

			c += hdr->size_idx;

		/* mark all unused chunk headers after last as not accessible */
			(MAX_CHUNK - chunks) * sizeof(struct chunk_header));
Exemple #7
 * list_insert -- insert object to a single list
 * pop          - pmemobj handle
 * pe_offset    - offset to list entry on user list relative to user data
 * head         - list head
 * dest         - destination object ID
 * before       - before/after destination
 * oid          - target object ID
list_insert(PMEMobjpool *pop,
	ssize_t pe_offset, struct list_head *head,
	PMEMoid dest, int before,
	PMEMoid oid)
	LOG(3, NULL);
	ASSERTne(head, NULL);

	struct lane *lane;
	lane_hold(pop, &lane);

	int ret;

	if ((ret = pmemobj_mutex_lock(pop, &head->lock))) {
		errno = ret;
		LOG(2, "pmemobj_mutex_lock failed");
		ret = -1;
		goto err;

	struct operation_context *ctx = lane->external;

	dest = list_get_dest(pop, head, dest, pe_offset, before);

	struct list_entry *entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
			(uintptr_t)((ssize_t)oid.off + pe_offset));

	struct list_entry *dest_entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
			(uintptr_t)((ssize_t)dest.off + pe_offset));

	struct list_args_insert args = {
		.dest = dest,
		.dest_entry_ptr = dest_entry_ptr,
		.head = head,
		.before = before,

	struct list_args_common args_common = {
		.obj_doffset = oid.off,
		.entry_ptr = entry_ptr,
		.pe_offset = (ssize_t)pe_offset,

	uint64_t next_offset;
	uint64_t prev_offset;

	/* insert element to user list */
	list_insert_user(pop, ctx,
			&args, &args_common, &next_offset, &prev_offset);

	/* fill entry of existing element using redo log */
	list_fill_entry_redo_log(pop, ctx,
			&args_common, next_offset, prev_offset, 1);


	pmemobj_mutex_unlock_nofail(pop, &head->lock);

	ASSERT(ret == 0 || ret == -1);
	return ret;

 * list_remove_free -- remove from two lists and free an object
 * pop         - pmemobj pool handle
 * oob_head    - oob list head
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head, *must* be locked if not NULL
 * oidp        - pointer to target object ID
static void
list_remove_free(PMEMobjpool *pop, size_t pe_offset,
	struct list_head *user_head, PMEMoid *oidp)
	LOG(3, NULL);
	ASSERT(user_head != NULL);

#ifdef DEBUG
	int r = pmemobj_mutex_assert_locked(pop, &user_head->lock);
	ASSERTeq(r, 0);

	struct lane *lane;
	lane_hold(pop, &lane);
	struct operation_context *ctx = lane->external;

	struct pobj_action deferred;
	palloc_defer_free(&pop->heap, oidp->off, &deferred);
	uint64_t obj_doffset = oidp->off;

	ASSERT((ssize_t)pe_offset >= 0);

	struct list_entry *entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
				obj_doffset + pe_offset);

	struct list_args_remove args = {
		.pe_offset = (ssize_t)pe_offset,
		.head = user_head,
		.entry_ptr = entry_ptr,
		.obj_doffset = obj_doffset

	/* remove from user list */
	list_remove_single(pop, ctx, &args);

	/* clear the oid */
	if (OBJ_PTR_IS_VALID(pop, oidp))
		list_set_oid_redo_log(pop, ctx, oidp, 0, 1);
		oidp->off = 0;

	palloc_publish(&pop->heap, &deferred, 1, ctx);


 * list_remove_free_user -- remove from two lists and free an object
 * pop         - pmemobj pool handle
 * oob_head    - oob list head
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head
 * oidp        - pointer to target object ID
list_remove_free_user(PMEMobjpool *pop, size_t pe_offset,
	struct list_head *user_head, PMEMoid *oidp)
	LOG(3, NULL);

	int ret;
	if ((ret = pmemobj_mutex_lock(pop, &user_head->lock))) {
		errno = ret;
		LOG(2, "pmemobj_mutex_lock failed");
		return -1;

	list_remove_free(pop, pe_offset, user_head, oidp);

	pmemobj_mutex_unlock_nofail(pop, &user_head->lock);

	return 0;
 * pmemalloc_init -- setup a Persistent Memory pool for use
 * Inputs:
 *	path -- path to the file which will contain the memory pool
 *	        If the file doesn't exist, it is created.  If it exists,
 *	        the state of the memory pool is initialized from the file.
 *	size -- size of the memory pool in bytes
 *		The size is only used when creating the memory pool
 *		the first time (the file created will be extended to
 *		that size).  The smallest size allowed is 1 meg.  The
 *		largest size allowed is whatever the underlaying file
 *		system allows as a max file size.
 * Outputs:
 * 	An opaque memory pool handle is returned on success.  That
 * 	handle must be passed in to most of the other pmem routines.
 * 	On error, NULL is returned and errno is set.
 * This function must be called before any other pmem functions.
void *
pmemalloc_init(const char *path, size_t size)
	void *pmp;
	int err;
	int fd = -1;
	struct stat stbuf;

	DEBUG("path=%s size=0x%lx", path, size);

	if (stat(path, &stbuf) < 0) {
		struct clump cl = { 0 };
		struct pool_header hdr = { 0 };
		size_t lastclumpoff;

		if (errno != ENOENT)
			goto out;

		 * file didn't exist, we're creating a new memory pool
		if (size < PMEM_MIN_POOL_SIZE) {
			DEBUG("size %lu too small (must be at least %lu)",
					size, PMEM_MIN_POOL_SIZE);
			errno = EINVAL;
			goto out;

		ASSERTeq(sizeof(cl), PMEM_CHUNK_SIZE);
		ASSERTeq(sizeof(hdr), PMEM_PAGE_SIZE);

		if ((fd = open(path, O_CREAT|O_RDWR, 0666)) < 0)
			goto out;

		if ((errno = posix_fallocate(fd, 0, size)) != 0)
			goto out;

		 * location of last clump is calculated by rounding the file
		 * size down to a multiple of 64, and then subtracting off
		 * another 64 to hold the struct clump.  the last clump is
		 * indicated by a size of zero (so no write is necessary
		 * here since the file is initially zeros.
		lastclumpoff =
			(size & ~(PMEM_CHUNK_SIZE - 1)) - PMEM_CHUNK_SIZE;

		 * create the first clump to cover the entire pool
		cl.size = lastclumpoff - PMEM_CLUMP_OFFSET;
		if (pwrite(fd, &cl, sizeof(cl), PMEM_CLUMP_OFFSET) < 0)
			goto out;
		DEBUG("[0x%lx] created clump, size 0x%lx",
				PMEM_CLUMP_OFFSET, cl.size);

		 * write the pool header
		strcpy(hdr.signature, PMEM_SIGNATURE);
		hdr.totalsize = size;
        pthread_mutex_init( &hdr.pool_lock, NULL );
        pthread_mutex_init( &hdr.activation_lock, NULL );
		if (pwrite(fd, &hdr, sizeof(hdr), PMEM_HDR_OFFSET) < 0)
			goto out;

		if (fsync(fd) < 0)
			goto out;

	} else {
		if ((fd = open(path, O_RDWR)) < 0)
			goto out;
		size = stbuf.st_size;

		/* XXX handle recovery case 1 described below */

	 * map the file
	if ((pmp = pmem_map(fd, size)) == NULL)
		goto out;

	 * scan pool for recovery work, five kinds:
	 * 	1. pmem pool file sisn't even fully setup
	 * 	2. RESERVED clumps that need to be freed
	 * 	3. ACTIVATING clumps that need to be ACTIVE
	 * 	4. FREEING clumps that need to be freed
	 * 	5. adjacent free clumps that need to be coalesced

	DEBUG("return pmp 0x%lx", pmp);
	return pmp;

	err = errno;
	if (fd != -1)
	errno = err;
	return NULL;
Exemple #9
 * memblock_validate_offset -- checks the state of any arbtirary offset within
 *	the heap.
 * This function traverses an entire zone, so use with caution.
enum memblock_state
memblock_validate_offset(struct palloc_heap *heap, uint64_t off)
	struct memory_block m = MEMORY_BLOCK_NONE;
	m.heap = heap;

	off -= HEAP_PTR_TO_OFF(heap, &heap->layout->zone0);
	m.zone_id = (uint32_t)(off / ZONE_MAX_SIZE);

	off -= (ZONE_MAX_SIZE * m.zone_id) + sizeof(struct zone);
	m.chunk_id = (uint32_t)(off / CHUNKSIZE);

	struct zone *z = ZID_TO_ZONE(heap->layout, m.zone_id);
	struct chunk_header *hdr = &z->chunk_headers[m.chunk_id];

	if (hdr->type == CHUNK_TYPE_RUN_DATA)
		m.chunk_id -= hdr->size_idx;

	off -= CHUNKSIZE * m.chunk_id;

	for (uint32_t i = 0; i < z->header.size_idx; ) {
		hdr = &z->chunk_headers[i];
		if (i + hdr->size_idx > m.chunk_id && i < m.chunk_id) {
			return MEMBLOCK_STATE_UNKNOWN; /* invalid chunk */
		} else if (m.chunk_id == i) {
		i += hdr->size_idx;
	ASSERTne(hdr, NULL);

	m.header_type = memblock_header_type(&m);

	if (hdr->type != CHUNK_TYPE_RUN) {
		if (header_type_to_size[m.header_type] != off)
		else if (hdr->type == CHUNK_TYPE_USED)
		else if (hdr->type == CHUNK_TYPE_FREE)
			return MEMBLOCK_FREE;

	if (header_type_to_size[m.header_type] > off)

	off -= header_type_to_size[m.header_type];

	m.type = off != 0 ? MEMORY_BLOCK_RUN : MEMORY_BLOCK_HUGE;
#ifdef DEBUG
	enum memory_block_type t = memblock_detect_type(&m, heap->layout);
	ASSERTeq(t, m.type);
	m.m_ops = &mb_ops[m.type];

	uint64_t unit_size = m.m_ops->block_size(&m);

	if (off != 0) { /* run */
		off -= RUN_METASIZE;
		m.block_off = (uint16_t)(off / unit_size);
		off -= m.block_off * unit_size;

	m.size_idx = CALC_SIZE_IDX(unit_size,

	ASSERTeq(off, 0);

	return m.m_ops->get_state(&m);
Exemple #10
 * list_insert_new -- allocate and insert element to oob and user lists
 * pop         - pmemobj pool handle
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head, must be locked if not NULL
 * dest        - destination on user list
 * before      - insert before/after destination on user list
 * size        - size of allocation, will be increased by OBJ_OOB_SIZE
 * constructor - object's constructor
 * arg         - argument for object's constructor
 * oidp        - pointer to target object ID
static int
list_insert_new(PMEMobjpool *pop,
	size_t pe_offset, struct list_head *user_head, PMEMoid dest, int before,
	size_t size, uint64_t type_num, int (*constructor)(void *ctx, void *ptr,
	size_t usable_size, void *arg), void *arg, PMEMoid *oidp)
	LOG(3, NULL);
	ASSERT(user_head != NULL);

	int ret;

#ifdef DEBUG
	int r = pmemobj_mutex_assert_locked(pop, &user_head->lock);
	ASSERTeq(r, 0);
	struct lane *lane;
	lane_hold(pop, &lane);

	struct pobj_action reserved;
	if (palloc_reserve(&pop->heap, size, constructor, arg,
		type_num, 0, 0, &reserved) != 0) {
		ret = -1;
		goto err_pmalloc;
	uint64_t obj_doffset = reserved.heap.offset;

	struct operation_context *ctx = lane->external;

	ASSERT((ssize_t)pe_offset >= 0);

	dest = list_get_dest(pop, user_head, dest,
		(ssize_t)pe_offset, before);

	struct list_entry *entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
				obj_doffset + pe_offset);

	struct list_entry *dest_entry_ptr =
		(struct list_entry *)OBJ_OFF_TO_PTR(pop,
				dest.off + pe_offset);

	struct list_args_insert args = {
		.dest = dest,
		.dest_entry_ptr = dest_entry_ptr,
		.head = user_head,
		.before = before,

	struct list_args_common args_common = {
		.obj_doffset = obj_doffset,
		.entry_ptr = entry_ptr,
		.pe_offset = (ssize_t)pe_offset,

	uint64_t next_offset;
	uint64_t prev_offset;

	/* insert element to user list */
		ctx, &args, &args_common,
		&next_offset, &prev_offset);

	/* don't need to use redo log for filling new element */
	list_fill_entry_persist(pop, entry_ptr,
			next_offset, prev_offset);

	if (oidp != NULL) {
		if (OBJ_PTR_IS_VALID(pop, oidp)) {
			list_set_oid_redo_log(pop, ctx,
				oidp, obj_doffset, 0);
		} else {
			oidp->off = obj_doffset;
			oidp->pool_uuid_lo = pop->uuid_lo;

	palloc_publish(&pop->heap, &reserved, 1, ctx);

	ret = 0;


	ASSERT(ret == 0 || ret == -1);
	return ret;

 * list_insert_new_user -- allocate and insert element to oob and user lists
 * pop         - pmemobj pool handle
 * oob_head    - oob list head
 * pe_offset   - offset to list entry on user list relative to user data
 * user_head   - user list head
 * dest        - destination on user list
 * before      - insert before/after destination on user list
 * size        - size of allocation, will be increased by OBJ_OOB_SIZE
 * constructor - object's constructor
 * arg         - argument for object's constructor
 * oidp        - pointer to target object ID
list_insert_new_user(PMEMobjpool *pop,
	size_t pe_offset, struct list_head *user_head, PMEMoid dest, int before,
	size_t size, uint64_t type_num, int (*constructor)(void *ctx, void *ptr,
	size_t usable_size, void *arg), void *arg, PMEMoid *oidp)
	int ret;
	if ((ret = pmemobj_mutex_lock(pop, &user_head->lock))) {
		errno = ret;
		LOG(2, "pmemobj_mutex_lock failed");
		return -1;

	ret = list_insert_new(pop, pe_offset, user_head,
			dest, before, size, type_num, constructor, arg, oidp);

	pmemobj_mutex_unlock_nofail(pop, &user_head->lock);

	ASSERT(ret == 0 || ret == -1);
	return ret;
Exemple #11
static inline
pmempool_rmU(const char *path, int flags)
	LOG(3, "path %s flags %x", path, flags);
	int ret;

	if (flags & ~PMEMPOOL_RM_ALL_FLAGS) {
		ERR("invalid flags specified");
		errno = EINVAL;
		return -1;

	int is_poolset = util_is_poolset_file(path);
	if (is_poolset < 0) {
		os_stat_t buff;
		ret = os_stat(path, &buff);
		if (!ret) {
			if (S_ISDIR(buff.st_mode)) {
				errno = EISDIR;
				ERR("removing file failed");
				return -1;
		ERR_F(flags, "removing file failed");
		if (CHECK_FLAG(flags, FORCE))
			return 0;

		return -1;

	if (!is_poolset) {
		LOG(2, "%s: not a poolset file", path);
		return rm_local(path, flags, 0);

	LOG(2, "%s: poolset file", path);

	/* fill up pool_set structure */
	struct pool_set *set = NULL;
	int fd = os_open(path, O_RDONLY);
	if (fd == -1 || util_poolset_parse(&set, path, fd)) {
		ERR_F(flags, "parsing poolset file failed");
		if (fd != -1)
		if (CHECK_FLAG(flags, FORCE))
			return 0;
		return -1;

	if (set->remote) {
		/* ignore error - it will be handled in rm_remote() */
		(void) util_remote_load();


	struct cb_args args;
	args.flags = flags;
	args.error = 0;
	ret = util_poolset_foreach_part(path, rm_cb, &args);
	if (ret == -1) {
		ERR_F(flags, "parsing poolset file failed");
		if (CHECK_FLAG(flags, FORCE))
			return 0;

		return ret;

	ASSERTeq(ret, 0);

	if (args.error)
		return args.error;

		ret = rm_local(path, flags, 0);
		if (ret) {
			ERR_F(flags, "removing pool set file failed");
		} else {
			LOG(3, "%s: removed", path);

		if (CHECK_FLAG(flags, FORCE))
			return 0;

		return ret;

	return 0;
Exemple #12
main(int argc, char *argv[])
	START(argc, argv, "obj_direct");

	if (argc != 3)
		FATAL("usage: %s [directory] [# of pools]", argv[0]);

	int npools = atoi(argv[2]);
	const char *dir = argv[1];
	int r;

	PMEMobjpool *pops[npools];

	char path[MAX_PATH_LEN];
	for (int i = 0; i < npools; ++i) {
		snprintf(path, MAX_PATH_LEN, "%s/testfile%d", dir, i);
		pops[i] = pmemobj_create(path, LAYOUT_NAME, PMEMOBJ_MIN_POOL,

		if (pops[i] == NULL)

	PMEMoid oids[npools];
	PMEMoid tmpoids[npools];

	oids[0] = OID_NULL;
	ASSERTeq(pmemobj_direct(oids[0]), NULL);

	for (int i = 0; i < npools; ++i) {
		oids[i] = (PMEMoid) {pops[i]->uuid_lo, 0};
		ASSERTeq(pmemobj_direct(oids[i]), NULL);

		uint64_t off = pops[i]->heap_offset;
		oids[i] = (PMEMoid) {pops[i]->uuid_lo, off};
		ASSERTeq((char *)pmemobj_direct(oids[i]) - off,
			(char *)pops[i]);

		r = pmemobj_alloc(pops[i], &tmpoids[i], 100, 1, NULL, NULL);
		ASSERTeq(r, 0);

	r = pmemobj_alloc(pops[0], &thread_oid, 100, 2, NULL, NULL);
	ASSERTeq(r, 0);
	ASSERTne(pmemobj_direct(thread_oid), NULL);


	pthread_t t;
	pthread_create(&t, NULL, test_worker, NULL);

	/* wait for the thread to perform the first direct */
	while (flag)

	for (int i = 0; i < npools; ++i) {
		ASSERTne(pmemobj_direct(tmpoids[i]), NULL);


		ASSERTeq(pmemobj_direct(tmpoids[i]), NULL);
		ASSERTeq(pmemobj_direct(oids[i]), NULL);

	pthread_join(t, NULL);

Exemple #13
 * util_replica_open -- (internal) open a memory pool replica
static int
util_replica_open(struct pool_set *set, unsigned repidx, int flags,
	size_t hdrsize)
	LOG(3, "set %p repidx %u flags %d hdrsize %zu\n",
		set, repidx, flags, hdrsize);

	struct pool_replica *rep = set->replica[repidx];

	rep->repsize -= (rep->nparts - 1) * hdrsize;

	/* determine a hint address for mmap() */
	void *addr = util_map_hint(rep->repsize, 0);
	if (addr == NULL) {
		ERR("cannot find a contiguous region of given size");
		return -1;

	/* map the first part and reserve space for remaining parts */
	if (util_map_part(&rep->part[0], addr, rep->repsize, 0, flags) != 0) {
		LOG(2, "pool mapping failed - part #0");
		return -1;

	VALGRIND_REGISTER_PMEM_MAPPING(rep->part[0].addr, rep->part[0].size);
				rep->part[0].addr, rep->part[0].size, 0);

	/* map all headers - don't care about the address */
	for (unsigned p = 0; p < rep->nparts; p++) {
		if (util_map_hdr(&rep->part[p], hdrsize, flags) != 0) {
			LOG(2, "header mapping failed - part #%d", p);
			goto err;

	size_t mapsize = rep->part[0].filesize & ~(Pagesize - 1);
	addr = (char *)rep->part[0].addr + mapsize;

	 * map the remaining parts of the usable pool space
	 * (4K-aligned)
	for (unsigned p = 1; p < rep->nparts; p++) {
		/* map data part */
		if (util_map_part(&rep->part[p], addr, 0, hdrsize,
				flags | MAP_FIXED) != 0) {
			LOG(2, "usable space mapping failed - part #%d", p);
			goto err;

			rep->part[p].addr, rep->part[p].size, hdrsize);

		mapsize += rep->part[p].size;
		addr = (char *)addr + rep->part[p].size;

	rep->is_pmem = pmem_is_pmem(rep->part[0].addr, rep->part[0].size);

	ASSERTeq(mapsize, rep->repsize);

	/* calculate pool size - choose the smallest replica size */
	if (rep->repsize < set->poolsize)
		set->poolsize = rep->repsize;

	LOG(3, "replica addr %p", rep->part[0].addr);

	return 0;
	LOG(4, "error clean up");
	int oerrno = errno;
	for (unsigned p = 0; p < rep->nparts; p++)
	errno = oerrno;
	return -1;
Exemple #14
 * os_badblocks_get -- returns 0 and bad blocks in the 'bbs' array
 *                     (that has to be pre-allocated)
 *                     or -1 in case of an error
os_badblocks_get(const char *file, struct badblocks *bbs)
	LOG(3, "file %s badblocks %p", file, bbs);

	ASSERTne(bbs, NULL);

	VEC(bbsvec, struct bad_block) bbv = VEC_INITIALIZER;
	struct extents *exts = NULL;
	long extents = 0;

	unsigned long long bb_beg;
	unsigned long long bb_end;
	unsigned long long bb_len;
	unsigned long long bb_off;
	unsigned long long ext_beg;
	unsigned long long ext_end;
	unsigned long long not_block_aligned;

	int bb_found = -1; /* -1 means an error */

	memset(bbs, 0, sizeof(*bbs));

	if (os_dimm_files_namespace_badblocks(file, bbs)) {
		LOG(1, "checking the file for bad blocks failed -- '%s'", file);
		goto error_free_all;

	if (bbs->bb_cnt == 0) {
		bb_found = 0;
		goto exit_free_all;

	exts = Zalloc(sizeof(struct extents));
	if (exts == NULL) {
		goto error_free_all;

	extents = os_extents_count(file, exts);
	if (extents < 0) {
		LOG(1, "counting file's extents failed -- '%s'", file);
		goto error_free_all;

	if (extents == 0) {
		/* dax device has no extents */
		bb_found = (int)bbs->bb_cnt;

		for (unsigned b = 0; b < bbs->bb_cnt; b++) {
			LOG(4, "bad block found: offset: %llu, length: %u",

		goto exit_free_all;

	exts->extents = Zalloc(exts->extents_count * sizeof(struct extent));
	if (exts->extents == NULL) {
		goto error_free_all;

	if (os_extents_get(file, exts)) {
		LOG(1, "getting file's extents failed -- '%s'", file);
		goto error_free_all;

	bb_found = 0;

	for (unsigned b = 0; b < bbs->bb_cnt; b++) {

		bb_beg = bbs->bbv[b].offset;
		bb_end = bb_beg + bbs->bbv[b].length - 1;

		for (unsigned e = 0; e < exts->extents_count; e++) {

			ext_beg = exts->extents[e].offset_physical;
			ext_end = ext_beg + exts->extents[e].length - 1;

			/* check if the bad block overlaps with file's extent */
			if (bb_beg > ext_end || ext_beg > bb_end)


			bb_beg = (bb_beg > ext_beg) ? bb_beg : ext_beg;
			bb_end = (bb_end < ext_end) ? bb_end : ext_end;
			bb_len = bb_end - bb_beg + 1;
			bb_off = bb_beg + exts->extents[e].offset_logical
					- exts->extents[e].offset_physical;

				"bad block found: physical offset: %llu, length: %llu",
				bb_beg, bb_len);

			/* check if offset is block-aligned */
			not_block_aligned = bb_off & (exts->blksize - 1);
			if (not_block_aligned) {
				bb_off -= not_block_aligned;
				bb_len += not_block_aligned;

			/* check if length is block-aligned */
			bb_len = ALIGN_UP(bb_len, exts->blksize);

				"bad block found: logical offset: %llu, length: %llu",
				bb_off, bb_len);

			 * Form a new bad block structure with offset and length
			 * expressed in bytes and offset relative
			 * to the beginning of the file.
			struct bad_block bb;
			bb.offset = bb_off;
			bb.length = (unsigned)(bb_len);
			/* unknown healthy replica */
			bb.nhealthy = NO_HEALTHY_REPLICA;

			/* add the new bad block to the vector */
			if (VEC_PUSH_BACK(&bbv, bb)) {
				bb_found = -1;
				goto error_free_all;

	bbs->bbv = NULL;
	bbs->bb_cnt = 0;

	if (exts) {

	if (extents > 0 && bb_found > 0) {
		bbs->bbv = VEC_ARR(&bbv);
		bbs->bb_cnt = (unsigned)VEC_SIZE(&bbv);

		LOG(10, "number of bad blocks detected: %u", bbs->bb_cnt);

		/* sanity check */
		ASSERTeq((unsigned)bb_found, bbs->bb_cnt);

	return (bb_found >= 0) ? 0 : -1;
Exemple #15
 * pool_parse_params -- parse pool type, file size and block size
static int
pool_params_parse(const PMEMpoolcheck *ppc, struct pool_params *params,
	int check)
	LOG(3, NULL);
	int is_btt = ppc->args.pool_type == PMEMPOOL_POOL_TYPE_BTT;

	params->type = POOL_TYPE_UNKNOWN;
	params->is_poolset = util_is_poolset_file(ppc->path) == 1;

	int fd = util_file_open(ppc->path, NULL, 0, O_RDONLY);
	if (fd < 0)
		return -1;

	int ret = 0;

	util_stat_t stat_buf;
	ret = util_fstat(fd, &stat_buf);
	if (ret)
		goto out_close;

	ASSERT(stat_buf.st_size >= 0);

	params->mode = stat_buf.st_mode;

	struct pool_set *set;
	void *addr;
	if (params->is_poolset) {
		 * Need to close the poolset because it will be opened with
		 * flock in the following instructions.
		fd = -1;

		if (check) {
			if (pool_set_map(ppc->path, &set, 1))
				return -1;
		} else {
			ret = util_poolset_create_set(&set, ppc->path, 0, 0);
			if (ret < 0) {
				LOG(2, "cannot open pool set -- '%s'",
				return -1;
			if (set->remote) {
				ERR("poolsets with remote replicas are not "
				return -1;
			if (util_pool_open_nocheck(set, 1))
				return -1;

		params->size = set->poolsize;
		addr = set->replica[0]->part[0].addr;
	} else if (is_btt) {
		params->size = (size_t)stat_buf.st_size;
#ifndef _WIN32
		if (params->mode & S_IFBLK)
			if (ioctl(fd, BLKGETSIZE64, &params->size)) {
				goto out_close;
		addr = NULL;
	} else {
		params->size = (size_t)stat_buf.st_size;
		addr = mmap(NULL, (uint64_t)stat_buf.st_size, PROT_READ,
			MAP_PRIVATE, fd, 0);
		if (addr == MAP_FAILED) {
			ret = -1;
			goto out_close;

	/* stop processing for BTT device */
	if (is_btt) {
		params->type = POOL_TYPE_BTT;
		params->is_part = false;
		goto out_close;

	struct pool_hdr hdr;
	memcpy(&hdr, addr, sizeof(hdr));
	pool_params_from_header(params, &hdr);

	if (ppc->args.pool_type != PMEMPOOL_POOL_TYPE_DETECT) {
		enum pool_type declared_type =
		if ((params->type & ~declared_type) != 0) {
			ERR("declared pool type does not match");
			ret = 1;
			goto out_unmap;

	if (params->type == POOL_TYPE_BLK) {
		struct pmemblk pbp;
		memcpy(&pbp, addr, sizeof(pbp));
		params->blk.bsize = le32toh(pbp.bsize);
	} else if (params->type == POOL_TYPE_OBJ) {
		struct pmemobjpool pop;
		memcpy(&pop, addr, sizeof(pop));
		memcpy(params->obj.layout, pop.layout,

	if (params->is_poolset) {
		ASSERTeq(fd, -1);
		ASSERTne(addr, NULL);
		util_poolset_close(set, 0);
	} else if (!is_btt) {
		ASSERTne(fd, -1);
		ASSERTne(addr, NULL);
		munmap(addr, params->size);
	if (fd != -1)
	return ret;
Exemple #16
 * backup_poolset_requirements -- (internal) check backup requirements
static int
backup_poolset_requirements(PMEMpoolcheck *ppc, location *loc)
	LOG(3, "backup_path %s", ppc->backup_path);

	if (ppc->pool->set_file->poolset->nreplicas > 1) {
			"backup of a poolset with multiple replicas is not supported");
		goto err;

	if (pool_set_parse(&loc->set, ppc->backup_path)) {
		CHECK_INFO(ppc, "invalid poolset backup file: %s",
		goto err;

	if (loc->set->nreplicas > 1) {
			"backup to a poolset with multiple replicas is not supported");
		goto err_poolset;

	ASSERTeq(loc->set->nreplicas, 1);
	struct pool_replica *srep = ppc->pool->set_file->poolset->replica[0];
	struct pool_replica *drep = loc->set->replica[0];
	if (srep->nparts != drep->nparts) {
			"number of part files in the backup poolset must match number of part files in the source poolset");
		goto err_poolset;

	int overwrite_required = 0;
	for (unsigned p = 0; p < srep->nparts; p++) {
		if (srep->part[p].filesize != drep->part[p].filesize) {
				"size of the part %u of the backup poolset does not match source poolset",
			goto err_poolset;

		if (os_access(drep->part[p].path, F_OK)) {
			if (errno == ENOENT) {
				errno = 0;
			} else {
					"unable to access the part of the destination poolset: %s",
				goto err_poolset;

		overwrite_required = true;

		if ((size_t)util_file_get_size(drep->part[p].path) !=
				srep->part[p].filesize) {
				"destination of the backup part does not match size of the source part file: %s",
			goto err_poolset;

		loc->step = CHECK_STEP_COMPLETE;
		return 0;

	if (overwrite_required) {
			"part files of the destination poolset of the backup already exist.|"
			"Do you want to overwrite them?");

	return check_questions_sequence_validate(ppc);

	ppc->result = CHECK_RESULT_ERROR;
	return CHECK_ERR(ppc, "unable to backup poolset");
Exemple #17
 * lane_boot -- initializes all lanes
lane_boot(PMEMobjpool *pop)
	ASSERTeq(pop->lanes, NULL);

	int err;

	pthread_mutexattr_t lock_attr;
	if ((err = pthread_mutexattr_init(&lock_attr)) != 0) {
		goto error_lanes_malloc;

	if ((err = pthread_mutexattr_settype(
			&lock_attr, PTHREAD_MUTEX_RECURSIVE)) != 0) {
		goto error_lanes_malloc;

	pop->lanes = Malloc(sizeof (struct lane) * pop->nlanes);
	if (pop->lanes == NULL) {
		err = ENOMEM;
		ERR("!Malloc of volatile lanes");
		goto error_lanes_malloc;

	pop->lane_locks = Malloc(sizeof (pthread_mutex_t) * pop->nlanes);
	if (pop->lane_locks == NULL) {
		err = ENOMEM;
		ERR("!Malloc for lane locks");
		goto error_lock_malloc;

	/* add lanes to pmemcheck ignored list */
	VALGRIND_ADD_TO_GLOBAL_TX_IGNORE((char *)pop + pop->lanes_offset,
		(sizeof (struct lane_layout) * pop->nlanes));

	uint64_t i;
	for (i = 0; i < pop->nlanes; ++i) {
		struct lane_layout *layout = lane_get_layout(pop, i);

		if ((err = lane_init(pop, &pop->lanes[i], layout,
				&pop->lane_locks[i], &lock_attr)) != 0) {
			goto error_lane_init;

	if (pthread_mutexattr_destroy(&lock_attr) != 0) {
		goto error_mutexattr_destroy;

	return 0;

	for (; i >= 1; --i)
		lane_destroy(pop, &pop->lanes[i - 1]);

	pop->lane_locks = NULL;

	pop->lanes = NULL;

	if (pthread_mutexattr_destroy(&lock_attr) != 0)

	return err;
Exemple #18
 * pmemalloc_check -- check the consistency of a pmem pool
 * Inputs:
 *	path -- path to the file which contains the memory pool
 * The current state of the pmem pool is printed.  This routine does
 * not make any changes to the pmem pool (maps it read-only, in fact).
 * It is not necessary to call pmemalloc_init() before calling this.
void pmemalloc_check(const char *path)
	void *pmp;
	int fd;
	struct stat stbuf;
	struct clump *clp;
	struct clump *lastclp;
	struct pool_header *hdrp;
	size_t clumptotal;
	 * stats we keep for each type of memory:
	 * 	stats[PMEM_STATE_FREE] for free clumps
	 * 	stats[PMEM_STATE_RESERVED] for reserved clumps
	 * 	stats[PMEM_STATE_ACTIVATING] for activating clumps
	 * 	stats[PMEM_STATE_ACTIVE] for active clumps
	 * 	stats[PMEM_STATE_FREEING] for freeing clumps
	 * 	stats[PMEM_STATE_UNUSED] for overall totals
	struct {
		size_t largest;
		size_t smallest;
		size_t bytes;
		unsigned count;
	} stats[PMEM_STATE_UNUSED + 1] = { 0 };
	const char *names[] = {
	int i;

	DEBUG("path=%s", path);

	if ((fd = open(path, O_RDONLY)) < 0)
		FATALSYS("%s", path);

	if (fstat(fd, &stbuf) < 0)

	DEBUG("file size 0x%lx", stbuf.st_size);

	if (stbuf.st_size < PMEM_MIN_POOL_SIZE)
		FATAL("size %lu too small (must be at least %lu)",
					stbuf.st_size, PMEM_MIN_POOL_SIZE);

	if ((pmp = mmap(NULL, stbuf.st_size, PROT_READ, MAP_SHARED,
					fd, 0)) == MAP_FAILED)
	DEBUG("pmp %lx", pmp);


	hdrp = PMEM(pmp, (struct pool_header *)PMEM_HDR_OFFSET);
	DEBUG("   hdrp 0x%lx (off 0x%lx)", hdrp, OFF(pmp, hdrp));

	if (strcmp(hdrp->signature, PMEM_SIGNATURE))
		FATAL("failed signature check");
	DEBUG("signature check passed");

	clp = PMEM(pmp, (struct clump *)PMEM_CLUMP_OFFSET);
	 * location of last clump is calculated by rounding the file
	 * size down to a multiple of 64, and then subtracting off
	 * another 64 to hold the struct clump.  the last clump is
	 * indicated by a size of zero.
	lastclp = PMEM(pmp, (struct clump *)
		(stbuf.st_size & ~(PMEM_CHUNK_SIZE - 1)) - PMEM_CHUNK_SIZE);
	DEBUG("    clp 0x%lx (off 0x%lx)", clp, OFF(pmp, clp));
	DEBUG("lastclp 0x%lx (off 0x%lx)", lastclp, OFF(pmp, lastclp));

	clumptotal = (uintptr_t)lastclp - (uintptr_t)clp;

	DEBUG("expected clumptotal: %lu", clumptotal);

	 * check that:
	 *   the overhead size (stuff up to CLUMP_OFFSET)
	 * + clumptotal
	 * + last clump marker (CHUNK_SIZE)
	 * + any bytes we rounded off the end
	 * = file size
	if (PMEM_CLUMP_OFFSET + clumptotal + 
		(stbuf.st_size & (PMEM_CHUNK_SIZE - 1)) + PMEM_CHUNK_SIZE
		== stbuf.st_size) {
		DEBUG("section sizes correctly add up to file size");
	} else {
		FATAL("CLUMP_OFFSET %d + clumptotal %lu + rounded %d + "
				"CHUNK_SIZE %d = %lu, (not st_size %lu)",
				PMEM_CLUMP_OFFSET, clumptotal,
				(stbuf.st_size & (PMEM_CHUNK_SIZE - 1)),
				PMEM_CLUMP_OFFSET + clumptotal +
				(stbuf.st_size & (PMEM_CHUNK_SIZE - 1)) +

	if (clp->size == 0)
		FATAL("no clumps found");

	while (clp->size) {
		size_t sz = clp->size & ~PMEM_STATE_MASK;
		int state = clp->size & PMEM_STATE_MASK;

		DEBUG("[%u]clump size 0x%lx state %d",
				OFF(pmp, clp), sz, state);
		DEBUG("on: 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx",
			clp->on[0].off, clp->on[0].ptr_,
			clp->on[1].off, clp->on[1].ptr_,
			clp->on[2].off, clp->on[2].ptr_);

		if (sz > stats[PMEM_STATE_UNUSED].largest)
			stats[PMEM_STATE_UNUSED].largest = sz;
		if (stats[PMEM_STATE_UNUSED].smallest == 0 ||
		    sz < stats[PMEM_STATE_UNUSED].smallest)
			stats[PMEM_STATE_UNUSED].smallest = sz;
		stats[PMEM_STATE_UNUSED].bytes += sz;

		switch (state) {
			DEBUG("clump state: free");
			ASSERTeq(clp->on[0].off, 0);
			ASSERTeq(clp->on[1].off, 0);
			ASSERTeq(clp->on[2].off, 0);

			DEBUG("clump state: reserved");

			DEBUG("clump state: activating");

			DEBUG("clump state: active");
			ASSERTeq(clp->on[0].off, 0);
			ASSERTeq(clp->on[1].off, 0);
			ASSERTeq(clp->on[2].off, 0);

			DEBUG("clump state: freeing");

			FATAL("unknown clump state: %d", state);

		if (sz > stats[state].largest)
			stats[state].largest = sz;
		if (stats[state].smallest == 0 ||
		    sz < stats[state].smallest)
			stats[state].smallest = sz;
		stats[state].bytes += sz;

		clp = (struct clump *)((uintptr_t)clp + sz);
		DEBUG("next clp 0x%lx, offset 0x%lx", clp, OFF(pmp, clp));

	if (clp == lastclp)
		DEBUG("all clump space accounted for");
		FATAL("clump list stopped at %lx instead of %lx", clp, lastclp);

	if (munmap(pmp, stbuf.st_size) < 0)

	 * print the report
	printf("Summary of pmem pool:\n");
	printf("File size: %lu, %d allocatable bytes in pool\n\n",
			stbuf.st_size, clumptotal);
	printf("     State      Bytes     Clumps    Largest   Smallest\n");
	for (i = 0; i < PMEM_STATE_UNUSED + 1; i++) {
		printf("%10s %10d %10d %10d %10d\n",
Exemple #19
 * write_layout -- (internal) write out the initial btt metadata layout
 * Called with write == 1 only once in the life time of a btt namespace, when
 * the first write happens.  The caller of this routine is responsible for
 * locking out multiple threads.  This routine doesn't read anything -- by the
 * time it is called, it is known there's no layout in the namespace and a new
 * layout should be written.
 * Calling with write == 0 tells this routine to do the calculations for
 * bttp->narena and bttp->nlba, but don't write out any metadata.
 * If successful, sets bttp->layout to 1 and returns 0.  Otherwise -1
 * is returned and errno is set, and bttp->layout remains 0 so that
 * later attempts to write will try again to create the layout.
static int
write_layout(struct btt *bttp, int lane, int write)
	LOG(3, "bttp %p lane %d write %d", bttp, lane, write);

	ASSERT(bttp->rawsize >= BTT_MIN_SIZE);

	 * The number of arenas is the number of full arena of
	 * size BTT_MAX_ARENA that fit into rawsize and then, if
	 * the remainder is at least BTT_MIN_SIZE in size, then
	 * that adds one more arena.
	bttp->narena = bttp->rawsize / BTT_MAX_ARENA;
	if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE)
	LOG(4, "narena %u", bttp->narena);

	int flog_size = bttp->nfree * 2 * sizeof (struct btt_flog);
	flog_size = roundup(flog_size, BTT_ALIGNMENT);

	uint32_t internal_lbasize = bttp->lbasize;
	if (internal_lbasize < BTT_MIN_LBA)
		internal_lbasize = BTT_MIN_LBA;
	internal_lbasize =
		roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT);
	LOG(4, "adjusted internal_lbasize %u", internal_lbasize);

	uint64_t total_nlba = 0;
	uint64_t rawsize = bttp->rawsize;
	int arena_num = 0;
	off_t arena_off = 0;

	 * for each arena...
	while (rawsize >= BTT_MIN_SIZE) {
		LOG(4, "layout arena %u", arena_num);

		uint64_t arena_rawsize = rawsize;
		if (arena_rawsize > BTT_MAX_ARENA) {
			arena_rawsize = BTT_MAX_ARENA;
		rawsize -= arena_rawsize;

		uint64_t arena_datasize = arena_rawsize;
		arena_datasize -= 2 * sizeof (struct btt_info);
		arena_datasize -= flog_size;

		/* allow for map alignment padding */
		uint64_t internal_nlba = (arena_datasize - BTT_ALIGNMENT) /
			(internal_lbasize + BTT_MAP_ENTRY_SIZE);
		uint64_t external_nlba = internal_nlba - bttp->nfree;

		LOG(4, "internal_nlba %zu external_nlba %zu",
				internal_nlba, external_nlba);

		total_nlba += external_nlba;

		 * The rest of the loop body calculates metadata structures
		 * and lays it out for this arena.  So only continue if
		 * the write flag is set.
		if (!write)

		uint64_t mapsize = roundup(external_nlba * BTT_MAP_ENTRY_SIZE,
		arena_datasize -= mapsize;

		ASSERT(arena_datasize / internal_lbasize >= internal_nlba);

		 * Calculate offsets for the BTT info block.  These are
		 * all relative to the beginning of the arena.
		uint64_t nextoff;
		if (rawsize)
			nextoff = arena_rawsize;
			nextoff = 0;
		uint64_t infooff = arena_rawsize - sizeof (struct btt_info);
		uint64_t flogoff = infooff - flog_size;
		uint64_t mapoff = flogoff - mapsize;
		uint64_t dataoff = sizeof (struct btt_info);

		LOG(4, "nextoff 0x%016lx", nextoff);
		LOG(4, "dataoff 0x%016lx", dataoff);
		LOG(4, "mapoff  0x%016lx", mapoff);
		LOG(4, "flogoff 0x%016lx", flogoff);
		LOG(4, "infooff 0x%016lx", infooff);

		ASSERTeq(arena_datasize, mapoff - dataoff);

		/* write out the initial map, identity style */
		off_t map_entry_off = arena_off + mapoff;
		uint32_t *mapp = NULL;
		int mlen = 0;
		int next_index = 0;
		int remaining = 0;
		for (int i = 0; i < external_nlba; i++) {
			if (remaining == 0) {
				/* flush previous mapped area */
				if (mapp != NULL) {
					 * Protect the memory again
					 * (debug version only).
					 * If (mapp != NULL) it had to be
					 * unprotected earlier.
					RANGE_RO(mapp, mlen);

						lane, mapp, mlen);
				/* request a mapping of remaining map area */
				mlen = (*bttp->ns_cbp->nsmap)(bttp->ns,
					lane, (void **)&mapp,
					(external_nlba - i) * sizeof (uint32_t),

				if (mlen < 0)
					return -1;

				/* unprotect the memory (debug version only) */
				RANGE_RW(mapp, mlen);

				remaining = mlen;
				next_index = 0;
			mapp[next_index++] = htole32(i | BTT_MAP_ENTRY_ZERO);
			remaining -= sizeof (uint32_t);

		/* protect the memory again (debug version only) */
		RANGE_RO(mapp, mlen);

		/* flush previous mapped area */
		if (mapp != NULL)
			(*bttp->ns_cbp->nssync)(bttp->ns, lane, mapp, mlen);

		/* write out the initial flog */
		off_t flog_entry_off = arena_off + flogoff;
		uint32_t next_free_lba = external_nlba;
		for (int i = 0; i < bttp->nfree; i++) {
			struct btt_flog flog;
			flog.lba = 0;
			flog.old_map = flog.new_map =
				htole32(next_free_lba | BTT_MAP_ENTRY_ZERO);
			flog.seq = htole32(1);

			 * Write both btt_flog structs in the pair, writing
			 * the second one as all zeros.
			LOG(6, "flog[%d] entry off %zu initial %u + zero = %u",
					i, flog_entry_off, next_free_lba,
					next_free_lba | BTT_MAP_ENTRY_ZERO);
			if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog,
					sizeof (flog), flog_entry_off) < 0)
				return -1;
			flog_entry_off += sizeof (flog);

			LOG(6, "flog[%d] entry off %zu zeros",
					i, flog_entry_off);
			if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog,
					sizeof (Zflog), flog_entry_off) < 0)
				return -1;
			flog_entry_off += sizeof (flog);


		 * Construct the BTT info block and write it out
		 * at both the beginning and end of the arena.
		struct btt_info info;
		memset(&info, '\0', sizeof (info));
		memcpy(info.sig, Sig, BTTINFO_SIG_LEN);
		memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN);
		info.major = htole16(BTTINFO_MAJOR_VERSION);
		info.minor = htole16(BTTINFO_MINOR_VERSION);
		info.external_lbasize = htole32(bttp->lbasize);
		info.external_nlba = htole32(external_nlba);
		info.internal_lbasize = htole32(internal_lbasize);
		info.internal_nlba = htole32(internal_nlba);
		info.nfree = htole32(bttp->nfree);
		info.infosize = htole32(sizeof (info));
		info.nextoff = htole64(nextoff);
		info.dataoff = htole64(dataoff);
		info.mapoff = htole64(mapoff);
		info.flogoff = htole64(flogoff);
		info.infooff = htole64(infooff);

		util_checksum(&info, sizeof (info), &info.checksum, 1);

		if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
					sizeof (info), arena_off) < 0)
			return -1;
		if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
					sizeof (info), arena_off + nextoff) < 0)
			return -1;

		arena_off += nextoff;

	ASSERTeq(bttp->narena, arena_num);

	bttp->nlba = total_nlba;

	if (write) {
		 * The layout is written now, so load up the arenas.
		return read_arenas(bttp, lane, bttp->narena);

	return 0;
main(int argc, char *argv[])
	const int test_value = 123456;
	char *dir = NULL;
	VMEM *vmp;
	size_t alignment;
	unsigned i;
	int *ptr;

	START(argc, argv, "vmem_aligned_alloc");

	if (argc == 2) {
		dir = argv[1];
	} else if (argc > 2) {
		FATAL("usage: %s [directory]", argv[0]);

	/* use custom alloc functions to check for memory leaks */
	vmem_set_funcs(malloc_custom, free_custom,
		realloc_custom, strdup_custom, NULL);

	/* test with address alignment from 2B to 4MB */
	for (alignment = 2; alignment <= 4 * 1024 * 1024; alignment *= 2) {

		custom_alloc_calls = 0;
		if (dir == NULL) {
			vmp = vmem_pool_create_in_region(mem_pool,
			if (vmp == NULL)
		} else {
			vmp = vmem_pool_create(dir, VMEM_MIN_POOL);
			if (vmp == NULL)

		for (i = 0; i < MAX_ALLOCS; ++i) {
			ptr = vmem_aligned_alloc(vmp, alignment, sizeof (int));

			/* at least one allocation must succeed */
			ASSERT(i != 0 || ptr != NULL);
			if (ptr == NULL)

			/* ptr should be usable */
			*ptr = test_value;
			ASSERTeq(*ptr, test_value);

			/* check for correct address alignment */
			ASSERTeq((uintptr_t)(ptr) & (alignment - 1), 0);

			/* check that pointer came from mem_pool */
			if (dir == NULL) {
				ASSERTrange(ptr, mem_pool, VMEM_MIN_POOL);


		/* check memory leaks */
		ASSERTne(custom_alloc_calls, 0);
		ASSERTeq(custom_allocs, 0);

Exemple #21
main(int argc, char *argv[])
	char *dir = NULL;
	VMEM *vmp;
	START(argc, argv, "vmem_freespace");

	if (argc == 2) {
		dir = argv[1];
	} else if (argc > 2) {
		FATAL("usage: %s [directory]", argv[0]);

	if (dir == NULL) {
		/* allocate memory for function vmem_pool_create_in_region() */

		vmp = vmem_pool_create_in_region(mem_pool, VMEM_MIN_POOL);
		if (vmp == NULL)
	} else {
		vmp = vmem_pool_create(dir, VMEM_MIN_POOL);
		if (vmp == NULL)

	size_t total_space = vmem_pool_freespace(vmp);
	size_t free_space = total_space;

	/* allocate all memory */
	void *prev = NULL;
	void **next;
	while ((next = vmem_malloc(vmp, 128)) != NULL) {
		*next = prev;
		prev = next;
		size_t space = vmem_pool_freespace(vmp);
		/* free space can only decrease */
		ASSERT(space <= free_space);
		free_space = space;

	ASSERTne(prev, NULL);
	/* for small allocations use all memory */
	ASSERTeq(free_space, 0);

	while (prev != NULL) {
		void **act = prev;
		prev = *act;
		vmem_free(vmp, act);
		size_t space = vmem_pool_freespace(vmp);
		/* free space can only increase */
		ASSERT(space >= free_space);
		free_space = space;

	free_space = vmem_pool_freespace(vmp);

	 * Depending on the distance of the 'mem_pool' from the
	 * chunk alignment (4MB) a different size of free memory
	 * will be wasted on base_alloc inside jemalloc.
	 * Rest of the internal data should not waste more than 10% of space.
	ASSERT(free_space > ((total_space - 4L * MB) * 9) / 10);


Exemple #22
 * heap_chunk_init -- (internal) writes chunk header
static void
heap_chunk_init(struct palloc_heap *heap, struct chunk_header *hdr,
	uint16_t type, uint32_t size_idx)
	struct chunk_header nhdr = {
		.type = type,
		.flags = 0,
		.size_idx = size_idx

	*hdr = nhdr; /* write the entire header (8 bytes) at once */
	pmemops_persist(&heap->p_ops, hdr, sizeof(*hdr));

	heap_chunk_write_footer(hdr, size_idx);

 * heap_zone_init -- (internal) writes zone's first chunk and header
static void
heap_zone_init(struct palloc_heap *heap, uint32_t zone_id)
	struct zone *z = ZID_TO_ZONE(heap->layout, zone_id);
	uint32_t size_idx = get_zone_size_idx(zone_id, heap->rt->max_zone,

	heap_chunk_init(heap, &z->chunk_headers[0], CHUNK_TYPE_FREE, size_idx);

	struct zone_header nhdr = {
		.size_idx = size_idx,
	z->header = nhdr;  /* write the entire header (8 bytes) at once */
	pmemops_persist(&heap->p_ops, &z->header, sizeof(z->header));

 * heap_run_init -- (internal) creates a run based on a chunk
static void
heap_run_init(struct palloc_heap *heap, struct bucket *b,
	const struct memory_block *m)
	struct alloc_class *c = b->aclass;
	ASSERTeq(c->type, CLASS_RUN);

	struct zone *z = ZID_TO_ZONE(heap->layout, m->zone_id);

	struct chunk_run *run = (struct chunk_run *)&z->chunks[m->chunk_id];
	ASSERTne(m->size_idx, 0);
	size_t runsize = SIZEOF_RUN(run, m->size_idx);


	/* add/remove chunk_run and chunk_header to valgrind transaction */
	VALGRIND_ADD_TO_TX(run, runsize);
	run->block_size = c->unit_size;
	pmemops_persist(&heap->p_ops, &run->block_size,

	/* set all the bits */
	memset(run->bitmap, 0xFF, sizeof(run->bitmap));

	unsigned nval = c->run.bitmap_nval;
	ASSERT(nval > 0);
	/* clear only the bits available for allocations from this bucket */
	memset(run->bitmap, 0, sizeof(uint64_t) * (nval - 1));
	run->bitmap[nval - 1] = c->run.bitmap_lastval;

	run->incarnation_claim = heap->run_id;


	pmemops_persist(&heap->p_ops, run->bitmap, sizeof(run->bitmap));

	struct chunk_header run_data_hdr;
	run_data_hdr.type = CHUNK_TYPE_RUN_DATA;
	run_data_hdr.flags = 0;

	struct chunk_header *data_hdr;
	for (unsigned i = 1; i < m->size_idx; ++i) {
		data_hdr = &z->chunk_headers[m->chunk_id + i];
		VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr));
		VALGRIND_ADD_TO_TX(data_hdr, sizeof(*data_hdr));
		run_data_hdr.size_idx = i;
		*data_hdr = run_data_hdr;
		VALGRIND_REMOVE_FROM_TX(data_hdr, sizeof(*data_hdr));
		&z->chunk_headers[m->chunk_id + 1],
		sizeof(struct chunk_header) * (m->size_idx - 1));

	struct chunk_header *hdr = &z->chunk_headers[m->chunk_id];
	ASSERT(hdr->type == CHUNK_TYPE_FREE);

	VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr));
	struct chunk_header run_hdr;
	run_hdr.size_idx = hdr->size_idx;
	run_hdr.type = CHUNK_TYPE_RUN;
	run_hdr.flags = header_type_to_flag[c->header_type];
	*hdr = run_hdr;
	VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr));

	pmemops_persist(&heap->p_ops, hdr, sizeof(*hdr));

 * heap_run_insert -- (internal) inserts and splits a block of memory into a run
static void
heap_run_insert(struct palloc_heap *heap, struct bucket *b,
	const struct memory_block *m, uint32_t size_idx, uint16_t block_off)
	struct alloc_class *c = b->aclass;
	ASSERTeq(c->type, CLASS_RUN);

	ASSERT(size_idx <= BITS_PER_VALUE);
	ASSERT(block_off + size_idx <= c->run.bitmap_nallocs);

	uint32_t unit_max = c->run.unit_max;
	struct memory_block nm = *m;
	nm.size_idx = unit_max - (block_off % unit_max);
	nm.block_off = block_off;
	if (nm.size_idx > size_idx)
		nm.size_idx = size_idx;

	do {
		bucket_insert_block(b, &nm);
		ASSERT(nm.size_idx <= UINT16_MAX);
		ASSERT(nm.block_off + nm.size_idx <= UINT16_MAX);
		nm.block_off = (uint16_t)(nm.block_off + (uint16_t)nm.size_idx);
		size_idx -= nm.size_idx;
		nm.size_idx = size_idx > unit_max ? unit_max : size_idx;
	} while (size_idx != 0);

 * heap_process_run_metadata -- (internal) parses the run bitmap
static uint32_t
heap_process_run_metadata(struct palloc_heap *heap, struct bucket *b,
	const struct memory_block *m)
	struct alloc_class *c = b->aclass;
	ASSERTeq(c->type, CLASS_RUN);

	uint16_t block_off = 0;
	uint16_t block_size_idx = 0;
	uint32_t inserted_blocks = 0;

	struct zone *z = ZID_TO_ZONE(heap->layout, m->zone_id);
	struct chunk_run *run = (struct chunk_run *)&z->chunks[m->chunk_id];

	for (unsigned i = 0; i < c->run.bitmap_nval; ++i) {
		uint64_t v = run->bitmap[i];
		block_off = (uint16_t)(BITS_PER_VALUE * i);
		if (v == 0) {
			heap_run_insert(heap, b, m, BITS_PER_VALUE, block_off);
			inserted_blocks += BITS_PER_VALUE;
		} else if (v == UINT64_MAX) {

		for (unsigned j = 0; j < BITS_PER_VALUE; ++j) {
			if (BIT_IS_CLR(v, j)) {
			} else if (block_size_idx != 0) {
				ASSERT(block_off >= block_size_idx);

				heap_run_insert(heap, b, m,
					(uint16_t)(block_off - block_size_idx));
				inserted_blocks += block_size_idx;
				block_size_idx = 0;

			if ((block_off++) == c->run.bitmap_nallocs) {

		if (block_size_idx != 0) {
			ASSERT(block_off >= block_size_idx);

			heap_run_insert(heap, b, m,
					(uint16_t)(block_off - block_size_idx));
			inserted_blocks += block_size_idx;
			block_size_idx = 0;

	return inserted_blocks;

 * heap_create_run -- (internal) initializes a new run on an existing free chunk
static void
heap_create_run(struct palloc_heap *heap, struct bucket *b,
	struct memory_block *m)
	heap_run_init(heap, b, m);
	memblock_rebuild_state(heap, m);
	heap_process_run_metadata(heap, b, m);
Exemple #23
 * alloc_class_collection_new -- creates a new collection of allocation classes
struct alloc_class_collection *
	LOG(10, NULL);

	struct alloc_class_collection *ac = Zalloc(sizeof(*ac));
	if (ac == NULL)
		return NULL;

	ac->granularity = ALLOC_BLOCK_SIZE;
	ac->last_run_max_size = MAX_RUN_SIZE;
	ac->fail_on_missing_class = 0;
	ac->autogenerate_on_missing_class = 1;

	size_t maps_size = (MAX_RUN_SIZE / ac->granularity) + 1;

	if ((ac->class_map_by_alloc_size = Malloc(maps_size)) == NULL)
		goto error;
	if ((ac->class_map_by_unit_size = critnib_new()) == NULL)
		goto error;

	memset(ac->class_map_by_alloc_size, 0xFF, maps_size);

	if (alloc_class_new(-1, ac, CLASS_HUGE, HEADER_COMPACT,
		CHUNKSIZE, 0, 1) == NULL)
		goto error;

	struct alloc_class *predefined_class =
		alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT,
			MIN_UNIT_SIZE, 0, 1);
	if (predefined_class == NULL)
		goto error;

	for (size_t i = 0; i < FIRST_GENERATED_CLASS_SIZE / ac->granularity;
		++i) {
		ac->class_map_by_alloc_size[i] = predefined_class->id;

	 * Based on the defined categories, a set of allocation classes is
	 * created. The unit size of those classes is depended on the category
	 * initial size and step.
	size_t granularity_mask = ALLOC_BLOCK_SIZE_GEN - 1;
	for (int c = 1; c < MAX_ALLOC_CATEGORIES; ++c) {
		size_t n = categories[c - 1].size + ALLOC_BLOCK_SIZE_GEN;
		do {
			if (alloc_class_find_or_create(ac, n) == NULL)
				goto error;

			float stepf = (float)n * categories[c].step;
			size_t stepi = (size_t)stepf;
			stepi = (stepf - (float)stepi < FLT_EPSILON) ?
				stepi : stepi + 1;

			n += (stepi + (granularity_mask)) & ~granularity_mask;
		} while (n <= categories[c].size);

	 * Find the largest alloc class and use it's unit size as run allocation
	 * threshold.
	uint8_t largest_aclass_slot;
	for (largest_aclass_slot = MAX_ALLOCATION_CLASSES - 1;
			largest_aclass_slot > 0 &&
			ac->aclasses[largest_aclass_slot] == NULL;
			--largest_aclass_slot) {
		/* intentional NOP */

	struct alloc_class *c = ac->aclasses[largest_aclass_slot];

	 * The actual run might contain less unit blocks than the theoretical
	 * unit max variable. This may be the case for very large unit sizes.
	size_t real_unit_max = c->run.nallocs < RUN_UNIT_MAX_ALLOC ?
		c->run.nallocs : RUN_UNIT_MAX_ALLOC;

	size_t theoretical_run_max_size = c->unit_size * real_unit_max;

	ac->last_run_max_size = MAX_RUN_SIZE > theoretical_run_max_size ?
		theoretical_run_max_size : MAX_RUN_SIZE;

#ifdef DEBUG
	 * Verify that each bucket's unit size points back to the bucket by the
	 * bucket map. This must be true for the default allocation classes,
	 * otherwise duplicate buckets will be created.
	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
		struct alloc_class *c = ac->aclasses[i];

		if (c != NULL && c->type == CLASS_RUN) {
			ASSERTeq(i, c->id);
			ASSERTeq(alloc_class_by_run(ac, c->unit_size,
				c->flags, c->run.size_idx), c);

	return ac;


	return NULL;