/* * alloc_prep_block -- (internal) prepares a memory block for allocation * * Once the block is fully reserved and it's guaranteed that no one else will * be able to write to this memory region it is safe to write the allocation * header and call the object construction function. * * Because the memory block at this stage is only reserved in transient state * there's no need to worry about fail-safety of this method because in case * of a crash the memory will be back in the free blocks collection. */ static int alloc_prep_block(struct palloc_heap *heap, struct memory_block m, palloc_constr constructor, void *arg, uint64_t *offset_value) { void *block_data = heap_get_block_data(heap, m); void *userdatap = (char *)block_data + ALLOC_OFF; uint64_t unit_size = MEMBLOCK_OPS(AUTO, &m)-> block_size(&m, heap->layout); uint64_t real_size = unit_size * m.size_idx; ASSERT((uint64_t)block_data % ALLOC_BLOCK_SIZE == 0); ASSERT((uint64_t)userdatap % ALLOC_BLOCK_SIZE == 0); /* mark everything (including headers) as accessible */ VALGRIND_DO_MAKE_MEM_UNDEFINED(block_data, real_size); /* mark space as allocated */ VALGRIND_DO_MEMPOOL_ALLOC(heap->layout, userdatap, real_size - ALLOC_OFF); alloc_write_header(heap, block_data, m, real_size); int ret; if (constructor != NULL && (ret = constructor(heap->base, userdatap, real_size - ALLOC_OFF, arg)) != 0) { /* * If canceled, revert the block back to the free state in vg * machinery. Because the free operation is only performed on * the user data, the allocation header is made inaccessible * in a separate call. */ VALGRIND_DO_MEMPOOL_FREE(heap->layout, userdatap); VALGRIND_DO_MAKE_MEM_NOACCESS(block_data, ALLOC_OFF); /* * During this method there are several stores to pmem that are * not immediately flushed and in case of a cancellation those * stores are no longer relevant anyway. */ VALGRIND_SET_CLEAN(block_data, ALLOC_OFF); return ret; } /* flushes both the alloc and oob headers */ pmemops_persist(&heap->p_ops, block_data, ALLOC_OFF); /* * To avoid determining the user data pointer twice this method is also * responsible for calculating the offset of the object in the pool that * will be used to set the offset destination pointer provided by the * caller. */ *offset_value = PMALLOC_PTR_TO_OFF(heap, userdatap); return 0; }
/* * persist_alloc -- (internal) performs a persistent allocation of the * memory block previously reserved by volatile bucket */ static int persist_alloc(PMEMobjpool *pop, struct lane_section *lane, struct memory_block m, uint64_t real_size, uint64_t *off, void (*constructor)(PMEMobjpool *pop, void *ptr, void *arg), void *arg, uint64_t data_off) { int err; #ifdef DEBUG if (heap_block_is_allocated(pop, m)) { ERR("heap corruption"); ASSERT(0); } #endif /* DEBUG */ uint64_t op_result = 0; void *block_data = heap_get_block_data(pop, m); void *datap = (char *)block_data + sizeof (struct allocation_header); void *userdatap = (char *)datap + data_off; ASSERT((uint64_t)block_data % _POBJ_CL_ALIGNMENT == 0); /* mark everything (including headers) as accessible */ VALGRIND_DO_MAKE_MEM_UNDEFINED(pop, block_data, real_size); /* mark space as allocated */ VALGRIND_DO_MEMPOOL_ALLOC(pop, userdatap, real_size - sizeof (struct allocation_header) - data_off); alloc_write_header(pop, block_data, m.chunk_id, m.zone_id, real_size); if (constructor != NULL) constructor(pop, userdatap, arg); if ((err = heap_lock_if_run(pop, m)) != 0) { VALGRIND_DO_MEMPOOL_FREE(pop, userdatap); return err; } void *hdr = heap_get_block_header(pop, m, HEAP_OP_ALLOC, &op_result); struct allocator_lane_section *sec = (struct allocator_lane_section *)lane->layout; redo_log_store(pop, sec->redo, ALLOC_OP_REDO_PTR_OFFSET, pop_offset(pop, off), pop_offset(pop, datap)); redo_log_store_last(pop, sec->redo, ALLOC_OP_REDO_HEADER, pop_offset(pop, hdr), op_result); redo_log_process(pop, sec->redo, MAX_ALLOC_OP_REDO); if (heap_unlock_if_run(pop, m) != 0) { ERR("Failed to release run lock"); ASSERT(0); } return err; }
/* * bucket_vg_mark_noaccess -- (internal) marks memory block as no access for vg */ static void bucket_vg_mark_noaccess(struct palloc_heap *heap, struct block_container *bc, struct memory_block m) { if (On_valgrind) { size_t rsize = m.size_idx * bc->unit_size; void *block_data = heap_get_block_data(heap, m); VALGRIND_MAKE_MEM_NOACCESS(block_data, rsize); } }
/* * run_block_offset -- calculates the block offset based on the number of bytes * between the beginning of the chunk and the allocation data. * * Because the block offset is not represented in bytes but in 'unit size', * the number of bytes must also be divided by the chunks block size. * A non-zero remainder would mean that either the caller provided incorrect * pointer or the allocation algorithm created an invalid allocation block. */ static uint16_t run_block_offset(struct memory_block *m, struct palloc_heap *heap, void *ptr) { size_t block_size = MEMBLOCK_OPS(RUN, &m)->block_size(m, heap->layout); void *data = heap_get_block_data(heap, *m); uintptr_t diff = (uintptr_t)ptr - (uintptr_t)data; ASSERT(diff <= RUNSIZE); ASSERT((size_t)diff / block_size <= UINT16_MAX); ASSERT(diff % block_size == 0); uint16_t block_off = (uint16_t)((size_t)diff / block_size); return block_off; }
/* * calc_block_offset -- (internal) calculates the block offset of allocation */ static uint16_t calc_block_offset(PMEMobjpool *pop, struct bucket *b, struct allocation_header *alloc) { uint16_t block_off = 0; if (bucket_is_small(b)) { struct memory_block m = {alloc->chunk_id, alloc->zone_id, 0, 0}; void *data = heap_get_block_data(pop, m); uintptr_t diff = (uintptr_t)alloc - (uintptr_t)data; block_off = diff / bucket_unit_size(b); ASSERT(diff % bucket_unit_size(b) == 0); } return block_off; }
/* * calc_block_offset -- (internal) calculates the block offset of allocation */ static uint16_t calc_block_offset(PMEMobjpool *pop, struct allocation_header *alloc, size_t unit_size) { uint16_t block_off = 0; if (unit_size != CHUNKSIZE) { struct memory_block m = {alloc->chunk_id, alloc->zone_id, 0, 0}; void *data = heap_get_block_data(pop, m); uintptr_t diff = (uintptr_t)alloc - (uintptr_t)data; ASSERT(diff <= RUNSIZE); ASSERT((size_t)diff / unit_size <= UINT16_MAX); ASSERT(diff % unit_size == 0); block_off = (uint16_t)((size_t)diff / unit_size); } return block_off; }
/* * bucket_insert_block -- inserts a new memory block into the container */ int bucket_insert_block(PMEMobjpool *pop, struct bucket *b, struct memory_block m) { ASSERT(m.chunk_id < MAX_CHUNK); ASSERT(m.zone_id < UINT16_MAX); ASSERT(m.size_idx != 0); #ifdef USE_VG_MEMCHECK if (On_valgrind) { size_t rsize = m.size_idx * bucket_unit_size(b); void *block_data = heap_get_block_data(pop, m); VALGRIND_DO_MAKE_MEM_NOACCESS(pop, block_data, rsize); } #endif uint64_t key = CHUNK_KEY_PACK(m.zone_id, m.chunk_id, m.block_off, m.size_idx); return ctree_insert(b->tree, key, 0); }
/* * prealloc_construct -- resizes an existing memory block with a constructor * * The block offset is written persistently into the off variable, but only * after the constructor function has been called. * * If successful function returns zero. Otherwise an error number is returned. */ int prealloc_construct(PMEMobjpool *pop, uint64_t *off, size_t size, void (*constructor)(PMEMobjpool *pop, void *ptr, void *arg), void *arg, uint64_t data_off) { if (size <= pmalloc_usable_size(pop, *off)) return 0; size_t sizeh = size + sizeof (struct allocation_header); int err = 0; struct allocation_header *alloc = alloc_get_header(pop, *off); struct bucket *b = heap_get_best_bucket(pop, alloc->size); uint32_t add_size_idx = bucket_calc_units(b, sizeh - alloc->size); uint32_t new_size_idx = bucket_calc_units(b, sizeh); uint64_t real_size = new_size_idx * bucket_unit_size(b); struct memory_block cnt = get_mblock_from_alloc(pop, b, alloc); if ((err = heap_lock_if_run(pop, cnt)) != 0) return err; struct memory_block next = {0}; if ((err = heap_get_adjacent_free_block(pop, &next, cnt, 0)) != 0) goto error; if (next.size_idx < add_size_idx) { err = ENOMEM; goto error; } if ((err = heap_get_exact_block(pop, b, &next, add_size_idx)) != 0) goto error; struct memory_block *blocks[2] = {&cnt, &next}; uint64_t op_result; void *hdr; struct memory_block m = heap_coalesce(pop, blocks, 2, HEAP_OP_ALLOC, &hdr, &op_result); void *block_data = heap_get_block_data(pop, m); void *datap = block_data + sizeof (struct allocation_header); if (constructor != NULL) constructor(pop, datap + data_off, arg); struct lane_section *lane; if ((err = lane_hold(pop, &lane, LANE_SECTION_ALLOCATOR)) != 0) goto error; struct allocator_lane_section *sec = (struct allocator_lane_section *)lane->layout; redo_log_store(pop, sec->redo, ALLOC_OP_REDO_PTR_OFFSET, pop_offset(pop, &alloc->size), real_size); redo_log_store_last(pop, sec->redo, ALLOC_OP_REDO_HEADER, pop_offset(pop, hdr), op_result); redo_log_process(pop, sec->redo, MAX_ALLOC_OP_REDO); if (lane_release(pop) != 0) { ERR("Failed to release the lane"); ASSERT(0); } if (heap_unlock_if_run(pop, cnt) != 0) { ERR("Failed to release run lock"); ASSERT(0); } return 0; error: if (heap_unlock_if_run(pop, cnt) != 0) { ERR("Failed to release run lock"); ASSERT(0); } return err; }
/* * pmalloc_construct -- allocates a new block of memory with a constructor * * The block offset is written persistently into the off variable, but only * after the constructor function has been called. * * If successful function returns zero. Otherwise an error number is returned. */ int pmalloc_construct(PMEMobjpool *pop, uint64_t *off, size_t size, void (*constructor)(PMEMobjpool *pop, void *ptr, void *arg), void *arg, uint64_t data_off) { size_t sizeh = size + sizeof (struct allocation_header); struct bucket *b = heap_get_best_bucket(pop, sizeh); int err = 0; uint32_t units = bucket_calc_units(b, sizeh); struct memory_block m = {0, 0, units, 0}; if ((err = heap_get_bestfit_block(pop, b, &m)) != 0) return err; uint64_t op_result = 0; void *block_data = heap_get_block_data(pop, m); void *datap = block_data + sizeof (struct allocation_header); ASSERT((uint64_t)block_data % _POBJ_CL_ALIGNMENT == 0); uint64_t real_size = bucket_unit_size(b) * units; alloc_write_header(pop, block_data, m.chunk_id, m.zone_id, real_size); if (constructor != NULL) constructor(pop, datap + data_off, arg); if ((err = heap_lock_if_run(pop, m)) != 0) return err; void *hdr = heap_get_block_header(pop, m, HEAP_OP_ALLOC, &op_result); struct lane_section *lane; if ((err = lane_hold(pop, &lane, LANE_SECTION_ALLOCATOR)) != 0) goto err_lane_hold; struct allocator_lane_section *sec = (struct allocator_lane_section *)lane->layout; redo_log_store(pop, sec->redo, ALLOC_OP_REDO_PTR_OFFSET, pop_offset(pop, off), pop_offset(pop, datap)); redo_log_store_last(pop, sec->redo, ALLOC_OP_REDO_HEADER, pop_offset(pop, hdr), op_result); redo_log_process(pop, sec->redo, MAX_ALLOC_OP_REDO); if (lane_release(pop) != 0) { ERR("Failed to release the lane"); ASSERT(0); } if (heap_unlock_if_run(pop, m) != 0) { ERR("Failed to release run lock"); ASSERT(0); } return 0; err_lane_hold: if (heap_unlock_if_run(pop, m) != 0) { ERR("Failed to release run lock"); ASSERT(0); } if (bucket_insert_block(b, m) != 0) { ERR("Failed to recover heap volatile state"); ASSERT(0); } return err; }
/* * prealloc_construct -- resizes an existing memory block with a constructor * * The block offset is written persistently into the off variable, but only * after the constructor function has been called. * * If successful function returns zero. Otherwise an error number is returned. */ int prealloc_construct(PMEMobjpool *pop, uint64_t *off, size_t size, void (*constructor)(PMEMobjpool *pop, void *ptr, size_t usable_size, void *arg), void *arg, uint64_t data_off) { if (size <= pmalloc_usable_size(pop, *off)) return 0; size_t sizeh = size + sizeof (struct allocation_header); int err; struct allocation_header *alloc = alloc_get_header(pop, *off); struct lane_section *lane; lane_hold(pop, &lane, LANE_SECTION_ALLOCATOR); struct bucket *b = heap_get_best_bucket(pop, alloc->size); uint32_t add_size_idx = b->calc_units(b, sizeh - alloc->size); uint32_t new_size_idx = b->calc_units(b, sizeh); uint64_t real_size = new_size_idx * b->unit_size; struct memory_block cnt = get_mblock_from_alloc(pop, alloc); heap_lock_if_run(pop, cnt); struct memory_block next = {0, 0, 0, 0}; if ((err = heap_get_adjacent_free_block(pop, b, &next, cnt, 0)) != 0) goto out; if (next.size_idx < add_size_idx) { err = ENOMEM; goto out; } if ((err = heap_get_exact_block(pop, b, &next, add_size_idx)) != 0) goto out; struct memory_block *blocks[2] = {&cnt, &next}; uint64_t op_result; void *hdr; struct memory_block m = heap_coalesce(pop, blocks, 2, HEAP_OP_ALLOC, &hdr, &op_result); void *block_data = heap_get_block_data(pop, m); void *datap = (char *)block_data + sizeof (struct allocation_header); void *userdatap = (char *)datap + data_off; /* mark new part as accessible and undefined */ VALGRIND_DO_MAKE_MEM_UNDEFINED(pop, (char *)block_data + alloc->size, real_size - alloc->size); /* resize allocated space */ VALGRIND_DO_MEMPOOL_CHANGE(pop, userdatap, userdatap, real_size - sizeof (struct allocation_header) - data_off); if (constructor != NULL) constructor(pop, userdatap, real_size - sizeof (struct allocation_header) - data_off, arg); struct allocator_lane_section *sec = (struct allocator_lane_section *)lane->layout; redo_log_store(pop, sec->redo, ALLOC_OP_REDO_PTR_OFFSET, pop_offset(pop, &alloc->size), real_size); redo_log_store_last(pop, sec->redo, ALLOC_OP_REDO_HEADER, pop_offset(pop, hdr), op_result); redo_log_process(pop, sec->redo, MAX_ALLOC_OP_REDO); out: heap_unlock_if_run(pop, cnt); lane_release(pop); return err; }
/* * palloc_operation -- persistent memory operation. Takes a NULL pointer * or an existing memory block and modifies it to occupy, at least, 'size' * number of bytes. * * The malloc, free and realloc routines are implemented in the context of this * common operation which encompasses all of the functionality usually done * separately in those methods. * * The first thing that needs to be done is determining which memory blocks * will be affected by the operation - this varies depending on the whether the * operation will need to modify or free an existing block and/or allocate * a new one. * * Simplified allocation process flow is as follows: * - reserve a new block in the transient heap * - prepare the new block * - create redo log of required modifications * - chunk metadata * - offset of the new object * - commit and process the redo log * * And similarly, the deallocation process: * - create redo log of required modifications * - reverse the chunk metadata back to the 'free' state * - set the destination of the object offset to zero * - commit and process the redo log * - return the memory block back to the free blocks transient heap * * Reallocation is a combination of the above, which one additional step * of copying the old content in the meantime. */ int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, palloc_constr constructor, void *arg, struct operation_context *ctx) { struct bucket *b = NULL; struct allocation_header *alloc = NULL; struct memory_block existing_block = {0, 0, 0, 0}; struct memory_block new_block = {0, 0, 0, 0}; struct memory_block reclaimed_block = {0, 0, 0, 0}; int ret = 0; /* * These two lock are responsible for protecting the metadata for the * persistent representation of a chunk. Depending on the operation and * the type of a chunk, they might be NULL. */ pthread_mutex_t *existing_block_lock = NULL; pthread_mutex_t *new_block_lock = NULL; size_t sizeh = size + sizeof(struct allocation_header); /* * The offset of an existing block can be nonzero which means this * operation is either free or a realloc - either way the offset of the * object needs to be translated into structure that all of the heap * methods operate in. */ if (off != 0) { alloc = ALLOC_GET_HEADER(heap, off); existing_block = get_mblock_from_alloc(heap, alloc); /* * This lock must be held until the operation is processed * successfully, because other threads might operate on the * same bitmap value. */ existing_block_lock = MEMBLOCK_OPS(AUTO, &existing_block)-> get_lock(&existing_block, heap); if (existing_block_lock != NULL) util_mutex_lock(existing_block_lock); #ifdef DEBUG if (MEMBLOCK_OPS(AUTO, &existing_block)->get_state(&existing_block, heap) != MEMBLOCK_ALLOCATED) { ERR("Double free or heap corruption"); ASSERT(0); } #endif /* DEBUG */ /* * The memory block must return back to the originating bucket, * otherwise coalescing of neighbouring blocks will be rendered * impossible. * * If the block was allocated in a different incarnation of the * heap (i.e. the application was restarted) and the chunk from * which the allocation comes from was not yet processed, the * originating bucket does not exists and all of the otherwise * necessary volatile heap modifications won't be performed for * this memory block. */ b = heap_get_chunk_bucket(heap, alloc->chunk_id, alloc->zone_id); } /* if allocation or reallocation, reserve new memory */ if (size != 0) { /* reallocation to exactly the same size, which is a no-op */ if (alloc != NULL && alloc->size == sizeh) goto out; errno = alloc_reserve_block(heap, &new_block, sizeh); if (errno != 0) { ret = -1; goto out; } } /* * The offset value which is to be written to the destination pointer * provided by the caller. */ uint64_t offset_value = 0; /* lock and persistently free the existing memory block */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block)) { /* * This method will insert new entries into the operation * context which will, after processing, update the chunk * metadata to 'free' - it also takes care of all the necessary * coalescing of blocks. * Even though the transient state of the heap is used during * this method to locate neighbouring blocks, it isn't modified. * * The rb block is the coalesced memory block that the free * resulted in, to prevent volatile memory leak it needs to be * inserted into the corresponding bucket. */ reclaimed_block = heap_free_block(heap, b, existing_block, ctx); offset_value = 0; } if (!MEMORY_BLOCK_IS_EMPTY(new_block)) { if (alloc_prep_block(heap, new_block, constructor, arg, &offset_value) != 0) { /* * Constructor returned non-zero value which means * the memory block reservation has to be rolled back. */ struct bucket *new_bucket = heap_get_chunk_bucket(heap, new_block.chunk_id, new_block.zone_id); ASSERTne(new_bucket, NULL); /* * Omitting the context in this method results in * coalescing of blocks without affecting the persistent * heap state. */ new_block = heap_free_block(heap, new_bucket, new_block, NULL); CNT_OP(new_bucket, insert, heap, new_block); if (new_bucket->type == BUCKET_RUN) heap_degrade_run_if_empty(heap, new_bucket, new_block); errno = ECANCELED; ret = -1; goto out; } /* * This lock must be held for the duration between the creation * of the allocation metadata updates in the operation context * and the operation processing. This is because a different * thread might operate on the same 8-byte value of the run * bitmap and override allocation performed by this thread. */ new_block_lock = MEMBLOCK_OPS(AUTO, &new_block)-> get_lock(&new_block, heap); /* the locks might be identical in the case of realloc */ if (new_block_lock == existing_block_lock) new_block_lock = NULL; if (new_block_lock != NULL) util_mutex_lock(new_block_lock); #ifdef DEBUG if (MEMBLOCK_OPS(AUTO, &new_block)->get_state(&new_block, heap) != MEMBLOCK_FREE) { ERR("Double free or heap corruption"); ASSERT(0); } #endif /* DEBUG */ /* * The actual required metadata modifications are chunk-type * dependent, but it always is a modification of a single 8 byte * value - either modification of few bits in a bitmap or * changing a chunk type from free to used. */ MEMBLOCK_OPS(AUTO, &new_block)->prep_hdr(&new_block, heap, MEMBLOCK_ALLOCATED, ctx); } /* not in-place realloc */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block) && !MEMORY_BLOCK_IS_EMPTY(new_block)) { size_t old_size = alloc->size; size_t to_cpy = old_size > sizeh ? sizeh : old_size; VALGRIND_ADD_TO_TX(PMALLOC_OFF_TO_PTR(heap, offset_value), to_cpy - ALLOC_OFF); pmemops_memcpy_persist(&heap->p_ops, PMALLOC_OFF_TO_PTR(heap, offset_value), PMALLOC_OFF_TO_PTR(heap, off), to_cpy - ALLOC_OFF); VALGRIND_REMOVE_FROM_TX(PMALLOC_OFF_TO_PTR(heap, offset_value), to_cpy - ALLOC_OFF); } /* * If the caller provided a destination value to update, it needs to be * modified atomically alongside the heap metadata, and so the operation * context must be used. * The actual offset value depends on whether the operation type. */ if (dest_off != NULL) operation_add_entry(ctx, dest_off, offset_value, OPERATION_SET); operation_process(ctx); /* * After the operation succeeded, the persistent state is all in order * but in some cases it might not be in-sync with the its transient * representation. */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block)) { VALGRIND_DO_MEMPOOL_FREE(heap->layout, (char *)heap_get_block_data(heap, existing_block) + ALLOC_OFF); /* we might have been operating on inactive run */ if (b != NULL) { /* * Even though the initial condition is to check * whether the existing block exists it's important to * use the 'reclaimed block' - it is the coalesced one * and reflects the current persistent heap state, * whereas the existing block reflects the state from * before this operation started. */ CNT_OP(b, insert, heap, reclaimed_block); /* * Degrading of a run means turning it back into a chunk * in case it's no longer needed. * It might be tempting to defer this operation until * such time that the chunk is actually needed, but * right now the decision is to keep the persistent heap * state as clean as possible - and that means not * leaving unused data around. */ if (b->type == BUCKET_RUN) heap_degrade_run_if_empty(heap, b, reclaimed_block); } } out: if (new_block_lock != NULL) util_mutex_unlock(new_block_lock); if (existing_block_lock != NULL) util_mutex_unlock(existing_block_lock); return ret; }