/* * list_insert_after -- (internal) insert element at offset after an element */ static size_t list_insert_after(PMEMobjpool *pop, struct operation_context *ctx, struct list_args_insert *args, struct list_args_common *args_common, uint64_t *next_offset, uint64_t *prev_offset) { LOG(15, NULL); /* current->next = dest->next and current->prev = dest */ *next_offset = args->dest_entry_ptr->pe_next.off; *prev_offset = args->dest.off; /* dest->next = current and dest->next->prev = current */ uint64_t dest_next_off = args->dest.off + NEXT_OFF; u64_add_offset(&dest_next_off, args_common->pe_offset); uint64_t dest_next_prev_off = args->dest_entry_ptr->pe_next.off + PREV_OFF; u64_add_offset(&dest_next_prev_off, args_common->pe_offset); void *dest_next_ptr = (char *)pop + dest_next_off; void *dest_next_prev_ptr = (char *)pop + dest_next_prev_off; operation_add_entry(ctx, dest_next_ptr, args_common->obj_doffset, ULOG_OPERATION_SET); operation_add_entry(ctx, dest_next_prev_ptr, args_common->obj_doffset, ULOG_OPERATION_SET); return 0; }
/* * palloc_mem_action_exec -- executes a single memory action (set, and, or) */ static void palloc_mem_action_exec(struct palloc_heap *heap, const struct pobj_action_internal *act, struct operation_context *ctx) { operation_add_entry(ctx, act->ptr, act->value, OPERATION_SET); }
/* * list_update_head -- (internal) update pe_first entry in list head */ static size_t list_update_head(PMEMobjpool *pop, struct operation_context *ctx, struct list_head *head, uint64_t first_offset) { LOG(15, NULL); operation_add_entry(ctx, &head->pe_first.off, first_offset, ULOG_OPERATION_SET); if (head->pe_first.pool_uuid_lo == 0) { operation_add_entry(ctx, &head->pe_first.pool_uuid_lo, pop->uuid_lo, ULOG_OPERATION_SET); } return 0; }
/* * list_fill_entry_redo_log -- (internal) fill new entry using redo log * * Used to update entry in existing object. */ static size_t list_fill_entry_redo_log(PMEMobjpool *pop, struct operation_context *ctx, struct list_args_common *args, uint64_t next_offset, uint64_t prev_offset, int set_uuid) { LOG(15, NULL); struct pmem_ops *ops = &pop->p_ops; ASSERTne(args->entry_ptr, NULL); ASSERTne(args->obj_doffset, 0); if (set_uuid) { VALGRIND_ADD_TO_TX(&(args->entry_ptr->pe_next.pool_uuid_lo), sizeof(args->entry_ptr->pe_next.pool_uuid_lo)); VALGRIND_ADD_TO_TX(&(args->entry_ptr->pe_prev.pool_uuid_lo), sizeof(args->entry_ptr->pe_prev.pool_uuid_lo)); /* don't need to fill pool uuid using redo log */ args->entry_ptr->pe_next.pool_uuid_lo = pop->uuid_lo; args->entry_ptr->pe_prev.pool_uuid_lo = pop->uuid_lo; VALGRIND_REMOVE_FROM_TX( &(args->entry_ptr->pe_next.pool_uuid_lo), sizeof(args->entry_ptr->pe_next.pool_uuid_lo)); VALGRIND_REMOVE_FROM_TX( &(args->entry_ptr->pe_prev.pool_uuid_lo), sizeof(args->entry_ptr->pe_prev.pool_uuid_lo)); pmemops_persist(ops, args->entry_ptr, sizeof(*args->entry_ptr)); } else { ASSERTeq(args->entry_ptr->pe_next.pool_uuid_lo, pop->uuid_lo); ASSERTeq(args->entry_ptr->pe_prev.pool_uuid_lo, pop->uuid_lo); } /* set current->next and current->prev using redo log */ uint64_t next_off_off = args->obj_doffset + NEXT_OFF; uint64_t prev_off_off = args->obj_doffset + PREV_OFF; u64_add_offset(&next_off_off, args->pe_offset); u64_add_offset(&prev_off_off, args->pe_offset); void *next_ptr = (char *)pop + next_off_off; void *prev_ptr = (char *)pop + prev_off_off; operation_add_entry(ctx, next_ptr, next_offset, ULOG_OPERATION_SET); operation_add_entry(ctx, prev_ptr, prev_offset, ULOG_OPERATION_SET); return 0; }
/* * run_prep_operation_hdr -- prepares the new value for a select few bytes of * a run bitmap that will be set after the operation concludes. * * It's VERY important to keep in mind that the particular value of the * bitmap this method is modifying must not be changed after this function * is called and before the operation is processed. */ static void run_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, struct operation_context *ctx) { struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id); struct chunk_run *r = (struct chunk_run *)&z->chunks[m->chunk_id]; ASSERT(m->size_idx <= BITS_PER_VALUE); /* * Free blocks are represented by clear bits and used blocks by set * bits - which is the reverse of the commonly used scheme. * * Here a bit mask is prepared that flips the bits that represent the * memory block provided by the caller - because both the size index and * the block offset are tied 1:1 to the bitmap this operation is * relatively simple. */ uint64_t bmask; if (m->size_idx == BITS_PER_VALUE) { ASSERTeq(m->block_off % BITS_PER_VALUE, 0); bmask = UINT64_MAX; } else { bmask = ((1ULL << m->size_idx) - 1ULL) << (m->block_off % BITS_PER_VALUE); } /* * The run bitmap is composed of several 8 byte values, so a proper * element of the bitmap array must be selected. */ int bpos = m->block_off / BITS_PER_VALUE; /* the bit mask is applied immediately by the add entry operations */ if (op == MEMBLOCK_ALLOCATED) { operation_add_entry(ctx, &r->bitmap[bpos], bmask, OPERATION_OR); } else if (op == MEMBLOCK_FREE) { operation_add_entry(ctx, &r->bitmap[bpos], ~bmask, OPERATION_AND); } else { ASSERT(0); } }
/* * operation_add_entries -- adds new entries to the current operation */ void operation_add_entries(struct operation_context *ctx, struct operation_entry *entries, size_t nentries) { for (size_t i = 0; i < nentries; ++i) { operation_add_entry(ctx, entries[i].ptr, entries[i].value, entries[i].type); } }
/* * huge_prep_operation_hdr -- prepares the new value of a chunk header that will * be set after the operation concludes. */ static void huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, struct operation_context *ctx) { struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); /* * Depending on the operation that needs to be performed a new chunk * header needs to be prepared with the new chunk state. */ uint64_t val = chunk_get_chunk_hdr_value( op == MEMBLOCK_ALLOCATED ? CHUNK_TYPE_USED : CHUNK_TYPE_FREE, hdr->flags, m->size_idx); if (ctx == NULL) { util_atomic_store_explicit64((uint64_t *)hdr, val, memory_order_relaxed); pmemops_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); } else { operation_add_entry(ctx, hdr, val, ULOG_OPERATION_SET); } VALGRIND_DO_MAKE_MEM_NOACCESS(hdr + 1, (hdr->size_idx - 1) * sizeof(struct chunk_header)); /* * In the case of chunks larger than one unit the footer must be * created immediately AFTER the persistent state is safely updated. */ if (m->size_idx == 1) return; struct chunk_header *footer = hdr + m->size_idx - 1; VALGRIND_DO_MAKE_MEM_UNDEFINED(footer, sizeof(*footer)); val = chunk_get_chunk_hdr_value(CHUNK_TYPE_FOOTER, 0, m->size_idx); /* * It's only safe to write the footer AFTER the persistent part of * the operation have been successfully processed because the footer * pointer might point to a currently valid persistent state * of a different chunk. * The footer entry change is updated as transient because it will * be recreated at heap boot regardless - it's just needed for runtime * operations. */ if (ctx == NULL) { util_atomic_store_explicit64((uint64_t *)footer, val, memory_order_relaxed); VALGRIND_SET_CLEAN(footer, sizeof(*footer)); } else { operation_add_typed_entry(ctx, footer, val, ULOG_OPERATION_SET, LOG_TRANSIENT); } }
/* * list_set_oid_redo_log -- (internal) set PMEMoid value using redo log */ static size_t list_set_oid_redo_log(PMEMobjpool *pop, struct operation_context *ctx, PMEMoid *oidp, uint64_t obj_doffset, int oidp_inited) { ASSERT(OBJ_PTR_IS_VALID(pop, oidp)); if (!oidp_inited || oidp->pool_uuid_lo != pop->uuid_lo) { if (oidp_inited) ASSERTeq(oidp->pool_uuid_lo, 0); operation_add_entry(ctx, &oidp->pool_uuid_lo, pop->uuid_lo, ULOG_OPERATION_SET); } operation_add_entry(ctx, &oidp->off, obj_doffset, ULOG_OPERATION_SET); return 0; }
/* * list_remove_single -- (internal) remove element from single list */ static size_t list_remove_single(PMEMobjpool *pop, struct operation_context *ctx, struct list_args_remove *args) { LOG(15, NULL); if (args->entry_ptr->pe_next.off == args->obj_doffset) { /* only one element on list */ ASSERTeq(args->head->pe_first.off, args->obj_doffset); ASSERTeq(args->entry_ptr->pe_prev.off, args->obj_doffset); return list_update_head(pop, ctx, args->head, 0); } else { /* set next->prev = prev and prev->next = next */ uint64_t next_off = args->entry_ptr->pe_next.off; uint64_t next_prev_off = next_off + PREV_OFF; u64_add_offset(&next_prev_off, args->pe_offset); uint64_t prev_off = args->entry_ptr->pe_prev.off; uint64_t prev_next_off = prev_off + NEXT_OFF; u64_add_offset(&prev_next_off, args->pe_offset); void *prev_ptr = (char *)pop + next_prev_off; void *next_ptr = (char *)pop + prev_next_off; operation_add_entry(ctx, prev_ptr, prev_off, ULOG_OPERATION_SET); operation_add_entry(ctx, next_ptr, next_off, ULOG_OPERATION_SET); if (args->head->pe_first.off == args->obj_doffset) { /* removing element is the first one */ return list_update_head(pop, ctx, args->head, next_off); } else { return 0; } } }
/* * huge_prep_operation_hdr -- prepares the new value of a chunk header that will * be set after the operation concludes. */ static void huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, struct operation_context *ctx) { struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id); struct chunk_header *hdr = &z->chunk_headers[m->chunk_id]; /* * Depending on the operation that needs to be performed a new chunk * header needs to be prepared with the new chunk state. */ uint64_t val = chunk_get_chunk_hdr_value( op == MEMBLOCK_ALLOCATED ? CHUNK_TYPE_USED : CHUNK_TYPE_FREE, hdr->flags, m->size_idx); operation_add_entry(ctx, hdr, val, OPERATION_SET); VALGRIND_DO_MAKE_MEM_NOACCESS(hdr + 1, (hdr->size_idx - 1) * sizeof(struct chunk_header)); /* * In the case of chunks larger than one unit the footer must be * created immediately AFTER the persistent state is safely updated. */ if (m->size_idx == 1) return; struct chunk_header *footer = hdr + m->size_idx - 1; VALGRIND_DO_MAKE_MEM_UNDEFINED(footer, sizeof(*footer)); val = chunk_get_chunk_hdr_value(CHUNK_TYPE_FOOTER, 0, m->size_idx); /* * It's only safe to write the footer AFTER the persistent part of * the operation have been successfully processed because the footer * pointer might point to a currently valid persistent state * of a different chunk. * The footer entry change is updated as transient because it will * be recreated at heap boot regardless - it's just needed for runtime * operations. */ operation_add_typed_entry(ctx, footer, val, OPERATION_SET, ENTRY_TRANSIENT); }
/* * palloc_operation -- persistent memory operation. Takes a NULL pointer * or an existing memory block and modifies it to occupy, at least, 'size' * number of bytes. * * The malloc, free and realloc routines are implemented in the context of this * common operation which encompasses all of the functionality usually done * separately in those methods. * * The first thing that needs to be done is determining which memory blocks * will be affected by the operation - this varies depending on the whether the * operation will need to modify or free an existing block and/or allocate * a new one. * * Simplified allocation process flow is as follows: * - reserve a new block in the transient heap * - prepare the new block * - create redo log of required modifications * - chunk metadata * - offset of the new object * - commit and process the redo log * * And similarly, the deallocation process: * - create redo log of required modifications * - reverse the chunk metadata back to the 'free' state * - set the destination of the object offset to zero * - commit and process the redo log * There's an important distinction in the deallocation process - it does not * return the memory block to the transient container. That is done once no more * memory is available. * * Reallocation is a combination of the above, with one additional step * of copying the old content. */ int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id, struct operation_context *ctx) { struct pobj_action_internal alloc = OBJ_HEAP_ACTION_INITIALIZER(0, MEMBLOCK_ALLOCATED); struct pobj_action_internal dealloc = OBJ_HEAP_ACTION_INITIALIZER(off, MEMBLOCK_FREE); size_t user_size = 0; int nops = 0; struct pobj_action_internal ops[2]; if (dealloc.offset != 0) { dealloc.m = memblock_from_offset(heap, dealloc.offset); user_size = dealloc.m.m_ops->get_user_size(&dealloc.m); if (user_size == size) return 0; } if (size != 0) { if (palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags, class_id, &alloc) != 0) return -1; ops[nops++] = alloc; } /* * The offset of an existing block can be nonzero which means this * operation is either free or a realloc - either way the offset of the * object needs to be translated into memory block, which is a structure * that all of the heap methods expect. */ if (dealloc.offset != 0) { /* realloc */ if (!MEMORY_BLOCK_IS_NONE(alloc.m)) { size_t old_size = user_size; size_t to_cpy = old_size > size ? size : old_size; VALGRIND_ADD_TO_TX( HEAP_OFF_TO_PTR(heap, alloc.offset), to_cpy); pmemops_memcpy(&heap->p_ops, HEAP_OFF_TO_PTR(heap, alloc.offset), HEAP_OFF_TO_PTR(heap, off), to_cpy, 0); VALGRIND_REMOVE_FROM_TX( HEAP_OFF_TO_PTR(heap, alloc.offset), to_cpy); } dealloc.lock = dealloc.m.m_ops->get_lock(&dealloc.m); ops[nops++] = dealloc; } /* * If the caller provided a destination value to update, it needs to be * modified atomically alongside the heap metadata, and so the operation * context must be used. * The actual offset value depends on the operation type, but * alloc.offset variable is used because it's 0 in the case of free, * and valid otherwise. */ if (dest_off) operation_add_entry(ctx, dest_off, alloc.offset, OPERATION_SET); palloc_exec_actions(heap, ctx, ops, nops); return 0; }
/* * palloc_operation -- persistent memory operation. Takes a NULL pointer * or an existing memory block and modifies it to occupy, at least, 'size' * number of bytes. * * The malloc, free and realloc routines are implemented in the context of this * common operation which encompasses all of the functionality usually done * separately in those methods. * * The first thing that needs to be done is determining which memory blocks * will be affected by the operation - this varies depending on the whether the * operation will need to modify or free an existing block and/or allocate * a new one. * * Simplified allocation process flow is as follows: * - reserve a new block in the transient heap * - prepare the new block * - create redo log of required modifications * - chunk metadata * - offset of the new object * - commit and process the redo log * * And similarly, the deallocation process: * - create redo log of required modifications * - reverse the chunk metadata back to the 'free' state * - set the destination of the object offset to zero * - commit and process the redo log * - return the memory block back to the free blocks transient heap * * Reallocation is a combination of the above, which one additional step * of copying the old content in the meantime. */ int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, palloc_constr constructor, void *arg, struct operation_context *ctx) { struct bucket *b = NULL; struct allocation_header *alloc = NULL; struct memory_block existing_block = {0, 0, 0, 0}; struct memory_block new_block = {0, 0, 0, 0}; struct memory_block reclaimed_block = {0, 0, 0, 0}; int ret = 0; /* * These two lock are responsible for protecting the metadata for the * persistent representation of a chunk. Depending on the operation and * the type of a chunk, they might be NULL. */ pthread_mutex_t *existing_block_lock = NULL; pthread_mutex_t *new_block_lock = NULL; size_t sizeh = size + sizeof(struct allocation_header); /* * The offset of an existing block can be nonzero which means this * operation is either free or a realloc - either way the offset of the * object needs to be translated into structure that all of the heap * methods operate in. */ if (off != 0) { alloc = ALLOC_GET_HEADER(heap, off); existing_block = get_mblock_from_alloc(heap, alloc); /* * This lock must be held until the operation is processed * successfully, because other threads might operate on the * same bitmap value. */ existing_block_lock = MEMBLOCK_OPS(AUTO, &existing_block)-> get_lock(&existing_block, heap); if (existing_block_lock != NULL) util_mutex_lock(existing_block_lock); #ifdef DEBUG if (MEMBLOCK_OPS(AUTO, &existing_block)->get_state(&existing_block, heap) != MEMBLOCK_ALLOCATED) { ERR("Double free or heap corruption"); ASSERT(0); } #endif /* DEBUG */ /* * The memory block must return back to the originating bucket, * otherwise coalescing of neighbouring blocks will be rendered * impossible. * * If the block was allocated in a different incarnation of the * heap (i.e. the application was restarted) and the chunk from * which the allocation comes from was not yet processed, the * originating bucket does not exists and all of the otherwise * necessary volatile heap modifications won't be performed for * this memory block. */ b = heap_get_chunk_bucket(heap, alloc->chunk_id, alloc->zone_id); } /* if allocation or reallocation, reserve new memory */ if (size != 0) { /* reallocation to exactly the same size, which is a no-op */ if (alloc != NULL && alloc->size == sizeh) goto out; errno = alloc_reserve_block(heap, &new_block, sizeh); if (errno != 0) { ret = -1; goto out; } } /* * The offset value which is to be written to the destination pointer * provided by the caller. */ uint64_t offset_value = 0; /* lock and persistently free the existing memory block */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block)) { /* * This method will insert new entries into the operation * context which will, after processing, update the chunk * metadata to 'free' - it also takes care of all the necessary * coalescing of blocks. * Even though the transient state of the heap is used during * this method to locate neighbouring blocks, it isn't modified. * * The rb block is the coalesced memory block that the free * resulted in, to prevent volatile memory leak it needs to be * inserted into the corresponding bucket. */ reclaimed_block = heap_free_block(heap, b, existing_block, ctx); offset_value = 0; } if (!MEMORY_BLOCK_IS_EMPTY(new_block)) { if (alloc_prep_block(heap, new_block, constructor, arg, &offset_value) != 0) { /* * Constructor returned non-zero value which means * the memory block reservation has to be rolled back. */ struct bucket *new_bucket = heap_get_chunk_bucket(heap, new_block.chunk_id, new_block.zone_id); ASSERTne(new_bucket, NULL); /* * Omitting the context in this method results in * coalescing of blocks without affecting the persistent * heap state. */ new_block = heap_free_block(heap, new_bucket, new_block, NULL); CNT_OP(new_bucket, insert, heap, new_block); if (new_bucket->type == BUCKET_RUN) heap_degrade_run_if_empty(heap, new_bucket, new_block); errno = ECANCELED; ret = -1; goto out; } /* * This lock must be held for the duration between the creation * of the allocation metadata updates in the operation context * and the operation processing. This is because a different * thread might operate on the same 8-byte value of the run * bitmap and override allocation performed by this thread. */ new_block_lock = MEMBLOCK_OPS(AUTO, &new_block)-> get_lock(&new_block, heap); /* the locks might be identical in the case of realloc */ if (new_block_lock == existing_block_lock) new_block_lock = NULL; if (new_block_lock != NULL) util_mutex_lock(new_block_lock); #ifdef DEBUG if (MEMBLOCK_OPS(AUTO, &new_block)->get_state(&new_block, heap) != MEMBLOCK_FREE) { ERR("Double free or heap corruption"); ASSERT(0); } #endif /* DEBUG */ /* * The actual required metadata modifications are chunk-type * dependent, but it always is a modification of a single 8 byte * value - either modification of few bits in a bitmap or * changing a chunk type from free to used. */ MEMBLOCK_OPS(AUTO, &new_block)->prep_hdr(&new_block, heap, MEMBLOCK_ALLOCATED, ctx); } /* not in-place realloc */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block) && !MEMORY_BLOCK_IS_EMPTY(new_block)) { size_t old_size = alloc->size; size_t to_cpy = old_size > sizeh ? sizeh : old_size; VALGRIND_ADD_TO_TX(PMALLOC_OFF_TO_PTR(heap, offset_value), to_cpy - ALLOC_OFF); pmemops_memcpy_persist(&heap->p_ops, PMALLOC_OFF_TO_PTR(heap, offset_value), PMALLOC_OFF_TO_PTR(heap, off), to_cpy - ALLOC_OFF); VALGRIND_REMOVE_FROM_TX(PMALLOC_OFF_TO_PTR(heap, offset_value), to_cpy - ALLOC_OFF); } /* * If the caller provided a destination value to update, it needs to be * modified atomically alongside the heap metadata, and so the operation * context must be used. * The actual offset value depends on whether the operation type. */ if (dest_off != NULL) operation_add_entry(ctx, dest_off, offset_value, OPERATION_SET); operation_process(ctx); /* * After the operation succeeded, the persistent state is all in order * but in some cases it might not be in-sync with the its transient * representation. */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block)) { VALGRIND_DO_MEMPOOL_FREE(heap->layout, (char *)heap_get_block_data(heap, existing_block) + ALLOC_OFF); /* we might have been operating on inactive run */ if (b != NULL) { /* * Even though the initial condition is to check * whether the existing block exists it's important to * use the 'reclaimed block' - it is the coalesced one * and reflects the current persistent heap state, * whereas the existing block reflects the state from * before this operation started. */ CNT_OP(b, insert, heap, reclaimed_block); /* * Degrading of a run means turning it back into a chunk * in case it's no longer needed. * It might be tempting to defer this operation until * such time that the chunk is actually needed, but * right now the decision is to keep the persistent heap * state as clean as possible - and that means not * leaving unused data around. */ if (b->type == BUCKET_RUN) heap_degrade_run_if_empty(heap, b, reclaimed_block); } } out: if (new_block_lock != NULL) util_mutex_unlock(new_block_lock); if (existing_block_lock != NULL) util_mutex_unlock(existing_block_lock); return ret; }