/* * redo_log_process -- (internal) process redo log entries */ void redo_log_process(PMEMobjpool *pop, struct redo_log *redo, size_t nentries) { LOG(15, "redo %p nentries %zu", redo, nentries); #ifdef DEBUG ASSERTeq(redo_log_check(pop, redo, nentries), 0); #endif uint64_t *val; while ((redo->offset & REDO_FINISH_FLAG) == 0) { val = (uint64_t *)((uintptr_t)pop->addr + redo->offset); VALGRIND_ADD_TO_TX(val, sizeof (*val)); *val = redo->value; VALGRIND_REMOVE_FROM_TX(val, sizeof (*val)); pop->flush(pop, val, sizeof (uint64_t)); redo++; } uint64_t offset = redo->offset & REDO_FLAG_MASK; val = (uint64_t *)((uintptr_t)pop->addr + offset); VALGRIND_ADD_TO_TX(val, sizeof (*val)); *val = redo->value; VALGRIND_REMOVE_FROM_TX(val, sizeof (*val)); pop->persist(pop, val, sizeof (uint64_t)); redo->offset = 0; pop->persist(pop, &redo->offset, sizeof (redo->offset)); }
/* * constructor_zrealloc_root -- (internal) constructor for pmemobj_root */ static void constructor_zrealloc_root(PMEMobjpool *pop, void *ptr, size_t usable_size, void *arg) { LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg); ASSERTne(ptr, NULL); ASSERTne(arg, NULL); struct carg_realloc *carg = arg; VALGRIND_ADD_TO_TX(OOB_HEADER_FROM_PTR(ptr), usable_size + OBJ_OOB_SIZE); constructor_realloc(pop, ptr, usable_size, arg); /* activate the padding redzone */ VALGRIND_DO_MAKE_MEM_NOACCESS(pop, &OOB_HEADER_FROM_PTR(ptr)->data.padding, sizeof (OOB_HEADER_FROM_PTR(ptr)->data.padding)); if (carg->constructor) carg->constructor(pop, ptr, carg->arg); VALGRIND_REMOVE_FROM_TX(OOB_HEADER_FROM_PTR(ptr), carg->new_size + OBJ_OOB_SIZE); }
/* * constructor_alloc_root -- (internal) constructor for obj_alloc_root */ static void constructor_alloc_root(PMEMobjpool *pop, void *ptr, void *arg) { LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg); ASSERTne(ptr, NULL); ASSERTne(arg, NULL); struct oob_header *ro = OOB_HEADER_FROM_PTR(ptr); struct carg_root *carg = arg; /* temporarily add atomic root allocation to pmemcheck transaction */ VALGRIND_ADD_TO_TX(ro, OBJ_OOB_SIZE + carg->size); if (carg->constructor) carg->constructor(pop, ptr, carg->arg); else pop->memset_persist(pop, ptr, 0, carg->size); ro->data.internal_type = TYPE_ALLOCATED; ro->data.user_type = POBJ_ROOT_TYPE_NUM; ro->size = carg->size; VALGRIND_REMOVE_FROM_TX(ro, OBJ_OOB_SIZE + carg->size); pop->persist(pop, &ro->size, /* there's no padding between these, so we can add sizes */ sizeof (ro->size) + sizeof (ro->data.internal_type) + sizeof (ro->data.user_type)); VALGRIND_DO_MAKE_MEM_NOACCESS(pop, &ro->data.padding, sizeof (ro->data.padding)); }
/* * operation_process -- processes registered operations * * The order of processing is important: persistent, transient. * This is because the transient entries that reside on persistent memory might * require write to a location that is currently occupied by a valid persistent * state but becomes a transient state after operation is processed. */ void operation_process(struct operation_context *ctx) { struct operation_entry *e; /* * If there's exactly one persistent entry there's no need to involve * the redo log. We can simply assign the value, the operation will be * atomic. */ if (ctx->nentries[ENTRY_PERSISTENT] == 1) { e = &ctx->entries[ENTRY_PERSISTENT][0]; VALGRIND_ADD_TO_TX(e->ptr, sizeof(uint64_t)); PM_EQU((*e->ptr), (e->value)); pmemobj_persist(ctx->pop, e->ptr, sizeof(uint64_t)); VALGRIND_REMOVE_FROM_TX(e->ptr, sizeof(uint64_t)); } else if (ctx->nentries[ENTRY_PERSISTENT] != 0) { operation_process_persistent_redo(ctx); } for (size_t i = 0; i < ctx->nentries[ENTRY_TRANSIENT]; ++i) { e = &ctx->entries[ENTRY_TRANSIENT][i]; PM_EQU((*e->ptr), (e->value)); /* * Just in case that the entry was transient but in reality * the variable is on persistent memory. This is true for * chunk footers. */ VALGRIND_SET_CLEAN(e->ptr, sizeof(e->value)); } }
/* * list_fill_entry_redo_log -- (internal) fill new entry using redo log * * Used to update entry in existing object. */ static size_t list_fill_entry_redo_log(PMEMobjpool *pop, struct redo_log *redo, size_t redo_index, struct list_args_common *args, uint64_t next_offset, uint64_t prev_offset, int set_uuid) { LOG(15, NULL); struct pmem_ops *ops = &pop->p_ops; ASSERTne(args->entry_ptr, NULL); ASSERTne(args->obj_doffset, 0); if (set_uuid) { VALGRIND_ADD_TO_TX(&(args->entry_ptr->pe_next.pool_uuid_lo), sizeof(args->entry_ptr->pe_next.pool_uuid_lo)); VALGRIND_ADD_TO_TX(&(args->entry_ptr->pe_prev.pool_uuid_lo), sizeof(args->entry_ptr->pe_prev.pool_uuid_lo)); /* don't need to fill pool uuid using redo log */ args->entry_ptr->pe_next.pool_uuid_lo = pop->uuid_lo; args->entry_ptr->pe_prev.pool_uuid_lo = pop->uuid_lo; VALGRIND_REMOVE_FROM_TX( &(args->entry_ptr->pe_next.pool_uuid_lo), sizeof(args->entry_ptr->pe_next.pool_uuid_lo)); VALGRIND_REMOVE_FROM_TX( &(args->entry_ptr->pe_prev.pool_uuid_lo), sizeof(args->entry_ptr->pe_prev.pool_uuid_lo)); pmemops_persist(ops, args->entry_ptr, sizeof(*args->entry_ptr)); } else { ASSERTeq(args->entry_ptr->pe_next.pool_uuid_lo, pop->uuid_lo); ASSERTeq(args->entry_ptr->pe_prev.pool_uuid_lo, pop->uuid_lo); } /* set current->next and current->prev using redo log */ uint64_t next_off_off = args->obj_doffset + NEXT_OFF; uint64_t prev_off_off = args->obj_doffset + PREV_OFF; u64_add_offset(&next_off_off, args->pe_offset); u64_add_offset(&prev_off_off, args->pe_offset); redo_log_store(pop->redo, redo, redo_index + 0, next_off_off, next_offset); redo_log_store(pop->redo, redo, redo_index + 1, prev_off_off, prev_offset); return redo_index + 2; }
/* * list_fill_entry_redo_log -- (internal) fill new entry using redo log * * Used to update entry in existing object. */ static size_t list_fill_entry_redo_log(PMEMobjpool *pop, struct operation_context *ctx, struct list_args_common *args, uint64_t next_offset, uint64_t prev_offset, int set_uuid) { LOG(15, NULL); struct pmem_ops *ops = &pop->p_ops; ASSERTne(args->entry_ptr, NULL); ASSERTne(args->obj_doffset, 0); if (set_uuid) { VALGRIND_ADD_TO_TX(&(args->entry_ptr->pe_next.pool_uuid_lo), sizeof(args->entry_ptr->pe_next.pool_uuid_lo)); VALGRIND_ADD_TO_TX(&(args->entry_ptr->pe_prev.pool_uuid_lo), sizeof(args->entry_ptr->pe_prev.pool_uuid_lo)); /* don't need to fill pool uuid using redo log */ args->entry_ptr->pe_next.pool_uuid_lo = pop->uuid_lo; args->entry_ptr->pe_prev.pool_uuid_lo = pop->uuid_lo; VALGRIND_REMOVE_FROM_TX( &(args->entry_ptr->pe_next.pool_uuid_lo), sizeof(args->entry_ptr->pe_next.pool_uuid_lo)); VALGRIND_REMOVE_FROM_TX( &(args->entry_ptr->pe_prev.pool_uuid_lo), sizeof(args->entry_ptr->pe_prev.pool_uuid_lo)); pmemops_persist(ops, args->entry_ptr, sizeof(*args->entry_ptr)); } else { ASSERTeq(args->entry_ptr->pe_next.pool_uuid_lo, pop->uuid_lo); ASSERTeq(args->entry_ptr->pe_prev.pool_uuid_lo, pop->uuid_lo); } /* set current->next and current->prev using redo log */ uint64_t next_off_off = args->obj_doffset + NEXT_OFF; uint64_t prev_off_off = args->obj_doffset + PREV_OFF; u64_add_offset(&next_off_off, args->pe_offset); u64_add_offset(&prev_off_off, args->pe_offset); void *next_ptr = (char *)pop + next_off_off; void *prev_ptr = (char *)pop + prev_off_off; operation_add_entry(ctx, next_ptr, next_offset, ULOG_OPERATION_SET); operation_add_entry(ctx, prev_ptr, prev_offset, ULOG_OPERATION_SET); return 0; }
/* * alloc_write_header -- (internal) creates allocation header */ static void alloc_write_header(struct palloc_heap *heap, struct allocation_header *alloc, struct memory_block m, uint64_t size) { VALGRIND_ADD_TO_TX(alloc, sizeof(*alloc)); alloc->chunk_id = m.chunk_id; alloc->size = size; alloc->zone_id = m.zone_id; VALGRIND_REMOVE_FROM_TX(alloc, sizeof(*alloc)); }
/* * alloc_write_header -- (internal) creates allocation header */ void alloc_write_header(PMEMobjpool *pop, struct allocation_header *alloc, uint32_t chunk_id, uint32_t zone_id, uint64_t size) { VALGRIND_ADD_TO_TX(alloc, sizeof (*alloc)); alloc->chunk_id = chunk_id; alloc->size = size; alloc->zone_id = zone_id; VALGRIND_REMOVE_FROM_TX(alloc, sizeof (*alloc)); pop->persist(alloc, sizeof (*alloc)); }
/* * memblock_header_compact_write -- * (internal) writes a compact header of an object */ static void memblock_header_compact_write(const struct memory_block *m, size_t size, uint64_t extra, uint16_t flags) { struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr)); VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); hdr->size = size | ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); hdr->extra = extra; VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); }
/* * huge_ensure_header_type -- checks the header type of a chunk and modifies * it if necessary. This is fail-safe atomic. */ static void huge_ensure_header_type(const struct memory_block *m, enum header_type t) { struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); ASSERTeq(hdr->type, CHUNK_TYPE_FREE); if ((hdr->flags & header_type_to_flag[t]) == 0) { VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); uint16_t f = ((uint16_t)header_type_to_flag[t]); hdr->flags |= f; pmemops_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); } }
/* * container_ravl_insert_block -- (internal) inserts a new memory block * into the container */ static int container_ravl_insert_block(struct block_container *bc, const struct memory_block *m) { struct block_container_ravl *c = (struct block_container_ravl *)bc; struct memory_block *e = m->m_ops->get_user_data(m); VALGRIND_DO_MAKE_MEM_DEFINED(e, sizeof(*e)); VALGRIND_ADD_TO_TX(e, sizeof(*e)); *e = *m; VALGRIND_SET_CLEAN(e, sizeof(*e)); VALGRIND_REMOVE_FROM_TX(e, sizeof(*e)); return ravl_insert(c->tree, e); }
/* * list_fill_entry_persist -- (internal) fill new entry using persist function * * Used for newly allocated objects. */ static void list_fill_entry_persist(PMEMobjpool *pop, struct list_entry *entry_ptr, uint64_t next_offset, uint64_t prev_offset) { LOG(15, NULL); VALGRIND_ADD_TO_TX(entry_ptr, sizeof(*entry_ptr)); entry_ptr->pe_next.pool_uuid_lo = pop->uuid_lo; entry_ptr->pe_next.off = next_offset; entry_ptr->pe_prev.pool_uuid_lo = pop->uuid_lo; entry_ptr->pe_prev.off = prev_offset; VALGRIND_REMOVE_FROM_TX(entry_ptr, sizeof(*entry_ptr)); pmemops_persist(&pop->p_ops, entry_ptr, sizeof(*entry_ptr)); }
/* * huge_ensure_header_type -- checks the header type of a chunk and modifies * it if necessery. This is fail-safe atomic. */ static void huge_ensure_header_type(const struct memory_block *m, enum header_type t) { struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id); struct chunk_header *hdr = &z->chunk_headers[m->chunk_id]; ASSERTeq(hdr->type, CHUNK_TYPE_FREE); if ((hdr->flags & header_type_to_flag[t]) == 0) { VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); uint16_t f = ((uint16_t)header_type_to_flag[t]); hdr->flags |= f; pmemops_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); } }
/* * memblock_header_legacy_write -- * (internal) writes a legacy header of an object */ static void memblock_header_legacy_write(const struct memory_block *m, size_t size, uint64_t extra, uint16_t flags) { struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr)); VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); hdr->size = size; hdr->type_num = extra; hdr->root_size = ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); /* unused fields of the legacy headers are used as a red zone */ VALGRIND_DO_MAKE_MEM_NOACCESS(hdr->unused, sizeof(hdr->unused)); }
/* * pvector_array_constr -- (internal) constructor of a new vector array. * * The vectors MUST be zeroed because non-zero array elements are treated as * vector values. */ static int pvector_array_constr(void *ctx, void *ptr, size_t usable_size, void *arg) { PMEMobjpool *pop = ctx; /* * Vectors are used as transaction logs, valgrind shouldn't warn about * storing things inside of them. * This memory range is removed from tx when the array is freed as a * result of pop_back or when the transaction itself ends. */ VALGRIND_ADD_TO_TX(ptr, usable_size); pmemops_memset_persist(&pop->p_ops, ptr, 0, usable_size); return 0; }
/* * constructor_zrealloc_root -- (internal) constructor for pmemobj_root */ static void constructor_zrealloc_root(PMEMobjpool *pop, void *ptr, void *arg) { LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg); ASSERTne(ptr, NULL); ASSERTne(arg, NULL); struct carg_realloc *carg = arg; VALGRIND_ADD_TO_TX(OOB_HEADER_FROM_PTR(ptr), carg->new_size + OBJ_OOB_SIZE); constructor_zrealloc(pop, ptr, arg); if (carg->constructor) carg->constructor(pop, ptr, carg->arg); VALGRIND_REMOVE_FROM_TX(OOB_HEADER_FROM_PTR(ptr), carg->new_size + OBJ_OOB_SIZE); }
/* * memblock_header_legacy_write -- * (internal) writes a legacy header of an object */ static void memblock_header_legacy_write(const struct memory_block *m, size_t size, uint64_t extra, uint16_t flags) { struct allocation_header_legacy hdr; hdr.size = size; hdr.type_num = extra; hdr.root_size = ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); struct allocation_header_legacy *hdrp = m->m_ops->get_real_data(m); VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp)); VALGRIND_ADD_TO_TX(hdrp, sizeof(*hdrp)); pmemops_memcpy(&m->heap->p_ops, hdrp, &hdr, sizeof(hdr), /* legacy header is 64 bytes in size */ PMEMOBJ_F_MEM_WC | PMEMOBJ_F_MEM_NODRAIN | PMEMOBJ_F_RELAXED); VALGRIND_REMOVE_FROM_TX(hdrp, sizeof(*hdrp)); /* unused fields of the legacy headers are used as a red zone */ VALGRIND_DO_MAKE_MEM_NOACCESS(hdrp->unused, sizeof(hdrp->unused)); }
/* * memblock_header_compact_write -- * (internal) writes a compact header of an object */ static void memblock_header_compact_write(const struct memory_block *m, size_t size, uint64_t extra, uint16_t flags) { COMPILE_ERROR_ON(ALLOC_HDR_COMPACT_SIZE > CACHELINE_SIZE); struct { struct allocation_header_compact hdr; uint8_t padding[CACHELINE_SIZE - ALLOC_HDR_COMPACT_SIZE]; } padded; padded.hdr.size = size | ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); padded.hdr.extra = extra; struct allocation_header_compact *hdrp = m->m_ops->get_real_data(m); VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp)); /* * If possible write the entire header with a single memcpy, this allows * the copy implementation to avoid a cache miss on a partial cache line * write. */ size_t hdr_size = ALLOC_HDR_COMPACT_SIZE; if ((uintptr_t)hdrp % CACHELINE_SIZE == 0 && size >= sizeof(padded)) hdr_size = sizeof(padded); VALGRIND_ADD_TO_TX(hdrp, hdr_size); pmemops_memcpy(&m->heap->p_ops, hdrp, &padded, hdr_size, PMEMOBJ_F_MEM_WC | PMEMOBJ_F_MEM_NODRAIN | PMEMOBJ_F_RELAXED); VALGRIND_DO_MAKE_MEM_UNDEFINED((char *)hdrp + ALLOC_HDR_COMPACT_SIZE, hdr_size - ALLOC_HDR_COMPACT_SIZE); VALGRIND_REMOVE_FROM_TX(hdrp, hdr_size); }
/* * heap_chunk_init -- (internal) writes chunk header */ static void heap_chunk_init(PMEMobjpool *pop, struct chunk_header *hdr, uint16_t type, uint32_t size_idx) { struct chunk_header nhdr = { .type = type, .flags = 0, .size_idx = size_idx }; *hdr = nhdr; /* write the entire header (8 bytes) at once */ pop->persist(hdr, sizeof (*hdr)); heap_chunk_write_footer(hdr, size_idx); } /* * heap_zone_init -- (internal) writes zone's first chunk and header */ static void heap_zone_init(PMEMobjpool *pop, uint32_t zone_id) { struct zone *z = &pop->heap->layout->zones[zone_id]; uint32_t size_idx = get_zone_size_idx(zone_id, pop->heap->max_zone, pop->heap_size); heap_chunk_init(pop, &z->chunk_headers[0], CHUNK_TYPE_FREE, size_idx); struct zone_header nhdr = { .size_idx = size_idx, .magic = ZONE_HEADER_MAGIC, }; z->header = nhdr; /* write the entire header (8 bytes) at once */ pop->persist(&z->header, sizeof (z->header)); } /* * heap_init_run -- (internal) creates a run based on a chunk */ static void heap_init_run(PMEMobjpool *pop, struct bucket *b, struct chunk_header *hdr, struct chunk_run *run) { /* add/remove chunk_run and chunk_header to valgrind transaction */ VALGRIND_ADD_TO_TX(run, sizeof (*run)); run->block_size = bucket_unit_size(b); pop->persist(&run->block_size, sizeof (run->block_size)); ASSERT(hdr->type == CHUNK_TYPE_FREE); /* set all the bits */ memset(run->bitmap, 0xFF, sizeof (run->bitmap)); /* clear only the bits available for allocations from this bucket */ memset(run->bitmap, 0, sizeof (uint64_t) * (bucket_bitmap_nval(b) - 1)); run->bitmap[bucket_bitmap_nval(b) - 1] = bucket_bitmap_lastval(b); VALGRIND_REMOVE_FROM_TX(run, sizeof (*run)); pop->persist(run->bitmap, sizeof (run->bitmap)); VALGRIND_ADD_TO_TX(hdr, sizeof (*hdr)); hdr->type = CHUNK_TYPE_RUN; VALGRIND_REMOVE_FROM_TX(hdr, sizeof (*hdr)); pop->persist(hdr, sizeof (*hdr)); }
/* * palloc_operation -- persistent memory operation. Takes a NULL pointer * or an existing memory block and modifies it to occupy, at least, 'size' * number of bytes. * * The malloc, free and realloc routines are implemented in the context of this * common operation which encompasses all of the functionality usually done * separately in those methods. * * The first thing that needs to be done is determining which memory blocks * will be affected by the operation - this varies depending on the whether the * operation will need to modify or free an existing block and/or allocate * a new one. * * Simplified allocation process flow is as follows: * - reserve a new block in the transient heap * - prepare the new block * - create redo log of required modifications * - chunk metadata * - offset of the new object * - commit and process the redo log * * And similarly, the deallocation process: * - create redo log of required modifications * - reverse the chunk metadata back to the 'free' state * - set the destination of the object offset to zero * - commit and process the redo log * There's an important distinction in the deallocation process - it does not * return the memory block to the transient container. That is done once no more * memory is available. * * Reallocation is a combination of the above, with one additional step * of copying the old content. */ int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id, struct operation_context *ctx) { struct pobj_action_internal alloc = OBJ_HEAP_ACTION_INITIALIZER(0, MEMBLOCK_ALLOCATED); struct pobj_action_internal dealloc = OBJ_HEAP_ACTION_INITIALIZER(off, MEMBLOCK_FREE); size_t user_size = 0; int nops = 0; struct pobj_action_internal ops[2]; if (dealloc.offset != 0) { dealloc.m = memblock_from_offset(heap, dealloc.offset); user_size = dealloc.m.m_ops->get_user_size(&dealloc.m); if (user_size == size) return 0; } if (size != 0) { if (palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags, class_id, &alloc) != 0) return -1; ops[nops++] = alloc; } /* * The offset of an existing block can be nonzero which means this * operation is either free or a realloc - either way the offset of the * object needs to be translated into memory block, which is a structure * that all of the heap methods expect. */ if (dealloc.offset != 0) { /* realloc */ if (!MEMORY_BLOCK_IS_NONE(alloc.m)) { size_t old_size = user_size; size_t to_cpy = old_size > size ? size : old_size; VALGRIND_ADD_TO_TX( HEAP_OFF_TO_PTR(heap, alloc.offset), to_cpy); pmemops_memcpy(&heap->p_ops, HEAP_OFF_TO_PTR(heap, alloc.offset), HEAP_OFF_TO_PTR(heap, off), to_cpy, 0); VALGRIND_REMOVE_FROM_TX( HEAP_OFF_TO_PTR(heap, alloc.offset), to_cpy); } dealloc.lock = dealloc.m.m_ops->get_lock(&dealloc.m); ops[nops++] = dealloc; } /* * If the caller provided a destination value to update, it needs to be * modified atomically alongside the heap metadata, and so the operation * context must be used. * The actual offset value depends on the operation type, but * alloc.offset variable is used because it's 0 in the case of free, * and valid otherwise. */ if (dest_off) operation_add_entry(ctx, dest_off, alloc.offset, OPERATION_SET); palloc_exec_actions(heap, ctx, ops, nops); return 0; }
/* * heap_degrade_run_if_empty -- makes a chunk out of an empty run */ int heap_degrade_run_if_empty(PMEMobjpool *pop, struct bucket *b, struct memory_block m) { struct zone *z = &pop->heap->layout->zones[m.zone_id]; struct chunk_header *hdr = &z->chunk_headers[m.chunk_id]; ASSERT(hdr->type == CHUNK_TYPE_RUN); struct chunk_run *run = (struct chunk_run *)&z->chunks[m.chunk_id]; int err = 0; if ((err = pthread_mutex_lock(heap_get_run_lock(pop, m))) != 0) return err; int i; for (i = 0; i < bucket_bitmap_nval(b) - 1; ++i) if (run->bitmap[i] != 0) goto out; if (run->bitmap[i] != bucket_bitmap_lastval(b)) goto out; m.block_off = 0; m.size_idx = RUN_UNIT_MAX; uint32_t size_idx_sum = 0; while (size_idx_sum != bucket_bitmap_nallocs(b)) { if (bucket_get_rm_block_exact(b, m) != 0) { ERR("persistent and volatile state mismatched"); ASSERT(0); } size_idx_sum += m.size_idx; m.block_off += RUN_UNIT_MAX; if (m.block_off + RUN_UNIT_MAX > bucket_bitmap_nallocs(b)) m.size_idx = bucket_bitmap_nallocs(b) - m.block_off; else m.size_idx = RUN_UNIT_MAX; } struct bucket *defb = pop->heap->buckets[DEFAULT_BUCKET]; if ((err = bucket_lock(defb)) != 0) { ERR("Failed to lock default bucket"); ASSERT(0); } m.block_off = 0; m.size_idx = 1; heap_chunk_init(pop, hdr, CHUNK_TYPE_FREE, m.size_idx); uint64_t *mhdr; uint64_t op_result; struct memory_block fm = heap_free_block(pop, defb, m, &mhdr, &op_result); VALGRIND_ADD_TO_TX(mhdr, sizeof (*mhdr)); *mhdr = op_result; VALGRIND_REMOVE_FROM_TX(mhdr, sizeof (*mhdr)); pop->persist(mhdr, sizeof (*mhdr)); if ((err = bucket_insert_block(defb, fm)) != 0) { ERR("Failed to update heap volatile state"); } bucket_unlock(defb); out: if (pthread_mutex_unlock(heap_get_run_lock(pop, m)) != 0) { ERR("Failed to release run lock"); ASSERT(0); } return err; }
/* * memblock_huge_init -- initializes a new huge memory block */ struct memory_block memblock_huge_init(struct palloc_heap *heap, uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx) { struct memory_block m = MEMORY_BLOCK_NONE; m.chunk_id = chunk_id; m.zone_id = zone_id; m.size_idx = size_idx; m.heap = heap; struct chunk_header nhdr = { .type = CHUNK_TYPE_FREE, .flags = 0, .size_idx = size_idx }; struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m); VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr)); VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr)); *hdr = nhdr; /* write the entire header (8 bytes) at once */ pmemops_persist(&heap->p_ops, hdr, sizeof(*hdr)); huge_write_footer(hdr, size_idx); memblock_rebuild_state(heap, &m); return m; } /* * memblock_run_init -- initializes a new run memory block */ struct memory_block memblock_run_init(struct palloc_heap *heap, uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx, uint16_t flags, uint64_t unit_size, uint64_t alignment) { ASSERTne(size_idx, 0); struct memory_block m = MEMORY_BLOCK_NONE; m.chunk_id = chunk_id; m.zone_id = zone_id; m.size_idx = size_idx; m.heap = heap; struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); struct chunk_run *run = heap_get_chunk_run(heap, &m); size_t runsize = SIZEOF_RUN(run, size_idx); VALGRIND_DO_MAKE_MEM_UNDEFINED(run, runsize); /* add/remove chunk_run and chunk_header to valgrind transaction */ VALGRIND_ADD_TO_TX(run, runsize); run->hdr.block_size = unit_size; run->hdr.alignment = alignment; struct run_bitmap b; memblock_run_bitmap(&size_idx, flags, unit_size, alignment, run->content, &b); size_t bitmap_size = b.size; /* set all the bits */ memset(b.values, 0xFF, bitmap_size); /* clear only the bits available for allocations from this bucket */ memset(b.values, 0, sizeof(*b.values) * (b.nvalues - 1)); unsigned trailing_bits = b.nbits % RUN_BITS_PER_VALUE; uint64_t last_value = UINT64_MAX << trailing_bits; b.values[b.nvalues - 1] = last_value; VALGRIND_REMOVE_FROM_TX(run, runsize); pmemops_flush(&heap->p_ops, run, sizeof(struct chunk_run_header) + bitmap_size); struct chunk_header run_data_hdr; run_data_hdr.type = CHUNK_TYPE_RUN_DATA; run_data_hdr.flags = 0; VALGRIND_ADD_TO_TX(&z->chunk_headers[chunk_id], sizeof(struct chunk_header) * size_idx); struct chunk_header *data_hdr; for (unsigned i = 1; i < size_idx; ++i) { data_hdr = &z->chunk_headers[chunk_id + i]; VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr)); VALGRIND_ANNOTATE_NEW_MEMORY(data_hdr, sizeof(*data_hdr)); run_data_hdr.size_idx = i; *data_hdr = run_data_hdr; } pmemops_persist(&heap->p_ops, &z->chunk_headers[chunk_id + 1], sizeof(struct chunk_header) * (size_idx - 1)); struct chunk_header *hdr = &z->chunk_headers[chunk_id]; ASSERT(hdr->type == CHUNK_TYPE_FREE); VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr)); struct chunk_header run_hdr; run_hdr.size_idx = hdr->size_idx; run_hdr.type = CHUNK_TYPE_RUN; run_hdr.flags = flags; *hdr = run_hdr; pmemops_persist(&heap->p_ops, hdr, sizeof(*hdr)); VALGRIND_REMOVE_FROM_TX(&z->chunk_headers[chunk_id], sizeof(struct chunk_header) * size_idx); memblock_rebuild_state(heap, &m); return m; }
/* * heap_chunk_init -- (internal) writes chunk header */ static void heap_chunk_init(struct palloc_heap *heap, struct chunk_header *hdr, uint16_t type, uint32_t size_idx) { struct chunk_header nhdr = { .type = type, .flags = 0, .size_idx = size_idx }; VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr)); *hdr = nhdr; /* write the entire header (8 bytes) at once */ pmemops_persist(&heap->p_ops, hdr, sizeof(*hdr)); heap_chunk_write_footer(hdr, size_idx); } /* * heap_zone_init -- (internal) writes zone's first chunk and header */ static void heap_zone_init(struct palloc_heap *heap, uint32_t zone_id) { struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); uint32_t size_idx = get_zone_size_idx(zone_id, heap->rt->max_zone, heap->size); heap_chunk_init(heap, &z->chunk_headers[0], CHUNK_TYPE_FREE, size_idx); struct zone_header nhdr = { .size_idx = size_idx, .magic = ZONE_HEADER_MAGIC, }; z->header = nhdr; /* write the entire header (8 bytes) at once */ pmemops_persist(&heap->p_ops, &z->header, sizeof(z->header)); } /* * heap_run_init -- (internal) creates a run based on a chunk */ static void heap_run_init(struct palloc_heap *heap, struct bucket *b, const struct memory_block *m) { struct alloc_class *c = b->aclass; ASSERTeq(c->type, CLASS_RUN); struct zone *z = ZID_TO_ZONE(heap->layout, m->zone_id); struct chunk_run *run = (struct chunk_run *)&z->chunks[m->chunk_id]; ASSERTne(m->size_idx, 0); size_t runsize = SIZEOF_RUN(run, m->size_idx); VALGRIND_DO_MAKE_MEM_UNDEFINED(run, runsize); /* add/remove chunk_run and chunk_header to valgrind transaction */ VALGRIND_ADD_TO_TX(run, runsize); run->block_size = c->unit_size; pmemops_persist(&heap->p_ops, &run->block_size, sizeof(run->block_size)); /* set all the bits */ memset(run->bitmap, 0xFF, sizeof(run->bitmap)); unsigned nval = c->run.bitmap_nval; ASSERT(nval > 0); /* clear only the bits available for allocations from this bucket */ memset(run->bitmap, 0, sizeof(uint64_t) * (nval - 1)); run->bitmap[nval - 1] = c->run.bitmap_lastval; run->incarnation_claim = heap->run_id; VALGRIND_SET_CLEAN(&run->incarnation_claim, sizeof(run->incarnation_claim)); VALGRIND_REMOVE_FROM_TX(run, runsize); pmemops_persist(&heap->p_ops, run->bitmap, sizeof(run->bitmap)); struct chunk_header run_data_hdr; run_data_hdr.type = CHUNK_TYPE_RUN_DATA; run_data_hdr.flags = 0; struct chunk_header *data_hdr; for (unsigned i = 1; i < m->size_idx; ++i) { data_hdr = &z->chunk_headers[m->chunk_id + i]; VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr)); VALGRIND_ADD_TO_TX(data_hdr, sizeof(*data_hdr)); run_data_hdr.size_idx = i; *data_hdr = run_data_hdr; VALGRIND_REMOVE_FROM_TX(data_hdr, sizeof(*data_hdr)); } pmemops_persist(&heap->p_ops, &z->chunk_headers[m->chunk_id + 1], sizeof(struct chunk_header) * (m->size_idx - 1)); struct chunk_header *hdr = &z->chunk_headers[m->chunk_id]; ASSERT(hdr->type == CHUNK_TYPE_FREE); VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); struct chunk_header run_hdr; run_hdr.size_idx = hdr->size_idx; run_hdr.type = CHUNK_TYPE_RUN; run_hdr.flags = header_type_to_flag[c->header_type]; *hdr = run_hdr; VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); pmemops_persist(&heap->p_ops, hdr, sizeof(*hdr)); } /* * heap_run_insert -- (internal) inserts and splits a block of memory into a run */ static void heap_run_insert(struct palloc_heap *heap, struct bucket *b, const struct memory_block *m, uint32_t size_idx, uint16_t block_off) { struct alloc_class *c = b->aclass; ASSERTeq(c->type, CLASS_RUN); ASSERT(size_idx <= BITS_PER_VALUE); ASSERT(block_off + size_idx <= c->run.bitmap_nallocs); uint32_t unit_max = c->run.unit_max; struct memory_block nm = *m; nm.size_idx = unit_max - (block_off % unit_max); nm.block_off = block_off; if (nm.size_idx > size_idx) nm.size_idx = size_idx; do { bucket_insert_block(b, &nm); ASSERT(nm.size_idx <= UINT16_MAX); ASSERT(nm.block_off + nm.size_idx <= UINT16_MAX); nm.block_off = (uint16_t)(nm.block_off + (uint16_t)nm.size_idx); size_idx -= nm.size_idx; nm.size_idx = size_idx > unit_max ? unit_max : size_idx; } while (size_idx != 0); } /* * heap_process_run_metadata -- (internal) parses the run bitmap */ static uint32_t heap_process_run_metadata(struct palloc_heap *heap, struct bucket *b, const struct memory_block *m) { struct alloc_class *c = b->aclass; ASSERTeq(c->type, CLASS_RUN); uint16_t block_off = 0; uint16_t block_size_idx = 0; uint32_t inserted_blocks = 0; struct zone *z = ZID_TO_ZONE(heap->layout, m->zone_id); struct chunk_run *run = (struct chunk_run *)&z->chunks[m->chunk_id]; for (unsigned i = 0; i < c->run.bitmap_nval; ++i) { ASSERT(i < MAX_BITMAP_VALUES); uint64_t v = run->bitmap[i]; ASSERT(BITS_PER_VALUE * i <= UINT16_MAX); block_off = (uint16_t)(BITS_PER_VALUE * i); if (v == 0) { heap_run_insert(heap, b, m, BITS_PER_VALUE, block_off); inserted_blocks += BITS_PER_VALUE; continue; } else if (v == UINT64_MAX) { continue; } for (unsigned j = 0; j < BITS_PER_VALUE; ++j) { if (BIT_IS_CLR(v, j)) { block_size_idx++; } else if (block_size_idx != 0) { ASSERT(block_off >= block_size_idx); heap_run_insert(heap, b, m, block_size_idx, (uint16_t)(block_off - block_size_idx)); inserted_blocks += block_size_idx; block_size_idx = 0; } if ((block_off++) == c->run.bitmap_nallocs) { i = MAX_BITMAP_VALUES; break; } } if (block_size_idx != 0) { ASSERT(block_off >= block_size_idx); heap_run_insert(heap, b, m, block_size_idx, (uint16_t)(block_off - block_size_idx)); inserted_blocks += block_size_idx; block_size_idx = 0; } } return inserted_blocks; } /* * heap_create_run -- (internal) initializes a new run on an existing free chunk */ static void heap_create_run(struct palloc_heap *heap, struct bucket *b, struct memory_block *m) { heap_run_init(heap, b, m); memblock_rebuild_state(heap, m); heap_process_run_metadata(heap, b, m); }
/* * palloc_operation -- persistent memory operation. Takes a NULL pointer * or an existing memory block and modifies it to occupy, at least, 'size' * number of bytes. * * The malloc, free and realloc routines are implemented in the context of this * common operation which encompasses all of the functionality usually done * separately in those methods. * * The first thing that needs to be done is determining which memory blocks * will be affected by the operation - this varies depending on the whether the * operation will need to modify or free an existing block and/or allocate * a new one. * * Simplified allocation process flow is as follows: * - reserve a new block in the transient heap * - prepare the new block * - create redo log of required modifications * - chunk metadata * - offset of the new object * - commit and process the redo log * * And similarly, the deallocation process: * - create redo log of required modifications * - reverse the chunk metadata back to the 'free' state * - set the destination of the object offset to zero * - commit and process the redo log * - return the memory block back to the free blocks transient heap * * Reallocation is a combination of the above, which one additional step * of copying the old content in the meantime. */ int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, palloc_constr constructor, void *arg, struct operation_context *ctx) { struct bucket *b = NULL; struct allocation_header *alloc = NULL; struct memory_block existing_block = {0, 0, 0, 0}; struct memory_block new_block = {0, 0, 0, 0}; struct memory_block reclaimed_block = {0, 0, 0, 0}; int ret = 0; /* * These two lock are responsible for protecting the metadata for the * persistent representation of a chunk. Depending on the operation and * the type of a chunk, they might be NULL. */ pthread_mutex_t *existing_block_lock = NULL; pthread_mutex_t *new_block_lock = NULL; size_t sizeh = size + sizeof(struct allocation_header); /* * The offset of an existing block can be nonzero which means this * operation is either free or a realloc - either way the offset of the * object needs to be translated into structure that all of the heap * methods operate in. */ if (off != 0) { alloc = ALLOC_GET_HEADER(heap, off); existing_block = get_mblock_from_alloc(heap, alloc); /* * This lock must be held until the operation is processed * successfully, because other threads might operate on the * same bitmap value. */ existing_block_lock = MEMBLOCK_OPS(AUTO, &existing_block)-> get_lock(&existing_block, heap); if (existing_block_lock != NULL) util_mutex_lock(existing_block_lock); #ifdef DEBUG if (MEMBLOCK_OPS(AUTO, &existing_block)->get_state(&existing_block, heap) != MEMBLOCK_ALLOCATED) { ERR("Double free or heap corruption"); ASSERT(0); } #endif /* DEBUG */ /* * The memory block must return back to the originating bucket, * otherwise coalescing of neighbouring blocks will be rendered * impossible. * * If the block was allocated in a different incarnation of the * heap (i.e. the application was restarted) and the chunk from * which the allocation comes from was not yet processed, the * originating bucket does not exists and all of the otherwise * necessary volatile heap modifications won't be performed for * this memory block. */ b = heap_get_chunk_bucket(heap, alloc->chunk_id, alloc->zone_id); } /* if allocation or reallocation, reserve new memory */ if (size != 0) { /* reallocation to exactly the same size, which is a no-op */ if (alloc != NULL && alloc->size == sizeh) goto out; errno = alloc_reserve_block(heap, &new_block, sizeh); if (errno != 0) { ret = -1; goto out; } } /* * The offset value which is to be written to the destination pointer * provided by the caller. */ uint64_t offset_value = 0; /* lock and persistently free the existing memory block */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block)) { /* * This method will insert new entries into the operation * context which will, after processing, update the chunk * metadata to 'free' - it also takes care of all the necessary * coalescing of blocks. * Even though the transient state of the heap is used during * this method to locate neighbouring blocks, it isn't modified. * * The rb block is the coalesced memory block that the free * resulted in, to prevent volatile memory leak it needs to be * inserted into the corresponding bucket. */ reclaimed_block = heap_free_block(heap, b, existing_block, ctx); offset_value = 0; } if (!MEMORY_BLOCK_IS_EMPTY(new_block)) { if (alloc_prep_block(heap, new_block, constructor, arg, &offset_value) != 0) { /* * Constructor returned non-zero value which means * the memory block reservation has to be rolled back. */ struct bucket *new_bucket = heap_get_chunk_bucket(heap, new_block.chunk_id, new_block.zone_id); ASSERTne(new_bucket, NULL); /* * Omitting the context in this method results in * coalescing of blocks without affecting the persistent * heap state. */ new_block = heap_free_block(heap, new_bucket, new_block, NULL); CNT_OP(new_bucket, insert, heap, new_block); if (new_bucket->type == BUCKET_RUN) heap_degrade_run_if_empty(heap, new_bucket, new_block); errno = ECANCELED; ret = -1; goto out; } /* * This lock must be held for the duration between the creation * of the allocation metadata updates in the operation context * and the operation processing. This is because a different * thread might operate on the same 8-byte value of the run * bitmap and override allocation performed by this thread. */ new_block_lock = MEMBLOCK_OPS(AUTO, &new_block)-> get_lock(&new_block, heap); /* the locks might be identical in the case of realloc */ if (new_block_lock == existing_block_lock) new_block_lock = NULL; if (new_block_lock != NULL) util_mutex_lock(new_block_lock); #ifdef DEBUG if (MEMBLOCK_OPS(AUTO, &new_block)->get_state(&new_block, heap) != MEMBLOCK_FREE) { ERR("Double free or heap corruption"); ASSERT(0); } #endif /* DEBUG */ /* * The actual required metadata modifications are chunk-type * dependent, but it always is a modification of a single 8 byte * value - either modification of few bits in a bitmap or * changing a chunk type from free to used. */ MEMBLOCK_OPS(AUTO, &new_block)->prep_hdr(&new_block, heap, MEMBLOCK_ALLOCATED, ctx); } /* not in-place realloc */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block) && !MEMORY_BLOCK_IS_EMPTY(new_block)) { size_t old_size = alloc->size; size_t to_cpy = old_size > sizeh ? sizeh : old_size; VALGRIND_ADD_TO_TX(PMALLOC_OFF_TO_PTR(heap, offset_value), to_cpy - ALLOC_OFF); pmemops_memcpy_persist(&heap->p_ops, PMALLOC_OFF_TO_PTR(heap, offset_value), PMALLOC_OFF_TO_PTR(heap, off), to_cpy - ALLOC_OFF); VALGRIND_REMOVE_FROM_TX(PMALLOC_OFF_TO_PTR(heap, offset_value), to_cpy - ALLOC_OFF); } /* * If the caller provided a destination value to update, it needs to be * modified atomically alongside the heap metadata, and so the operation * context must be used. * The actual offset value depends on whether the operation type. */ if (dest_off != NULL) operation_add_entry(ctx, dest_off, offset_value, OPERATION_SET); operation_process(ctx); /* * After the operation succeeded, the persistent state is all in order * but in some cases it might not be in-sync with the its transient * representation. */ if (!MEMORY_BLOCK_IS_EMPTY(existing_block)) { VALGRIND_DO_MEMPOOL_FREE(heap->layout, (char *)heap_get_block_data(heap, existing_block) + ALLOC_OFF); /* we might have been operating on inactive run */ if (b != NULL) { /* * Even though the initial condition is to check * whether the existing block exists it's important to * use the 'reclaimed block' - it is the coalesced one * and reflects the current persistent heap state, * whereas the existing block reflects the state from * before this operation started. */ CNT_OP(b, insert, heap, reclaimed_block); /* * Degrading of a run means turning it back into a chunk * in case it's no longer needed. * It might be tempting to defer this operation until * such time that the chunk is actually needed, but * right now the decision is to keep the persistent heap * state as clean as possible - and that means not * leaving unused data around. */ if (b->type == BUCKET_RUN) heap_degrade_run_if_empty(heap, b, reclaimed_block); } } out: if (new_block_lock != NULL) util_mutex_unlock(new_block_lock); if (existing_block_lock != NULL) util_mutex_unlock(existing_block_lock); return ret; }