static int rebalance_children(struct shadow_spine *s, struct dm_btree_info *info, struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; struct btree_node *n; n = dm_block_data(shadow_current(s)); if (le32_to_cpu(n->header.nr_entries) == 1) { struct dm_block *child; dm_block_t b = value64(n, 0); r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); if (r) return r; memcpy(n, dm_block_data(child), dm_bm_block_size(dm_tm_get_bm(info->tm))); r = dm_tm_unlock(info->tm, child); if (r) return r; dm_tm_dec(info->tm, dm_block_location(child)); return 0; } i = lower_bound(n, key); if (i < 0) return -ENODATA; r = get_nr_entries(info->tm, value64(n, i), &child_entries); if (r) return r; has_left_sibling = i > 0; has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) r = rebalance2(s, info, vt, i); else if (!has_right_sibling) r = rebalance2(s, info, vt, i - 1); else r = rebalance3(s, info, vt, i - 1); return r; }
/* * Prepares for removal from one level of the hierarchy. The caller must * call delete_at() to remove the entry at index. */ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, struct dm_btree_value_type *vt, dm_block_t root, uint64_t key, unsigned *index) { int i = *index, r; struct btree_node *n; for (;;) { r = shadow_step(s, root, vt); if (r < 0) break; /* * We have to patch up the parent node, ugly, but I don't * see a way to do this automatically as part of the spine * op. */ if (shadow_has_parent(s)) { __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), &location, sizeof(__le64)); } n = dm_block_data(shadow_current(s)); if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); r = rebalance_children(s, info, vt, key); if (r) break; n = dm_block_data(shadow_current(s)); if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); i = lower_bound(n, key); /* * We know the key is present, or else * rebalance_children would have returned * -ENODATA */ root = value64(n, i); } return r; }
static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent; struct child left, right; parent = dm_block_data(shadow_current(s)); r = init_child(info, vt, parent, left_index, &left); if (r) return r; r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; } __rebalance2(info, parent, &left, &right); r = exit_child(info, &left); if (r) { exit_child(info, &right); return r; } return exit_child(info, &right); }
static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, struct btree_node *parent, unsigned index, struct child *result) { int r, inc; dm_block_t root; result->index = index; root = value64(parent, index); r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, &result->block, &inc); if (r) return r; result->n = dm_block_data(result->block); if (inc) inc_children(info->tm, result->n, vt); *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); return 0; }
int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) { int r; struct dm_block *b; struct btree_node *n; size_t block_size; uint32_t max_entries; r = new_block(info, &b); if (r < 0) return r; block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); max_entries = calc_max_entries(info->value_type.size, block_size); n = dm_block_data(b); memset(n, 0, block_size); n->header.flags = cpu_to_le32(LEAF_NODE); n->header.nr_entries = cpu_to_le32(0); n->header.max_entries = cpu_to_le32(max_entries); n->header.value_size = cpu_to_le32(info->value_type.size); *root = dm_block_location(b); return unlock_block(info, b); }
static int array_block_check(struct dm_block_validator *v, struct dm_block *b, size_t size_of_block) { struct array_block *bh_le = dm_block_data(b); __le32 csum_disk; if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", (unsigned long long) le64_to_cpu(bh_le->blocknr), (unsigned long long) dm_block_location(b)); return -ENOTBLK; } csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, size_of_block - sizeof(__le32), CSUM_XOR)); if (csum_disk != bh_le->csum) { DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", (unsigned) le32_to_cpu(csum_disk), (unsigned) le32_to_cpu(bh_le->csum)); return -EILSEQ; } return 0; }
/* * FIXME: We shouldn't use a recursive algorithm when we have limited stack * space. Also this only works for single level trees. */ static int walk_node(struct dm_btree_info *info, dm_block_t block, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { int r; unsigned i, nr; struct dm_block *node; struct btree_node *n; uint64_t keys; r = bn_read_lock(info, block, &node); if (r) return r; n = dm_block_data(node); nr = le32_to_cpu(n->header.nr_entries); for (i = 0; i < nr; i++) { if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { r = walk_node(info, value64(n, i), fn, context); if (r) goto out; } else { keys = le64_to_cpu(*key_ptr(n, i)); r = fn(context, &keys, value_ptr(n, i)); if (r) goto out; } } out: dm_tm_unlock(info->tm, node); return r; }
static int sb_check(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { struct thin_disk_superblock *disk_super = dm_block_data(b); __le32 csum_le; if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { DMERR("sb_check failed: blocknr %llu: " "wanted %llu", le64_to_cpu(disk_super->blocknr), (unsigned long long)dm_block_location(b)); return -ENOTBLK; } if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { DMERR("sb_check failed: magic %llu: " "wanted %llu", le64_to_cpu(disk_super->magic), (unsigned long long)THIN_SUPERBLOCK_MAGIC); return -EILSEQ; } csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, block_size - sizeof(__le32), SUPERBLOCK_CSUM_XOR)); if (csum_le != disk_super->csum) { DMERR("sb_check failed: csum %u: wanted %u", le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); return -EILSEQ; } return 0; }
struct node *ro_node(struct ro_spine *s) { struct dm_block *block; BUG_ON(!s->count); block = s->nodes[s->count - 1]; return dm_block_data(block); }
static void array_block_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t size_of_block) { struct array_block *bh_le = dm_block_data(b); bh_le->blocknr = cpu_to_le64(dm_block_location(b)); bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, size_of_block - sizeof(__le32), CSUM_XOR)); }
static void sb_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { struct thin_disk_superblock *disk_super = dm_block_data(b); disk_super->blocknr = cpu_to_le64(dm_block_location(b)); disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, block_size - sizeof(__le32), SUPERBLOCK_CSUM_XOR)); }
/* * Read locks a block, and coerces it to an array block. The caller must * unlock 'block' when finished. */ static int get_ablock(struct dm_array_info *info, dm_block_t b, struct dm_block **block, struct array_block **ab) { int r; r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); if (r) return r; *ab = dm_block_data(*block); return 0; }
static int bn_shadow(struct dm_btree_info *info, dm_block_t orig, struct dm_btree_value_type *vt, struct dm_block **result) { int r, inc; r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator, result, &inc); if (!r && inc) inc_children(info->tm, dm_block_data(*result), vt); return r; }
static void node_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { struct node *n = dm_block_data(b); struct node_header *h = &n->header; h->blocknr = cpu_to_le64(dm_block_location(b)); h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, block_size - sizeof(__le32), BTREE_CSUM_XOR)); BUG_ON(node_check(v, b, 4096)); }
static int node_check(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { struct node *n = dm_block_data(b); struct node_header *h = &n->header; size_t value_size; __le32 csum_disk; uint32_t flags; if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { DMERR_LIMIT("node_check failed blocknr %llu wanted %llu", le64_to_cpu(h->blocknr), dm_block_location(b)); return -ENOTBLK; } csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags, block_size - sizeof(__le32), BTREE_CSUM_XOR)); if (csum_disk != h->csum) { DMERR_LIMIT("node_check failed csum %u wanted %u", le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); return -EILSEQ; } value_size = le32_to_cpu(h->value_size); if (sizeof(struct node_header) + (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) { DMERR_LIMIT("node_check failed: max_entries too large"); return -EILSEQ; } if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) { DMERR_LIMIT("node_check failed, too many entries"); return -EILSEQ; } /* * The node must be either INTERNAL or LEAF. */ flags = le32_to_cpu(h->flags); if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { DMERR_LIMIT("node_check failed, node is neither INTERNAL or LEAF"); return -EILSEQ; } return 0; }
/* * Looks up an array block in the btree. Then shadows it, and updates the * btree to point to this new shadow. 'root' is an input/output parameter * for both the current root block, and the new one. */ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, unsigned index, struct dm_block **block, struct array_block **ab) { int r, inc; uint64_t key = index; dm_block_t b; __le64 block_le; /* * lookup */ r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); if (r) return r; b = le64_to_cpu(block_le); /* * shadow */ r = dm_tm_shadow_block(info->btree_info.tm, b, &array_validator, block, &inc); if (r) return r; *ab = dm_block_data(*block); if (inc) inc_ablock_entries(info, *ab); /* * Reinsert. * * The shadow op will often be a noop. Only insert if it really * copied data. */ if (dm_block_location(*block) != b) { /* * dm_tm_shadow_block will have already decremented the old * block, but it is still referenced by the btree. We * increment to stop the insert decrementing it below zero * when overwriting the old value. */ dm_tm_inc(info->btree_info.tm, b); r = insert_ablock(info, index, *block, root); } return r; }
static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent = dm_block_data(shadow_current(s)); struct child left, center, right; /* * FIXME: fill out an array? */ r = init_child(info, vt, parent, left_index, &left); if (r) return r; r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); return r; } __rebalance3(info, parent, &left, ¢er, &right); r = exit_child(info, &left); if (r) { exit_child(info, ¢er); exit_child(info, &right); return r; } r = exit_child(info, ¢er); if (r) { exit_child(info, &right); return r; } r = exit_child(info, &right); if (r) return r; return 0; }
static int get_nr_entries(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result) { int r; struct dm_block *block; struct btree_node *n; r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); if (r) return r; n = dm_block_data(block); *result = le32_to_cpu(n->header.nr_entries); return dm_tm_unlock(tm, block); }
/* * Allocate a new array block. The caller will need to unlock block. */ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, uint32_t max_entries, struct dm_block **block, struct array_block **ab) { int r; r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); if (r) return r; (*ab) = dm_block_data(*block); (*ab)->max_entries = cpu_to_le32(max_entries); (*ab)->nr_entries = cpu_to_le32(0); (*ab)->value_size = cpu_to_le32(info->value_type.size); return 0; }
static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) { int r; uint32_t ref_count; if (s->top >= MAX_SPINE_DEPTH - 1) { DMERR("btree deletion stack out of memory"); return -ENOMEM; } r = dm_tm_ref(s->tm, b, &ref_count); if (r) return r; if (ref_count > 1) /* * This is a shared node, so we can just decrement it's * reference counter and leave the children. */ dm_tm_dec(s->tm, b); else { uint32_t flags; struct frame *f = s->spine + ++s->top; r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); if (r) { s->top--; return r; } f->n = dm_block_data(f->b); f->level = level; f->nr_children = le32_to_cpu(f->n->header.nr_entries); f->current_child = 0; flags = le32_to_cpu(f->n->header.flags); if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) prefetch_children(s, f); } return 0; }
/* * Looks up an array block in the btree. Then shadows it, and updates the * btree to point to this new shadow. 'root' is an input/output parameter * for both the current root block, and the new one. */ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, unsigned index, struct dm_block **block, struct array_block **ab) { int r, inc; uint64_t key = index; dm_block_t b; __le64 block_le; /* * lookup */ r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); if (r) return r; b = le64_to_cpu(block_le); /* * shadow */ r = dm_tm_shadow_block(info->btree_info.tm, b, &array_validator, block, &inc); if (r) return r; *ab = dm_block_data(*block); if (inc) inc_ablock_entries(info, *ab); /* * Reinsert. * * The shadow op will often be a noop. Only insert if it really * copied data. */ if (dm_block_location(*block) != b) r = insert_ablock(info, index, *block, root); return r; }
int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { unsigned level, last_level = info->levels - 1; int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; struct dm_btree_value_type le64_vt; init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < info->levels; level++) { r = remove_raw(&spine, info, (level == last_level ? &info->value_type : &le64_vt), root, keys[level], (unsigned *)&index); if (r < 0) break; n = dm_block_data(shadow_current(&spine)); if (level != last_level) { root = value64(n, index); continue; } BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); if (info->value_type.dec) info->value_type.dec(info->value_type.context, value_ptr(n, index)); delete_at(n, index); } *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; }
static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, struct dm_btree_value_type *vt, uint64_t key, unsigned *index) { int r, i = *index, top = 1; struct btree_node *node; for (;;) { r = shadow_step(s, root, vt); if (r < 0) return r; node = dm_block_data(shadow_current(s)); /* * We have to patch up the parent node, ugly, but I don't * see a way to do this automatically as part of the spine * op. */ if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); __dm_bless_for_disk(&location); memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), &location, sizeof(__le64)); } node = dm_block_data(shadow_current(s)); if (node->header.nr_entries == node->header.max_entries) { if (top) r = btree_split_beneath(s, key); else r = btree_split_sibling(s, root, i, key); if (r < 0) return r; } node = dm_block_data(shadow_current(s)); i = lower_bound(node, key); if (le32_to_cpu(node->header.flags) & LEAF_NODE) break; if (i < 0) { /* change the bounds on the lowest key */ node->keys[0] = cpu_to_le64(key); i = 0; } root = value64(node, i); top = 0; } if (i < 0 || le64_to_cpu(node->keys[i]) != key) i++; *index = i; return 0; }
/* * Splits a node by creating a sibling node and shifting half the nodes * contents across. Assumes there is a parent node, and it has room for * another child. * * Before: * +--------+ * | Parent | * +--------+ * | * v * +----------+ * | A ++++++ | * +----------+ * * * After: * +--------+ * | Parent | * +--------+ * | | * v +------+ * +---------+ | * | A* +++ | v * +---------+ +-------+ * | B +++ | * +-------+ * * Where A* is a shadow of A. */ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, unsigned parent_index, uint64_t key) { int r; size_t size; unsigned nr_left, nr_right; struct dm_block *left, *right, *parent; struct btree_node *ln, *rn, *pn; __le64 location; left = shadow_current(s); r = new_block(s->info, &right); if (r < 0) return r; ln = dm_block_data(left); rn = dm_block_data(right); nr_left = le32_to_cpu(ln->header.nr_entries) / 2; nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; ln->header.nr_entries = cpu_to_le32(nr_left); rn->header.flags = ln->header.flags; rn->header.nr_entries = cpu_to_le32(nr_right); rn->header.max_entries = ln->header.max_entries; rn->header.value_size = ln->header.value_size; memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? sizeof(uint64_t) : s->info->value_type.size; memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), size * nr_right); /* * Patch up the parent */ parent = shadow_parent(s); pn = dm_block_data(parent); location = cpu_to_le64(dm_block_location(left)); __dm_bless_for_disk(&location); memcpy_disk(value_ptr(pn, parent_index), &location, sizeof(__le64)); location = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&location); r = insert_at(sizeof(__le64), pn, parent_index + 1, le64_to_cpu(rn->keys[0]), &location); if (r) return r; if (key < le64_to_cpu(rn->keys[0])) { unlock_block(s->info, right); s->nodes[1] = left; } else { unlock_block(s->info, left); s->nodes[1] = right; } return 0; }
/* * Splits a node by creating two new children beneath the given node. * * Before: * +----------+ * | A ++++++ | * +----------+ * * * After: * +------------+ * | A (shadow) | * +------------+ * | | * +------+ +----+ * | | * v v * +-------+ +-------+ * | B +++ | | C +++ | * +-------+ +-------+ */ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) { int r; size_t size; unsigned nr_left, nr_right; struct dm_block *left, *right, *new_parent; struct btree_node *pn, *ln, *rn; __le64 val; new_parent = shadow_current(s); r = new_block(s->info, &left); if (r < 0) return r; r = new_block(s->info, &right); if (r < 0) { /* FIXME: put left */ return r; } pn = dm_block_data(new_parent); ln = dm_block_data(left); rn = dm_block_data(right); nr_left = le32_to_cpu(pn->header.nr_entries) / 2; nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; ln->header.flags = pn->header.flags; ln->header.nr_entries = cpu_to_le32(nr_left); ln->header.max_entries = pn->header.max_entries; ln->header.value_size = pn->header.value_size; rn->header.flags = pn->header.flags; rn->header.nr_entries = cpu_to_le32(nr_right); rn->header.max_entries = pn->header.max_entries; rn->header.value_size = pn->header.value_size; memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? sizeof(__le64) : s->info->value_type.size; memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), nr_right * size); /* new_parent should just point to l and r now */ pn->header.flags = cpu_to_le32(INTERNAL_NODE); pn->header.nr_entries = cpu_to_le32(2); pn->header.max_entries = cpu_to_le32( calc_max_entries(sizeof(__le64), dm_bm_block_size( dm_tm_get_bm(s->info->tm)))); pn->header.value_size = cpu_to_le32(sizeof(__le64)); val = cpu_to_le64(dm_block_location(left)); __dm_bless_for_disk(&val); pn->keys[0] = ln->keys[0]; memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); val = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&val); pn->keys[1] = rn->keys[0]; memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); /* * rejig the spine. This is ugly, since it knows too * much about the spine */ if (s->nodes[0] != new_parent) { unlock_block(s->info, s->nodes[0]); s->nodes[0] = new_parent; } if (key < le64_to_cpu(rn->keys[0])) { unlock_block(s->info, right); s->nodes[1] = left; } else { unlock_block(s->info, left); s->nodes[1] = right; } s->count = 2; return 0; }