static loff_t unatom_dict_write(struct inode *atable, atom_t atom, loff_t where) { unsigned delta = tux3_get_current_delta(); struct buffer_head *buffer, *clone; loff_t old; unsigned offset; buffer = blockread_unatom(atable, atom, &offset); if (!buffer) return -EIO; /* * The atable is protected by i_mutex for now. * blockdirty() should never return -EAGAIN. * FIXME: need finer granularity locking */ clone = blockdirty(buffer, delta); if (IS_ERR(clone)) { assert(PTR_ERR(clone) != -EAGAIN); blockput(buffer); return PTR_ERR(clone); } __be64 *unatom_dict = bufdata(clone); old = be64_to_cpu(unatom_dict[offset]); unatom_dict[offset] = cpu_to_be64(where); mark_buffer_dirty_non(clone); blockput(clone); return old; }
tux_dirent *tux_find_entry(struct inode *dir, const char *name, unsigned len, struct buffer_head **result, loff_t size) { struct sb *sb = tux_sb(dir->i_sb); unsigned reclen = TUX_REC_LEN(len); block_t block, blocks = size >> sb->blockbits; int err = -ENOENT; for (block = 0; block < blocks; block++) { struct buffer_head *buffer = blockread(mapping(dir), block); if (!buffer) { err = -EIO; // need ERR_PTR for blockread!!! goto error; } tux_dirent *entry = bufdata(buffer); tux_dirent *limit = (void *)entry + sb->blocksize - reclen; while (entry <= limit) { if (entry->rec_len == 0) { blockput(buffer); tux_zero_len_error(dir, block); err = -EIO; goto error; } if (tux_match(entry, name, len)) { *result = buffer; return entry; } entry = next_entry(entry); } blockput(buffer); } error: *result = NULL; /* for debug */ return ERR_PTR(err); }
/* Modify buffer of refcount, then release buffer */ static int update_refcount(struct sb *sb, struct buffer_head *buffer, unsigned offset, u16 val) { unsigned delta = tux3_get_current_delta(); struct buffer_head *clone; __be16 *refcount; /* * The atable is protected by i_mutex for now. * blockdirty() should never return -EAGAIN. * FIXME: need finer granularity locking */ clone = blockdirty(buffer, delta); if (IS_ERR(clone)) { assert(PTR_ERR(clone) != -EAGAIN); blockput(buffer); return PTR_ERR(clone); } refcount = bufdata(clone); refcount[offset] = cpu_to_be16(val); mark_buffer_dirty_non(clone); blockput(clone); return 0; }
int tux_readdir(struct file *file, void *state, filldir_t filldir) { loff_t pos = file->f_pos; #ifdef __KERNEL__ struct inode *dir = file->f_dentry->d_inode; #else struct inode *dir = file->f_inode; #endif int revalidate = file->f_version != dir->i_version; struct sb *sb = tux_sb(dir->i_sb); unsigned blockbits = sb->blockbits; block_t block, blocks = dir->i_size >> blockbits; unsigned offset = pos & sb->blockmask; assert(!(dir->i_size & sb->blockmask)); for (block = pos >> blockbits ; block < blocks; block++) { struct buffer_head *buffer = blockread(mapping(dir), block); if (!buffer) return -EIO; void *base = bufdata(buffer); if (revalidate) { if (offset) { tux_dirent *entry = base + offset; tux_dirent *p = base + (offset & sb->blockmask); while (p < entry && p->rec_len) p = next_entry(p); offset = (void *)p - base; file->f_pos = (block << blockbits) + offset; } file->f_version = dir->i_version; revalidate = 0; } tux_dirent *limit = base + sb->blocksize - TUX_REC_LEN(1); for (tux_dirent *entry = base + offset; entry <= limit; entry = next_entry(entry)) { if (entry->rec_len == 0) { blockput(buffer); tux_zero_len_error(dir, block); return -EIO; } if (!is_deleted(entry)) { unsigned type = (entry->type < TUX_TYPES) ? filetype[entry->type] : DT_UNKNOWN; int lame = filldir( state, entry->name, entry->name_len, (block << blockbits) | ((void *)entry - base), be64_to_cpu(entry->inum), type); if (lame) { blockput(buffer); return 0; } } file->f_pos += tux_rec_len_from_disk(entry->rec_len); } blockput(buffer); offset = 0; } return 0; }
static struct dentry *tux3_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct sb *sb = tux_sb(dir->i_sb); struct buffer_head *buffer; struct inode *inode; tux_dirent *entry; inum_t inum; entry = tux_find_dirent(dir, &dentry->d_name, &buffer); if (IS_ERR(entry)) { if (PTR_ERR(entry) != -ENOENT) return ERR_CAST(entry); inode = NULL; goto out; } inum = be64_to_cpu(entry->inum); blockput(buffer); inode = tux3_iget(sb, inum); if (IS_ERR(inode) && PTR_ERR(inode) == -ENOENT) tux3_warn(sb, "%s: inum %Lu not found", __func__, inum); out: return d_splice_alias(inode, dentry); }
static int bitmap_test(struct sb *sb, block_t start, block_t count, int set) { struct inode *bitmap = sb->bitmap; unsigned mapshift = sb->blockbits + 3; unsigned mapsize = 1 << mapshift; unsigned mapmask = mapsize - 1; unsigned mapoffset = start & mapmask; block_t mapblock, mapblocks = (start + count + mapmask) >> mapshift; int (*test)(u8 *, unsigned, unsigned) = set ? all_set : all_clear; for (mapblock = start >> mapshift; mapblock < mapblocks; mapblock++) { struct buffer_head *buffer; unsigned len; int ret; buffer = blockget(mapping(bitmap), mapblock); assert(buffer); len = min_t(block_t, mapsize - mapoffset, count); ret = test(bufdata(buffer), mapoffset, len); blockput(buffer); if (!ret) return 0; mapoffset = 0; count -= len; } return 1; }
static inline void cursor_pop_blockput(struct cursor *cursor) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } blockput(cursor_pop(cursor)); }
static void tux_update_entry(struct buffer_head *buffer, tux_dirent *entry, inum_t inum, umode_t mode) { entry->inum = cpu_to_be64(inum); entry->type = tux_type_by_mode[(mode & S_IFMT) >> STAT_SHIFT]; mark_buffer_dirty_non(buffer); blockput(buffer); }
/* Unpin logblocks. */ static void replay_unpin_logblocks(struct sb *sb, unsigned i, unsigned logcount) { if(DEBUG_MODE_K==1) { printk(KERN_INFO"%25s %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct buffer_head *logbuf; while (i < logcount) { logbuf = blockget(mapping(sb->logmap), i); assert(logbuf != NULL); blockput(logbuf); blockput(logbuf); /* Unpin */ i++; } }
/* This is called for the freeing block on volmap */ static void __blockput_free(struct sb *sb, struct buffer_head *buffer, unsigned delta) { /* FIXME: buffer was freed, so we would like to free cache */ tux3_clear_buffer_dirty(buffer, delta); tux3_try_cancel_dirty_page(buffer->b_page); blockput(buffer); }
/* Convert atom to name */ static int unatom(struct inode *atable, atom_t atom, char *name, unsigned size) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = tux_sb(atable->i_sb); struct buffer_head *buffer; int err; loff_t where = unatom_dict_read(atable, atom); if (where < 0) { err = where; goto error; } buffer = blockread(mapping(atable), where >> sb->blockbits); if (!buffer) { err = -EIO; goto error; } tux_dirent *entry = bufdata(buffer) + (where & sb->blockmask); if (entry_atom(entry) != atom) { tux3_fs_error(sb, "atom %x reverse entry broken", atom); err = -EIO; goto error_blockput; } unsigned len = entry->name_len; if (size) { if (len > size) { err = -ERANGE; goto error_blockput; } memcpy(name, entry->name, len); } blockput(buffer); return len; error_blockput: blockput(buffer); error: return err; }
int tux_delete_entry(struct inode *dir, struct buffer_head *buffer, tux_dirent *entry) { unsigned delta = tux3_get_current_delta(); tux_dirent *prev = NULL, *this = bufdata(buffer); struct buffer_head *clone; void *olddata; while ((char *)this < (char *)entry) { if (this->rec_len == 0) { blockput(buffer); tux_zero_len_error(dir, bufindex(buffer)); return -EIO; } prev = this; this = next_entry(this); } /* * The directory is protected by i_mutex. * blockdirty() should never return -EAGAIN. */ olddata = bufdata(buffer); clone = blockdirty(buffer, delta); if (IS_ERR(clone)) { assert(PTR_ERR(clone) != -EAGAIN); blockput(buffer); return PTR_ERR(clone); } entry = ptr_redirect(entry, olddata, bufdata(clone)); prev = ptr_redirect(prev, olddata, bufdata(clone)); if (prev) prev->rec_len = tux_rec_len_to_disk((void *)next_entry(entry) - (void *)prev); memset(entry->name, 0, entry->name_len); entry->name_len = entry->type = 0; entry->inum = 0; mark_buffer_dirty_non(clone); blockput(clone); return 0; }
/* Modify atom refcount */ static int atomref(struct inode *atable, atom_t atom, int use) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = tux_sb(atable->i_sb); unsigned shift = sb->blockbits - ATOMREF_BLKBITS; unsigned block = sb->atomref_base + ATOMREF_SIZE * (atom >> shift); unsigned offset = atom & ~(-1 << shift), kill = 0; struct buffer_head *buffer; __be16 *refcount; int err; buffer = blockread(mapping(atable), block); if (!buffer) return -EIO; refcount = bufdata(buffer); int low = be16_to_cpu(refcount[offset]) + use; trace("inc atom %x by %d, offset %x[%x], low = %d", atom, use, block, offset, low); /* This releases buffer */ err = update_refcount(sb, buffer, offset, low); if (err) return err; if (!low || (low & (-1 << 16))) { buffer = blockread(mapping(atable), block + 1); if (!buffer) return -EIO; refcount = bufdata(buffer); int high = be16_to_cpu(refcount[offset]); if (!low) blockput(buffer); else { trace("carry %d, offset %x[%x], high = %d", (low >> 16), block, offset, high); high += (low >> 16); assert(high >= 0); /* paranoia check */ /* This releases buffer */ err = update_refcount(sb, buffer, offset, high); if (err) { /* FIXME: better set a flag that atomref broke * or something! */ return err; } } kill = !(low | high); }
void ventiproc(void *dummy) { int i; Block *db; u32int bno; u64int bsize; USED(dummy); proccreate(vtsendproc, z, STACK); proccreate(vtrecvproc, z, STACK); writechan = chancreate(sizeof(WriteReq), 0); for(i=0; i<nwritethread; i++) threadcreate(writethread, nil, STACK); vtcachesetwrite(zcache, myvtwrite); bsize = fsys->blocksize; vtfilelock(vfile, -1); while((db = qread(qventi, &bno)) != nil){ if(nop){ blockput(db); continue; } if(vtfilewrite(vfile, db->data, bsize, bno*bsize) != bsize) sysfatal("ventiproc vtfilewrite: %r"); if(vtfileflushbefore(vfile, (bno+1)*bsize) < 0) sysfatal("ventiproc vtfileflushbefore: %r"); blockput(db); } vtfileunlock(vfile); vtcachesetwrite(zcache, nil); for(i=0; i<nwritethread; i++) send(writechan, nil); chanfree(writechan); if(statustime) print("# %T venti proc exiting - nsend %d nrecv %d\n", nsend, nrecv); runlock(&endlk); }
int tux_dir_is_empty(struct inode *dir) { struct sb *sb = tux_sb(dir->i_sb); block_t block, blocks = dir->i_size >> sb->blockbits; __be64 self = cpu_to_be64(tux_inode(dir)->inum); struct buffer_head *buffer; for (block = 0; block < blocks; block++) { buffer = blockread(mapping(dir), block); if (!buffer) return -EIO; tux_dirent *entry = bufdata(buffer); tux_dirent *limit = bufdata(buffer) + sb->blocksize - TUX_REC_LEN(1); for (; entry <= limit; entry = next_entry(entry)) { if (!entry->rec_len) { blockput(buffer); tux_zero_len_error(dir, block); return -EIO; } if (is_deleted(entry)) continue; if (entry->name[0] != '.') goto not_empty; if (entry->name_len > 2) goto not_empty; if (entry->name_len < 2) { if (entry->inum != self) goto not_empty; } else if (entry->name[1] != '.') goto not_empty; } blockput(buffer); } return 0; not_empty: blockput(buffer); return -ENOTEMPTY; }
static void level_replace_blockput(struct cursor *cursor, int level, struct buffer_head *buffer, struct index_entry *next) { #ifdef CURSOR_DEBUG assert(buffer); assert(level <= cursor->level); assert(cursor->path[level].buffer != FREE_BUFFER); assert(cursor->path[level].next != FREE_NEXT); #endif blockput(cursor->path[level].buffer); cursor->path[level].buffer = buffer; cursor->path[level].next = next; }
static loff_t unatom_dict_read(struct inode *atable, atom_t atom) { struct buffer_head *buffer; unsigned offset; buffer = blockread_unatom(atable, atom, &offset); if (!buffer) return -EIO; __be64 *unatom_dict = bufdata(buffer); loff_t where = be64_to_cpu(unatom_dict[offset]); blockput(buffer); return where; }
static int ext2sync(Fsys *fsys) { int i; Group g; Block *b; Super super; Ext2 *fs; Disk *disk; fs = fsys->priv; disk = fs->disk; if((b = diskread(disk, SBSIZE, SBOFF)) == nil) return -1; parsesuper(&super, b->data); blockput(b); if(checksuper(&super) < 0) return -1; fs->blocksize = MINBLOCKSIZE<<super.logblocksize; fs->nblock = super.nblock; fs->ngroup = (super.nblock+super.blockspergroup-1) / super.blockspergroup; fs->inospergroup = super.inospergroup; fs->blockspergroup = super.blockspergroup; if(super.revlevel >= 1) fs->inosize = super.inosize; else fs->inosize = 128; fs->inosperblock = fs->blocksize / fs->inosize; if(fs->blocksize == SBOFF) fs->groupaddr = 2; else fs->groupaddr = 1; fs->descperblock = fs->blocksize / GroupSize; fs->firstblock = super.firstdatablock; fsys->blocksize = fs->blocksize; fsys->nblock = fs->nblock; if(debug) fprint(2, "ext2 %d %d-byte blocks, first data block %d, %d groups of %d\n", fs->nblock, fs->blocksize, fs->firstblock, fs->ngroup, fs->blockspergroup); if(0){ for(i=0; i<fs->ngroup; i++) if(ext2group(fs, i, &g) >= 0) fprint(2, "grp %d: bitblock=%d\n", i, g.bitblock); } return 0; }
static int ext2group(Ext2 *fs, u32int i, Group *g) { Block *b; u64int addr; if(i >= fs->ngroup) return -1; addr = fs->groupaddr + i/fs->descperblock; b = diskread(fs->disk, fs->blocksize, addr*fs->blocksize); if(b == nil) return -1; parsegroup(g, b->data+i%fs->descperblock*GroupSize); blockput(b); return 0; }
static void level_replace_blockput(struct cursor *cursor, int level, struct buffer_head *buffer, struct index_entry *next) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } #ifdef CURSOR_DEBUG assert(buffer); assert(level <= cursor->level); assert(cursor->path[level].buffer != FREE_BUFFER); assert(cursor->path[level].next != FREE_NEXT); #endif blockput(cursor->path[level].buffer); cursor->path[level].buffer = buffer; cursor->path[level].next = next; }
static loff_t unatom_dict_read(struct inode *atable, atom_t atom) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct buffer_head *buffer; unsigned offset; buffer = blockread_unatom(atable, atom, &offset); if (!buffer) return -EIO; __be64 *unatom_dict = bufdata(buffer); loff_t where = be64_to_cpu(unatom_dict[offset]); blockput(buffer); return where; }
/* Find atom of name */ static int find_atom(struct inode *atable, const char *name, unsigned len, atom_t *atom) { struct sb *sb = tux_sb(atable->i_sb); struct buffer_head *buffer; struct tux3_dirent *entry; entry = tux_find_entry(atable, name, len, &buffer, sb->atomdictsize); if (IS_ERR(entry)) { int err = PTR_ERR(entry); if (err == -ENOENT) return -ENODATA; return err; } *atom = entry_atom(entry); blockput(buffer); return 0; }
static void add_maps(struct inode *inode, block_t index, struct block_segment *seg, int nr_segs) { unsigned delta = tux3_get_current_delta(); for (int i = 0; i < nr_segs; i++) { struct block_segment *s = &seg[i]; for (unsigned j = 0; j < s->count; j++) { struct buffer_head *buf; buf = blockget(inode->map, index + j); buf = blockdirty(buf, delta); memset(buf->data, 0, inode->i_sb->blocksize); *(block_t *)buf->data = s->block + j; mark_buffer_dirty_non(buf); blockput(buf); } index += s->count; } }
static void check_maps(struct inode *inode, block_t index, struct block_segment *seg, int nr_segs) { for (int i = 0; i < nr_segs; i++) { struct block_segment *s = &seg[i]; for (unsigned j = 0; j < s->count; j++) { struct buffer_head *buf; buf = peekblk(inode->map, index + j); if (s->state == BLOCK_SEG_HOLE) test_assert(buf == NULL); else { block_t blk = *(block_t *)buf->data; test_assert(blk == s->block + j); blockput(buf); } } index += s->count; } }
void cmpproc(void *dummy) { uchar *data; Block *db; u32int bno, bsize; uchar score[VtScoreSize]; uchar score1[VtScoreSize]; USED(dummy); if(incremental) vtfilelock(vscores, VtOREAD); bsize = fsys->blocksize; while((db = qread(qcmp, &bno)) != nil){ data = db->data; sha1(data, vtzerotruncate(VtDataType, data, bsize), score, nil); if(incremental){ if(vtfileblockscore(vscores, bno, score1) < 0) sysfatal("cmpproc vtfileblockscore %d: %r", bno); }else{ if(Bseek(&bscores, (vlong)bno*VtScoreSize, 0) < 0) sysfatal("cmpproc Bseek: %r"); if(Bread(&bscores, score1, VtScoreSize) != VtScoreSize) sysfatal("cmpproc Bread: %r"); } if(memcmp(score, score1, VtScoreSize) != 0){ nchange++; if(verbose) print("# block %ud: old %V new %V\n", bno, score1, score); qwrite(qventi, db, bno); }else blockput(db); } qclose(qventi); if(incremental) vtfileunlock(vscores); if(statustime) print("# %T cmp proc exiting\n"); runlock(&endlk); }
/* Find atom of name */ static int find_atom(struct inode *atable, const char *name, unsigned len, atom_t *atom) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = tux_sb(atable->i_sb); struct buffer_head *buffer; tux_dirent *entry; entry = tux_find_entry(atable, name, len, &buffer, sb->atomdictsize); if (IS_ERR(entry)) { int err = PTR_ERR(entry); if (err == -ENOENT) return -ENODATA; return err; } *atom = entry_atom(entry); blockput(buffer); return 0; }
int alloc_empty_btree(struct btree *btree) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = btree->sb; struct buffer_head *rootbuf = new_node(btree); if (IS_ERR(rootbuf)) goto error; struct buffer_head *leafbuf = new_leaf(btree); if (IS_ERR(leafbuf)) goto error_leafbuf; assert(!has_root(btree)); struct bnode *rootnode = bufdata(rootbuf); block_t rootblock = bufindex(rootbuf); block_t leafblock = bufindex(leafbuf); trace("root at %Lx", rootblock); trace("leaf at %Lx", leafblock); bnode_init_root(rootnode, 1, leafblock, 0, 0); log_bnode_root(sb, rootblock, 1, leafblock, 0, 0); log_balloc(sb, leafblock, 1); mark_buffer_unify_non(rootbuf); blockput(rootbuf); mark_buffer_dirty_non(leafbuf); blockput(leafbuf); btree->root = (struct root){ .block = rootblock, .depth = 1 }; tux3_mark_btree_dirty(btree); return 0; error_leafbuf: (btree->ops->bfree)(sb, bufindex(rootbuf), 1); blockput(rootbuf); rootbuf = leafbuf; error: return PTR_ERR(rootbuf); } /* FIXME: right? and this should be done by btree_chop()? */ int free_empty_btree(struct btree *btree) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct btree_ops *ops = btree->ops; if (!has_root(btree)) return 0; assert(btree->root.depth == 1); struct sb *sb = btree->sb; struct buffer_head *rootbuf = vol_bread(sb, btree->root.block); if (!rootbuf) return -EIO; assert(bnode_sniff(bufdata(rootbuf))); /* Make btree has no root */ btree->root = no_root; tux3_mark_btree_dirty(btree); struct bnode *rootnode = bufdata(rootbuf); assert(bcount(rootnode) == 1); block_t leaf = be64_to_cpu(rootnode->entries[0].block); struct buffer_head *leafbuf = vol_find_get_block(sb, leaf); if (leafbuf && !leaf_need_redirect(sb, leafbuf)) { /* * This is redirected leaf. So, in here, we can just * cancel leaf_redirect by bfree(), instead of * defered_bfree(). */ bfree(sb, leaf, 1); log_leaf_free(sb, leaf); assert(ops->leaf_can_free(btree, bufdata(leafbuf))); blockput_free(sb, leafbuf); } else { defer_bfree(&sb->defree, leaf, 1); log_bfree(sb, leaf, 1); if (leafbuf) { assert(ops->leaf_can_free(btree, bufdata(leafbuf))); blockput(leafbuf); } } if (!bnode_need_redirect(sb, rootbuf)) { /* * This is redirected bnode. So, in here, we can just * cancel bnode_redirect by bfree(), instead of * defered_bfree(). */ bfree(sb, bufindex(rootbuf), 1); log_bnode_free(sb, bufindex(rootbuf)); blockput_free_unify(sb, rootbuf); } else { defer_bfree(&sb->deunify, bufindex(rootbuf), 1); log_bfree_on_unify(sb, bufindex(rootbuf), 1); blockput(rootbuf); } return 0; } int replay_bnode_redirect(struct replay *rp, block_t oldblock, block_t newblock) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *newbuf, *oldbuf; int err = 0; newbuf = vol_getblk(sb, newblock); if (!newbuf) { err = -ENOMEM; /* FIXME: error code */ goto error; } oldbuf = vol_bread(sb, oldblock); if (!oldbuf) { err = -EIO; /* FIXME: error code */ goto error_put_newbuf; } assert(bnode_sniff(bufdata(oldbuf))); memcpy(bufdata(newbuf), bufdata(oldbuf), bufsize(newbuf)); mark_buffer_unify_atomic(newbuf); blockput(oldbuf); error_put_newbuf: blockput(newbuf); error: return err; } int replay_bnode_root(struct replay *rp, block_t root, unsigned count, block_t left, block_t right, tuxkey_t rkey) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *rootbuf; rootbuf = vol_getblk(sb, root); if (!rootbuf) return -ENOMEM; bnode_buffer_init(rootbuf); bnode_init_root(bufdata(rootbuf), count, left, right, rkey); mark_buffer_unify_atomic(rootbuf); blockput(rootbuf); return 0; } /* * Before this replay, replay should already dirty the buffer of src. * (e.g. by redirect) */ int replay_bnode_split(struct replay *rp, block_t src, unsigned pos, block_t dst) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *srcbuf, *dstbuf; int err = 0; srcbuf = vol_getblk(sb, src); if (!srcbuf) { err = -ENOMEM; /* FIXME: error code */ goto error; } dstbuf = vol_getblk(sb, dst); if (!dstbuf) { err = -ENOMEM; /* FIXME: error code */ goto error_put_srcbuf; } bnode_buffer_init(dstbuf); bnode_split(bufdata(srcbuf), pos, bufdata(dstbuf)); mark_buffer_unify_non(srcbuf); mark_buffer_unify_atomic(dstbuf); blockput(dstbuf); error_put_srcbuf: blockput(srcbuf); error: return err; } /* * Before this replay, replay should already dirty the buffer of bnodeblock. * (e.g. by redirect) */ static int replay_bnode_change(struct sb *sb, block_t bnodeblock, u64 val1, u64 val2, void (*change)(struct bnode *, u64, u64)) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct buffer_head *bnodebuf; bnodebuf = vol_getblk(sb, bnodeblock); if (!bnodebuf) return -ENOMEM; /* FIXME: error code */ struct bnode *bnode = bufdata(bnodebuf); change(bnode, val1, val2); mark_buffer_unify_non(bnodebuf); blockput(bnodebuf); return 0; } static void add_func(struct bnode *bnode, u64 child, u64 key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, key) + 1; bnode_add_index(bnode, entry, child, key); } int replay_bnode_add(struct replay *rp, block_t parent, block_t child, tuxkey_t key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, parent, child, key, add_func); } static void update_func(struct bnode *bnode, u64 child, u64 key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, key); assert(be64_to_cpu(entry->key) == key); entry->block = cpu_to_be64(child); } int replay_bnode_update(struct replay *rp, block_t parent, block_t child, tuxkey_t key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, parent, child, key, update_func); } int replay_bnode_merge(struct replay *rp, block_t src, block_t dst) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *srcbuf, *dstbuf; int err = 0, ret; srcbuf = vol_getblk(sb, src); if (!srcbuf) { err = -ENOMEM; /* FIXME: error code */ goto error; } dstbuf = vol_getblk(sb, dst); if (!dstbuf) { err = -ENOMEM; /* FIXME: error code */ goto error_put_srcbuf; } ret = bnode_merge_nodes(sb, bufdata(dstbuf), bufdata(srcbuf)); assert(ret == 1); mark_buffer_unify_non(dstbuf); mark_buffer_unify_non(srcbuf); blockput(dstbuf); error_put_srcbuf: blockput(srcbuf); error: return err; } static void del_func(struct bnode *bnode, u64 key, u64 count) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, key); assert(be64_to_cpu(entry->key) == key); bnode_remove_index(bnode, entry, count); } int replay_bnode_del(struct replay *rp, block_t bnode, tuxkey_t key, unsigned count) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, bnode, key, count, del_func); } static void adjust_func(struct bnode *bnode, u64 from, u64 to) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, from); assert(be64_to_cpu(entry->key) == from); entry->key = cpu_to_be64(to); } int replay_bnode_adjust(struct replay *rp, block_t bnode, tuxkey_t from, tuxkey_t to) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, bnode, from, to, adjust_func); }
/* * Insert new leaf to next cursor position. * keep == 1: keep current cursor position. * keep == 0, set cursor position to new leaf. */ static int insert_leaf(struct cursor *cursor, tuxkey_t childkey, struct buffer_head *leafbuf, int keep) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct btree *btree = cursor->btree; struct sb *sb = btree->sb; int level = btree->root.depth; block_t childblock = bufindex(leafbuf); if (keep) blockput(leafbuf); else { cursor_pop_blockput(cursor); cursor_push(cursor, leafbuf, NULL); } while (level--) { struct path_level *at = &cursor->path[level]; struct buffer_head *parentbuf = at->buffer; struct bnode *parent = bufdata(parentbuf); /* insert and exit if not full */ if (bcount(parent) < btree->sb->entries_per_node) { bnode_add_index(parent, at->next, childblock, childkey); if (!keep) at->next++; log_bnode_add(sb, bufindex(parentbuf), childblock, childkey); mark_buffer_unify_non(parentbuf); cursor_check(cursor); return 0; } /* split a full index node */ struct buffer_head *newbuf = new_node(btree); if (IS_ERR(newbuf)) return PTR_ERR(newbuf); struct bnode *newnode = bufdata(newbuf); unsigned half = bcount(parent) / 2; u64 newkey = be64_to_cpu(parent->entries[half].key); bnode_split(parent, half, newnode); log_bnode_split(sb, bufindex(parentbuf), half, bufindex(newbuf)); /* if the cursor is in the new node, use that as the parent */ int child_is_left = at->next <= parent->entries + half; if (!child_is_left) { struct index_entry *newnext; mark_buffer_unify_non(parentbuf); newnext = newnode->entries + (at->next - &parent->entries[half]); get_bh(newbuf); level_replace_blockput(cursor, level, newbuf, newnext); parentbuf = newbuf; parent = newnode; } else mark_buffer_unify_non(newbuf); bnode_add_index(parent, at->next, childblock, childkey); if (!keep) at->next++; log_bnode_add(sb, bufindex(parentbuf), childblock, childkey); mark_buffer_unify_non(parentbuf); childkey = newkey; childblock = bufindex(newbuf); blockput(newbuf); /* * If child is in left bnode, we should keep the * cursor position to child, otherwise adjust cursor * to new bnode. */ keep = child_is_left; } /* Make new root bnode */ trace("add tree level"); struct buffer_head *newbuf = new_node(btree); if (IS_ERR(newbuf)) return PTR_ERR(newbuf); struct bnode *newroot = bufdata(newbuf); block_t newrootblock = bufindex(newbuf); block_t oldrootblock = btree->root.block; int left_node = bufindex(cursor->path[0].buffer) != childblock; bnode_init_root(newroot, 2, oldrootblock, childblock, childkey); cursor_root_add(cursor, newbuf, newroot->entries + 1 + !left_node); log_bnode_root(sb, newrootblock, 2, oldrootblock, childblock, childkey); /* Change btree to point the new root */ btree->root.block = newrootblock; btree->root.depth++; mark_buffer_unify_non(newbuf); tux3_mark_btree_dirty(btree); cursor_check(cursor); return 0; }
/* * This is range deletion. So, instead of adjusting balance of the * space on sibling nodes for each change, this just removes the range * and merges from right to left even if it is not same parent. * * +--------------- (A, B, C)--------------------+ * | | | * +-- (AA, AB, AC) -+ +- (BA, BB, BC) -+ + (CA, CB, CC) + * | | | | | | | | | * (AAA,AAB)(ABA,ABB)(ACA,ACB) (BAA,BAB)(BBA)(BCA,BCB) (CAA)(CBA,CBB)(CCA) * * [less : A, AA, AAA, AAB, AB, ABA, ABB, AC, ACA, ACB, B, BA ... : greater] * * If we merged from cousin (or re-distributed), we may have to update * the index until common parent. (e.g. removed (ACB), then merged * from (BAA,BAB) to (ACA), we have to adjust B in root node to BB) * * See, adjust_parent_sep(). * * FIXME: no re-distribute. so, we don't guarantee above than 50% * space efficiency. And if range is end of key (truncate() case), we * don't need to merge, and adjust_parent_sep(). * * FIXME2: we may want to split chop work for each step. instead of * blocking for a long time. */ int btree_chop(struct btree *btree, tuxkey_t start, u64 len) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = btree->sb; struct btree_ops *ops = btree->ops; struct buffer_head **prev, *leafprev = NULL; struct chopped_index_info *cii; struct cursor *cursor; tuxkey_t limit; int ret, done = 0; if (!has_root(btree)) return 0; /* Chop all range if len >= TUXKEY_LIMIT */ limit = (len >= TUXKEY_LIMIT) ? TUXKEY_LIMIT : start + len; prev = malloc(sizeof(*prev) * btree->root.depth); if (prev == NULL) return -ENOMEM; memset(prev, 0, sizeof(*prev) * btree->root.depth); cii = malloc(sizeof(*cii) * btree->root.depth); if (cii == NULL) { ret = -ENOMEM; goto error_cii; } memset(cii, 0, sizeof(*cii) * btree->root.depth); cursor = alloc_cursor(btree, 0); if (!cursor) { ret = -ENOMEM; goto error_alloc_cursor; } down_write(&btree->lock); ret = btree_probe(cursor, start); if (ret) goto error_btree_probe; /* Walk leaves */ while (1) { struct buffer_head *leafbuf; tuxkey_t this_key; /* * FIXME: If leaf was merged and freed later, we don't * need to redirect leaf and leaf_chop() */ if ((ret = cursor_redirect(cursor))) goto out; leafbuf = cursor_pop(cursor); /* Adjust start and len for this leaf */ this_key = cursor_level_this_key(cursor); if (start < this_key) { if (limit < TUXKEY_LIMIT) len -= this_key - start; start = this_key; } ret = ops->leaf_chop(btree, start, len, bufdata(leafbuf)); if (ret) { if (ret < 0) { blockput(leafbuf); goto out; } mark_buffer_dirty_non(leafbuf); } /* Try to merge this leaf with prev */ if (leafprev) { if (try_leaf_merge(btree, leafprev, leafbuf)) { trace(">>> can merge leaf %p into leaf %p", leafbuf, leafprev); remove_index(cursor, cii); mark_buffer_dirty_non(leafprev); blockput_free(sb, leafbuf); goto keep_prev_leaf; } blockput(leafprev); } leafprev = leafbuf; keep_prev_leaf: if (cursor_level_next_key(cursor) >= limit) done = 1; /* Pop and try to merge finished nodes */ while (done || cursor_level_finished(cursor)) { struct buffer_head *buf; int level = cursor->level; struct chopped_index_info *ciil = &cii[level]; /* Get merge src buffer, and go parent level */ buf = cursor_pop(cursor); /* * Logging chopped indexes * FIXME: If node is freed later (e.g. merged), * we dont't need to log this */ if (ciil->count) { log_bnode_del(sb, bufindex(buf), ciil->start, ciil->count); } memset(ciil, 0, sizeof(*ciil)); /* Try to merge node with prev */ if (prev[level]) { assert(level); if (try_bnode_merge(sb, prev[level], buf)) { trace(">>> can merge node %p into node %p", buf, prev[level]); remove_index(cursor, cii); mark_buffer_unify_non(prev[level]); blockput_free_unify(sb, buf); goto keep_prev_node; } blockput(prev[level]); } prev[level] = buf; keep_prev_node: if (!level) goto chop_root; } /* Push back down to leaf level */ do { ret = cursor_advance_down(cursor); if (ret < 0) goto out; } while (ret); } chop_root: /* Remove depth if possible */ while (btree->root.depth > 1 && bcount(bufdata(prev[0])) == 1) { trace("drop btree level"); btree->root.block = bufindex(prev[1]); btree->root.depth--; tux3_mark_btree_dirty(btree); /* * We know prev[0] is redirected and dirty. So, in * here, we can just cancel bnode_redirect by bfree(), * instead of defered_bfree() * FIXME: we can optimize freeing bnode without * bnode_redirect, and if we did, this is not true. */ bfree(sb, bufindex(prev[0]), 1); log_bnode_free(sb, bufindex(prev[0])); blockput_free_unify(sb, prev[0]); vecmove(prev, prev + 1, btree->root.depth); } ret = 0; out: if (leafprev) blockput(leafprev); for (int i = 0; i < btree->root.depth; i++) { if (prev[i]) blockput(prev[i]); } release_cursor(cursor); error_btree_probe: up_write(&btree->lock); free_cursor(cursor); error_alloc_cursor: free(cii); error_cii: free(prev); return ret; }
static Block* ext2blockread(Fsys *fsys, u64int vbno) { Block *bitb; Group g; uchar *bits; u32int bno, boff, bitblock; u64int bitpos; Ext2 *fs; fs = fsys->priv; if(vbno >= fs->nblock) return nil; bno = vbno; if(bno != vbno) return nil; /* if(bno < fs->firstblock) return diskread(fs->disk, fs->blocksize, (u64int)bno*fs->blocksize); */ if(bno < fs->firstblock) return nil; bno -= fs->firstblock; if(ext2group(fs, bno/fs->blockspergroup, &g) < 0){ if(debug) fprint(2, "loading group: %r..."); return nil; } /* if(debug) fprint(2, "ext2 group %d: bitblock=%ud inodebitblock=%ud inodeaddr=%ud freeblocks=%ud freeinodes=%ud useddirs=%ud\n", (int)(bno/fs->blockspergroup), g.bitblock, g.inodebitblock, g.inodeaddr, g.freeblockscount, g.freeinodescount, g.useddirscount); if(debug) fprint(2, "group %d bitblock=%d...", bno/fs->blockspergroup, g.bitblock); */ bitblock = g.bitblock; bitpos = (u64int)bitblock*fs->blocksize; if((bitb = diskread(fs->disk, fs->blocksize, bitpos)) == nil){ if(debug) fprint(2, "loading bitblock: %r..."); return nil; } bits = bitb->data; boff = bno%fs->blockspergroup; if((bits[boff>>3] & (1<<(boff&7))) == 0){ if(debug) fprint(2, "block %d not allocated in group %d: bitblock %d/%lld bits[%d] = %#x\n", boff, bno/fs->blockspergroup, (int)bitblock, bitpos, boff>>3, bits[boff>>3]); blockput(bitb); return nil; } blockput(bitb); bno += fs->firstblock; return diskread(fs->disk, fs->blocksize, (u64int)bno*fs->blocksize); }