/* * Upgrade a shared lock to an exclusively held lock. This function will * return EDEADLK If there is more then one shared holder. * * No error occurs and no action is taken if the lock is already exclusively * held by the caller. If the lock is not held at all or held exclusively * by someone else, this function will panic. */ int hammer_lock_upgrade(struct hammer_lock *lock, int shcount) { thread_t td = curthread; u_int lv; u_int nlv; int error; for (;;) { lv = lock->lockval; if ((lv & ~HAMMER_LOCKF_WANTED) == shcount) { nlv = lv | HAMMER_LOCKF_EXCLUSIVE; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { lock->lowner = td; error = 0; break; } } else if (lv & HAMMER_LOCKF_EXCLUSIVE) { if (lock->lowner != curthread) hpanic("illegal state"); error = 0; break; } else if ((lv & ~HAMMER_LOCKF_WANTED) == 0) { hpanic("lock is not held"); /* NOT REACHED */ error = EDEADLK; break; } else { error = EDEADLK; break; } } return (error); }
void hammer_unlock(struct hammer_lock *lock) { thread_t td __debugvar = curthread; u_int lv; u_int nlv; lv = lock->lockval; KKASSERT(lv != 0); if (lv & HAMMER_LOCKF_EXCLUSIVE) KKASSERT(lock->lowner == td); for (;;) { lv = lock->lockval; nlv = lv & ~(HAMMER_LOCKF_EXCLUSIVE | HAMMER_LOCKF_WANTED); if (nlv > 1) { nlv = lv - 1; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) break; } else if (nlv == 1) { nlv = 0; if (lv & HAMMER_LOCKF_EXCLUSIVE) lock->lowner = NULL; if (atomic_cmpset_int(&lock->lockval, lv, nlv)) { if (lv & HAMMER_LOCKF_WANTED) wakeup(&lock->lockval); break; } } else { hpanic("lock %p is not held", lock); } } }
/* * The calling thread must be holding a shared or exclusive lock. * Returns < 0 if lock is held shared, and > 0 if held exlusively. */ int hammer_lock_status(struct hammer_lock *lock) { u_int lv = lock->lockval; if (lv & HAMMER_LOCKF_EXCLUSIVE) return(1); else if (lv) return(-1); hpanic("lock must be held: %p", lock); }
/* * Return the demarkation point between the two offsets where * the block size changes. */ int64_t hammer_blockdemarc(int64_t file_offset1, int64_t file_offset2) { if (file_offset1 < HAMMER_XDEMARC) { if (file_offset2 <= HAMMER_XDEMARC) return(file_offset2); return(HAMMER_XDEMARC); } hpanic("illegal range %lld %lld", (long long)file_offset1, (long long)file_offset2); }
/* * nnode is a newly allocated node, and now elm becomes the node * element within nnode's parent that represents a pointer to nnode, * or nnode becomes the root node if elm does not exist. */ static void hammer_move_node(hammer_cursor_t cursor, hammer_btree_elm_t elm, hammer_node_t onode, hammer_node_t nnode) { int error, i; bcopy(onode->ondisk, nnode->ondisk, sizeof(*nnode->ondisk)); /* * Adjust the parent's pointer to us first. */ if (elm) { /* * We are not the root of the B-Tree */ KKASSERT(hammer_is_internal_node_elm(elm)); hammer_modify_node(cursor->trans, cursor->parent, &elm->internal.subtree_offset, sizeof(elm->internal.subtree_offset)); elm->internal.subtree_offset = nnode->node_offset; hammer_modify_node_done(cursor->parent); } else { /* * We are the root of the B-Tree */ hammer_volume_t volume; volume = hammer_get_root_volume(cursor->trans->hmp, &error); KKASSERT(error == 0); hammer_modify_volume_field(cursor->trans, volume, vol0_btree_root); volume->ondisk->vol0_btree_root = nnode->node_offset; hammer_modify_volume_done(volume); hammer_rel_volume(volume, 0); } /* * Now adjust our children's pointers to us * if we are an internal node. */ if (nnode->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { for (i = 0; i < nnode->ondisk->count; ++i) { error = btree_set_parent_of_child(cursor->trans, nnode, &nnode->ondisk->elms[i]); if (error) hpanic("reblock internal node: fixup problem"); } } }
/* * This is the same as hammer_ref_interlock() but asserts that the * 0->1 transition is always true, thus the lock must have no references * on entry or have CHECK set, and will have one reference with the * interlock held on return. It must also not be interlocked on entry * by anyone. * * NOTE that CHECK will never be found set when the ref-count is 0. * * 1 is always returned to match the API for hammer_ref_interlock(). * This function returns with one ref, the lock held, and the CHECK bit set. */ int hammer_ref_interlock_true(struct hammer_lock *lock) { u_int lv; u_int nlv; for (;;) { lv = lock->refs; if (lv) { hpanic("bad lock %p %08x", lock, lock->refs); } nlv = 1 | HAMMER_REFS_LOCKED | HAMMER_REFS_CHECK; if (atomic_cmpset_int(&lock->refs, lv, nlv)) { lock->rowner = curthread; return (1); } } }
/* * ALGORITHM VERSION 0: * Return a namekey hash. The 64 bit namekey hash consists of a 32 bit * crc in the MSB and 0 in the LSB. The caller will use the low 32 bits * to generate a unique key and will scan all entries with the same upper * 32 bits when issuing a lookup. * * 0hhhhhhhhhhhhhhh hhhhhhhhhhhhhhhh 0000000000000000 0000000000000000 * * ALGORITHM VERSION 1: * * This algorithm breaks the filename down into a separate 32-bit crcs * for each filename segment separated by a special character (dot, * underscore, underline, or tilde). The CRCs are then added together. * This allows temporary names. A full-filename 16 bit crc is also * generated to deal with degenerate conditions. * * The algorithm is designed to handle create/rename situations such * that a create with an extention to a rename without an extention * only shifts the key space rather than randomizes it. * * NOTE: The inode allocator cache can only match 10 bits so we do * not really have any room for a partial sorted name, and * numbers don't sort well in that situation anyway. * * 0mmmmmmmmmmmmmmm mmmmmmmmmmmmmmmm llllllllllllllll 0000000000000000 * * * We strip bit 63 in order to provide a positive key, this way a seek * offset of 0 will represent the base of the directory. * * We usually strip bit 0 (set it to 0) in order to provide a consistent * iteration space for collisions. * * This function can never return 0. We use the MSB-0 space to synthesize * artificial directory entries such as "." and "..". */ int64_t hammer_directory_namekey(hammer_inode_t dip, const void *name, int len, u_int32_t *max_iterationsp) { const char *aname = name; int32_t crcx; int64_t key; int i; int j; switch (dip->ino_data.cap_flags & HAMMER_INODE_CAP_DIRHASH_MASK) { case HAMMER_INODE_CAP_DIRHASH_ALG0: /* * Original algorithm */ key = (int64_t)(crc32(aname, len) & 0x7FFFFFFF) << 32; if (key == 0) key |= 0x100000000LL; *max_iterationsp = 0xFFFFFFFFU; break; case HAMMER_INODE_CAP_DIRHASH_ALG1: /* * Filesystem version 6 or better will create directories * using the ALG1 dirhash. This hash breaks the filename * up into domains separated by special characters and * hashes each domain independently. * * We also do a simple sub-sort using the first character * of the filename in the top 5-bits. */ key = 0; /* * m32 */ crcx = 0; for (i = j = 0; i < len; ++i) { if (aname[i] == '.' || aname[i] == '-' || aname[i] == '_' || aname[i] == '~') { if (i != j) crcx += crc32(aname + j, i - j); j = i + 1; } } if (i != j) crcx += crc32(aname + j, i - j); #if 0 /* * xor top 5 bits 0mmmm into low bits and steal the top 5 * bits as a semi sub sort using the first character of * the filename. bit 63 is always left as 0 so directory * keys are positive numbers. */ crcx ^= (uint32_t)crcx >> (32 - 5); crcx = (crcx & 0x07FFFFFF) | ((aname[0] & 0x0F) << (32 - 5)); #endif crcx &= 0x7FFFFFFFU; key |= (uint64_t)crcx << 32; /* * l16 - crc of entire filename * * This crc reduces degenerate hash collision conditions */ crcx = crc32(aname, len); crcx = crcx ^ (crcx << 16); key |= crcx & 0xFFFF0000U; /* * Cleanup */ if ((key & 0xFFFFFFFF00000000LL) == 0) key |= 0x100000000LL; if (hammer_debug_general & 0x0400) { hdkprintf("0x%016llx %*.*s\n", (long long)key, len, len, aname); } *max_iterationsp = 0x00FFFFFF; break; case HAMMER_INODE_CAP_DIRHASH_ALG2: case HAMMER_INODE_CAP_DIRHASH_ALG3: default: key = 0; /* compiler warning */ *max_iterationsp = 1; /* sanity */ hpanic("bad algorithm %p", dip); break; } return(key); }
/* * Reblock the B-Tree (leaf) node, record, and/or data if necessary. * * XXX We have no visibility into internal B-Tree nodes at the moment, * only leaf nodes. */ static int hammer_reblock_helper(struct hammer_ioc_reblock *reblock, hammer_cursor_t cursor, hammer_btree_elm_t elm) { hammer_mount_t hmp; hammer_off_t tmp_offset; hammer_node_ondisk_t ondisk; struct hammer_btree_leaf_elm leaf; int error; int bytes; int cur; int iocflags; error = 0; hmp = cursor->trans->hmp; /* * Reblock data. Note that data embedded in a record is reblocked * by the record reblock code. Data processing only occurs at leaf * nodes and for RECORD element types. */ if (cursor->node->ondisk->type != HAMMER_BTREE_TYPE_LEAF) goto skip; if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) return(EINVAL); tmp_offset = elm->leaf.data_offset; if (tmp_offset == 0) goto skip; /* * If reblock->vol_no is specified we only want to reblock data * in that volume, but ignore everything else. */ if (reblock->vol_no != -1 && reblock->vol_no != HAMMER_VOL_DECODE(tmp_offset)) goto skip; /* * NOTE: Localization restrictions may also have been set-up, we can't * just set the match flags willy-nilly here. */ switch(elm->leaf.base.rec_type) { case HAMMER_RECTYPE_INODE: case HAMMER_RECTYPE_SNAPSHOT: case HAMMER_RECTYPE_CONFIG: iocflags = HAMMER_IOC_DO_INODES; break; case HAMMER_RECTYPE_EXT: case HAMMER_RECTYPE_FIX: case HAMMER_RECTYPE_PFS: case HAMMER_RECTYPE_DIRENTRY: iocflags = HAMMER_IOC_DO_DIRS; break; case HAMMER_RECTYPE_DATA: case HAMMER_RECTYPE_DB: iocflags = HAMMER_IOC_DO_DATA; break; default: iocflags = 0; break; } if (reblock->head.flags & iocflags) { ++reblock->data_count; reblock->data_byte_count += elm->leaf.data_len; bytes = hammer_blockmap_getfree(hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) hdkprintf("D %6d/%d\n", bytes, reblock->free_level); /* * Start data reblock if * 1. there is no error * 2. the data and allocator offset are not in the same * big-block, or free level threshold is 0 * 3. free bytes in the data's big-block is larger than * free level threshold (means if threshold is 0 then * do reblock no matter what). */ if (error == 0 && (cur == 0 || reblock->free_level == 0) && bytes >= reblock->free_level) { /* * This is nasty, the uncache code may have to get * vnode locks and because of that we can't hold * the cursor locked. * * WARNING: See warnings in hammer_unlock_cursor() * function. */ leaf = elm->leaf; hammer_unlock_cursor(cursor); hammer_io_direct_uncache(hmp, &leaf); hammer_lock_cursor(cursor); /* * elm may have become stale or invalid, reload it. * ondisk variable is temporary only. Note that * cursor->node and thus cursor->node->ondisk may * also changed. */ ondisk = cursor->node->ondisk; elm = &ondisk->elms[cursor->index]; if (cursor->flags & HAMMER_CURSOR_RETEST) { hkprintf("debug: retest on reblocker uncache\n"); error = EDEADLK; } else if (ondisk->type != HAMMER_BTREE_TYPE_LEAF || cursor->index >= ondisk->count) { hkprintf("debug: shifted on reblocker uncache\n"); error = EDEADLK; } else if (bcmp(&elm->leaf, &leaf, sizeof(leaf))) { hkprintf("debug: changed on reblocker uncache\n"); error = EDEADLK; } if (error == 0) error = hammer_cursor_upgrade(cursor); if (error == 0) { KKASSERT(cursor->index < ondisk->count); error = hammer_reblock_data(reblock, cursor, elm); } if (error == 0) { ++reblock->data_moves; reblock->data_byte_moves += elm->leaf.data_len; } } } skip: /* * Reblock a B-Tree internal or leaf node. A leaf node is reblocked * on initial entry only (element 0). An internal node is reblocked * when entered upward from its first leaf node only (also element 0, * see hammer_btree_iterate() where cursor moves up and may return). * Further revisits of the internal node (index > 0) are ignored. */ tmp_offset = cursor->node->node_offset; /* * If reblock->vol_no is specified we only want to reblock data * in that volume, but ignore everything else. */ if (reblock->vol_no != -1 && reblock->vol_no != HAMMER_VOL_DECODE(tmp_offset)) goto end; if (cursor->index == 0 && error == 0 && (reblock->head.flags & HAMMER_IOC_DO_BTREE)) { ++reblock->btree_count; bytes = hammer_blockmap_getfree(hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) hdkprintf("B %6d/%d\n", bytes, reblock->free_level); /* * Start node reblock if * 1. there is no error * 2. the node and allocator offset are not in the same * big-block, or free level threshold is 0 * 3. free bytes in the node's big-block is larger than * free level threshold (means if threshold is 0 then * do reblock no matter what). */ if (error == 0 && (cur == 0 || reblock->free_level == 0) && bytes >= reblock->free_level) { error = hammer_cursor_upgrade(cursor); if (error == 0) { if (cursor->parent) { KKASSERT(cursor->parent_index < cursor->parent->ondisk->count); elm = &cursor->parent->ondisk->elms[cursor->parent_index]; } else { elm = NULL; } switch(cursor->node->ondisk->type) { case HAMMER_BTREE_TYPE_LEAF: error = hammer_reblock_leaf_node( reblock, cursor, elm); break; case HAMMER_BTREE_TYPE_INTERNAL: error = hammer_reblock_int_node( reblock, cursor, elm); break; default: hpanic("Illegal B-Tree node type"); } } if (error == 0) { ++reblock->btree_moves; } } } end: hammer_cursor_downgrade(cursor); return(error); }
/* * Generate UNDO record(s) for the block of data at the specified zone1 * or zone2 offset. * * The recovery code will execute UNDOs in reverse order, allowing overlaps. * All the UNDOs are executed together so if we already laid one down we * do not have to lay another one down for the same range. * * For HAMMER version 4+ UNDO a 512 byte boundary is enforced and a PAD * will be laid down for any unused space. UNDO FIFO media structures * will implement the hdr_seq field (it used to be reserved01), and * both flush and recovery mechanics will be very different. * * WARNING! See also hammer_generate_redo() in hammer_redo.c */ int hammer_generate_undo(hammer_transaction_t trans, hammer_off_t zone_off, void *base, int len) { hammer_mount_t hmp; hammer_volume_t root_volume; hammer_blockmap_t undomap; hammer_buffer_t buffer = NULL; hammer_fifo_undo_t undo; hammer_fifo_tail_t tail; hammer_off_t next_offset; int error; int bytes; int n; hmp = trans->hmp; /* * A SYNC record may be required before we can lay down a general * UNDO. This ensures that the nominal recovery span contains * at least one SYNC record telling the recovery code how far * out-of-span it must go to run the REDOs. */ if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 && hmp->version >= HAMMER_VOL_VERSION_FOUR) { hammer_generate_redo_sync(trans); } /* * Enter the offset into our undo history. If there is an existing * undo we do not have to generate a new one. */ if (hammer_enter_undo_history(hmp, zone_off, len) == EALREADY) return(0); root_volume = trans->rootvol; undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; /* no undo recursion */ hammer_modify_volume_noundo(NULL, root_volume); hammer_lock_ex(&hmp->undo_lock); /* undo had better not roll over (loose test) */ if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3) hpanic("insufficient UNDO/REDO FIFO space for undo!"); /* * Loop until the undo for the entire range has been laid down. */ while (len) { /* * Fetch the layout offset in the UNDO FIFO, wrap it as * necessary. */ if (undomap->next_offset == undomap->alloc_offset) undomap->next_offset = HAMMER_ENCODE_UNDO(0); next_offset = undomap->next_offset; /* * This is a tail-chasing FIFO, when we hit the start of a new * buffer we don't have to read it in. */ if ((next_offset & HAMMER_BUFMASK) == 0) { undo = hammer_bnew(hmp, next_offset, &error, &buffer); hammer_format_undo(undo, hmp->undo_seqno ^ 0x40000000); } else { undo = hammer_bread(hmp, next_offset, &error, &buffer); } if (error) break; /* no undo recursion */ hammer_modify_buffer_noundo(NULL, buffer); /* * Calculate how big a media structure fits up to the next * alignment point and how large a data payload we can * accomodate. * * If n calculates to 0 or negative there is no room for * anything but a PAD. */ bytes = HAMMER_UNDO_ALIGN - ((int)next_offset & HAMMER_UNDO_MASK); n = bytes - (int)sizeof(struct hammer_fifo_undo) - (int)sizeof(struct hammer_fifo_tail); /* * If available space is insufficient for any payload * we have to lay down a PAD. * * The minimum PAD is 8 bytes and the head and tail will * overlap each other in that case. PADs do not have * sequence numbers or CRCs. * * A PAD may not start on a boundary. That is, every * 512-byte block in the UNDO/REDO FIFO must begin with * a record containing a sequence number. */ if (n <= 0) { KKASSERT(bytes >= sizeof(struct hammer_fifo_tail)); KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0); tail = (void *)((char *)undo + bytes - sizeof(*tail)); if ((void *)undo != (void *)tail) { tail->tail_signature = HAMMER_TAIL_SIGNATURE; tail->tail_type = HAMMER_HEAD_TYPE_PAD; tail->tail_size = bytes; } undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD; undo->head.hdr_size = bytes; /* NO CRC OR SEQ NO */ undomap->next_offset += bytes; hammer_modify_buffer_done(buffer); hammer_stats_undo += bytes; continue; } /* * Calculate the actual payload and recalculate the size * of the media structure as necessary. */ if (n > len) { n = len; bytes = HAMMER_HEAD_DOALIGN(n) + (int)sizeof(struct hammer_fifo_undo) + (int)sizeof(struct hammer_fifo_tail); } if (hammer_debug_general & 0x0080) { hdkprintf("undo %016jx %d %d\n", (intmax_t)next_offset, bytes, n); } undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; undo->head.hdr_type = HAMMER_HEAD_TYPE_UNDO; undo->head.hdr_size = bytes; undo->head.hdr_seq = hmp->undo_seqno++; undo->head.hdr_crc = 0; undo->undo_offset = zone_off; undo->undo_data_bytes = n; bcopy(base, undo + 1, n); tail = (void *)((char *)undo + bytes - sizeof(*tail)); tail->tail_signature = HAMMER_TAIL_SIGNATURE; tail->tail_type = HAMMER_HEAD_TYPE_UNDO; tail->tail_size = bytes; KKASSERT(bytes >= sizeof(undo->head)); hammer_crc_set_fifo_head(&undo->head, bytes); undomap->next_offset += bytes; hammer_stats_undo += bytes; /* * Before we finish off the buffer we have to deal with any * junk between the end of the media structure we just laid * down and the UNDO alignment boundary. We do this by laying * down a dummy PAD. Even though we will probably overwrite * it almost immediately we have to do this so recovery runs * can iterate the UNDO space without having to depend on * the indices in the volume header. * * This dummy PAD will be overwritten on the next undo so * we do not adjust undomap->next_offset. */ bytes = HAMMER_UNDO_ALIGN - ((int)undomap->next_offset & HAMMER_UNDO_MASK); if (bytes != HAMMER_UNDO_ALIGN) { KKASSERT(bytes >= sizeof(struct hammer_fifo_tail)); undo = (void *)(tail + 1); tail = (void *)((char *)undo + bytes - sizeof(*tail)); if ((void *)undo != (void *)tail) { tail->tail_signature = HAMMER_TAIL_SIGNATURE; tail->tail_type = HAMMER_HEAD_TYPE_PAD; tail->tail_size = bytes; } undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD; undo->head.hdr_size = bytes; /* NO CRC OR SEQ NO */ } hammer_modify_buffer_done(buffer); /* * Adjust for loop */ len -= n; base = (char *)base + n; zone_off += n; } hammer_modify_volume_done(root_volume); hammer_unlock(&hmp->undo_lock); if (buffer) hammer_rel_buffer(buffer, 0); return(error); }