/* * Copy records from userland to the target mirror. * * The PFS is identified in the mirror structure. The passed ip is just * some directory in the overall HAMMER filesystem and has nothing to * do with the PFS. In fact, there might not even be a root directory for * the PFS yet! */ int hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_mirror_rw *mirror) { union hammer_ioc_mrecord_any mrec; struct hammer_cursor cursor; u_int32_t localization; int checkspace_count = 0; int error; int bytes; char *uptr; int seq; localization = (u_int32_t)mirror->pfs_id << 16; seq = trans->hmp->flusher.done; /* * Validate the mirror structure and relocalize the tracking keys. */ if (mirror->size < 0 || mirror->size > 0x70000000) return(EINVAL); mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; mirror->key_beg.localization += localization; mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; mirror->key_end.localization += localization; mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; mirror->key_cur.localization += localization; /* * Set up our tracking cursor for the loop. The tracking cursor * is used to delete records that are no longer present on the * master. The last handled record at key_cur must be skipped. */ error = hammer_init_cursor(trans, &cursor, NULL, NULL); cursor.key_beg = mirror->key_cur; cursor.key_end = mirror->key_end; cursor.flags |= HAMMER_CURSOR_BACKEND; error = hammer_btree_first(&cursor); if (error == 0) cursor.flags |= HAMMER_CURSOR_ATEDISK; if (error == ENOENT) error = 0; /* * Loop until our input buffer has been exhausted. */ while (error == 0 && mirror->count + sizeof(mrec.head) <= mirror->size) { /* * Don't blow out the buffer cache. Leave room for frontend * cache as well. * * WARNING: See warnings in hammer_unlock_cursor() function. */ while (hammer_flusher_meta_halflimit(trans->hmp) || hammer_flusher_undo_exhausted(trans, 2)) { hammer_unlock_cursor(&cursor); hammer_flusher_wait(trans->hmp, seq); hammer_lock_cursor(&cursor); seq = hammer_flusher_async_one(trans->hmp); } /* * If there is insufficient free space it may be due to * reserved bigblocks, which flushing might fix. */ if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { if (++checkspace_count == 10) { error = ENOSPC; break; } hammer_unlock_cursor(&cursor); hammer_flusher_wait(trans->hmp, seq); hammer_lock_cursor(&cursor); seq = hammer_flusher_async(trans->hmp, NULL); } /* * Acquire and validate header */ if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) bytes = sizeof(mrec); uptr = (char *)mirror->ubuf + mirror->count; error = copyin(uptr, &mrec, bytes); if (error) break; if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { error = EINVAL; break; } if (mrec.head.rec_size < sizeof(mrec.head) || mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || mirror->count + mrec.head.rec_size > mirror->size) { error = EINVAL; break; } switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { case HAMMER_MREC_TYPE_SKIP: if (mrec.head.rec_size != sizeof(mrec.skip)) error = EINVAL; if (error == 0) error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); break; case HAMMER_MREC_TYPE_REC: if (mrec.head.rec_size < sizeof(mrec.rec)) error = EINVAL; if (error == 0) error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); break; case HAMMER_MREC_TYPE_REC_NODATA: case HAMMER_MREC_TYPE_REC_BADCRC: /* * Records with bad data payloads are ignored XXX. * Records with no data payload have to be skipped * (they shouldn't have been written in the first * place). */ if (mrec.head.rec_size < sizeof(mrec.rec)) error = EINVAL; break; case HAMMER_MREC_TYPE_PASS: if (mrec.head.rec_size != sizeof(mrec.rec)) error = EINVAL; if (error == 0) error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); break; default: error = EINVAL; break; } /* * Retry the current record on deadlock, otherwise setup * for the next loop. */ if (error == EDEADLK) { while (error == EDEADLK) { hammer_sync_lock_sh(trans); hammer_recover_cursor(&cursor); error = hammer_cursor_upgrade(&cursor); hammer_sync_unlock(trans); } } else { if (error == EALREADY) error = 0; if (error == 0) { mirror->count += HAMMER_HEAD_DOALIGN(mrec.head.rec_size); } } } hammer_done_cursor(&cursor); /* * cumulative error */ if (error) { mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; mirror->head.error = error; } /* * ioctls don't update the RW data structure if an error is returned, * always return 0. */ return(0); }
/* * Reblock the B-Tree (leaf) node, record, and/or data if necessary. * * XXX We have no visibility into internal B-Tree nodes at the moment, * only leaf nodes. */ static int hammer_reblock_helper(struct hammer_ioc_reblock *reblock, hammer_cursor_t cursor, hammer_btree_elm_t elm) { hammer_mount_t hmp; hammer_off_t tmp_offset; hammer_node_ondisk_t ondisk; struct hammer_btree_leaf_elm leaf; int error; int bytes; int cur; int iocflags; error = 0; hmp = cursor->trans->hmp; /* * Reblock data. Note that data embedded in a record is reblocked * by the record reblock code. Data processing only occurs at leaf * nodes and for RECORD element types. */ if (cursor->node->ondisk->type != HAMMER_BTREE_TYPE_LEAF) goto skip; if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) return(0); tmp_offset = elm->leaf.data_offset; if (tmp_offset == 0) goto skip; if (error) goto skip; /* * NOTE: Localization restrictions may also have been set-up, we can't * just set the match flags willy-nilly here. */ switch (elm->leaf.base.rec_type) { case HAMMER_RECTYPE_INODE: case HAMMER_RECTYPE_SNAPSHOT: case HAMMER_RECTYPE_CONFIG: iocflags = HAMMER_IOC_DO_INODES; break; case HAMMER_RECTYPE_EXT: case HAMMER_RECTYPE_FIX: case HAMMER_RECTYPE_PFS: case HAMMER_RECTYPE_DIRENTRY: iocflags = HAMMER_IOC_DO_DIRS; break; case HAMMER_RECTYPE_DATA: case HAMMER_RECTYPE_DB: iocflags = HAMMER_IOC_DO_DATA; break; default: iocflags = 0; break; } if (reblock->head.flags & iocflags) { ++reblock->data_count; reblock->data_byte_count += elm->leaf.data_len; bytes = hammer_blockmap_getfree(hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) kprintf("D %6d/%d\n", bytes, reblock->free_level); if (error == 0 && (cur == 0 || reblock->free_level == 0) && bytes >= reblock->free_level) { /* * This is nasty, the uncache code may have to get * vnode locks and because of that we can't hold * the cursor locked. * * WARNING: See warnings in hammer_unlock_cursor() * function. */ leaf = elm->leaf; hammer_unlock_cursor(cursor); hammer_io_direct_uncache(hmp, &leaf); hammer_lock_cursor(cursor); /* * elm may have become stale or invalid, reload it. * ondisk variable is temporary only. Note that * cursor->node and thus cursor->node->ondisk may * also changed. */ ondisk = cursor->node->ondisk; elm = &ondisk->elms[cursor->index]; if (cursor->flags & HAMMER_CURSOR_RETEST) { kprintf("hammer: debug: retest on " "reblocker uncache\n"); error = EDEADLK; } else if (ondisk->type != HAMMER_BTREE_TYPE_LEAF || cursor->index >= ondisk->count) { kprintf("hammer: debug: shifted on " "reblocker uncache\n"); error = EDEADLK; } else if (bcmp(&elm->leaf, &leaf, sizeof(leaf))) { kprintf("hammer: debug: changed on " "reblocker uncache\n"); error = EDEADLK; } if (error == 0) error = hammer_cursor_upgrade(cursor); if (error == 0) { KKASSERT(cursor->index < ondisk->count); error = hammer_reblock_data(reblock, cursor, elm); } if (error == 0) { ++reblock->data_moves; reblock->data_byte_moves += elm->leaf.data_len; } } } skip: /* * Reblock a B-Tree internal or leaf node. A leaf node is reblocked * on initial entry only (element 0). An internal node is reblocked * when entered upward from its first leaf node only (also element 0). * Further revisits of the internal node (index > 0) are ignored. */ tmp_offset = cursor->node->node_offset; if (cursor->index == 0 && error == 0 && (reblock->head.flags & HAMMER_IOC_DO_BTREE)) { ++reblock->btree_count; bytes = hammer_blockmap_getfree(hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) kprintf("B %6d/%d\n", bytes, reblock->free_level); if (error == 0 && (cur == 0 || reblock->free_level == 0) && bytes >= reblock->free_level) { error = hammer_cursor_upgrade(cursor); if (error == 0) { if (cursor->parent) { KKASSERT(cursor->parent_index < cursor->parent->ondisk->count); elm = &cursor->parent->ondisk->elms[cursor->parent_index]; } else { elm = NULL; } switch(cursor->node->ondisk->type) { case HAMMER_BTREE_TYPE_LEAF: error = hammer_reblock_leaf_node( reblock, cursor, elm); break; case HAMMER_BTREE_TYPE_INTERNAL: error = hammer_reblock_int_node( reblock, cursor, elm); break; default: panic("Illegal B-Tree node type"); } } if (error == 0) { ++reblock->btree_moves; } } } hammer_cursor_downgrade(cursor); return(error); }
/* * NOTE: THIS CODE HAS BEEN REMOVED! Pruning no longer attempts to realign * adjacent records because it seriously interferes with every * mirroring algorithm I could come up with. * * This means that historical accesses beyond the first snapshot * softlink should be on snapshot boundaries only. Historical * accesses from "now" to the first snapshot softlink continue to * be fine-grained. * * NOTE: It also looks like there's a bug in the removed code. It is believed * that create_tid can sometimes get set to 0xffffffffffffffff. Just as * well we no longer try to do this fancy shit. Probably the attempt to * correct the rhb is blowing up the cursor's indexing or addressing mapping. * * Align the record to cover any gaps created through the deletion of * records within the pruning space. If we were to just delete the records * there would be gaps which in turn would cause a snapshot that is NOT on * a pruning boundary to appear corrupt to the user. Forcing alignment * of the create_tid and delete_tid for retained records 'reconnects' * the previously contiguous space, making it contiguous again after the * deletions. * * The use of a reverse iteration allows us to safely align the records and * related elements without creating temporary overlaps. XXX we should * add ordering dependancies for record buffers to guarantee consistency * during recovery. */ static int realign_prune(struct hammer_ioc_prune *prune, hammer_cursor_t cursor, int realign_cre, int realign_del) { struct hammer_ioc_prune_elm *scan; hammer_btree_elm_t elm; hammer_tid_t delta; hammer_tid_t tid; int error; hammer_cursor_downgrade(cursor); elm = &cursor->node->ondisk->elms[cursor->index]; ++prune->stat_realignments; /* * Align the create_tid. By doing a reverse iteration we guarantee * that all records after our current record have already been * aligned, allowing us to safely correct the right-hand-boundary * (because no record to our right is otherwise exactly matching * will have a create_tid to the left of our aligned create_tid). */ error = 0; if (realign_cre >= 0) { scan = &prune->elms[realign_cre]; delta = (elm->leaf.base.create_tid - scan->beg_tid) % scan->mod_tid; if (delta) { tid = elm->leaf.base.create_tid - delta + scan->mod_tid; /* can EDEADLK */ error = hammer_btree_correct_rhb(cursor, tid + 1); if (error == 0) { error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); } if (error == 0) { /* can EDEADLK */ error = hammer_cursor_upgrade(cursor); } if (error == 0) { hammer_modify_node(cursor->trans, cursor->node, &elm->leaf.base.create_tid, sizeof(elm->leaf.base.create_tid)); elm->leaf.base.create_tid = tid; hammer_modify_node_done(cursor->node); } } } /* * Align the delete_tid. This only occurs if the record is historical * was deleted at some point. Realigning the delete_tid does not * move the record within the B-Tree but may cause it to temporarily * overlap a record that has not yet been pruned. */ if (error == 0 && realign_del >= 0) { scan = &prune->elms[realign_del]; delta = (elm->leaf.base.delete_tid - scan->beg_tid) % scan->mod_tid; if (delta) { error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); if (error == 0) { hammer_modify_node(cursor->trans, cursor->node, &elm->leaf.base.delete_tid, sizeof(elm->leaf.base.delete_tid)); elm->leaf.base.delete_tid = elm->leaf.base.delete_tid - delta + scan->mod_tid; hammer_modify_node_done(cursor->node); } } } return (error); }
/* * Reblock the B-Tree (leaf) node, record, and/or data if necessary. * * XXX We have no visibility into internal B-Tree nodes at the moment, * only leaf nodes. */ static int hammer_reblock_helper(struct hammer_ioc_reblock *reblock, hammer_cursor_t cursor, hammer_btree_elm_t elm) { hammer_mount_t hmp; hammer_off_t tmp_offset; struct hammer_btree_leaf_elm leaf; int error; int bytes; int cur; int iocflags; error = 0; hmp = cursor->trans->hmp; /* * Reblock data. Note that data embedded in a record is reblocked * by the record reblock code. Data processing only occurs at leaf * nodes and for RECORD element types. */ if (cursor->node->ondisk->type != HAMMER_BTREE_TYPE_LEAF) goto skip; if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) return(0); tmp_offset = elm->leaf.data_offset; if (tmp_offset == 0) goto skip; if (error) goto skip; /* * NOTE: Localization restrictions may also have been set-up, we can't * just set the match flags willy-nilly here. */ switch(elm->leaf.base.rec_type) { case HAMMER_RECTYPE_INODE: iocflags = HAMMER_IOC_DO_INODES; break; case HAMMER_RECTYPE_EXT: case HAMMER_RECTYPE_FIX: case HAMMER_RECTYPE_PFS: case HAMMER_RECTYPE_DIRENTRY: iocflags = HAMMER_IOC_DO_DIRS; break; case HAMMER_RECTYPE_DATA: case HAMMER_RECTYPE_DB: iocflags = HAMMER_IOC_DO_DATA; break; default: iocflags = 0; break; } if (reblock->head.flags & iocflags) { ++reblock->data_count; reblock->data_byte_count += elm->leaf.data_len; bytes = hammer_blockmap_getfree(hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) kprintf("D %6d/%d\n", bytes, reblock->free_level); if (error == 0 && (cur == 0 || reblock->free_level == 0) && bytes >= reblock->free_level) { /* * This is nasty, the uncache code may have to get * vnode locks and because of that we can't hold * the cursor locked. */ leaf = elm->leaf; hammer_unlock_cursor(cursor, 0); hammer_io_direct_uncache(hmp, &leaf); hammer_lock_cursor(cursor, 0); if (cursor->flags & HAMMER_CURSOR_RETEST) { kprintf("hammer: retest after uncache\n"); error = EDEADLK; } else { KKASSERT(bcmp(&elm->leaf, &leaf, sizeof(leaf)) == 0); } if (error == 0) error = hammer_cursor_upgrade(cursor); if (error == 0) { error = hammer_reblock_data(reblock, cursor, elm); } if (error == 0) { ++reblock->data_moves; reblock->data_byte_moves += elm->leaf.data_len; } } } skip: /* * Reblock a B-Tree internal or leaf node. */ tmp_offset = cursor->node->node_offset; if (cursor->index == 0 && error == 0 && (reblock->head.flags & HAMMER_IOC_DO_BTREE)) { ++reblock->btree_count; bytes = hammer_blockmap_getfree(hmp, tmp_offset, &cur, &error); if (hammer_debug_general & 0x4000) kprintf("B %6d/%d\n", bytes, reblock->free_level); if (error == 0 && (cur == 0 || reblock->free_level == 0) && bytes >= reblock->free_level) { error = hammer_cursor_upgrade(cursor); if (error == 0) { if (cursor->parent) elm = &cursor->parent->ondisk->elms[cursor->parent_index]; else elm = NULL; switch(cursor->node->ondisk->type) { case HAMMER_BTREE_TYPE_LEAF: error = hammer_reblock_leaf_node( reblock, cursor, elm); break; case HAMMER_BTREE_TYPE_INTERNAL: error = hammer_reblock_int_node( reblock, cursor, elm); break; default: panic("Illegal B-Tree node type"); } } if (error == 0) { ++reblock->btree_moves; } } } hammer_cursor_downgrade(cursor); return(error); }