/*
 * Dangling inodes can occur if processes are holding open descriptors on
 * deleted files as-of when a machine crashes.  When we find one simply
 * acquire the inode and release it.  The inode handling code will then
 * do the right thing.
 */
static
void
prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
{
	hammer_inode_t ip;
	int error;

	if (elm->base.rec_type != HAMMER_RECTYPE_INODE)
		return;
	if (elm->base.delete_tid != 0)
		return;
	if (hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA))
		return;
	if (cursor->data->inode.nlinks)
		return;
	hammer_cursor_downgrade(cursor);
	ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id,
		      HAMMER_MAX_TID,
		      elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK,
		      0, &error);
	if (ip) {
		if (hammer_debug_general & 0x0001) {
			kprintf("pruning disconnected inode %016llx\n",
				(long long)elm->base.obj_id);
		}
		hammer_rel_inode(ip, 0);
		hammer_inode_waitreclaims(cursor->trans);
	} else {
		kprintf("unable to prune disconnected inode %016llx\n",
			(long long)elm->base.obj_id);
	}
}
/*
 * Reblock a record's data.  Both the B-Tree element and record pointers
 * to the data must be adjusted.
 */
static int
hammer_reblock_data(struct hammer_ioc_reblock *reblock,
		    hammer_cursor_t cursor, hammer_btree_elm_t elm)
{
	struct hammer_buffer *data_buffer = NULL;
	hammer_off_t ndata_offset;
	int error;
	void *ndata;

	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
					     HAMMER_CURSOR_GET_LEAF);
	if (error)
		return (error);
	ndata = hammer_alloc_data(cursor->trans, elm->leaf.data_len,
				  elm->leaf.base.rec_type,
				  &ndata_offset, &data_buffer,
				  0, &error);
	if (error)
		goto done;
	hammer_io_notmeta(data_buffer);

	/*
	 * Move the data.  Note that we must invalidate any cached
	 * data buffer in the cursor before calling blockmap_free.
	 * The blockmap_free may free up the entire large-block and
	 * will not be able to invalidate it if the cursor is holding
	 * a data buffer cached in that large block.
	 */
	hammer_modify_buffer(cursor->trans, data_buffer, NULL, 0);
	bcopy(cursor->data, ndata, elm->leaf.data_len);
	hammer_modify_buffer_done(data_buffer);
	hammer_cursor_invalidate_cache(cursor);

	hammer_blockmap_free(cursor->trans,
			     elm->leaf.data_offset, elm->leaf.data_len);

	hammer_modify_node(cursor->trans, cursor->node,
			   &elm->leaf.data_offset, sizeof(hammer_off_t));
	elm->leaf.data_offset = ndata_offset;
	hammer_modify_node_done(cursor->node);

done:
	if (data_buffer)
		hammer_rel_buffer(data_buffer, 0);
	return (error);
}
Exemple #3
0
/*
 * Reblock a record's data.  Both the B-Tree element and record pointers
 * to the data must be adjusted.
 */
static int
hammer_reblock_data(struct hammer_ioc_reblock *reblock,
		    hammer_cursor_t cursor, hammer_btree_elm_t elm)
{
	struct hammer_buffer *data_buffer = NULL;
	hammer_off_t ndata_offset;
	int error;
	void *ndata;

	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
					     HAMMER_CURSOR_GET_LEAF);
	if (error)
		return (error);
	ndata = hammer_alloc_data(cursor->trans, elm->leaf.data_len,
				  elm->leaf.base.rec_type,
				  &ndata_offset, &data_buffer, &error);
	if (error)
		goto done;

	/*
	 * Move the data
	 */
	hammer_modify_buffer(cursor->trans, data_buffer, NULL, 0);
	bcopy(cursor->data, ndata, elm->leaf.data_len);
	hammer_modify_buffer_done(data_buffer);

	hammer_blockmap_free(cursor->trans,
			     elm->leaf.data_offset, elm->leaf.data_len);

	hammer_modify_node(cursor->trans, cursor->node,
			   &elm->leaf.data_offset, sizeof(hammer_off_t));
	elm->leaf.data_offset = ndata_offset;
	hammer_modify_node_done(cursor->node);

done:
	if (data_buffer)
		hammer_rel_buffer(data_buffer, 0);
	return (error);
}
Exemple #4
0
/*
 * All B-Tree records within the specified key range which also conform
 * to the transaction id range are returned.  Mirroring code keeps track
 * of the last transaction id fully scanned and can efficiently pick up
 * where it left off if interrupted.
 *
 * The PFS is identified in the mirror structure.  The passed ip is just
 * some directory in the overall HAMMER filesystem and has nothing to
 * do with the PFS.
 */
int
hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip,
		       struct hammer_ioc_mirror_rw *mirror)
{
	struct hammer_cmirror cmirror;
	struct hammer_cursor cursor;
	union hammer_ioc_mrecord_any mrec;
	hammer_btree_leaf_elm_t elm;
	const int crc_start = HAMMER_MREC_CRCOFF;
	char *uptr;
	int error;
	int data_len;
	int bytes;
	int eatdisk;
	int mrec_flags;
	u_int32_t localization;
	u_int32_t rec_crc;

	localization = (u_int32_t)mirror->pfs_id << 16;

	if ((mirror->key_beg.localization | mirror->key_end.localization) &
	    HAMMER_LOCALIZE_PSEUDOFS_MASK) {
		return(EINVAL);
	}
	if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0)
		return(EINVAL);

	mirror->key_cur = mirror->key_beg;
	mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
	mirror->key_cur.localization += localization;
	bzero(&mrec, sizeof(mrec));
	bzero(&cmirror, sizeof(cmirror));

	/*
	 * Make CRC errors non-fatal (at least on data), causing an EDOM
	 * error instead of EIO.
	 */
	trans->flags |= HAMMER_TRANSF_CRCDOM;

retry:
	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
	if (error) {
		hammer_done_cursor(&cursor);
		goto failed;
	}
	cursor.key_beg = mirror->key_cur;
	cursor.key_end = mirror->key_end;
	cursor.key_end.localization &= HAMMER_LOCALIZE_MASK;
	cursor.key_end.localization += localization;

	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
	cursor.flags |= HAMMER_CURSOR_BACKEND;

	/*
	 * This flag filters the search to only return elements whos create
	 * or delete TID is >= mirror_tid.  The B-Tree uses the mirror_tid
	 * field stored with internal and leaf nodes to shortcut the scan.
	 */
	cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
	cursor.cmirror = &cmirror;
	cmirror.mirror_tid = mirror->tid_beg;

	error = hammer_btree_first(&cursor);
	while (error == 0) {
		/*
		 * Yield to more important tasks
		 */
		if (error == 0) {
			error = hammer_signal_check(trans->hmp);
			if (error)
				break;
		}

		/*
		 * An internal node can be returned in mirror-filtered
		 * mode and indicates that the scan is returning a skip
		 * range in the cursor->cmirror structure.
		 */
		uptr = (char *)mirror->ubuf + mirror->count;
		if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
			/*
			 * Check space
			 */
			mirror->key_cur = cmirror.skip_beg;
			bytes = sizeof(mrec.skip);
			if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) >
			    mirror->size) {
				break;
			}

			/*
			 * Fill mrec
			 */
			mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
			mrec.head.type = HAMMER_MREC_TYPE_SKIP;
			mrec.head.rec_size = bytes;
			mrec.skip.skip_beg = cmirror.skip_beg;
			mrec.skip.skip_end = cmirror.skip_end;
			mrec.head.rec_crc = crc32(&mrec.head.rec_size,
						 bytes - crc_start);
			error = copyout(&mrec, uptr, bytes);
			eatdisk = 0;
			goto didwrite;
		}

		/*
		 * Leaf node.  In full-history mode we could filter out
		 * elements modified outside the user-requested TID range.
		 *
		 * However, such elements must be returned so the writer
		 * can compare them against the target to determine what
		 * needs to be deleted on the target, particular for
		 * no-history mirrors.
		 */
		KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
		elm = &cursor.node->ondisk->elms[cursor.index].leaf;
		mirror->key_cur = elm->base;

		/*
		 * If the record was created after our end point we just
		 * ignore it.
		 */
		if (elm->base.create_tid > mirror->tid_end) {
			error = 0;
			bytes = 0;
			eatdisk = 1;
			goto didwrite;
		}

		/*
		 * Determine if we should generate a PASS or a REC.  PASS
		 * records are records without any data payload.  Such
		 * records will be generated if the target is already expected
		 * to have the record, allowing it to delete the gaps.
		 *
		 * A PASS record is also used to perform deletions on the
		 * target.
		 *
		 * Such deletions are needed if the master or files on the
		 * master are no-history, or if the slave is so far behind
		 * the master has already been pruned.
		 */
		if (elm->base.create_tid < mirror->tid_beg) {
			bytes = sizeof(mrec.rec);
			if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) >
			    mirror->size) {
				break;
			}

			/*
			 * Fill mrec.
			 */
			mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
			mrec.head.type = HAMMER_MREC_TYPE_PASS;
			mrec.head.rec_size = bytes;
			mrec.rec.leaf = *elm;
			mrec.head.rec_crc = crc32(&mrec.head.rec_size,
						 bytes - crc_start);
			error = copyout(&mrec, uptr, bytes);
			eatdisk = 1;
			goto didwrite;
			
		}

		/*
		 * The core code exports the data to userland.
		 *
		 * CRC errors on data are reported but passed through,
		 * but the data must be washed by the user program.
		 *
		 * If userland just wants the btree records it can
		 * request that bulk data not be returned.  This is
		 * use during mirror-stream histogram generation.
		 */
		mrec_flags = 0;
		data_len = (elm->data_offset) ? elm->data_len : 0;
		if (data_len &&
		    (mirror->head.flags & HAMMER_IOC_MIRROR_NODATA)) {
			data_len = 0;
			mrec_flags |= HAMMER_MRECF_NODATA;
		}
		if (data_len) {
			error = hammer_btree_extract(&cursor,
						     HAMMER_CURSOR_GET_DATA);
			if (error) {
				if (error != EDOM)
					break;
				mrec_flags |= HAMMER_MRECF_CRC_ERROR |
					      HAMMER_MRECF_DATA_CRC_BAD;
			}
		}

		bytes = sizeof(mrec.rec) + data_len;
		if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size)
			break;

		/*
		 * Construct the record for userland and copyout.
		 *
		 * The user is asking for a snapshot, if the record was
		 * deleted beyond the user-requested ending tid, the record
		 * is not considered deleted from the point of view of
		 * userland and delete_tid is cleared.
		 */
		mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
		mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags;
		mrec.head.rec_size = bytes;
		mrec.rec.leaf = *elm;

		if (elm->base.delete_tid > mirror->tid_end)
			mrec.rec.leaf.base.delete_tid = 0;
		rec_crc = crc32(&mrec.head.rec_size,
				sizeof(mrec.rec) - crc_start);
		if (data_len)
			rec_crc = crc32_ext(cursor.data, data_len, rec_crc);
		mrec.head.rec_crc = rec_crc;
		error = copyout(&mrec, uptr, sizeof(mrec.rec));
		if (data_len && error == 0) {
			error = copyout(cursor.data, uptr + sizeof(mrec.rec),
					data_len);
		}
		eatdisk = 1;

		/*
		 * eatdisk controls whether we skip the current cursor
		 * position on the next scan or not.  If doing a SKIP
		 * the cursor is already positioned properly for the next
		 * scan and eatdisk will be 0.
		 */
didwrite:
		if (error == 0) {
			mirror->count += HAMMER_HEAD_DOALIGN(bytes);
			if (eatdisk)
				cursor.flags |= HAMMER_CURSOR_ATEDISK;
			else
				cursor.flags &= ~HAMMER_CURSOR_ATEDISK;
			error = hammer_btree_iterate(&cursor);
		}
	}
	if (error == ENOENT) {
		mirror->key_cur = mirror->key_end;
		error = 0;
	}
	hammer_done_cursor(&cursor);
	if (error == EDEADLK)
		goto retry;
	if (error == EINTR) {
		mirror->head.flags |= HAMMER_IOC_HEAD_INTR;
		error = 0;
	}
failed:
	mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
	return(error);
}
/*
 * NOTE: THIS CODE HAS BEEN REMOVED!  Pruning no longer attempts to realign
 *	 adjacent records because it seriously interferes with every 
 *	 mirroring algorithm I could come up with.
 *
 *	 This means that historical accesses beyond the first snapshot
 *	 softlink should be on snapshot boundaries only.  Historical
 *	 accesses from "now" to the first snapshot softlink continue to
 *	 be fine-grained.
 *
 * NOTE: It also looks like there's a bug in the removed code.  It is believed
 *	 that create_tid can sometimes get set to 0xffffffffffffffff.  Just as
 *	 well we no longer try to do this fancy shit.  Probably the attempt to
 *	 correct the rhb is blowing up the cursor's indexing or addressing mapping.
 *
 * Align the record to cover any gaps created through the deletion of
 * records within the pruning space.  If we were to just delete the records
 * there would be gaps which in turn would cause a snapshot that is NOT on
 * a pruning boundary to appear corrupt to the user.  Forcing alignment
 * of the create_tid and delete_tid for retained records 'reconnects'
 * the previously contiguous space, making it contiguous again after the
 * deletions.
 *
 * The use of a reverse iteration allows us to safely align the records and
 * related elements without creating temporary overlaps.  XXX we should
 * add ordering dependancies for record buffers to guarantee consistency
 * during recovery.
 */
static int
realign_prune(struct hammer_ioc_prune *prune,
	      hammer_cursor_t cursor, int realign_cre, int realign_del)
{
	struct hammer_ioc_prune_elm *scan;
	hammer_btree_elm_t elm;
	hammer_tid_t delta;
	hammer_tid_t tid;
	int error;

	hammer_cursor_downgrade(cursor);

	elm = &cursor->node->ondisk->elms[cursor->index];
	++prune->stat_realignments;

	/*
	 * Align the create_tid.  By doing a reverse iteration we guarantee
	 * that all records after our current record have already been
	 * aligned, allowing us to safely correct the right-hand-boundary
	 * (because no record to our right is otherwise exactly matching
	 * will have a create_tid to the left of our aligned create_tid).
	 */
	error = 0;
	if (realign_cre >= 0) {
		scan = &prune->elms[realign_cre];

		delta = (elm->leaf.base.create_tid - scan->beg_tid) % 
			scan->mod_tid;
		if (delta) {
			tid = elm->leaf.base.create_tid - delta + scan->mod_tid;

			/* can EDEADLK */
			error = hammer_btree_correct_rhb(cursor, tid + 1);
			if (error == 0) {
				error = hammer_btree_extract(cursor,
						     HAMMER_CURSOR_GET_LEAF);
			}
			if (error == 0) {
				/* can EDEADLK */
				error = hammer_cursor_upgrade(cursor);
			}
			if (error == 0) {
				hammer_modify_node(cursor->trans, cursor->node,
					    &elm->leaf.base.create_tid,
					    sizeof(elm->leaf.base.create_tid));
				elm->leaf.base.create_tid = tid;
				hammer_modify_node_done(cursor->node);
			}
		}
	}

	/*
	 * Align the delete_tid.  This only occurs if the record is historical
	 * was deleted at some point.  Realigning the delete_tid does not
	 * move the record within the B-Tree but may cause it to temporarily
	 * overlap a record that has not yet been pruned.
	 */
	if (error == 0 && realign_del >= 0) {
		scan = &prune->elms[realign_del];

		delta = (elm->leaf.base.delete_tid - scan->beg_tid) % 
			scan->mod_tid;
		if (delta) {
			error = hammer_btree_extract(cursor,
						     HAMMER_CURSOR_GET_LEAF);
			if (error == 0) {
				hammer_modify_node(cursor->trans, cursor->node,
					    &elm->leaf.base.delete_tid,
					    sizeof(elm->leaf.base.delete_tid));
				elm->leaf.base.delete_tid =
					    elm->leaf.base.delete_tid -
					    delta + scan->mod_tid;
				hammer_modify_node_done(cursor->node);
			}
		}
	}
	return (error);
}
int
hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
		 struct hammer_ioc_dedup *dedup)
{
	struct hammer_cursor cursor1, cursor2;
	int error;
	int seq;

	/*
	 * Enforce hammer filesystem version requirements
	 */
	if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
		kprintf("hammer: Filesystem must be upgraded to v5 "
			"before you can run dedup\n");
		return (EOPNOTSUPP); /* 95*/
	}

	/*
	 * Cursor1, return an error -> candidate goes to pass2 list
	 */
	error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
	if (error)
		goto done_cursor;
	cursor1.key_beg = dedup->elm1;
	cursor1.flags |= HAMMER_CURSOR_BACKEND;

	error = hammer_btree_lookup(&cursor1);
	if (error)
		goto done_cursor;
	error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
						HAMMER_CURSOR_GET_DATA);
	if (error)
		goto done_cursor;

	/*
	 * Cursor2, return an error -> candidate goes to pass2 list
	 */
	error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
	if (error)
		goto done_cursors;
	cursor2.key_beg = dedup->elm2;
	cursor2.flags |= HAMMER_CURSOR_BACKEND;

	error = hammer_btree_lookup(&cursor2);
	if (error)
		goto done_cursors;
	error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
						HAMMER_CURSOR_GET_DATA);
	if (error)
		goto done_cursors;

	/*
	 * Zone validation. We can't de-dup any of the other zones
	 * (BTREE or META) or bad things will happen.
	 *
	 * Return with error = 0, but set an INVALID_ZONE flag.
	 */
	error = validate_zone(cursor1.leaf->data_offset) +
			    validate_zone(cursor2.leaf->data_offset);
	if (error) {
		dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
		error = 0;
		goto done_cursors;
	}

	/*
	 * Comparison checks
	 *
	 * If zones don't match or data_len fields aren't the same
	 * we consider it to be a comparison failure.
	 *
	 * Return with error = 0, but set a CMP_FAILURE flag.
	 */
	if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
	    (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
		goto done_cursors;
	}
	if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
		goto done_cursors;
	}

	/* byte-by-byte comparison to be sure */
	if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
		dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
		goto done_cursors;
	}

	/*
	 * Upgrade both cursors together to an exclusive lock
	 *
	 * Return an error -> candidate goes to pass2 list
	 */
	hammer_sync_lock_sh(trans);
	error = hammer_cursor_upgrade2(&cursor1, &cursor2);
	if (error) {
		hammer_sync_unlock(trans);
		goto done_cursors;
	}

	error = hammer_blockmap_dedup(cursor1.trans,
			cursor1.leaf->data_offset, cursor1.leaf->data_len);
	if (error) {
		if (error == ERANGE) {
			/*
			 * Return with error = 0, but set an UNDERFLOW flag
			 */
			dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
			error = 0;
			goto downgrade_cursors;
		} else {
			/*
			 * Return an error -> block goes to pass2 list
			 */
			goto downgrade_cursors;
		}
	}

	/*
	 * The cursor2's cache must be invalidated before calling
	 * hammer_blockmap_free(), otherwise it will not be able to
	 * invalidate the underlying data buffer.
	 */
	hammer_cursor_invalidate_cache(&cursor2);
	hammer_blockmap_free(cursor2.trans,
			cursor2.leaf->data_offset, cursor2.leaf->data_len);

	hammer_modify_node(cursor2.trans, cursor2.node,
			&cursor2.leaf->data_offset, sizeof(hammer_off_t));
	cursor2.leaf->data_offset = cursor1.leaf->data_offset;
	hammer_modify_node_done(cursor2.node);

downgrade_cursors:
	hammer_cursor_downgrade2(&cursor1, &cursor2);
	hammer_sync_unlock(trans);
done_cursors:
	hammer_done_cursor(&cursor2);
done_cursor:
	hammer_done_cursor(&cursor1);

	/*
	 * Avoid deadlocking the buffer cache
	 */
	seq = trans->hmp->flusher.done;
	while (hammer_flusher_meta_halflimit(trans->hmp) ||
	       hammer_flusher_undo_exhausted(trans, 2)) {
		hammer_flusher_wait(trans->hmp, seq);
		seq = hammer_flusher_async_one(trans->hmp);
	}
	return (error);
}