Example #1
0
/*
 * eio_mem_init
 */
int eio_mem_init(struct cache_c *dmc)
{
	u_int32_t lsb_bits;
	u_int32_t msb_bits_24;  /* most significant bits in shrunk dbn */
	u_int64_t max_dbn;
	u_int64_t num_sets_64;

	/*
	 * Sanity check the number of sets.
	 */
	num_sets_64 = EIO_DIV(dmc->size, dmc->assoc);
	if (num_sets_64 > UINT_MAX) {
		pr_err("Number of cache sets (%lu) greater than maximum"
		       "allowed (%u)",
		       (long unsigned int)num_sets_64, UINT_MAX);
		return -1;
	}

	/*
	 * Find the number of bits required to encode the set number and
	 * its corresponding mask value.
	 */
	dmc->num_sets = (u_int32_t)num_sets_64;
	for (dmc->num_sets_bits = 0; (dmc->num_sets >> dmc->num_sets_bits) != 0;
	     dmc->num_sets_bits++) ;
	dmc->num_sets_mask = ULLONG_MAX >> (64 - dmc->num_sets_bits);

	/*
	 * If we don't have at least 16 bits to save,
	 * we can't use small metadata.
	 */
	if (dmc->num_sets_bits < 16) {
		dmc->cache_flags |= CACHE_FLAGS_MD8;
		pr_info("Not enough sets to use small metadata");
		return 1;
	}

	/*
	 * Now compute the largest sector number that we can shrink; then see
	 * if the source volume is smaller.
	 */
	lsb_bits = dmc->consecutive_shift + dmc->block_shift;
	msb_bits_24 = 24 - 1 - lsb_bits;        /* 1 for wrapped bit */
	max_dbn =
		((u_int64_t)1) << (msb_bits_24 + dmc->num_sets_bits + lsb_bits);
	if (to_sector(eio_get_device_size(dmc->disk_dev)) > max_dbn) {
		dmc->cache_flags |= CACHE_FLAGS_MD8;
		pr_info("Source volume too big to use small metadata");
		return 1;
	}

	return 0;
}
Example #2
0
/*-----------------------------------------------------------------
 * IO routines that accept a list of pages.
 *---------------------------------------------------------------*/
static void do_region(int rw, unsigned region, struct dm_io_region *where,
		      struct dpages *dp, struct io *io)
{
	struct bio *bio;
	struct page *page;
	unsigned long len;
	unsigned offset;
	unsigned num_bvecs;
	sector_t remaining = where->count;

	/*
	 * where->count may be zero if rw holds a flush and we need to
	 * send a zero-sized flush.
	 */
	do {
		/*
		 * Allocate a suitably sized-bio.
		 */
		num_bvecs = dm_sector_div_up(remaining,
					     (PAGE_SIZE >> SECTOR_SHIFT));
		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
		if (!bio) {
			printk(KERN_WARNING "%s : %s() failed\n", __FILE__, __func__);
			BUG_ON(1);
		}
		bio->bi_sector = where->sector + (where->count - remaining);
		bio->bi_bdev = where->bdev;
		bio->bi_end_io = endio;
		bio->bi_destructor = dm_bio_destructor;
		store_io_and_region_in_bio(bio, io, region);

		/*
		 * Try and add as many pages as possible.
		 */
		while (remaining) {
			dp->get_page(dp, &page, &len, &offset);
			len = min(len, to_bytes(remaining));
			if (!bio_add_page(bio, page, len, offset))
				break;

			offset = 0;
			remaining -= to_sector(len);
			dp->next_page(dp);
		}

		atomic_inc(&io->count);
		submit_bio(rw, bio);
	} while (remaining);
}
Example #3
0
static void do_region(int rw, unsigned region, struct dm_io_region *where,
		      struct dpages *dp, struct io *io)
{
	struct bio *bio;
	struct page *page;
	unsigned long len;
	unsigned offset;
	unsigned num_bvecs;
	sector_t remaining = where->count;
	struct request_queue *q = bdev_get_queue(where->bdev);
	sector_t discard_sectors;

	do {
		if (rw & REQ_DISCARD)
			num_bvecs = 1;
		else
			num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));

		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
		bio->bi_sector = where->sector + (where->count - remaining);
		bio->bi_bdev = where->bdev;
		bio->bi_end_io = endio;
		bio->bi_destructor = dm_bio_destructor;
		store_io_and_region_in_bio(bio, io, region);

		if (rw & REQ_DISCARD) {
			discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
			bio->bi_size = discard_sectors << SECTOR_SHIFT;
			remaining -= discard_sectors;
		} else while (remaining) {
			dp->get_page(dp, &page, &len, &offset);
			len = min(len, to_bytes(remaining));
			if (!bio_add_page(bio, page, len, offset))
				break;

			offset = 0;
			remaining -= to_sector(len);
			dp->next_page(dp);
		}

		atomic_inc(&io->count);
		submit_bio(rw, bio);
	} while (remaining);
}
Example #4
0
static struct iostash_bio *_io_alloc(struct hdd_info * hdd, struct ssd_info * ssd, uint32_t fragnum,
				    struct bio *bio, sector_t psn)
{
	struct iostash_bio *io = mempool_alloc(hdd->io_pool, GFP_NOIO);

	if (io) {
		atomic_inc(&hdd->io_pending);

		io->hdd = hdd;
		io->ssd = ssd;
		io->fragnum = fragnum;
		io->base_bio = bio;
		io->psn = psn;
		io->nr_sctr = to_sector(BIO_SIZE(bio));
		io->error = 0;
		io->ssd_werr = 0;	/* SSD write error */
		atomic_set(&io->io_pending, 0);
	}
	return io;
}
Example #5
0
File: dm.c Project: wxlong/Test
static void __clone_and_map(struct clone_info *ci)
{
	struct bio *clone, *bio = ci->bio;
	struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
	struct target_io *tio;

	/*
	 * Allocate a target io object.
	 */
	tio = alloc_tio(ci->md);
	tio->io = ci->io;
	tio->ti = ti;
	memset(&tio->info, 0, sizeof(tio->info));

	if (ci->sector_count <= max) {
		/*
		 * Optimise for the simple case where we can do all of
		 * the remaining io with a single clone.
		 */
		clone = clone_bio(bio, ci->sector, ci->idx,
				  bio->bi_vcnt - ci->idx, ci->sector_count);
		__map_bio(ti, clone, tio);
		ci->sector_count = 0;

	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
		/*
		 * There are some bvecs that don't span targets.
		 * Do as many of these as possible.
		 */
		int i;
		sector_t remaining = max;
		sector_t bv_len;

		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
			bv_len = to_sector(bio->bi_io_vec[i].bv_len);

			if (bv_len > remaining)
				break;

			remaining -= bv_len;
			len += bv_len;
		}

		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
		__map_bio(ti, clone, tio);

		ci->sector += len;
		ci->sector_count -= len;
		ci->idx = i;

	} else {
		/*
		 * Create two copy bios to deal with io that has
		 * been split across a target.
		 */
		struct bio_vec *bv = bio->bi_io_vec + ci->idx;

		clone = split_bvec(bio, ci->sector, ci->idx,
				   bv->bv_offset, max);
		__map_bio(ti, clone, tio);

		ci->sector += max;
		ci->sector_count -= max;
		ti = dm_table_find_target(ci->map, ci->sector);

		len = to_sector(bv->bv_len) - max;
		clone = split_bvec(bio, ci->sector, ci->idx,
				   bv->bv_offset + to_bytes(max), len);
		tio = alloc_tio(ci->md);
		tio->io = ci->io;
		tio->ti = ti;
		memset(&tio->info, 0, sizeof(tio->info));
		__map_bio(ti, clone, tio);

		ci->sector += len;
		ci->sector_count -= len;
		ci->idx++;
	}
}
Example #6
0
static void __clone_and_map(struct clone_info *ci)
{
	struct bio *clone, *bio = ci->bio;
	struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
	struct target_io *tio;

	/*
	 * Allocate a target io object.
	 */
	tio = alloc_tio(ci->md);
	tio->io = ci->io;
	tio->ti = ti;
	memset(&tio->info, 0, sizeof(tio->info));

	if (ci->sector_count <= max) {
		/*
		 * Optimise for the simple case where we can do all of
		 * the remaining io with a single clone.
		 */
		clone = clone_bio(bio, ci->sector, ci->idx,
				  bio->bi_vcnt - ci->idx, ci->sector_count,
				  ci->md->bs);
		__map_bio(ti, clone, tio);
		ci->sector_count = 0;

	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
		/*
		 * There are some bvecs that don't span targets.
		 * Do as many of these as possible.
		 */
		int i;
		sector_t remaining = max;
		sector_t bv_len;

		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
			bv_len = to_sector(bio->bi_io_vec[i].bv_len);

			if (bv_len > remaining)
				break;

			remaining -= bv_len;
			len += bv_len;
		}

		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
				  ci->md->bs);
		__map_bio(ti, clone, tio);

		ci->sector += len;
		ci->sector_count -= len;
		ci->idx = i;

	} else {
		/*
		 * Handle a bvec that must be split between two or more targets.
		 */
		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
		sector_t remaining = to_sector(bv->bv_len);
		unsigned int offset = 0;

		do {
			if (offset) {
				ti = dm_table_find_target(ci->map, ci->sector);
				max = max_io_len(ci->md, ci->sector, ti);

				tio = alloc_tio(ci->md);
				tio->io = ci->io;
				tio->ti = ti;
				memset(&tio->info, 0, sizeof(tio->info));
			}

			len = min(remaining, max);

			clone = split_bvec(bio, ci->sector, ci->idx,
					   bv->bv_offset + offset, len,
					   ci->md->bs);

			__map_bio(ti, clone, tio);

			ci->sector += len;
			ci->sector_count -= len;
			offset += to_bytes(len);
		} while (remaining -= len);

		ci->idx++;
	}
}
Example #7
0
File: dax.c Project: 020gzh/linux
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		pmd_t *pmd, unsigned int flags, get_block_t get_block,
		dax_iodone_t complete_unwritten)
{
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	struct buffer_head bh;
	unsigned blkbits = inode->i_blkbits;
	unsigned long pmd_addr = address & PMD_MASK;
	bool write = flags & FAULT_FLAG_WRITE;
	struct block_device *bdev;
	pgoff_t size, pgoff;
	sector_t block;
	int error, result = 0;
	bool alloc = false;

	/* dax pmd mappings require pfn_t_devmap() */
	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
		return VM_FAULT_FALLBACK;

	/* Fall back to PTEs if we're going to COW */
	if (write && !(vma->vm_flags & VM_SHARED)) {
		split_huge_pmd(vma, pmd, address);
		dax_pmd_dbg(NULL, address, "cow write");
		return VM_FAULT_FALLBACK;
	}
	/* If the PMD would extend outside the VMA */
	if (pmd_addr < vma->vm_start) {
		dax_pmd_dbg(NULL, address, "vma start unaligned");
		return VM_FAULT_FALLBACK;
	}
	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
		dax_pmd_dbg(NULL, address, "vma end unaligned");
		return VM_FAULT_FALLBACK;
	}

	pgoff = linear_page_index(vma, pmd_addr);
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (pgoff >= size)
		return VM_FAULT_SIGBUS;
	/* If the PMD would cover blocks out of the file */
	if ((pgoff | PG_PMD_COLOUR) >= size) {
		dax_pmd_dbg(NULL, address,
				"offset + huge page size > file size");
		return VM_FAULT_FALLBACK;
	}

	memset(&bh, 0, sizeof(bh));
	bh.b_bdev = inode->i_sb->s_bdev;
	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);

	bh.b_size = PMD_SIZE;

	if (get_block(inode, block, &bh, 0) != 0)
		return VM_FAULT_SIGBUS;

	if (!buffer_mapped(&bh) && write) {
		if (get_block(inode, block, &bh, 1) != 0)
			return VM_FAULT_SIGBUS;
		alloc = true;
	}

	bdev = bh.b_bdev;

	/*
	 * If the filesystem isn't willing to tell us the length of a hole,
	 * just fall back to PTEs.  Calling get_block 512 times in a loop
	 * would be silly.
	 */
	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
		dax_pmd_dbg(&bh, address, "allocated block too small");
		return VM_FAULT_FALLBACK;
	}

	/*
	 * If we allocated new storage, make sure no process has any
	 * zero pages covering this hole
	 */
	if (alloc) {
		loff_t lstart = pgoff << PAGE_SHIFT;
		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */

		truncate_pagecache_range(inode, lstart, lend);
	}

	i_mmap_lock_read(mapping);

	/*
	 * If a truncate happened while we were allocating blocks, we may
	 * leave blocks allocated to the file that are beyond EOF.  We can't
	 * take i_mutex here, so just leave them hanging; they'll be freed
	 * when the file is deleted.
	 */
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (pgoff >= size) {
		result = VM_FAULT_SIGBUS;
		goto out;
	}
	if ((pgoff | PG_PMD_COLOUR) >= size) {
		dax_pmd_dbg(&bh, address,
				"offset + huge page size > file size");
		goto fallback;
	}

	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
		spinlock_t *ptl;
		pmd_t entry;
		struct page *zero_page = get_huge_zero_page();

		if (unlikely(!zero_page)) {
			dax_pmd_dbg(&bh, address, "no zero page");
			goto fallback;
		}

		ptl = pmd_lock(vma->vm_mm, pmd);
		if (!pmd_none(*pmd)) {
			spin_unlock(ptl);
			dax_pmd_dbg(&bh, address, "pmd already present");
			goto fallback;
		}

		dev_dbg(part_to_dev(bdev->bd_part),
				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
				__func__, current->comm, address,
				(unsigned long long) to_sector(&bh, inode));

		entry = mk_pmd(zero_page, vma->vm_page_prot);
		entry = pmd_mkhuge(entry);
		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
		result = VM_FAULT_NOPAGE;
		spin_unlock(ptl);
	} else {
		struct blk_dax_ctl dax = {
			.sector = to_sector(&bh, inode),
			.size = PMD_SIZE,
		};
		long length = dax_map_atomic(bdev, &dax);

		if (length < 0) {
			result = VM_FAULT_SIGBUS;
			goto out;
		}
		if (length < PMD_SIZE) {
			dax_pmd_dbg(&bh, address, "dax-length too small");
			dax_unmap_atomic(bdev, &dax);
			goto fallback;
		}
		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
			dax_pmd_dbg(&bh, address, "pfn unaligned");
			dax_unmap_atomic(bdev, &dax);
			goto fallback;
		}

		if (!pfn_t_devmap(dax.pfn)) {
			dax_unmap_atomic(bdev, &dax);
			dax_pmd_dbg(&bh, address, "pfn not in memmap");
			goto fallback;
		}

		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
			clear_pmem(dax.addr, PMD_SIZE);
			wmb_pmem();
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			result |= VM_FAULT_MAJOR;
		}
		dax_unmap_atomic(bdev, &dax);

		/*
		 * For PTE faults we insert a radix tree entry for reads, and
		 * leave it clean.  Then on the first write we dirty the radix
		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
		 * call into get_block() to translate the pgoff to a sector in
		 * order to be able to create a new radix tree entry.
		 *
		 * The PMD path doesn't have an equivalent to
		 * dax_pfn_mkwrite(), though, so for a read followed by a
		 * write we traverse all the way through __dax_pmd_fault()
		 * twice.  This means we can just skip inserting a radix tree
		 * entry completely on the initial read and just wait until
		 * the write to insert a dirty entry.
		 */
		if (write) {
			error = dax_radix_entry(mapping, pgoff, dax.sector,
					true, true);
			if (error) {
				dax_pmd_dbg(&bh, address,
						"PMD radix insertion failed");
				goto fallback;
			}
		}

		dev_dbg(part_to_dev(bdev->bd_part),
				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
				__func__, current->comm, address,
				pfn_t_to_pfn(dax.pfn),
				(unsigned long long) dax.sector);
		result |= vmf_insert_pfn_pmd(vma, address, pmd,
				dax.pfn, write);
	}

 out:
	i_mmap_unlock_read(mapping);

	if (buffer_unwritten(&bh))
		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));

	return result;

 fallback:
	count_vm_event(THP_FAULT_FALLBACK);
	result = VM_FAULT_FALLBACK;
	goto out;
}
EXPORT_SYMBOL_GPL(__dax_pmd_fault);

/**
 * dax_pmd_fault - handle a PMD fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
 * When a page fault occurs, filesystems may call this helper in their
 * pmd_fault handler for DAX files.
 */
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
			pmd_t *pmd, unsigned int flags, get_block_t get_block,
			dax_iodone_t complete_unwritten)
{
	int result;
	struct super_block *sb = file_inode(vma->vm_file)->i_sb;

	if (flags & FAULT_FLAG_WRITE) {
		sb_start_pagefault(sb);
		file_update_time(vma->vm_file);
	}
	result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
				complete_unwritten);
	if (flags & FAULT_FLAG_WRITE)
		sb_end_pagefault(sb);

	return result;
}
Example #8
0
File: dax.c Project: 020gzh/linux
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
		      loff_t start, loff_t end, get_block_t get_block,
		      struct buffer_head *bh)
{
	loff_t pos = start, max = start, bh_max = start;
	bool hole = false, need_wmb = false;
	struct block_device *bdev = NULL;
	int rw = iov_iter_rw(iter), rc;
	long map_len = 0;
	struct blk_dax_ctl dax = {
		.addr = (void __pmem *) ERR_PTR(-EIO),
	};

	if (rw == READ)
		end = min(end, i_size_read(inode));

	while (pos < end) {
		size_t len;
		if (pos == max) {
			unsigned blkbits = inode->i_blkbits;
			long page = pos >> PAGE_SHIFT;
			sector_t block = page << (PAGE_SHIFT - blkbits);
			unsigned first = pos - (block << blkbits);
			long size;

			if (pos == bh_max) {
				bh->b_size = PAGE_ALIGN(end - pos);
				bh->b_state = 0;
				rc = get_block(inode, block, bh, rw == WRITE);
				if (rc)
					break;
				if (!buffer_size_valid(bh))
					bh->b_size = 1 << blkbits;
				bh_max = pos - first + bh->b_size;
				bdev = bh->b_bdev;
			} else {
				unsigned done = bh->b_size -
						(bh_max - (pos - first));
				bh->b_blocknr += done >> blkbits;
				bh->b_size -= done;
			}

			hole = rw == READ && !buffer_written(bh);
			if (hole) {
				size = bh->b_size - first;
			} else {
				dax_unmap_atomic(bdev, &dax);
				dax.sector = to_sector(bh, inode);
				dax.size = bh->b_size;
				map_len = dax_map_atomic(bdev, &dax);
				if (map_len < 0) {
					rc = map_len;
					break;
				}
				if (buffer_unwritten(bh) || buffer_new(bh)) {
					dax_new_buf(dax.addr, map_len, first,
							pos, end);
					need_wmb = true;
				}
				dax.addr += first;
				size = map_len - first;
			}
			max = min(pos + size, end);
		}

		if (iov_iter_rw(iter) == WRITE) {
			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
			need_wmb = true;
		} else if (!hole)
			len = copy_to_iter((void __force *) dax.addr, max - pos,
					iter);
		else
			len = iov_iter_zero(max - pos, iter);

		if (!len) {
			rc = -EFAULT;
			break;
		}

		pos += len;
		if (!IS_ERR(dax.addr))
			dax.addr += len;
	}

	if (need_wmb)
		wmb_pmem();
	dax_unmap_atomic(bdev, &dax);

	return (pos == start) ? rc : pos - start;
}
Example #9
0
void iostash_mkrequest(struct request_queue *q, struct bio *bio)
#endif
{
	struct hdd_info *hdd;
	struct ssd_info *ssd;
	struct iostash_bio *io;
        sce_fmap_t fmap;
	uint32_t nr_sctr;
	sector_t psn;
	make_request_fn *org_mapreq = NULL;
#if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE
	blk_qc_t ret = BLK_QC_T_NONE;
#endif

	DBG("Got bio=%p bio->bi_rw(%lu) request at s=%lu l=%u.\n",
		bio, bio->bi_rw, BIO_SECTOR(bio), bio_sectors(bio));

	rcu_read_lock();
	hdd = hdd_search(bio);
	if (hdd) {
		atomic_inc(&hdd->nr_ref);
		org_mapreq = hdd->org_mapreq;
	}
	rcu_read_unlock();

	if (unlikely(NULL == hdd)) {
		/* have to requeue the request, somebody was holding a
		 * dangling reference */
		ERR("Request holding a dangling make_request_fn pointer\n.");

#if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE
		bio->bi_error = -EAGAIN;
		return ret;
#elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0)
		rmb();		/* read the change in make_request_fn */
		return -EAGAIN; /* retry */
#else
		/* no retry possible in newer kernels since the return
		 * of make_request_fn is no longer checked and retried
		 * if not zero, we cannot unload the module */
		BUG();
		return;
#endif
	}

	if (!hdd->online) {
		ERR("request re-routed due to hdd not being online.\n");
		/* being unloaded, re-route */
		goto out;
	}

	hdd->request_q = q;
	/* calculate physical sector number -- offset partition information */
	psn = BIO_SECTOR(bio) + bio->bi_bdev->bd_part->start_sect;
	nr_sctr = to_sector(BIO_SIZE(bio));
	do {
		if (bio_sectors(bio) == 0)
			break;

		/* partition boundary check */
		if ((psn < hdd->part_start) ||
			((psn + nr_sctr) > hdd->part_end))
			break;

		if (bio_data_dir(bio) == WRITE) {
			gctx.st_write++;

#ifdef SCE_AWT
			/* make sure the request is only for one fragment */
			if (((psn + nr_sctr - 1) / SCE_SCTRPERFRAG) !=
				(psn / SCE_SCTRPERFRAG)) {
				sce_invalidate(hdd->lun, psn, nr_sctr);
				break;
			}
			rcu_read_lock();
			if (sce_get4write(hdd->lun, psn, nr_sctr, &fmap) 
				== SCE_SUCCESS) {
				ssd = (struct ssd_info *)fmap.cdevctx;
				atomic_inc(&ssd->nr_ref);
				rcu_read_unlock();
				if (!ssd->online) {
					sce_put4write(hdd->lun, psn,
						nr_sctr, 1);
					atomic_dec(&ssd->nr_ref);
				} else {
					io = _io_alloc(hdd, ssd, fmap.fragnum, bio, psn);
					if (NULL == io) {
						atomic_dec(&ssd->nr_ref);
						break;
					}
#if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE
					ret = _io_worker_run(&io->work);
#else
					_io_queue(io);
#endif
					/* lose the reference to hdd, not needed anymore */
					atomic_dec(&hdd->nr_ref);
#if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE
					return ret;
#elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0)
					return 0;
#else
					return;
#endif
				}
			} else
				rcu_read_unlock();
#else
			sce_invalidate(hdd->lun, psn, nr_sctr);
#endif
			break;
		}
		else
		{
			/* Read handling */
			gctx.st_read++;

			/* make sure the request is only for one fragment */
			if (((psn + nr_sctr - 1) / SCE_SCTRPERFRAG) !=
				(psn / SCE_SCTRPERFRAG))
				break;

			/* cache hit/miss check */
			rcu_read_lock();
			if (sce_get4read(hdd->lun, psn, nr_sctr, &fmap) != SCE_SUCCESS) {
				rcu_read_unlock();
				break;
			}
			BUG_ON(NULL == fmap.cdevctx);
			ssd = (struct ssd_info *) fmap.cdevctx;
			atomic_inc(&ssd->nr_ref);
			rcu_read_unlock();
			/* make sure the request is within the SSD limits and the SSD is online */
			if (!ssd->online || ssd->queue_max_hw_sectors < nr_sctr) {
				sce_put4read(hdd->lun, psn, nr_sctr);
				atomic_dec(&ssd->nr_ref);
				break;
			}

			/* cache hit */
			io = _io_alloc(hdd, ssd, fmap.fragnum, bio, psn);
			if (NULL == io) {
				atomic_dec(&ssd->nr_ref);
				break;
			}

#if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE
			ret = _io_worker_run(&io->work);
#else
			_io_queue(io);
#endif
			/* lose the reference to hdd , not needed anymore */
			atomic_dec(&hdd->nr_ref);
		}

#if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE
		return ret;
#elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0)
		return 0;
#else
		return;
#endif
	} while (0);

out:
	/* lose the reference to hdd , not needed anymore */
	atomic_dec(&hdd->nr_ref);

	return (org_mapreq) (q, bio);
}
Example #10
0
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
		      loff_t start, loff_t end, get_block_t get_block,
		      struct buffer_head *bh)
{
	loff_t pos = start, max = start, bh_max = start;
	bool hole = false, need_wmb = false;
	struct block_device *bdev = NULL;
	int rw = iov_iter_rw(iter), rc;
	long map_len = 0;
	struct blk_dax_ctl dax = {
		.addr = (void __pmem *) ERR_PTR(-EIO),
	};
	unsigned blkbits = inode->i_blkbits;
	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
								>> blkbits;

	if (rw == READ)
		end = min(end, i_size_read(inode));

	while (pos < end) {
		size_t len;
		if (pos == max) {
			long page = pos >> PAGE_SHIFT;
			sector_t block = page << (PAGE_SHIFT - blkbits);
			unsigned first = pos - (block << blkbits);
			long size;

			if (pos == bh_max) {
				bh->b_size = PAGE_ALIGN(end - pos);
				bh->b_state = 0;
				rc = get_block(inode, block, bh, rw == WRITE);
				if (rc)
					break;
				if (!buffer_size_valid(bh))
					bh->b_size = 1 << blkbits;
				bh_max = pos - first + bh->b_size;
				bdev = bh->b_bdev;
				/*
				 * We allow uninitialized buffers for writes
				 * beyond EOF as those cannot race with faults
				 */
				WARN_ON_ONCE(
					(buffer_new(bh) && block < file_blks) ||
					(rw == WRITE && buffer_unwritten(bh)));
			} else {
				unsigned done = bh->b_size -
						(bh_max - (pos - first));
				bh->b_blocknr += done >> blkbits;
				bh->b_size -= done;
			}

			hole = rw == READ && !buffer_written(bh);
			if (hole) {
				size = bh->b_size - first;
			} else {
				dax_unmap_atomic(bdev, &dax);
				dax.sector = to_sector(bh, inode);
				dax.size = bh->b_size;
				map_len = dax_map_atomic(bdev, &dax);
				if (map_len < 0) {
					rc = map_len;
					break;
				}
				dax.addr += first;
				size = map_len - first;
			}
			/*
			 * pos + size is one past the last offset for IO,
			 * so pos + size can overflow loff_t at extreme offsets.
			 * Cast to u64 to catch this and get the true minimum.
			 */
			max = min_t(u64, pos + size, end);
		}

		if (iov_iter_rw(iter) == WRITE) {
			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
			need_wmb = true;
		} else if (!hole)
			len = copy_to_iter((void __force *) dax.addr, max - pos,
					iter);
		else
			len = iov_iter_zero(max - pos, iter);

		if (!len) {
			rc = -EFAULT;
			break;
		}

		pos += len;
		if (!IS_ERR(dax.addr))
			dax.addr += len;
	}

	if (need_wmb)
		wmb_pmem();
	dax_unmap_atomic(bdev, &dax);

	return (pos == start) ? rc : pos - start;
}
Example #11
0
/*-----------------------------------------------------------------
 * IO routines that accept a list of pages.
 *---------------------------------------------------------------*/
static void do_region(int rw, unsigned region, struct dm_io_region *where,
		      struct dpages *dp, struct io *io)
{
	struct bio *bio;
	struct page *page;
	unsigned long len;
	unsigned offset;
	unsigned num_bvecs;
	sector_t remaining = where->count;
	struct request_queue *q = bdev_get_queue(where->bdev);
	unsigned short logical_block_size = queue_logical_block_size(q);
	sector_t num_sectors;
	unsigned int uninitialized_var(special_cmd_max_sectors);

	/*
	 * Reject unsupported discard and write same requests.
	 */
	if (rw & REQ_DISCARD)
		special_cmd_max_sectors = q->limits.max_discard_sectors;
	else if (rw & REQ_WRITE_SAME)
		special_cmd_max_sectors = q->limits.max_write_same_sectors;
	if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) {
		dec_count(io, region, -EOPNOTSUPP);
		return;
	}

	/*
	 * where->count may be zero if rw holds a flush and we need to
	 * send a zero-sized flush.
	 */
	do {
		/*
		 * Allocate a suitably sized-bio.
		 */
		if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
			num_bvecs = 1;
		else
			num_bvecs = min_t(int, BIO_MAX_PAGES,
					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));

		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
		bio->bi_bdev = where->bdev;
		bio->bi_end_io = endio;
		store_io_and_region_in_bio(bio, io, region);

		if (rw & REQ_DISCARD) {
			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
			remaining -= num_sectors;
		} else if (rw & REQ_WRITE_SAME) {
			/*
			 * WRITE SAME only uses a single page.
			 */
			dp->get_page(dp, &page, &len, &offset);
			bio_add_page(bio, page, logical_block_size, offset);
			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;

			offset = 0;
			remaining -= num_sectors;
			dp->next_page(dp);
		} else while (remaining) {
			/*
			 * Try and add as many pages as possible.
			 */
			dp->get_page(dp, &page, &len, &offset);
			len = min(len, to_bytes(remaining));
			if (!bio_add_page(bio, page, len, offset))
				break;

			offset = 0;
			remaining -= to_sector(len);
			dp->next_page(dp);
		}

		atomic_inc(&io->count);
		submit_bio(rw, bio);
	} while (remaining);
}
/**ltl
 * 功能: 将bio请求分割成多个bio,并分发到多个目标设备中
 * 参数:
 * 返回值:
 * 说明: 此
 */
static void __clone_and_map(struct clone_info *ci)
{
	struct bio *clone, *bio = ci->bio;
	/* 根据请求的起始扇区获取目标设备 */
	struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); /* 可以请求的最大数据长度 */
	struct target_io *tio;

	/*
	 * Allocate a target io object.
	 */
	tio = alloc_tio(ci->md);
	tio->io = ci->io;
	tio->ti = ti;
	memset(&tio->info, 0, sizeof(tio->info));
	/* [step1]请求的数据长度没有超出目标设备的剩下的数据长度,则一次操作就可以完成 */
	if (ci->sector_count <= max) {
		/*
		 * Optimise for the simple case where we can do all of
		 * the remaining io with a single clone.
		 */
		/* 拷贝bio对象 */
		clone = clone_bio(bio, ci->sector, ci->idx,
				  bio->bi_vcnt - ci->idx, ci->sector_count);
		/* 将新的bio请求映射为目标设备的请求 */
		__map_bio(ti, clone, tio);
		ci->sector_count = 0;

	}/*[step2]请求的数据长度已经超过目标设备空间,但是idx下标所指的数组项在目标设备空间范围之内 */
	else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
		/*
		 * There are some bvecs that don't span targets.
		 * Do as many of these as possible.
		 */
		int i;
		sector_t remaining = max;
		sector_t bv_len;
		/* 求出在此目标设备可以传输的最大数据长度 */
		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
			/* 此bio_vec的数据长度已经超出目标设备的空间
			 * 表明bio->bi_io_vec[i]中的数据跨越两个目标设备,因此剩余数据要走[step3]中的流程
			 */
			if (bv_len > remaining)
				break;

			remaining -= bv_len;
			len += bv_len;
		}
		/* 克隆bio对象 */
		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
		/* 将bio对象映射到目标设备 */
		__map_bio(ti, clone, tio);
		/* 剩下请求的起始扇区 */
		ci->sector += len;
		/* 剩下的扇区数,若此字段不为0,则会转到[step3]执行 */
		ci->sector_count -= len;
		/* 数据下标 */
		ci->idx = i;

	}
	else
	{/* [step3]表示bio->bi_io_vec[i]请求数据跨越了两个目标设备,因此要对其分割 */
		/*
		 * Handle a bvec that must be split between two or more targets.
		 */
		/* 跨越两个目标设备的bio_vec对象 */
		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
		/* 数据长度 */
		sector_t remaining = to_sector(bv->bv_len);
		unsigned int offset = 0;
		/* 将bio_vec分割成多份,分发到目标设备中 */
		do {
			if (offset) {
				ti = dm_table_find_target(ci->map, ci->sector); /* 在btree中查找 */
				max = max_io_len(ci->md, ci->sector, ti);

				tio = alloc_tio(ci->md);
				tio->io = ci->io;
				tio->ti = ti;
				memset(&tio->info, 0, sizeof(tio->info));
			}

			len = min(remaining, max); /* 数据长度 */
			/* 分割bio_vec请求 */
			clone = split_bvec(bio, ci->sector, ci->idx,
					   bv->bv_offset + offset, len);
			/* 将请求映射到目标设备 */
			__map_bio(ti, clone, tio);

			ci->sector += len; /* dm的起始扇区号 */
			ci->sector_count -= len; /* 剩下的扇区数 */
			offset += to_bytes(len);	/* bio_vec的数据偏移量 */
		} while (remaining -= len);
		/* 将下标指向下了个bio_vec数组项 */
		ci->idx++;
	}
}