/* * eio_mem_init */ int eio_mem_init(struct cache_c *dmc) { u_int32_t lsb_bits; u_int32_t msb_bits_24; /* most significant bits in shrunk dbn */ u_int64_t max_dbn; u_int64_t num_sets_64; /* * Sanity check the number of sets. */ num_sets_64 = EIO_DIV(dmc->size, dmc->assoc); if (num_sets_64 > UINT_MAX) { pr_err("Number of cache sets (%lu) greater than maximum" "allowed (%u)", (long unsigned int)num_sets_64, UINT_MAX); return -1; } /* * Find the number of bits required to encode the set number and * its corresponding mask value. */ dmc->num_sets = (u_int32_t)num_sets_64; for (dmc->num_sets_bits = 0; (dmc->num_sets >> dmc->num_sets_bits) != 0; dmc->num_sets_bits++) ; dmc->num_sets_mask = ULLONG_MAX >> (64 - dmc->num_sets_bits); /* * If we don't have at least 16 bits to save, * we can't use small metadata. */ if (dmc->num_sets_bits < 16) { dmc->cache_flags |= CACHE_FLAGS_MD8; pr_info("Not enough sets to use small metadata"); return 1; } /* * Now compute the largest sector number that we can shrink; then see * if the source volume is smaller. */ lsb_bits = dmc->consecutive_shift + dmc->block_shift; msb_bits_24 = 24 - 1 - lsb_bits; /* 1 for wrapped bit */ max_dbn = ((u_int64_t)1) << (msb_bits_24 + dmc->num_sets_bits + lsb_bits); if (to_sector(eio_get_device_size(dmc->disk_dev)) > max_dbn) { dmc->cache_flags |= CACHE_FLAGS_MD8; pr_info("Source volume too big to use small metadata"); return 1; } return 0; }
/*----------------------------------------------------------------- * IO routines that accept a list of pages. *---------------------------------------------------------------*/ static void do_region(int rw, unsigned region, struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; unsigned long len; unsigned offset; unsigned num_bvecs; sector_t remaining = where->count; /* * where->count may be zero if rw holds a flush and we need to * send a zero-sized flush. */ do { /* * Allocate a suitably sized-bio. */ num_bvecs = dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)); num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); if (!bio) { printk(KERN_WARNING "%s : %s() failed\n", __FILE__, __func__); BUG_ON(1); } bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; bio->bi_destructor = dm_bio_destructor; store_io_and_region_in_bio(bio, io, region); /* * Try and add as many pages as possible. */ while (remaining) { dp->get_page(dp, &page, &len, &offset); len = min(len, to_bytes(remaining)); if (!bio_add_page(bio, page, len, offset)) break; offset = 0; remaining -= to_sector(len); dp->next_page(dp); } atomic_inc(&io->count); submit_bio(rw, bio); } while (remaining); }
static void do_region(int rw, unsigned region, struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; unsigned long len; unsigned offset; unsigned num_bvecs; sector_t remaining = where->count; struct request_queue *q = bdev_get_queue(where->bdev); sector_t discard_sectors; do { if (rw & REQ_DISCARD) num_bvecs = 1; else num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; bio->bi_destructor = dm_bio_destructor; store_io_and_region_in_bio(bio, io, region); if (rw & REQ_DISCARD) { discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); bio->bi_size = discard_sectors << SECTOR_SHIFT; remaining -= discard_sectors; } else while (remaining) { dp->get_page(dp, &page, &len, &offset); len = min(len, to_bytes(remaining)); if (!bio_add_page(bio, page, len, offset)) break; offset = 0; remaining -= to_sector(len); dp->next_page(dp); } atomic_inc(&io->count); submit_bio(rw, bio); } while (remaining); }
static struct iostash_bio *_io_alloc(struct hdd_info * hdd, struct ssd_info * ssd, uint32_t fragnum, struct bio *bio, sector_t psn) { struct iostash_bio *io = mempool_alloc(hdd->io_pool, GFP_NOIO); if (io) { atomic_inc(&hdd->io_pending); io->hdd = hdd; io->ssd = ssd; io->fragnum = fragnum; io->base_bio = bio; io->psn = psn; io->nr_sctr = to_sector(BIO_SIZE(bio)); io->error = 0; io->ssd_werr = 0; /* SSD write error */ atomic_set(&io->io_pending, 0); } return io; }
static void __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); struct target_io *tio; /* * Allocate a target io object. */ tio = alloc_tio(ci->md); tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); if (ci->sector_count <= max) { /* * Optimise for the simple case where we can do all of * the remaining io with a single clone. */ clone = clone_bio(bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, ci->sector_count); __map_bio(ti, clone, tio); ci->sector_count = 0; } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { /* * There are some bvecs that don't span targets. * Do as many of these as possible. */ int i; sector_t remaining = max; sector_t bv_len; for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { bv_len = to_sector(bio->bi_io_vec[i].bv_len); if (bv_len > remaining) break; remaining -= bv_len; len += bv_len; } clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); __map_bio(ti, clone, tio); ci->sector += len; ci->sector_count -= len; ci->idx = i; } else { /* * Create two copy bios to deal with io that has * been split across a target. */ struct bio_vec *bv = bio->bi_io_vec + ci->idx; clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset, max); __map_bio(ti, clone, tio); ci->sector += max; ci->sector_count -= max; ti = dm_table_find_target(ci->map, ci->sector); len = to_sector(bv->bv_len) - max; clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + to_bytes(max), len); tio = alloc_tio(ci->md); tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); __map_bio(ti, clone, tio); ci->sector += len; ci->sector_count -= len; ci->idx++; } }
static void __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); struct target_io *tio; /* * Allocate a target io object. */ tio = alloc_tio(ci->md); tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); if (ci->sector_count <= max) { /* * Optimise for the simple case where we can do all of * the remaining io with a single clone. */ clone = clone_bio(bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, ci->sector_count, ci->md->bs); __map_bio(ti, clone, tio); ci->sector_count = 0; } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { /* * There are some bvecs that don't span targets. * Do as many of these as possible. */ int i; sector_t remaining = max; sector_t bv_len; for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { bv_len = to_sector(bio->bi_io_vec[i].bv_len); if (bv_len > remaining) break; remaining -= bv_len; len += bv_len; } clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, ci->md->bs); __map_bio(ti, clone, tio); ci->sector += len; ci->sector_count -= len; ci->idx = i; } else { /* * Handle a bvec that must be split between two or more targets. */ struct bio_vec *bv = bio->bi_io_vec + ci->idx; sector_t remaining = to_sector(bv->bv_len); unsigned int offset = 0; do { if (offset) { ti = dm_table_find_target(ci->map, ci->sector); max = max_io_len(ci->md, ci->sector, ti); tio = alloc_tio(ci->md); tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); } len = min(remaining, max); clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + offset, len, ci->md->bs); __map_bio(ti, clone, tio); ci->sector += len; ci->sector_count -= len; offset += to_bytes(len); } while (remaining -= len); ci->idx++; } }
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; sector_t block; int error, result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) { split_huge_pmd(vma, pmd, address); dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; } /* If the PMD would extend outside the VMA */ if (pmd_addr < vma->vm_start) { dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; } if ((pmd_addr + PMD_SIZE) > vma->vm_end) { dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(NULL, address, "offset + huge page size > file size"); return VM_FAULT_FALLBACK; } memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; if (!buffer_mapped(&bh) && write) { if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; } bdev = bh.b_bdev; /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ if (alloc) { loff_t lstart = pgoff << PAGE_SHIFT; loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ truncate_pagecache_range(inode, lstart, lend); } i_mmap_lock_read(mapping); /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't * take i_mutex here, so just leave them hanging; they'll be freed * when the file is deleted. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) { result = VM_FAULT_SIGBUS; goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(&bh, address, "offset + huge page size > file size"); goto fallback; } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: <zero> sect: %llx\n", __func__, current->comm, address, (unsigned long long) to_sector(&bh, inode)); entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { struct blk_dax_ctl dax = { .sector = to_sector(&bh, inode), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { dax_pmd_dbg(&bh, address, "pfn unaligned"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (!pfn_t_devmap(dax.pfn)) { dax_unmap_atomic(bdev, &dax); dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } if (buffer_unwritten(&bh) || buffer_new(&bh)) { clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } dax_unmap_atomic(bdev, &dax); /* * For PTE faults we insert a radix tree entry for reads, and * leave it clean. Then on the first write we dirty the radix * tree entry via the dax_pfn_mkwrite() path. This sequence * allows the dax_pfn_mkwrite() call to be simpler and avoid a * call into get_block() to translate the pgoff to a sector in * order to be able to create a new radix tree entry. * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a * write we traverse all the way through __dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. */ if (write) { error = dax_radix_entry(mapping, pgoff, dax.sector, true, true); if (error) { dax_pmd_dbg(&bh, address, "PMD radix insertion failed"); goto fallback; } } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, pfn_t_to_pfn(dax.pfn), (unsigned long long) dax.sector); result |= vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); } out: i_mmap_unlock_read(mapping); if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); return result; fallback: count_vm_event(THP_FAULT_FALLBACK); result = VM_FAULT_FALLBACK; goto out; } EXPORT_SYMBOL_GPL(__dax_pmd_fault); /** * dax_pmd_fault - handle a PMD fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; if (flags & FAULT_FLAG_WRITE) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } result = __dax_pmd_fault(vma, address, pmd, flags, get_block, complete_unwritten); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); return result; }
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; bool hole = false, need_wmb = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); long size; if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; rc = get_block(inode, block, bh, rw == WRITE); if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; } else { unsigned done = bh->b_size - (bh_max - (pos - first)); bh->b_blocknr += done >> blkbits; bh->b_size -= done; } hole = rw == READ && !buffer_written(bh); if (hole) { size = bh->b_size - first; } else { dax_unmap_atomic(bdev, &dax); dax.sector = to_sector(bh, inode); dax.size = bh->b_size; map_len = dax_map_atomic(bdev, &dax); if (map_len < 0) { rc = map_len; break; } if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(dax.addr, map_len, first, pos, end); need_wmb = true; } dax.addr += first; size = map_len - first; } max = min(pos + size, end); } if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { rc = -EFAULT; break; } pos += len; if (!IS_ERR(dax.addr)) dax.addr += len; } if (need_wmb) wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; }
void iostash_mkrequest(struct request_queue *q, struct bio *bio) #endif { struct hdd_info *hdd; struct ssd_info *ssd; struct iostash_bio *io; sce_fmap_t fmap; uint32_t nr_sctr; sector_t psn; make_request_fn *org_mapreq = NULL; #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE blk_qc_t ret = BLK_QC_T_NONE; #endif DBG("Got bio=%p bio->bi_rw(%lu) request at s=%lu l=%u.\n", bio, bio->bi_rw, BIO_SECTOR(bio), bio_sectors(bio)); rcu_read_lock(); hdd = hdd_search(bio); if (hdd) { atomic_inc(&hdd->nr_ref); org_mapreq = hdd->org_mapreq; } rcu_read_unlock(); if (unlikely(NULL == hdd)) { /* have to requeue the request, somebody was holding a * dangling reference */ ERR("Request holding a dangling make_request_fn pointer\n."); #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE bio->bi_error = -EAGAIN; return ret; #elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0) rmb(); /* read the change in make_request_fn */ return -EAGAIN; /* retry */ #else /* no retry possible in newer kernels since the return * of make_request_fn is no longer checked and retried * if not zero, we cannot unload the module */ BUG(); return; #endif } if (!hdd->online) { ERR("request re-routed due to hdd not being online.\n"); /* being unloaded, re-route */ goto out; } hdd->request_q = q; /* calculate physical sector number -- offset partition information */ psn = BIO_SECTOR(bio) + bio->bi_bdev->bd_part->start_sect; nr_sctr = to_sector(BIO_SIZE(bio)); do { if (bio_sectors(bio) == 0) break; /* partition boundary check */ if ((psn < hdd->part_start) || ((psn + nr_sctr) > hdd->part_end)) break; if (bio_data_dir(bio) == WRITE) { gctx.st_write++; #ifdef SCE_AWT /* make sure the request is only for one fragment */ if (((psn + nr_sctr - 1) / SCE_SCTRPERFRAG) != (psn / SCE_SCTRPERFRAG)) { sce_invalidate(hdd->lun, psn, nr_sctr); break; } rcu_read_lock(); if (sce_get4write(hdd->lun, psn, nr_sctr, &fmap) == SCE_SUCCESS) { ssd = (struct ssd_info *)fmap.cdevctx; atomic_inc(&ssd->nr_ref); rcu_read_unlock(); if (!ssd->online) { sce_put4write(hdd->lun, psn, nr_sctr, 1); atomic_dec(&ssd->nr_ref); } else { io = _io_alloc(hdd, ssd, fmap.fragnum, bio, psn); if (NULL == io) { atomic_dec(&ssd->nr_ref); break; } #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE ret = _io_worker_run(&io->work); #else _io_queue(io); #endif /* lose the reference to hdd, not needed anymore */ atomic_dec(&hdd->nr_ref); #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE return ret; #elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0) return 0; #else return; #endif } } else rcu_read_unlock(); #else sce_invalidate(hdd->lun, psn, nr_sctr); #endif break; } else { /* Read handling */ gctx.st_read++; /* make sure the request is only for one fragment */ if (((psn + nr_sctr - 1) / SCE_SCTRPERFRAG) != (psn / SCE_SCTRPERFRAG)) break; /* cache hit/miss check */ rcu_read_lock(); if (sce_get4read(hdd->lun, psn, nr_sctr, &fmap) != SCE_SUCCESS) { rcu_read_unlock(); break; } BUG_ON(NULL == fmap.cdevctx); ssd = (struct ssd_info *) fmap.cdevctx; atomic_inc(&ssd->nr_ref); rcu_read_unlock(); /* make sure the request is within the SSD limits and the SSD is online */ if (!ssd->online || ssd->queue_max_hw_sectors < nr_sctr) { sce_put4read(hdd->lun, psn, nr_sctr); atomic_dec(&ssd->nr_ref); break; } /* cache hit */ io = _io_alloc(hdd, ssd, fmap.fragnum, bio, psn); if (NULL == io) { atomic_dec(&ssd->nr_ref); break; } #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE ret = _io_worker_run(&io->work); #else _io_queue(io); #endif /* lose the reference to hdd , not needed anymore */ atomic_dec(&hdd->nr_ref); } #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE return ret; #elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0) return 0; #else return; #endif } while (0); out: /* lose the reference to hdd , not needed anymore */ atomic_dec(&hdd->nr_ref); return (org_mapreq) (q, bio); }
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; bool hole = false, need_wmb = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; unsigned blkbits = inode->i_blkbits; sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { size_t len; if (pos == max) { long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); long size; if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; rc = get_block(inode, block, bh, rw == WRITE); if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; /* * We allow uninitialized buffers for writes * beyond EOF as those cannot race with faults */ WARN_ON_ONCE( (buffer_new(bh) && block < file_blks) || (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); bh->b_blocknr += done >> blkbits; bh->b_size -= done; } hole = rw == READ && !buffer_written(bh); if (hole) { size = bh->b_size - first; } else { dax_unmap_atomic(bdev, &dax); dax.sector = to_sector(bh, inode); dax.size = bh->b_size; map_len = dax_map_atomic(bdev, &dax); if (map_len < 0) { rc = map_len; break; } dax.addr += first; size = map_len - first; } /* * pos + size is one past the last offset for IO, * so pos + size can overflow loff_t at extreme offsets. * Cast to u64 to catch this and get the true minimum. */ max = min_t(u64, pos + size, end); } if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { rc = -EFAULT; break; } pos += len; if (!IS_ERR(dax.addr)) dax.addr += len; } if (need_wmb) wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; }
/*----------------------------------------------------------------- * IO routines that accept a list of pages. *---------------------------------------------------------------*/ static void do_region(int rw, unsigned region, struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; unsigned long len; unsigned offset; unsigned num_bvecs; sector_t remaining = where->count; struct request_queue *q = bdev_get_queue(where->bdev); unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; unsigned int uninitialized_var(special_cmd_max_sectors); /* * Reject unsupported discard and write same requests. */ if (rw & REQ_DISCARD) special_cmd_max_sectors = q->limits.max_discard_sectors; else if (rw & REQ_WRITE_SAME) special_cmd_max_sectors = q->limits.max_write_same_sectors; if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) { dec_count(io, region, -EOPNOTSUPP); return; } /* * where->count may be zero if rw holds a flush and we need to * send a zero-sized flush. */ do { /* * Allocate a suitably sized-bio. */ if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME)) num_bvecs = 1; else num_bvecs = min_t(int, BIO_MAX_PAGES, dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; store_io_and_region_in_bio(bio, io, region); if (rw & REQ_DISCARD) { num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; remaining -= num_sectors; } else if (rw & REQ_WRITE_SAME) { /* * WRITE SAME only uses a single page. */ dp->get_page(dp, &page, &len, &offset); bio_add_page(bio, page, logical_block_size, offset); num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; offset = 0; remaining -= num_sectors; dp->next_page(dp); } else while (remaining) { /* * Try and add as many pages as possible. */ dp->get_page(dp, &page, &len, &offset); len = min(len, to_bytes(remaining)); if (!bio_add_page(bio, page, len, offset)) break; offset = 0; remaining -= to_sector(len); dp->next_page(dp); } atomic_inc(&io->count); submit_bio(rw, bio); } while (remaining); }
/**ltl * 功能: 将bio请求分割成多个bio,并分发到多个目标设备中 * 参数: * 返回值: * 说明: 此 */ static void __clone_and_map(struct clone_info *ci) { struct bio *clone, *bio = ci->bio; /* 根据请求的起始扇区获取目标设备 */ struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); /* 可以请求的最大数据长度 */ struct target_io *tio; /* * Allocate a target io object. */ tio = alloc_tio(ci->md); tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); /* [step1]请求的数据长度没有超出目标设备的剩下的数据长度,则一次操作就可以完成 */ if (ci->sector_count <= max) { /* * Optimise for the simple case where we can do all of * the remaining io with a single clone. */ /* 拷贝bio对象 */ clone = clone_bio(bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, ci->sector_count); /* 将新的bio请求映射为目标设备的请求 */ __map_bio(ti, clone, tio); ci->sector_count = 0; }/*[step2]请求的数据长度已经超过目标设备空间,但是idx下标所指的数组项在目标设备空间范围之内 */ else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { /* * There are some bvecs that don't span targets. * Do as many of these as possible. */ int i; sector_t remaining = max; sector_t bv_len; /* 求出在此目标设备可以传输的最大数据长度 */ for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { bv_len = to_sector(bio->bi_io_vec[i].bv_len); /* 此bio_vec的数据长度已经超出目标设备的空间 * 表明bio->bi_io_vec[i]中的数据跨越两个目标设备,因此剩余数据要走[step3]中的流程 */ if (bv_len > remaining) break; remaining -= bv_len; len += bv_len; } /* 克隆bio对象 */ clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); /* 将bio对象映射到目标设备 */ __map_bio(ti, clone, tio); /* 剩下请求的起始扇区 */ ci->sector += len; /* 剩下的扇区数,若此字段不为0,则会转到[step3]执行 */ ci->sector_count -= len; /* 数据下标 */ ci->idx = i; } else {/* [step3]表示bio->bi_io_vec[i]请求数据跨越了两个目标设备,因此要对其分割 */ /* * Handle a bvec that must be split between two or more targets. */ /* 跨越两个目标设备的bio_vec对象 */ struct bio_vec *bv = bio->bi_io_vec + ci->idx; /* 数据长度 */ sector_t remaining = to_sector(bv->bv_len); unsigned int offset = 0; /* 将bio_vec分割成多份,分发到目标设备中 */ do { if (offset) { ti = dm_table_find_target(ci->map, ci->sector); /* 在btree中查找 */ max = max_io_len(ci->md, ci->sector, ti); tio = alloc_tio(ci->md); tio->io = ci->io; tio->ti = ti; memset(&tio->info, 0, sizeof(tio->info)); } len = min(remaining, max); /* 数据长度 */ /* 分割bio_vec请求 */ clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + offset, len); /* 将请求映射到目标设备 */ __map_bio(ti, clone, tio); ci->sector += len; /* dm的起始扇区号 */ ci->sector_count -= len; /* 剩下的扇区数 */ offset += to_bytes(len); /* bio_vec的数据偏移量 */ } while (remaining -= len); /* 将下标指向下了个bio_vec数组项 */ ci->idx++; } }