static ssize_t ll_direct_IO( # ifndef HAVE_IOV_ITER_RW int rw, # endif struct kiocb *iocb, struct iov_iter *iter, loff_t file_offset) { struct ll_cl_context *lcc; const struct lu_env *env; struct cl_io *io; struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t count = iov_iter_count(iter); ssize_t tot_bytes = 0, result = 0; size_t size = MAX_DIO_SIZE; /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) return -EINVAL; CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " "offset=%lld=%llx, pages %zd (max %lu)\n", PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, file_offset, file_offset, count >> PAGE_SHIFT, MAX_DIO_SIZE >> PAGE_SHIFT); /* Check that all user buffers are aligned as well */ if (iov_iter_alignment(iter) & ~PAGE_MASK) return -EINVAL; lcc = ll_cl_find(file); if (lcc == NULL) RETURN(-EIO); env = lcc->lcc_env; LASSERT(!IS_ERR(env)); io = lcc->lcc_io; LASSERT(io != NULL); /* 0. Need locking between buffered and direct access. and race with * size changing by concurrent truncates and writes. * 1. Need inode mutex to operate transient pages. */ if (iov_iter_rw(iter) == READ) inode_lock(inode); while (iov_iter_count(iter)) { struct page **pages; size_t offs; count = min_t(size_t, iov_iter_count(iter), size); if (iov_iter_rw(iter) == READ) { if (file_offset >= i_size_read(inode)) break; if (file_offset + count > i_size_read(inode)) count = i_size_read(inode) - file_offset; } result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); if (likely(result > 0)) { int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); result = ll_direct_IO_seg(env, io, iov_iter_rw(iter), inode, result, file_offset, pages, n); ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ); } if (unlikely(result <= 0)) { /* If we can't allocate a large enough buffer * for the request, shrink it to a smaller * PAGE_SIZE multiple and try again. * We should always be able to kmalloc for a * page worth of page pointers = 4MB on i386. */ if (result == -ENOMEM && size > (PAGE_SIZE / sizeof(*pages)) * PAGE_SIZE) { size = ((((size / 2) - 1) | ~PAGE_MASK) + 1) & PAGE_MASK; CDEBUG(D_VFSTRACE, "DIO size now %zu\n", size); continue; } GOTO(out, result); } iov_iter_advance(iter, result); tot_bytes += result; file_offset += result; } out: if (iov_iter_rw(iter) == READ) inode_unlock(inode); if (tot_bytes > 0) { struct vvp_io *vio = vvp_env_io(env); /* no commit async for direct IO */ vio->u.write.vui_written += tot_bytes; } return tot_bytes ? : result; }
/* * Completely synchronous read and write methods. Direct from __user * buffer to osd, or directly to user pages (if O_DIRECT). * * If the read spans object boundary, just do multiple reads. */ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, int *checkeof) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; u64 off = iocb->ki_pos; int num_pages; ssize_t ret; size_t len = iov_iter_count(to); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (!len) return 0; /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); if (ret < 0) return ret; if (unlikely(to->type & ITER_PIPE)) { size_t page_off; ret = iov_iter_get_pages_alloc(to, &pages, len, &page_off); if (ret <= 0) return -ENOMEM; num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); ret = striped_read(inode, off, ret, pages, num_pages, page_off, checkeof); if (ret > 0) { iov_iter_advance(to, ret); off += ret; } else { iov_iter_advance(to, 0); } ceph_put_page_vector(pages, num_pages, false); } else { num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = striped_read(inode, off, len, pages, num_pages, (off & ~PAGE_MASK), checkeof); if (ret > 0) { int l, k = 0; size_t left = ret; while (left) { size_t page_off = off & ~PAGE_MASK; size_t copy = min_t(size_t, left, PAGE_SIZE - page_off); l = copy_page_to_iter(pages[k++], page_off, copy, to); off += l; left -= l; if (l < copy) break; } } ceph_release_page_vector(pages, num_pages); } if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; iocb->ki_pos = off; } dout("sync_read result %zd\n", ret); return ret; }