static void ceph_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); kmem_cache_free(ceph_inode_cachep, ci); }
/* * Dirty a page. Optimistically adjust accounting, on the assumption * that we won't race with invalidate. If we do, readjust. */ static int ceph_set_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; int ret; if (unlikely(!mapping)) return !TestSetPageDirty(page); if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", mapping->host, page, page->index); BUG_ON(!PagePrivate(page)); return 0; } inode = mapping->host; ci = ceph_inode(inode); /* dirty the head */ spin_lock(&ci->i_ceph_lock); BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference if (__ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); snapc = ceph_get_snap_context(capsnap->context); capsnap->dirty_pages++; } else {
/* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date * info, depending on which capabilities are held, and whether * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime * and size are monotonically increasing, except when utimes() or * truncate() increments the corresponding _seq values.) */ int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size) { struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { dout("size %lld -> %llu\n", inode->i_size, size); inode->i_size = size; inode->i_blocks = (size + (1<<9) - 1) >> 9; ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; /* * If we hold relevant caps, or in the case where we're * not the only client referencing this file and we * don't hold those caps, then we need to check whether * the file is either opened or mmaped */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_LAZYIO)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; queue_trunc = 1; } } }
static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, int *checkeof, bool o_direct, unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int io_align, page_align; int left, pages_left; int read; struct page **page_pos; int ret; bool hit_stripe, was_short; pos = off; left = len; page_pos = pages; pages_left = num_pages; read = 0; io_align = off & ~PAGE_MASK; more: if (o_direct) page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left, page_align); if (ret == -ENOENT) ret = 0; hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret > 0) { int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); ceph_zero_page_vector_range(page_align + read, pos - off - read, pages); } pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; if (left && hit_stripe) goto more; }
/* * initialize private struct file data. * if we fail, clean up by dropping fmode reference on the ceph_inode */ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; switch (inode->i_mode & S_IFMT) { case S_IFREG: ceph_fscache_register_inode_cookie(inode); ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); if (cf == NULL) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } cf->fmode = fmode; cf->next_offset = 2; cf->readdir_cache_idx = -1; file->private_data = cf; BUG_ON(inode->i_fop->release != ceph_release); break; case S_IFLNK: dout("init_file %p %p 0%o (symlink)\n", inode, file, inode->i_mode); ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ break; default: dout("init_file %p %p 0%o (special)\n", inode, file, inode->i_mode); /* * we need to drop the open ref now, since we don't * have .release set to ceph_release. */ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ BUG_ON(inode->i_fop->release == ceph_release); /* call the proper open fop */ ret = inode->i_fop->open(inode, file); } return ret; }
/* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) * * If we get a short result from the OSD, check against i_size; we need to * only return a short read to the caller if we hit EOF. */ static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, int *checkeof) { struct ceph_client *client = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; struct page **page_pos; int ret; bool hit_stripe, was_short; /* * we may need to do multiple reads. not atomic, unfortunately. */ pos = off; left = len; page_pos = pages; pages_left = num_pages; read = 0; more: this_len = left; ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left); hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) ret = 0; dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret > 0) { int didpages = ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); zero_page_vector_range(page_off + read, pos - off - read, pages); } pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; /* hit stripe? */ if (left && hit_stripe) goto more; }
/* * Process dirfrag (delegation) info from the mds. Include leaf * fragment in tree ONLY if ndist > 0. Otherwise, only * branches/splits are included in i_fragtree) */ static int ceph_fill_dirfrag(struct inode *inode, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag; u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); int ndist = le32_to_cpu(dirinfo->ndist); int i; int err = 0; mutex_lock(&ci->i_fragtree_mutex); if (ndist == 0) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) goto out; if (frag->split_by == 0) { /* tree leaf, remove */ dout("fill_dirfrag removed %llx.%llx frag %x" " (no ref)\n", ceph_vinop(inode), id); rb_erase(&frag->node, &ci->i_fragtree); kfree(frag); } else { /* tree branch, keep and clear */ dout("fill_dirfrag cleared %llx.%llx frag %x" " referral\n", ceph_vinop(inode), id); frag->mds = -1; frag->ndist = 0; } goto out; } /* find/add this frag to store mds delegation info */ frag = __get_or_create_frag(ci, id); if (IS_ERR(frag)) { /* this is not the end of the world; we can continue with bad/inaccurate delegation info */ pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); err = -ENOMEM; goto out; } frag->mds = mds; frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); for (i = 0; i < frag->ndist; i++) frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", ceph_vinop(inode), frag->frag, frag->ndist); out: mutex_unlock(&ci->i_fragtree_mutex); return err; }
static inline void ceph_set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) { struct ceph_inode_info *ci = ceph_inode(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) set_cached_acl(inode, type, acl); spin_unlock(&ci->i_ceph_lock); }
/* * Return object name, size/offset information, and location (OSD * number, network address) for a given file offset. */ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) { struct ceph_ioctl_dataloc dl; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; struct ceph_pg pgid; int r; /* copy and validate */ if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; down_read(&osdc->map_sem); r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) return -EIO; dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); /* block_offset = object_offset % block_size */ tmp = dl.object_offset; dl.block_offset = do_div(tmp, dl.block_size); snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, osdc->osdmap); pgid = ol.ol_pgid; dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); if (a) memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr)); } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } up_read(&osdc->map_sem); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) return -EFAULT; return 0; }
/* * This function walks through the snaprealm for an inode and returns the * ceph_snap_realm for the first snaprealm that has quotas set (either max_files * or max_bytes). If the root is reached, return the root ceph_snap_realm * instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; struct inode *in; bool has_quota; if (ceph_snap(inode) != CEPH_NOSNAP) return NULL; realm = ceph_inode(inode)->i_snap_realm; if (realm) ceph_get_snap_realm(mdsc, realm); else pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " "null i_snap_realm\n", ceph_vinop(inode)); while (realm) { spin_lock(&realm->inodes_with_caps_lock); in = realm->inode ? igrab(realm->inode) : NULL; spin_unlock(&realm->inodes_with_caps_lock); if (!in) break; ci = ceph_inode(in); has_quota = __ceph_has_any_quota(ci); iput(in); next = realm->parent; if (has_quota || !next) return realm; ceph_get_snap_realm(mdsc, next); ceph_put_snap_realm(mdsc, realm); realm = next; } if (realm) ceph_put_snap_realm(mdsc, realm); return NULL; }
static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, int type) { struct ceph_inode_info *ci = ceph_inode(inode); struct posix_acl *acl = ACL_NOT_CACHED; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) acl = get_cached_acl(inode, type); spin_unlock(&ci->i_ceph_lock); return acl; }
static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; switch (inode->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); if (cf == NULL) { ceph_put_fmode(ceph_inode(inode), fmode); return -ENOMEM; } cf->fmode = fmode; cf->next_offset = 2; file->private_data = cf; BUG_ON(inode->i_fop->release != ceph_release); break; case S_IFLNK: dout("init_file %p %p 0%o (symlink)\n", inode, file, inode->i_mode); ceph_put_fmode(ceph_inode(inode), fmode); break; default: dout("init_file %p %p 0%o (special)\n", inode, file, inode->i_mode); ceph_put_fmode(ceph_inode(inode), fmode); BUG_ON(inode->i_fop->release == ceph_release); ret = inode->i_fop->open(inode, file); } return ret; }
/* * try renew caps after session gets killed. */ int ceph_renew_caps(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; int err, flags, wanted; spin_lock(&ci->i_ceph_lock); wanted = __ceph_caps_file_wanted(ci); if (__ceph_is_any_real_caps(ci) && (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { int issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", inode, ceph_cap_string(wanted), ceph_cap_string(issued)); ceph_check_caps(ci, 0, NULL); return 0; } spin_unlock(&ci->i_ceph_lock); flags = 0; if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) flags = O_RDWR; else if (wanted & CEPH_CAP_FILE_RD) flags = O_RDONLY; else if (wanted & CEPH_CAP_FILE_WR) flags = O_WRONLY; #ifdef O_LAZY if (wanted & CEPH_CAP_FILE_LAZYIO) flags |= O_LAZY; #endif req = prepare_open_request(inode->i_sb, flags, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } req->r_inode = inode; ihold(inode); req->r_num_caps = 1; req->r_fmode = -1; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); out: dout("renew caps %p open result=%d\n", inode, err); return err < 0 ? err : 0; }
int ceph_release(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *cf = file->private_data; dout("release inode %p file %p\n", inode, file); ceph_put_fmode(ci, cf->fmode); if (cf->last_readdir) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ wake_up_all(&ci->i_cap_wq); return 0; }
/* * get and set the file layout */ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) { struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout l; int err; err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (!err) { l.stripe_unit = ceph_file_layout_su(ci->i_layout); l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); l.object_size = ceph_file_layout_object_size(ci->i_layout); l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); l.preferred_osd = (s32)-1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } return err; }
static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag; struct rb_node *rb_node; int i; u32 id, nsplits; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; } else
static void ceph_aio_complete(struct inode *inode, struct ceph_aio_request *aio_req) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; if (!atomic_dec_and_test(&aio_req->pending_reqs)) return; ret = aio_req->error; if (!ret) ret = aio_req->total_len; dout("ceph_aio_complete %p rc %d\n", inode, ret); if (ret >= 0 && aio_req->write) { int dirty; loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; if (endoff > i_size_read(inode)) { if (ceph_inode_set_size(inode, endoff)) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &aio_req->prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); } ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : CEPH_CAP_FILE_RD)); aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); ceph_free_cap_flush(aio_req->prealloc_cf); kfree(aio_req); }
void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { struct super_block *sb = mdsc->fsc->sb; struct ceph_mds_quota *h = msg->front.iov_base; struct ceph_vino vino; struct inode *inode; struct ceph_inode_info *ci; if (msg->front.iov_len < sizeof(*h)) { pr_err("%s corrupt message mds%d len %d\n", __func__, session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg); return; } /* increment msg sequence number */ mutex_lock(&session->s_mutex); session->s_seq++; mutex_unlock(&session->s_mutex); /* lookup inode */ vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { pr_warn("Failed to find inode %llu\n", vino.ino); return; } ci = ceph_inode(inode); spin_lock(&ci->i_ceph_lock); ci->i_rbytes = le64_to_cpu(h->rbytes); ci->i_rfiles = le64_to_cpu(h->rfiles); ci->i_rsubdirs = le64_to_cpu(h->rsubdirs); __ceph_update_quota(ci, le64_to_cpu(h->max_bytes), le64_to_cpu(h->max_files)); spin_unlock(&ci->i_ceph_lock); iput(inode); }
static long ceph_ioctl_lazyio(struct file *file) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&inode->i_lock); ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[fi->fmode]++; spin_unlock(&inode->i_lock); dout("ioctl_layzio: file %p marked lazy\n", file); ceph_check_caps(ci, 0, NULL); } else { dout("ioctl_layzio: file %p already lazy\n", file); } return 0; }
static void ceph_aio_retry_work(struct work_struct *work) { struct ceph_aio_work *aio_work = container_of(work, struct ceph_aio_work, work); struct ceph_osd_request *orig_req = aio_work->req; struct ceph_aio_request *aio_req = orig_req->r_priv; struct inode *inode = orig_req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; struct ceph_osd_request *req; int ret; spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); snapc = ceph_get_snap_context(capsnap->context); } else {
void ceph_destroy_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag; struct rb_node *n; dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); ceph_queue_caps_release(inode); /* * we may still have a snap_realm reference if there are stray * caps in i_cap_exporting_issued or i_snap_caps. */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); spin_lock(&realm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(mdsc, realm); } kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); rb_erase(n, &ci->i_fragtree); kfree(frag); } __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); call_rcu(&inode->i_rcu, ceph_i_callback); }
/* * get/constuct snapdir inode for a given directory */ struct inode *ceph_get_snapdir(struct inode *parent) { struct ceph_vino vino = { .ino = ceph_ino(parent), .snap = CEPH_SNAPDIR, }; struct inode *inode = ceph_get_inode(parent->i_sb, vino); struct ceph_inode_info *ci = ceph_inode(inode); BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) return inode; inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; return inode; }
int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; if (cf) { dout("open file %p is already opened\n", file); return 0; } flags = file->f_flags & ~(O_CREAT|O_EXCL); if (S_ISDIR(inode->i_mode)) flags = O_DIRECTORY; dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, ceph_vinop(inode), file, flags, file->f_flags); fmode = ceph_flags_to_mode(flags); wanted = ceph_caps_for_mode(fmode); if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) return -EROFS; if (ceph_snap(inode) == CEPH_SNAPDIR) { spin_lock(&ci->i_ceph_lock); __ceph_get_fmode(ci, fmode); spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } spin_lock(&ci->i_ceph_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); dout("open %p fmode %d want %s issued %s using existing\n", inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); __ceph_get_fmode(ci, fmode); spin_unlock(&ci->i_ceph_lock); if ((issued & wanted) != wanted && (mds_wanted & wanted) != wanted && ceph_snap(inode) != CEPH_SNAPDIR) ceph_check_caps(ci, 0, NULL); return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { __ceph_get_fmode(ci, fmode); spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } req->r_inode = inode; ihold(inode); req->r_num_caps = 1; if (flags & (O_CREAT|O_TRUNC)) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); out: return err; }
return err; } static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; <<<<<<< HEAD ======= <<<<<<< HEAD >>>>>>> ae1773bb70f3d7cf73324ce8fba787e01d8fa9f2 struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); struct ceph_ioctl_layout nl; int err, i; if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; /* validate changed params against current layout */ err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); if (!err) { nl.stripe_unit = ceph_file_layout_su(ci->i_layout); nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); nl.object_size = ceph_file_layout_object_size(ci->i_layout); nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); nl.preferred_osd = (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
/* * find or create an inode, given the ceph ino number */ static int ceph_set_ino_cb(struct inode *inode, void *data) { ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); return 0; }
static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout nl; int err, i; if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; /* validate changed params against current layout */ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (!err) { nl.stripe_unit = ceph_file_layout_su(ci->i_layout); nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); nl.object_size = ceph_file_layout_object_size(ci->i_layout); nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); nl.preferred_osd = (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); } else return err; if (l.stripe_count) nl.stripe_count = l.stripe_count; if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; if (l.object_size) nl.object_size = l.object_size; if (l.data_pool) nl.data_pool = l.data_pool; if (l.preferred_osd) nl.preferred_osd = l.preferred_osd; if ((nl.object_size & ~PAGE_MASK) || (nl.stripe_unit & ~PAGE_MASK) || ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) return -EINVAL; /* make sure it's a valid data pool */ if (l.data_pool > 0) { mutex_lock(&mdsc->mutex); err = -EINVAL; for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { err = 0; break; } mutex_unlock(&mdsc->mutex); if (err) return err; } req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); req->r_args.setlayout.layout.fl_stripe_count = cpu_to_le32(l.stripe_count); req->r_args.setlayout.layout.fl_object_size = cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32(l.preferred_osd); parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); ceph_mdsc_put_request(req); return err; }
/* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) * * If we get a short result from the OSD, check against i_size; we need to * only return a short read to the caller if we hit EOF. */ static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, int *checkeof, bool o_direct, unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len, left; int io_align, page_align; int pages_left; int read; struct page **page_pos; int ret; bool hit_stripe, was_short; /* * we may need to do multiple reads. not atomic, unfortunately. */ pos = off; left = len; page_pos = pages; pages_left = num_pages; read = 0; io_align = off & ~PAGE_MASK; more: if (o_direct) page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left, page_align); if (ret == -ENOENT) ret = 0; hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret >= 0) { int didpages; if (was_short && (pos + ret < inode->i_size)) { u64 tmp = min(this_len - ret, inode->i_size - pos - ret); dout(" zero gap %llu to %llu\n", pos + ret, pos + ret + tmp); ceph_zero_page_vector_range(page_align + read + ret, tmp, pages); ret += tmp; } didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; /* hit stripe and need continue*/ if (left && hit_stripe && pos < inode->i_size) goto more; }
/* * Do a lookup + open with a single request. If we get a non-existent * file or symlink, return 1 so the VFS can retry. */ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, int *opened) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct dentry *dn; int err; dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", dir, dentry, dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; err = ceph_init_dentry(dentry); if (err < 0) return err; /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) return PTR_ERR(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); if (err) goto out_err; err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); if (d_unhashed(dentry)) { dn = ceph_finish_lookup(req, dentry, err); if (IS_ERR(dn)) err = PTR_ERR(dn); } else { /* we were given a hashed negative dentry */ dn = NULL; } if (err) goto out_err; if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { ceph_init_acl(dentry, dentry->d_inode, dir); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); } out_err: if (!req->r_err && req->r_target_inode) ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); dout("atomic_open result=%d\n", err); return err; }
/* * If the filp already has private_data, that means the file was * already opened by intent during lookup, and we do nothing. * * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) * if our wanted caps set expands. */ int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; int err; int flags, fmode, wanted; if (cf) { dout("open file %p is already opened\n", file); return 0; } /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ flags = file->f_flags & ~(O_CREAT|O_EXCL); if (S_ISDIR(inode->i_mode)) flags = O_DIRECTORY; /* mds likes to know */ dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, ceph_vinop(inode), file, flags, file->f_flags); fmode = ceph_flags_to_mode(flags); wanted = ceph_caps_for_mode(fmode); /* snapped files are read-only */ if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) return -EROFS; /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { spin_lock(&inode->i_lock); __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); return ceph_init_file(inode, file, fmode); } /* * No need to block if we have caps on the auth MDS (for * write) or any MDS (for read). Update wanted set * asynchronously. */ spin_lock(&inode->i_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); dout("open %p fmode %d want %s issued %s using existing\n", inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); /* adjust wanted? */ if ((issued & wanted) != wanted && (mds_wanted & wanted) != wanted && ceph_snap(inode) != CEPH_SNAPDIR) ceph_check_caps(ci, 0, NULL); return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); return ceph_init_file(inode, file, fmode); } spin_unlock(&inode->i_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } req->r_inode = igrab(inode); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, parent_inode, req); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); out: return err; }
static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout nl; int err; if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; /* validate changed params against current layout */ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (err) return err; memset(&nl, 0, sizeof(nl)); if (l.stripe_count) nl.stripe_count = l.stripe_count; else nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; else nl.stripe_unit = ceph_file_layout_su(ci->i_layout); if (l.object_size) nl.object_size = l.object_size; else nl.object_size = ceph_file_layout_object_size(ci->i_layout); if (l.data_pool) nl.data_pool = l.data_pool; else nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); /* this is obsolete, and always -1 */ nl.preferred_osd = le64_to_cpu(-1); err = __validate_layout(mdsc, &nl); if (err) return err; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); req->r_args.setlayout.layout.fl_stripe_count = cpu_to_le32(l.stripe_count); req->r_args.setlayout.layout.fl_object_size = cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); ceph_mdsc_put_request(req); return err; }