Beispiel #1
0
/** Send a DONE_WRITING rpc. */
static void ll_done_writing(struct inode *inode)
{
	struct obd_client_handle *och = NULL;
	struct md_op_data *op_data;
	int rc;

	LASSERT(exp_connect_som(ll_i2mdexp(inode)));

	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
	if (!op_data)
		return;

	ll_prepare_done_writing(inode, op_data, &och);
	/* If there is no @och, we do not do D_W yet. */
	if (och == NULL)
		goto out;

	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
	if (rc == -EAGAIN)
		/* MDS has instructed us to obtain Size-on-MDS attribute from
		 * OSTs and send setattr to back to MDS. */
		rc = ll_som_update(inode, op_data);
	else if (rc)
		CERROR("inode %lu mdc done_writing failed: rc = %d\n",
		       inode->i_ino, rc);
out:
	ll_finish_md_op_data(op_data);
	if (och) {
		md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
		kfree(och);
	}
}
/** Send a DONE_WRITING rpc. */
static void ll_done_writing(struct inode *inode)
{
	struct obd_client_handle *och = NULL;
	struct md_op_data *op_data;
	int rc;

	LASSERT(exp_connect_som(ll_i2mdexp(inode)));

	OBD_ALLOC_PTR(op_data);
	if (op_data == NULL) {
		CERROR("can't allocate op_data\n");
		return;
	}

	ll_prepare_done_writing(inode, op_data, &och);
	/* If there is no @och, we do not do D_W yet. */
	if (och == NULL)
		GOTO(out, 0);

	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
	if (rc == -EAGAIN) {
		/* MDS has instructed us to obtain Size-on-MDS attribute from
		 * OSTs and send setattr to back to MDS. */
		rc = ll_som_update(inode, op_data);
	} else if (rc) {
		CERROR("inode %lu mdc done_writing failed: rc = %d\n",
		       inode->i_ino, rc);
	}
out:
	ll_finish_md_op_data(op_data);
	if (och) {
		md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
		OBD_FREE_PTR(och);
	}
}
Beispiel #3
0
int ll_setxattr(struct dentry *dentry, const char *name,
		const void *value, size_t size, int flags)
{
	struct inode *inode = d_inode(dentry);

	LASSERT(inode);
	LASSERT(name);

	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
	       inode->i_ino, inode->i_generation, inode, name);

	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);

	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
	    (strncmp(name, XATTR_LUSTRE_PREFIX,
		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
		struct lov_user_md *lump = (struct lov_user_md *)value;
		int rc = 0;

		if (size != 0 && size < sizeof(struct lov_user_md))
			return -EINVAL;

		/* Attributes that are saved via getxattr will always have
		 * the stripe_offset as 0.  Instead, the MDS should be
		 * allowed to pick the starting OST index.   b=17846
		 */
		if (lump && lump->lmm_stripe_offset == 0)
			lump->lmm_stripe_offset = -1;

		if (lump && S_ISREG(inode->i_mode)) {
			int flags = FMODE_WRITE;
			int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
				sizeof(*lump) : sizeof(struct lov_user_md_v3);

			rc = ll_lov_setstripe_ea_info(inode, dentry, flags, lump,
						      lum_size);
			/* b10667: rc always be 0 here for now */
			rc = 0;
		} else if (S_ISDIR(inode->i_mode)) {
			rc = ll_dir_setstripe(inode, lump, 0);
		}

		return rc;

	} else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
		   strcmp(name, XATTR_NAME_LINK) == 0)
		return 0;

	return ll_setxattr_common(inode, name, value, size, flags,
				  OBD_MD_FLXATTR);
}
Beispiel #4
0
static int vvp_io_read_page(const struct lu_env *env,
                            const struct cl_io_slice *ios,
                            const struct cl_page_slice *slice)
{
        struct cl_io              *io     = ios->cis_io;
        struct cl_object          *obj    = slice->cpl_obj;
        struct ccc_page           *cp     = cl2ccc_page(slice);
        struct cl_page            *page   = slice->cpl_page;
        struct inode              *inode  = ccc_object_inode(obj);
        struct ll_sb_info         *sbi    = ll_i2sbi(inode);
        struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
        struct ll_readahead_state *ras    = &fd->fd_ras;
	struct page                *vmpage = cp->cpg_page;
        struct cl_2queue          *queue  = &io->ci_queue;
        int rc;

        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
        LASSERT(slice->cpl_obj == obj);

        ENTRY;

        if (sbi->ll_ra_info.ra_max_pages_per_file &&
            sbi->ll_ra_info.ra_max_pages)
                ras_update(sbi, inode, ras, page->cp_index,
                           cp->cpg_defer_uptodate);

        /* Sanity check whether the page is protected by a lock. */
        rc = cl_page_is_under_lock(env, io, page);
        if (rc != -EBUSY) {
                CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
                               rc == -ENODATA ? "without a lock" :
                               "match failed", rc);
                if (rc != -ENODATA)
                        RETURN(rc);
        }

        if (cp->cpg_defer_uptodate) {
                cp->cpg_ra_used = 1;
                cl_page_export(env, page, 1);
        }
        /*
         * Add page into the queue even when it is marked uptodate above.
         * this will unlock it automatically as part of cl_page_list_disown().
         */
        cl_2queue_add(queue, page);
        if (sbi->ll_ra_info.ra_max_pages_per_file &&
            sbi->ll_ra_info.ra_max_pages)
                ll_readahead(env, io, ras,
                             vmpage->mapping, &queue->c2_qin, fd->fd_flags);

        RETURN(0);
}
Beispiel #5
0
/**
 * Cliens updates SOM attributes on MDS (including llog cookies):
 * obd_getattr with no lock and md_setattr.
 */
int ll_som_update(struct inode *inode, struct md_op_data *op_data)
{
        struct ll_inode_info *lli = ll_i2info(inode);
        struct ptlrpc_request *request = NULL;
        __u32 old_flags;
        struct obdo *oa;
        int rc;
        ENTRY;

        LASSERT(op_data != NULL);
        if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
                CERROR("ino %lu/%u(flags %u) som valid it just after "
                       "recovery\n", inode->i_ino, inode->i_generation,
                       lli->lli_flags);

        OBDO_ALLOC(oa);
        if (!oa) {
                CERROR("can't allocate memory for Size-on-MDS update.\n");
                RETURN(-ENOMEM);
        }

        old_flags = op_data->op_flags;
        op_data->op_flags = MF_SOM_CHANGE;

        /* If inode is already in another epoch, skip getattr from OSTs. */
        if (lli->lli_ioepoch == op_data->op_ioepoch) {
                rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
                                      old_flags & MF_GETATTR_LOCK);
                if (rc) {
                        oa->o_valid = 0;
			if (rc != -ENOENT)
                                CERROR("inode_getattr failed (%d): unable to "
                                       "send a Size-on-MDS attribute update "
                                       "for inode %lu/%u\n", rc, inode->i_ino,
                                       inode->i_generation);
                } else {
                        CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
                               PFID(&lli->lli_fid));
                }
                /* Install attributes into op_data. */
                md_from_obdo(op_data, oa, oa->o_valid);
        }

        rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
                        NULL, 0, NULL, 0, &request, NULL);
        ptlrpc_req_finished(request);

        OBDO_FREE(oa);
        RETURN(rc);
}
Beispiel #6
0
int ll_removexattr(struct dentry *dentry, const char *name)
{
	struct inode *inode = dentry->d_inode;

	LASSERT(inode);
	LASSERT(name);

	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
	       inode->i_ino, inode->i_generation, inode, name);

	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
	return ll_setxattr_common(inode, name, NULL, 0, 0,
				  OBD_MD_FLXATTRRM);
}
Beispiel #7
0
static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	int count = 0;
	bool printed = false;
	bool retry;
	int result;

	ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
			   LPROC_LL_MKWRITE, 1);

	file_update_time(vma->vm_file);
        do {
                retry = false;
                result = ll_page_mkwrite0(vma, vmf->page, &retry);

                if (!printed && ++count > 16) {
			const struct dentry *de = file_dentry(vma->vm_file);

			CWARN("app(%s): the page %lu of file "DFID" is under"
			      " heavy contention\n",
			      current->comm, vmf->pgoff,
			      PFID(ll_inode2fid(de->d_inode)));
                        printed = true;
                }
        } while (retry);

        switch(result) {
        case 0:
                LASSERT(PageLocked(vmf->page));
                result = VM_FAULT_LOCKED;
                break;
        case -ENODATA:
        case -EFAULT:
                result = VM_FAULT_NOPAGE;
                break;
        case -ENOMEM:
                result = VM_FAULT_OOM;
                break;
        case -EAGAIN:
                result = VM_FAULT_RETRY;
                break;
        default:
                result = VM_FAULT_SIGBUS;
                break;
        }

        return result;
}
Beispiel #8
0
static int vvp_io_write_start(const struct lu_env *env,
                              const struct cl_io_slice *ios)
{
        struct ccc_io      *cio   = cl2ccc_io(env, ios);
        struct cl_io       *io    = ios->cis_io;
        struct cl_object   *obj   = io->ci_obj;
        struct inode       *inode = ccc_object_inode(obj);
        struct file        *file  = cio->cui_fd->fd_file;
        ssize_t result = 0;
        loff_t pos = io->u.ci_wr.wr.crw_pos;
        size_t cnt = io->u.ci_wr.wr.crw_count;

        ENTRY;

	if (!can_populate_pages(env, io, inode))
		return 0;

        if (cl_io_is_append(io)) {
                /*
                 * PARALLEL IO This has to be changed for parallel IO doing
                 * out-of-order writes.
                 */
                pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
                cio->cui_iocb->ki_pos = pos;
        }

        CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);

        if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
                result = 0;
        else
                result = lustre_generic_file_write(file, cio, &pos);

	if (result > 0) {
		if (result < cnt)
			io->ci_continue = 0;
		io->ci_nob += result;
		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
				  cio->cui_fd, pos, result, WRITE);
		result = 0;
	}

	RETURN(result);
}
Beispiel #9
0
int ll_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct inode *inode = file_inode(file);
	int rc;

	if (ll_file_nolock(file))
		return -EOPNOTSUPP;

	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
	rc = generic_file_mmap(file, vma);
	if (rc == 0) {
		vma->vm_ops = &ll_file_vm_ops;
		vma->vm_ops->open(vma);
		/* update the inode's size and mtime */
		rc = ll_glimpse_size(inode);
	}

	return rc;
}
Beispiel #10
0
/* find any ldlm lock of the inode in mdc and lov
 * return 0    not find
 *        1    find one
 *      < 0    error */
static int find_cbdata(struct inode *inode)
{
        struct ll_inode_info *lli = ll_i2info(inode);
        struct ll_sb_info *sbi = ll_i2sbi(inode);
        int rc = 0;
        ENTRY;

        LASSERT(inode);
        rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
                            return_if_equal, NULL);
        if (rc != 0)
                 RETURN(rc);

        if (lli->lli_smd)
                rc = obd_find_cbdata(sbi->ll_dt_exp, lli->lli_smd,
                                     return_if_equal, NULL);

        RETURN(rc);
}
Beispiel #11
0
static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	int count = 0;
	bool printed = false;
	int result;
	sigset_t set;

	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
	 * so that it can be killed by admin but not cause segfault by
	 * other signals. */
	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));

	ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
			   LPROC_LL_FAULT, 1);

restart:
	result = ll_fault0(vma, vmf);
	if (!(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) {
                struct page *vmpage = vmf->page;

                /* check if this page has been truncated */
                lock_page(vmpage);
                if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
                        unlock_page(vmpage);
			put_page(vmpage);
                        vmf->page = NULL;

                        if (!printed && ++count > 16) {
                                CWARN("the page is under heavy contention,"
                                      "maybe your app(%s) needs revising :-)\n",
                                      current->comm);
                                printed = true;
                        }

                        goto restart;
                }

                result |= VM_FAULT_LOCKED;
        }
	cfs_restore_sigs(set);
        return result;
}
Beispiel #12
0
/** Queues DONE_WRITING if
 * - done writing is allowed;
 * - inode has no no dirty pages; */
void ll_queue_done_writing(struct inode *inode, unsigned long flags)
{
	struct ll_inode_info *lli = ll_i2info(inode);
	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
	ENTRY;

	spin_lock(&lli->lli_lock);
        lli->lli_flags |= flags;

        if ((lli->lli_flags & LLIF_DONE_WRITING) &&
            cfs_list_empty(&club->cob_pending_list)) {
                struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;

                if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
                        CWARN("ino %lu/%u(flags %u) som valid it just after "
                              "recovery\n",
                              inode->i_ino, inode->i_generation,
                              lli->lli_flags);
                /* DONE_WRITING is allowed and inode has no dirty page. */
		spin_lock(&lcq->lcq_lock);

                LASSERT(cfs_list_empty(&lli->lli_close_list));
                CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
                       inode->i_ino, inode->i_generation);
                cfs_list_add_tail(&lli->lli_close_list, &lcq->lcq_head);

                /* Avoid a concurrent insertion into the close thread queue:
                 * an inode is already in the close thread, open(), write(),
                 * close() happen, epoch is closed as the inode is marked as
                 * LLIF_EPOCH_PENDING. When pages are written inode should not
                 * be inserted into the queue again, clear this flag to avoid
                 * it. */
                lli->lli_flags &= ~LLIF_DONE_WRITING;

                cfs_waitq_signal(&lcq->lcq_waitq);
		spin_unlock(&lcq->lcq_lock);
	}
	spin_unlock(&lli->lli_lock);
	EXIT;
}
Beispiel #13
0
void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
{
        LASSERT(it != NULL);
        LASSERT(dentry != NULL);

        if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
                struct inode *inode = dentry->d_inode;
                struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);

                CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
                       inode, inode->i_ino, inode->i_generation);
                ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
        }

        /* drop lookup or getattr locks immediately */
        if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
                /* on 2.6 there are situation when several lookups and
                 * revalidations may be requested during single operation.
                 * therefore, we don't release intent here -bzzz */
                ll_intent_drop_lock(it);
        }
}
Beispiel #14
0
static int ll_readlink_internal(struct inode *inode,
				struct ptlrpc_request **request, char **symname)
{
	struct ll_inode_info *lli = ll_i2info(inode);
	struct ll_sb_info *sbi = ll_i2sbi(inode);
	int rc, symlen = i_size_read(inode) + 1;
	struct mdt_body *body;
	struct md_op_data *op_data;

	*request = NULL;

	if (lli->lli_symlink_name) {
		int print_limit = min_t(int, PAGE_SIZE - 128, symlen);

		*symname = lli->lli_symlink_name;
		/* If the total CDEBUG() size is larger than a page, it
		 * will print a warning to the console, avoid this by
		 * printing just the last part of the symlink. */
		CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
		       print_limit < symlen ? "..." : "", print_limit,
		       (*symname) + symlen - print_limit, symlen);
		return 0;
	}
Beispiel #15
0
int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
{
	struct ll_inode_info *lli = ll_i2info(inode);
	struct ll_remote_perm *lrp = NULL, *tmp = NULL;
	struct hlist_head *head, *perm_hash = NULL;

	LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);

#if 0
	if (perm->rp_uid != current->uid ||
	    perm->rp_gid != current->gid ||
	    perm->rp_fsuid != current->fsuid ||
	    perm->rp_fsgid != current->fsgid) {
		/* user might setxid in this small period */
		CDEBUG(D_SEC,
		       "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
		       perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
		       perm->rp_fsgid, current->uid, current->gid,
		       current->fsuid, current->fsgid);
		return -EAGAIN;
	}
#endif

	if (!lli->lli_remote_perms) {
		perm_hash = alloc_rmtperm_hash();
		if (perm_hash == NULL) {
			CERROR("alloc lli_remote_perms failed!\n");
			return -ENOMEM;
		}
	}

	spin_lock(&lli->lli_lock);

	if (!lli->lli_remote_perms)
		lli->lli_remote_perms = perm_hash;
	else if (perm_hash)
		free_rmtperm_hash(perm_hash);

	head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);

again:
	hlist_for_each_entry(tmp, head, lrp_list) {
		if (tmp->lrp_uid != perm->rp_uid)
			continue;
		if (tmp->lrp_gid != perm->rp_gid)
			continue;
		if (tmp->lrp_fsuid != perm->rp_fsuid)
			continue;
		if (tmp->lrp_fsgid != perm->rp_fsgid)
			continue;
		if (lrp)
			free_ll_remote_perm(lrp);
		lrp = tmp;
		break;
	}

	if (!lrp) {
		spin_unlock(&lli->lli_lock);
		lrp = alloc_ll_remote_perm();
		if (!lrp) {
			CERROR("alloc memory for ll_remote_perm failed!\n");
			return -ENOMEM;
		}
		spin_lock(&lli->lli_lock);
		goto again;
	}

	lrp->lrp_access_perm = perm->rp_access_perm;
	if (lrp != tmp) {
		lrp->lrp_uid	 = perm->rp_uid;
		lrp->lrp_gid	 = perm->rp_gid;
		lrp->lrp_fsuid       = perm->rp_fsuid;
		lrp->lrp_fsgid       = perm->rp_fsgid;
		hlist_add_head(&lrp->lrp_list, head);
	}
	lli->lli_rmtperm_time = cfs_time_current();
	spin_unlock(&lli->lli_lock);

	CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
	       lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
	       lrp->lrp_access_perm);

	return 0;
}
Beispiel #16
0
ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
	struct inode *inode = dentry->d_inode;
	int rc = 0, rc2 = 0;
	struct lov_mds_md *lmm = NULL;
	struct ptlrpc_request *request = NULL;
	int lmmsize;

	LASSERT(inode);

	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
	       inode->i_ino, inode->i_generation, inode);

	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);

	rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
	if (rc < 0)
		GOTO(out, rc);

	if (buffer != NULL) {
		struct ll_sb_info *sbi = ll_i2sbi(inode);
		char *xattr_name = buffer;
		int xlen, rem = rc;

		while (rem > 0) {
			xlen = strnlen(xattr_name, rem - 1) + 1;
			rem -= xlen;
			if (xattr_type_filter(sbi,
					get_xattr_type(xattr_name)) == 0) {
				/* skip OK xattr type
				 * leave it in buffer
				 */
				xattr_name += xlen;
				continue;
			}
			/* move up remaining xattrs in buffer
			 * removing the xattr that is not OK
			 */
			memmove(xattr_name, xattr_name + xlen, rem);
			rc -= xlen;
		}
	}
	if (S_ISREG(inode->i_mode)) {
		if (!ll_i2info(inode)->lli_has_smd)
			rc2 = -1;
	} else if (S_ISDIR(inode->i_mode)) {
		rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
	}

	if (rc2 < 0) {
		GOTO(out, rc2 = 0);
	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
		const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
		const size_t name_len   = sizeof("lov") - 1;
		const size_t total_len  = prefix_len + name_len + 1;

		if (((rc + total_len) > size) && (buffer != NULL)) {
			ptlrpc_req_finished(request);
			return -ERANGE;
		}

		if (buffer != NULL) {
			buffer += rc;
			memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
			memcpy(buffer + prefix_len, "lov", name_len);
			buffer[prefix_len + name_len] = '\0';
		}
		rc2 = total_len;
	}
out:
	ptlrpc_req_finished(request);
	rc = rc + rc2;

	return rc;
}
Beispiel #17
0
ssize_t ll_getxattr(struct dentry *dentry, const char *name,
		    void *buffer, size_t size)
{
	struct inode *inode = dentry->d_inode;

	LASSERT(inode);
	LASSERT(name);

	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
	       inode->i_ino, inode->i_generation, inode, name);

	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);

	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
	    (strncmp(name, XATTR_LUSTRE_PREFIX,
		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
		struct lov_stripe_md *lsm;
		struct lov_user_md *lump;
		struct lov_mds_md *lmm = NULL;
		struct ptlrpc_request *request = NULL;
		int rc = 0, lmmsize = 0;

		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
			return -ENODATA;

		if (size == 0 && S_ISDIR(inode->i_mode)) {
			/* XXX directory EA is fix for now, optimize to save
			 * RPC transfer */
			GOTO(out, rc = sizeof(struct lov_user_md));
		}

		lsm = ccc_inode_lsm_get(inode);
		if (lsm == NULL) {
			if (S_ISDIR(inode->i_mode)) {
				rc = ll_dir_getstripe(inode, &lmm,
						      &lmmsize, &request);
			} else {
				rc = -ENODATA;
			}
		} else {
			/* LSM is present already after lookup/getattr call.
			 * we need to grab layout lock once it is implemented */
			rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
			lmmsize = rc;
		}
		ccc_inode_lsm_put(inode, lsm);

		if (rc < 0)
		       GOTO(out, rc);

		if (size == 0) {
			/* used to call ll_get_max_mdsize() forward to get
			 * the maximum buffer size, while some apps (such as
			 * rsync 3.0.x) care much about the exact xattr value
			 * size */
			rc = lmmsize;
			GOTO(out, rc);
		}

		if (size < lmmsize) {
			CERROR("server bug: replied size %d > %d for %s (%s)\n",
			       lmmsize, (int)size, dentry->d_name.name, name);
			GOTO(out, rc = -ERANGE);
		}

		lump = (struct lov_user_md *)buffer;
		memcpy(lump, lmm, lmmsize);
		/* do not return layout gen for getxattr otherwise it would
		 * confuse tar --xattr by recognizing layout gen as stripe
		 * offset when the file is restored. See LU-2809. */
		lump->lmm_layout_gen = 0;

		rc = lmmsize;
out:
		if (request)
			ptlrpc_req_finished(request);
		else if (lmm)
			obd_free_diskmd(ll_i2dtexp(inode), &lmm);
		return(rc);
	}

	return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
}
Beispiel #18
0
static
int ll_getxattr_common(struct inode *inode, const char *name,
		       void *buffer, size_t size, __u64 valid)
{
	struct ll_sb_info *sbi = ll_i2sbi(inode);
	struct ptlrpc_request *req = NULL;
	struct mdt_body *body;
	int xattr_type, rc;
	void *xdata;
	struct obd_capa *oc;
	struct rmtacl_ctl_entry *rce = NULL;

	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
	       inode->i_ino, inode->i_generation, inode);

	/* listxattr have slightly different behavior from of ext3:
	 * without 'user_xattr' ext3 will list all xattr names but
	 * filtered out "^user..*"; we list them all for simplicity.
	 */
	if (!name) {
		xattr_type = XATTR_OTHER_T;
		goto do_getxattr;
	}

	xattr_type = get_xattr_type(name);
	rc = xattr_type_filter(sbi, xattr_type);
	if (rc)
		return rc;

	/* b15587: ignore security.capability xattr for now */
	if ((xattr_type == XATTR_SECURITY_T &&
	    strcmp(name, "security.capability") == 0))
		return -ENODATA;

	/* LU-549:  Disable security.selinux when selinux is disabled */
	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
	    strcmp(name, "security.selinux") == 0)
		return -EOPNOTSUPP;

#ifdef CONFIG_FS_POSIX_ACL
	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
	    (xattr_type == XATTR_ACL_ACCESS_T ||
	    xattr_type == XATTR_ACL_DEFAULT_T)) {
		rce = rct_search(&sbi->ll_rct, current_pid());
		if (rce == NULL ||
		    (rce->rce_ops != RMT_LSETFACL &&
		    rce->rce_ops != RMT_LGETFACL &&
		    rce->rce_ops != RMT_RSETFACL &&
		    rce->rce_ops != RMT_RGETFACL))
			return -EOPNOTSUPP;
	}

	/* posix acl is under protection of LOOKUP lock. when calling to this,
	 * we just have path resolution to the target inode, so we have great
	 * chance that cached ACL is uptodate.
	 */
	if (xattr_type == XATTR_ACL_ACCESS_T &&
	    !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
		struct ll_inode_info *lli = ll_i2info(inode);
		struct posix_acl *acl;

		spin_lock(&lli->lli_lock);
		acl = posix_acl_dup(lli->lli_posix_acl);
		spin_unlock(&lli->lli_lock);

		if (!acl)
			return -ENODATA;

		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
		posix_acl_release(acl);
		return rc;
	}
	if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
		return -ENODATA;
#endif

do_getxattr:
	if (sbi->ll_xattr_cache_enabled && (rce == NULL ||
					    rce->rce_ops == RMT_LGETFACL ||
					    rce->rce_ops == RMT_LSETFACL)) {
		rc = ll_xattr_cache_get(inode, name, buffer, size, valid);
		if (rc < 0)
			GOTO(out_xattr, rc);
	} else {
		oc = ll_mdscapa_get(inode);
		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
				valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
				name, NULL, 0, size, 0, &req);
		capa_put(oc);

		if (rc < 0)
			GOTO(out_xattr, rc);

		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
		LASSERT(body);

		/* only detect the xattr size */
		if (size == 0)
			GOTO(out, rc = body->eadatasize);

		if (size < body->eadatasize) {
			CERROR("server bug: replied size %u > %u\n",
				body->eadatasize, (int)size);
			GOTO(out, rc = -ERANGE);
		}

		if (body->eadatasize == 0)
			GOTO(out, rc = -ENODATA);

		/* do not need swab xattr data */
		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
							body->eadatasize);
		if (!xdata)
			GOTO(out, rc = -EFAULT);

		memcpy(buffer, xdata, body->eadatasize);
		rc = body->eadatasize;
	}

#ifdef CONFIG_FS_POSIX_ACL
	if (rce && rce->rce_ops == RMT_LSETFACL) {
		ext_acl_xattr_header *acl;

		acl = lustre_posix_acl_xattr_2ext(
					(posix_acl_xattr_header *)buffer, rc);
		if (IS_ERR(acl))
			GOTO(out, rc = PTR_ERR(acl));

		rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
			    xattr_type, acl);
		if (unlikely(rc < 0)) {
			lustre_ext_acl_xattr_free(acl);
			GOTO(out, rc);
		}
	}
#endif

out_xattr:
	if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
		LCONSOLE_INFO(
			"%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n",
			ll_get_fsname(inode->i_sb, NULL, 0), rc);
		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
	}
out:
	ptlrpc_req_finished(req);
	return rc;
}
Beispiel #19
0
static int vvp_io_read_start(const struct lu_env *env,
                             const struct cl_io_slice *ios)
{
        struct vvp_io     *vio   = cl2vvp_io(env, ios);
        struct ccc_io     *cio   = cl2ccc_io(env, ios);
        struct cl_io      *io    = ios->cis_io;
        struct cl_object  *obj   = io->ci_obj;
        struct inode      *inode = ccc_object_inode(obj);
        struct ll_ra_read *bead  = &vio->cui_bead;
        struct file       *file  = cio->cui_fd->fd_file;

        int     result;
        loff_t  pos = io->u.ci_rd.rd.crw_pos;
        long    cnt = io->u.ci_rd.rd.crw_count;
        long    tot = cio->cui_tot_count;
        int     exceed = 0;

        CLOBINVRNT(env, obj, ccc_object_invariant(obj));

        CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);

	if (!can_populate_pages(env, io, inode))
		return 0;

        result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
        if (result != 0)
                return result;
        else if (exceed != 0)
                goto out;

        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
                        "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
                        inode->i_ino, cnt, pos, i_size_read(inode));

        /* turn off the kernel's read-ahead */
        cio->cui_fd->fd_file->f_ra.ra_pages = 0;

        /* initialize read-ahead window once per syscall */
        if (!vio->cui_ra_window_set) {
                vio->cui_ra_window_set = 1;
                bead->lrr_start = cl_index(obj, pos);
		bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
                ll_ra_read_in(file, bead);
        }

        /* BUG: 5972 */
        file_accessed(file);
        switch (vio->cui_io_subtype) {
        case IO_NORMAL:
                 result = lustre_generic_file_read(file, cio, &pos);
                 break;
        case IO_SPLICE:
                result = generic_file_splice_read(file, &pos,
                                vio->u.splice.cui_pipe, cnt,
                                vio->u.splice.cui_flags);
                /* LU-1109: do splice read stripe by stripe otherwise if it
                 * may make nfsd stuck if this read occupied all internal pipe
                 * buffers. */
                io->ci_continue = 0;
                break;
        default:
                CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
                LBUG();
        }

out:
	if (result >= 0) {
		if (result < cnt)
			io->ci_continue = 0;
		io->ci_nob += result;
		ll_rw_stats_tally(ll_i2sbi(inode), current->pid, cio->cui_fd,
				  pos, result, READ);
		result = 0;
	}

	return result;
}
Beispiel #20
0
static
int ll_setxattr_common(struct inode *inode, const char *name,
		       const void *value, size_t size,
		       int flags, __u64 valid)
{
	struct ll_sb_info *sbi = ll_i2sbi(inode);
	struct ptlrpc_request *req = NULL;
	int xattr_type, rc;
	struct obd_capa *oc;
	struct rmtacl_ctl_entry *rce = NULL;
#ifdef CONFIG_FS_POSIX_ACL
	posix_acl_xattr_header *new_value = NULL;
	ext_acl_xattr_header *acl = NULL;
#endif
	const char *pv = value;

	xattr_type = get_xattr_type(name);
	rc = xattr_type_filter(sbi, xattr_type);
	if (rc)
		return rc;

	/* b10667: ignore lustre special xattr for now */
	if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
	    (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
		return 0;

	/* b15587: ignore security.capability xattr for now */
	if ((xattr_type == XATTR_SECURITY_T &&
	    strcmp(name, "security.capability") == 0))
		return 0;

	/* LU-549:  Disable security.selinux when selinux is disabled */
	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
	    strcmp(name, "security.selinux") == 0)
		return -EOPNOTSUPP;

#ifdef CONFIG_FS_POSIX_ACL
	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
	    (xattr_type == XATTR_ACL_ACCESS_T ||
	    xattr_type == XATTR_ACL_DEFAULT_T)) {
		rce = rct_search(&sbi->ll_rct, current_pid());
		if (rce == NULL ||
		    (rce->rce_ops != RMT_LSETFACL &&
		    rce->rce_ops != RMT_RSETFACL))
			return -EOPNOTSUPP;

		if (rce->rce_ops == RMT_LSETFACL) {
			struct eacl_entry *ee;

			ee = et_search_del(&sbi->ll_et, current_pid(),
					   ll_inode2fid(inode), xattr_type);
			LASSERT(ee != NULL);
			if (valid & OBD_MD_FLXATTR) {
				acl = lustre_acl_xattr_merge2ext(
						(posix_acl_xattr_header *)value,
						size, ee->ee_acl);
				if (IS_ERR(acl)) {
					ee_free(ee);
					return PTR_ERR(acl);
				}
				size =  CFS_ACL_XATTR_SIZE(\
						le32_to_cpu(acl->a_count), \
						ext_acl_xattr);
				pv = (const char *)acl;
			}
			ee_free(ee);
		} else if (rce->rce_ops == RMT_RSETFACL) {
			size = lustre_posix_acl_xattr_filter(
						(posix_acl_xattr_header *)value,
						size, &new_value);
			if (unlikely(size < 0))
				return size;

			pv = (const char *)new_value;
		} else
			return -EOPNOTSUPP;

		valid |= rce_ops2valid(rce->rce_ops);
	}
#endif
	if (sbi->ll_xattr_cache_enabled &&
	    (rce == NULL || rce->rce_ops == RMT_LSETFACL)) {
		rc = ll_xattr_cache_update(inode, name, pv, size, valid, flags);
	} else {
		oc = ll_mdscapa_get(inode);
		rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
				valid, name, pv, size, 0, flags,
				ll_i2suppgid(inode), &req);
		capa_put(oc);
	}

#ifdef CONFIG_FS_POSIX_ACL
	if (new_value != NULL)
		lustre_posix_acl_xattr_free(new_value, size);
	if (acl != NULL)
		lustre_ext_acl_xattr_free(acl);
#endif
	if (rc) {
		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
			LCONSOLE_INFO("Disabling user_xattr feature because "
				      "it is not supported on the server\n");
			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
		}
		return rc;
	}

	ptlrpc_req_finished(req);
	return 0;
}
Beispiel #21
0
int ll_revalidate_it(struct dentry *de, int lookup_flags,
                     struct lookup_intent *it)
{
        struct md_op_data *op_data;
        struct ptlrpc_request *req = NULL;
        struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
        struct obd_export *exp;
        struct inode *parent = de->d_parent->d_inode;
        int rc, first = 0;

        ENTRY;
        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
               LL_IT2STR(it));

        if (de->d_inode == NULL) {
                /* We can only use negative dentries if this is stat or lookup,
                   for opens and stuff we do need to query server. */
                /* If there is IT_CREAT in intent op set, then we must throw
                   away this negative dentry and actually do the request to
                   kernel to create whatever needs to be created (if possible)*/
                if (it && (it->it_op & IT_CREAT))
                        RETURN(0);

                if (de->d_flags & DCACHE_LUSTRE_INVALID)
                        RETURN(0);

                rc = ll_have_md_lock(parent, MDS_INODELOCK_UPDATE, LCK_MINMODE);
                GOTO(out_sa, rc);
        }

        /* Never execute intents for mount points.
         * Attributes will be fixed up in ll_inode_revalidate_it */
        if (d_mountpoint(de))
                GOTO(out_sa, rc = 1);

        /* need to get attributes in case root got changed from other client */
        if (de == de->d_sb->s_root) {
                rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
                if (rc == 0)
                        rc = 1;
                GOTO(out_sa, rc);
        }

        exp = ll_i2mdexp(de->d_inode);

        OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
        ll_frob_intent(&it, &lookup_it);
        LASSERT(it);

        if (it->it_op == IT_LOOKUP && !(de->d_flags & DCACHE_LUSTRE_INVALID))
                GOTO(out_sa, rc = 1);

        op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
                                     de->d_name.name, de->d_name.len,
                                     0, LUSTRE_OPC_ANY, NULL);
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));

        if ((it->it_op == IT_OPEN) && de->d_inode) {
                struct inode *inode = de->d_inode;
                struct ll_inode_info *lli = ll_i2info(inode);
                struct obd_client_handle **och_p;
                __u64 *och_usecount;

                /*
                 * We used to check for MDS_INODELOCK_OPEN here, but in fact
                 * just having LOOKUP lock is enough to justify inode is the
                 * same. And if inode is the same and we have suitable
                 * openhandle, then there is no point in doing another OPEN RPC
                 * just to throw away newly received openhandle.  There are no
                 * security implications too, if file owner or access mode is
                 * change, LOOKUP lock is revoked.
                 */


                if (it->it_flags & FMODE_WRITE) {
                        och_p = &lli->lli_mds_write_och;
                        och_usecount = &lli->lli_open_fd_write_count;
                } else if (it->it_flags & FMODE_EXEC) {
                        och_p = &lli->lli_mds_exec_och;
                        och_usecount = &lli->lli_open_fd_exec_count;
                } else {
                        och_p = &lli->lli_mds_read_och;
                        och_usecount = &lli->lli_open_fd_read_count;
                }
                /* Check for the proper lock. */
                if (!ll_have_md_lock(inode, MDS_INODELOCK_LOOKUP, LCK_MINMODE))
                        goto do_lock;
                cfs_down(&lli->lli_och_sem);
                if (*och_p) { /* Everything is open already, do nothing */
                        /*(*och_usecount)++;  Do not let them steal our open
                          handle from under us */
                        /* XXX The code above was my original idea, but in case
                           we have the handle, but we cannot use it due to later
                           checks (e.g. O_CREAT|O_EXCL flags set), nobody
                           would decrement counter increased here. So we just
                           hope the lock won't be invalidated in between. But
                           if it would be, we'll reopen the open request to
                           MDS later during file open path */
                        cfs_up(&lli->lli_och_sem);
                        ll_finish_md_op_data(op_data);
                        RETURN(1);
                } else {
                        cfs_up(&lli->lli_och_sem);
                }
        }

        if (it->it_op == IT_GETATTR) {
                first = ll_statahead_enter(parent, &de, 0);
                if (first == 1) {
                        ll_statahead_exit(parent, de, 1);
                        ll_finish_md_op_data(op_data);
                        GOTO(out, rc = 1);
                }
        }

do_lock:
        it->it_create_mode &= ~current->fs->umask;
        it->it_create_mode |= M_CHECK_STALE;
        rc = md_intent_lock(exp, op_data, NULL, 0, it,
                            lookup_flags,
                            &req, ll_md_blocking_ast, 0);
        it->it_create_mode &= ~M_CHECK_STALE;
        ll_finish_md_op_data(op_data);
        if (it->it_op == IT_GETATTR && !first)
                /* If there are too many locks on client-side, then some
                 * locks taken by statahead maybe dropped automatically
                 * before the real "revalidate" using them. */
                ll_statahead_exit(parent, de, req == NULL ? rc : 0);
        else if (first == -EEXIST)
                ll_statahead_mark(parent, de);

        /* If req is NULL, then md_intent_lock only tried to do a lock match;
         * if all was well, it will return 1 if it found locks, 0 otherwise. */
        if (req == NULL && rc >= 0) {
                if (!rc)
                        goto do_lookup;
                GOTO(out, rc);
        }

        if (rc < 0) {
                if (rc != -ESTALE) {
                        CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
                               "%d\n", rc, it->d.lustre.it_status);
                }
                GOTO(out, rc = 0);
        }

revalidate_finish:
        rc = ll_revalidate_it_finish(req, it, de);
        if (rc != 0) {
                if (rc != -ESTALE && rc != -ENOENT)
                        ll_intent_release(it);
                GOTO(out, rc = 0);
        }

        if ((it->it_op & IT_OPEN) && de->d_inode &&
            !S_ISREG(de->d_inode->i_mode) &&
            !S_ISDIR(de->d_inode->i_mode)) {
                ll_release_openhandle(de, it);
        }
        rc = 1;

        /* unfortunately ll_intent_lock may cause a callback and revoke our
         * dentry */
        cfs_spin_lock(&ll_lookup_lock);
        spin_lock(&dcache_lock);
        lock_dentry(de);
        __d_drop(de);
        unlock_dentry(de);
        d_rehash_cond(de, 0);
        spin_unlock(&dcache_lock);
        cfs_spin_unlock(&ll_lookup_lock);

out:
        /* We do not free request as it may be reused during following lookup
         * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
         * be freed in ll_lookup_it or in ll_intent_release. But if
         * request was not completed, we need to free it. (bug 5154, 9903) */
        if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
                ptlrpc_req_finished(req);
        if (rc == 0) {
                ll_unhash_aliases(de->d_inode);
                /* done in ll_unhash_aliases()
                   dentry->d_flags |= DCACHE_LUSTRE_INVALID; */
        } else {
                CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
                       "inode %p refc %d\n", de->d_name.len,
                       de->d_name.name, de, de->d_parent, de->d_inode,
                       atomic_read(&de->d_count));
                if (first != 1) {
                        if (de->d_flags & DCACHE_LUSTRE_INVALID) {
                                lock_dentry(de);
                                de->d_flags &= ~DCACHE_LUSTRE_INVALID;
                                unlock_dentry(de);
                        }
                        ll_lookup_finish_locks(it, de);
                }
        }
        RETURN(rc);

        /*
         * This part is here to combat evil-evil race in real_lookup on 2.6
         * kernels.  The race details are: We enter do_lookup() looking for some
         * name, there is nothing in dcache for this name yet and d_lookup()
         * returns NULL.  We proceed to real_lookup(), and while we do this,
         * another process does open on the same file we looking up (most simple
         * reproducer), open succeeds and the dentry is added. Now back to
         * us. In real_lookup() we do d_lookup() again and suddenly find the
         * dentry, so we call d_revalidate on it, but there is no lock, so
         * without this code we would return 0, but unpatched real_lookup just
         * returns -ENOENT in such a case instead of retrying the lookup. Once
         * this is dealt with in real_lookup(), all of this ugly mess can go and
         * we can just check locks in ->d_revalidate without doing any RPCs
         * ever.
         */
do_lookup:
        if (it != &lookup_it) {
                /* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
                if (it->it_op == IT_GETATTR)
                        lookup_it.it_op = IT_GETATTR;
                ll_lookup_finish_locks(it, de);
                it = &lookup_it;
        }

        /* Do real lookup here. */
        op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
                                     de->d_name.len, 0, (it->it_op & IT_CREAT ?
                                                         LUSTRE_OPC_CREATE :
                                                         LUSTRE_OPC_ANY), NULL);
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));

        rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
                            ll_md_blocking_ast, 0);
        if (rc >= 0) {
                struct mdt_body *mdt_body;
                struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
                mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);

                if (de->d_inode)
                        fid = *ll_inode2fid(de->d_inode);

                /* see if we got same inode, if not - return error */
                if (lu_fid_eq(&fid, &mdt_body->fid1)) {
                        ll_finish_md_op_data(op_data);
                        op_data = NULL;
                        goto revalidate_finish;
                }
                ll_intent_release(it);
        }
        ll_finish_md_op_data(op_data);
        GOTO(out, rc = 0);

out_sa:
        /*
         * For rc == 1 case, should not return directly to prevent losing
         * statahead windows; for rc == 0 case, the "lookup" will be done later.
         */
        if (it && it->it_op == IT_GETATTR && rc == 1) {
                first = ll_statahead_enter(parent, &de, 0);
                if (first >= 0)
                        ll_statahead_exit(parent, de, 1);
                else if (first == -EEXIST)
                        ll_statahead_mark(parent, de);
        }

        return rc;
}

#if 0
static void ll_pin(struct dentry *de, struct vfsmount *mnt, int flag)
{
        struct inode *inode= de->d_inode;
        struct ll_sb_info *sbi = ll_i2sbi(inode);
        struct ll_dentry_data *ldd = ll_d2d(de);
        struct obd_client_handle *handle;
        struct obd_capa *oc;
        int rc = 0;
        ENTRY;
        LASSERT(ldd);

        cfs_lock_kernel();
        /* Strictly speaking this introduces an additional race: the
         * increments should wait until the rpc has returned.
         * However, given that at present the function is void, this
         * issue is moot. */
        if (flag == 1 && (++ldd->lld_mnt_count) > 1) {
                cfs_unlock_kernel();
                EXIT;
                return;
        }

        if (flag == 0 && (++ldd->lld_cwd_count) > 1) {
                cfs_unlock_kernel();
                EXIT;
                return;
        }
        cfs_unlock_kernel();

        handle = (flag) ? &ldd->lld_mnt_och : &ldd->lld_cwd_och;
        oc = ll_mdscapa_get(inode);
        rc = obd_pin(sbi->ll_md_exp, ll_inode2fid(inode), oc, handle, flag);
        capa_put(oc);
        if (rc) {
                cfs_lock_kernel();
                memset(handle, 0, sizeof(*handle));
                if (flag == 0)
                        ldd->lld_cwd_count--;
                else
                        ldd->lld_mnt_count--;
                cfs_unlock_kernel();
        }

        EXIT;
        return;
}

static void ll_unpin(struct dentry *de, struct vfsmount *mnt, int flag)
{
        struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
        struct ll_dentry_data *ldd = ll_d2d(de);
        struct obd_client_handle handle;
        int count, rc = 0;
        ENTRY;
        LASSERT(ldd);

        cfs_lock_kernel();
        /* Strictly speaking this introduces an additional race: the
         * increments should wait until the rpc has returned.
         * However, given that at present the function is void, this
         * issue is moot. */
        handle = (flag) ? ldd->lld_mnt_och : ldd->lld_cwd_och;
        if (handle.och_magic != OBD_CLIENT_HANDLE_MAGIC) {
                /* the "pin" failed */
                cfs_unlock_kernel();
                EXIT;
                return;
        }

        if (flag)
                count = --ldd->lld_mnt_count;
        else
                count = --ldd->lld_cwd_count;
        cfs_unlock_kernel();

        if (count != 0) {
                EXIT;
                return;
        }

        rc = obd_unpin(sbi->ll_md_exp, &handle, flag);
        EXIT;
        return;
}
#endif

#ifdef HAVE_VFS_INTENT_PATCHES
int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
{
        int rc;
        ENTRY;

        if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST))
                rc = ll_revalidate_it(dentry, nd->flags, &nd->intent);
        else
                rc = ll_revalidate_it(dentry, 0, NULL);

        RETURN(rc);
}
Beispiel #22
0
static int vvp_io_commit_write(const struct lu_env *env,
                               const struct cl_io_slice *ios,
                               const struct cl_page_slice *slice,
                               unsigned from, unsigned to)
{
        struct cl_object  *obj    = slice->cpl_obj;
        struct cl_io      *io     = ios->cis_io;
        struct ccc_page   *cp     = cl2ccc_page(slice);
        struct cl_page    *pg     = slice->cpl_page;
        struct inode      *inode  = ccc_object_inode(obj);
        struct ll_sb_info *sbi    = ll_i2sbi(inode);
	struct ll_inode_info *lli = ll_i2info(inode);
	struct page        *vmpage = cp->cpg_page;

        int    result;
        int    tallyop;
        loff_t size;

        ENTRY;

        LINVRNT(cl_page_is_vmlocked(env, pg));
        LASSERT(vmpage->mapping->host == inode);

        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n");
        CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);

        /*
         * queue a write for some time in the future the first time we
         * dirty the page.
         *
         * This is different from what other file systems do: they usually
         * just mark page (and some of its buffers) dirty and rely on
         * balance_dirty_pages() to start a write-back. Lustre wants write-back
         * to be started earlier for the following reasons:
         *
         *     (1) with a large number of clients we need to limit the amount
         *     of cached data on the clients a lot;
         *
         *     (2) large compute jobs generally want compute-only then io-only
         *     and the IO should complete as quickly as possible;
         *
         *     (3) IO is batched up to the RPC size and is async until the
         *     client max cache is hit
         *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
         *
         */
        if (!PageDirty(vmpage)) {
                tallyop = LPROC_LL_DIRTY_MISSES;
                result = cl_page_cache_add(env, io, pg, CRT_WRITE);
                if (result == 0) {
                        /* page was added into cache successfully. */
                        set_page_dirty(vmpage);
                        vvp_write_pending(cl2ccc(obj), cp);
                } else if (result == -EDQUOT) {
			pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
                        bool need_clip = true;

                        /*
                         * Client ran out of disk space grant. Possible
                         * strategies are:
                         *
                         *     (a) do a sync write, renewing grant;
                         *
                         *     (b) stop writing on this stripe, switch to the
                         *     next one.
                         *
                         * (b) is a part of "parallel io" design that is the
                         * ultimate goal. (a) is what "old" client did, and
                         * what the new code continues to do for the time
                         * being.
                         */
                        if (last_index > pg->cp_index) {
				to = PAGE_CACHE_SIZE;
                                need_clip = false;
                        } else if (last_index == pg->cp_index) {
                                int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
                                if (to < size_to)
                                        to = size_to;
                        }
                        if (need_clip)
                                cl_page_clip(env, pg, 0, to);
                        result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
                        if (result)
                                CERROR("Write page %lu of inode %p failed %d\n",
                                       pg->cp_index, inode, result);
                }
Beispiel #23
0
int lustre_check_remote_perm(struct inode *inode, int mask)
{
	struct ll_inode_info *lli = ll_i2info(inode);
	struct ll_sb_info *sbi = ll_i2sbi(inode);
	struct ptlrpc_request *req = NULL;
	struct mdt_remote_perm *perm;
	struct obd_capa *oc;
	unsigned long save;
	int i = 0, rc;

	do {
		save = lli->lli_rmtperm_time;
		rc = do_check_remote_perm(lli, mask);
		if (!rc || (rc != -ENOENT && i))
			break;

		might_sleep();

		mutex_lock(&lli->lli_rmtperm_mutex);
		/* check again */
		if (save != lli->lli_rmtperm_time) {
			rc = do_check_remote_perm(lli, mask);
			if (!rc || (rc != -ENOENT && i)) {
				mutex_unlock(&lli->lli_rmtperm_mutex);
				break;
			}
		}

		if (i++ > 5) {
			CERROR("check remote perm falls in dead loop!\n");
			LBUG();
		}

		oc = ll_mdscapa_get(inode);
		rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
					ll_i2suppgid(inode), &req);
		capa_put(oc);
		if (rc) {
			mutex_unlock(&lli->lli_rmtperm_mutex);
			break;
		}

		perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
						   lustre_swab_mdt_remote_perm);
		if (unlikely(perm == NULL)) {
			mutex_unlock(&lli->lli_rmtperm_mutex);
			rc = -EPROTO;
			break;
		}

		rc = ll_update_remote_perm(inode, perm);
		mutex_unlock(&lli->lli_rmtperm_mutex);
		if (rc == -ENOMEM)
			break;

		ptlrpc_req_finished(req);
		req = NULL;
	} while (1);
	ptlrpc_req_finished(req);
	return rc;
}
Beispiel #24
0
int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
		struct dir_context *ctx)
{
	struct ll_sb_info    *sbi	= ll_i2sbi(inode);
	__u64		   pos		= *ppos;
	int		   is_api32 = ll_need_32bit_api(sbi);
	int		   is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH;
	struct page	  *page;
	bool		   done = false;
	int		   rc = 0;

	page = ll_get_dir_page(inode, op_data, pos);

	while (rc == 0 && !done) {
		struct lu_dirpage *dp;
		struct lu_dirent  *ent;
		__u64 hash;
		__u64 next;

		if (IS_ERR(page)) {
			rc = PTR_ERR(page);
			break;
		}

		hash = MDS_DIR_END_OFF;
		dp = page_address(page);
		for (ent = lu_dirent_start(dp); ent && !done;
		     ent = lu_dirent_next(ent)) {
			__u16	  type;
			int	    namelen;
			struct lu_fid  fid;
			__u64	  lhash;
			__u64	  ino;

			hash = le64_to_cpu(ent->lde_hash);
			if (hash < pos)
				/*
				 * Skip until we find target hash
				 * value.
				 */
				continue;

			namelen = le16_to_cpu(ent->lde_namelen);
			if (namelen == 0)
				/*
				 * Skip dummy record.
				 */
				continue;

			if (is_api32 && is_hash64)
				lhash = hash >> 32;
			else
				lhash = hash;
			fid_le_to_cpu(&fid, &ent->lde_fid);
			ino = cl_fid_build_ino(&fid, is_api32);
			type = ll_dirent_type_get(ent);
			ctx->pos = lhash;
			/* For 'll_nfs_get_name_filldir()', it will try
			 * to access the 'ent' through its 'lde_name',
			 * so the parameter 'name' for 'ctx->actor()'
			 * must be part of the 'ent'.
			 */
			done = !dir_emit(ctx, ent->lde_name,
					 namelen, ino, type);
		}

		if (done) {
			pos = hash;
			ll_release_page(inode, page, false);
			break;
		}

		next = le64_to_cpu(dp->ldp_hash_end);
		pos = next;
		if (pos == MDS_DIR_END_OFF) {
			/*
			 * End of directory reached.
			 */
			done = 1;
			ll_release_page(inode, page, false);
		} else {
			/*
			 * Normal case: continue to the next
			 * page.
			 */
			ll_release_page(inode, page,
					le32_to_cpu(dp->ldp_flags) &
					LDF_COLLIDE);
			next = pos;
			page = ll_get_dir_page(inode, op_data, pos);
		}
	}
Beispiel #25
0
/**
 * Lustre implementation of a vm_operations_struct::fault() method, called by
 * VM to server page fault (both in kernel and user space).
 *
 * \param vma - is virtiual area struct related to page fault
 * \param vmf - structure which describe type and address where hit fault
 *
 * \return allocated and filled _locked_ page for address
 * \retval VM_FAULT_ERROR on general error
 * \retval NOPAGE_OOM not have memory for allocate new page
 */
static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct lu_env           *env;
	struct cl_io            *io;
	struct vvp_io           *vio = NULL;
	struct page             *vmpage;
	unsigned long            ra_flags;
	int                      result = 0;
	int                      fault_ret = 0;
	__u16			 refcheck;
	ENTRY;

	env = cl_env_get(&refcheck);
	if (IS_ERR(env))
		RETURN(PTR_ERR(env));

	if (ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
		/* do fast fault */
		ll_cl_add(vma->vm_file, env, NULL, LCC_MMAP);
		fault_ret = filemap_fault(vma, vmf);
		ll_cl_remove(vma->vm_file, env);

		/* - If there is no error, then the page was found in cache and
		 *   uptodate;
		 * - If VM_FAULT_RETRY is set, the page existed but failed to
		 *   lock. It will return to kernel and retry;
		 * - Otherwise, it should try normal fault under DLM lock. */
		if ((fault_ret & VM_FAULT_RETRY) ||
		    !(fault_ret & VM_FAULT_ERROR))
			GOTO(out, result = 0);

		fault_ret = 0;
	}

	io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags);
	if (IS_ERR(io))
		GOTO(out, result = PTR_ERR(io));

	result = io->ci_result;
	if (result == 0) {
		vio = vvp_env_io(env);
		vio->u.fault.ft_vma       = vma;
		vio->u.fault.ft_vmpage    = NULL;
		vio->u.fault.ft_vmf = vmf;
		vio->u.fault.ft_flags = 0;
		vio->u.fault.ft_flags_valid = 0;

		/* May call ll_readpage() */
		ll_cl_add(vma->vm_file, env, io, LCC_MMAP);

		result = cl_io_loop(env, io);

		ll_cl_remove(vma->vm_file, env);

		/* ft_flags are only valid if we reached
		 * the call to filemap_fault */
		if (vio->u.fault.ft_flags_valid)
			fault_ret = vio->u.fault.ft_flags;

		vmpage = vio->u.fault.ft_vmpage;
		if (result != 0 && vmpage != NULL) {
			put_page(vmpage);
			vmf->page = NULL;
		}
        }
	cl_io_fini(env, io);

	vma->vm_flags |= ra_flags;

out:
	cl_env_put(env, &refcheck);
	if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
		fault_ret |= to_fault_error(result);

	CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result);
	RETURN(fault_ret);
}