Esempio n. 1
0
int filter_recov_log_mds_ost_cb(struct llog_handle *llh,
                                struct llog_rec_hdr *rec, void *data)
{
        struct llog_ctxt *ctxt = llh->lgh_ctxt;
        struct llog_cookie cookie;
        int rc = 0;
        ENTRY;

        if (ctxt->loc_obd->obd_stopping)
                RETURN(LLOG_PROC_BREAK);

        if (rec == NULL) {
                cfs_spin_lock(&ctxt->loc_obd->u.filter.fo_flags_lock);
                ctxt->loc_obd->u.filter.fo_mds_ost_sync = 0;
                cfs_spin_unlock(&ctxt->loc_obd->u.filter.fo_flags_lock);
                RETURN(0);
        }

        if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
                CERROR("log is not plain\n");
                RETURN(-EINVAL);
        }

        OBD_FAIL_TIMEOUT(OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT, 30);
        cookie.lgc_lgl = llh->lgh_id;
        cookie.lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
        cookie.lgc_index = rec->lrh_index;

        switch (rec->lrh_type) {
        case MDS_UNLINK_REC:
                rc = filter_recov_log_unlink_cb(ctxt, rec, &cookie);
                break;
        case MDS_SETATTR_REC:
        case MDS_SETATTR64_REC:
                rc = filter_recov_log_setattr_cb(ctxt, rec, &cookie);
                break;
        case LLOG_GEN_REC: {
                struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
                if (llog_gen_lt(lgr->lgr_gen, ctxt->loc_gen))
                        rc = 0;
                else
                        rc = LLOG_PROC_BREAK;
                CDEBUG(D_HA, "fetch generation log, send cookie\n");
                llog_cancel(ctxt, NULL, 1, &cookie, 0);
                RETURN(rc);
                }
                break;
        default:
                CERROR("log record type %08x unknown\n", rec->lrh_type);
                RETURN(-EINVAL);
                break;
        }

        RETURN(rc);
}
Esempio n. 2
0
static int osc_io_write_start(const struct lu_env *env,
			      const struct cl_io_slice *slice)
{
	struct cl_object *obj   = slice->cis_obj;
	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
	int rc = 0;

	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
	cl_object_attr_lock(obj);
	attr->cat_mtime = attr->cat_ctime = LTIME_S(CURRENT_TIME);
	rc = cl_object_attr_set(env, obj, attr, CAT_MTIME | CAT_CTIME);
	cl_object_attr_unlock(obj);

	return rc;
}
Esempio n. 3
0
int cl_glimpse_size0(struct inode *inode, int agl)
{
        /*
         * We don't need ast_flags argument to cl_glimpse_size(), because
         * osc_lock_enqueue() takes care of the possible deadlock that said
         * argument was introduced to avoid.
         */
        /*
         * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
         * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
         * blocking anyway.
         */
        struct lu_env          *env = NULL;
        struct cl_io           *io  = NULL;
	__u16			refcheck;
	int			retried = 0;
	int                     result;

	ENTRY;

	result = cl_io_get(inode, &env, &io, &refcheck);
	if (result <= 0)
		RETURN(result);

	do {
		io->ci_ndelay_tried = retried++;
		io->ci_ndelay = io->ci_verify_layout = 1;
		result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
		if (result > 0) {
			/*
			 * nothing to do for this io. This currently happens
			 * when stripe sub-object's are not yet created.
			 */
			result = io->ci_result;
		} else if (result == 0) {
			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
						 agl);
			if (!agl && result == -EWOULDBLOCK)
				io->ci_need_restart = 1;
		}

		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
		cl_io_fini(env, io);
	} while (unlikely(io->ci_need_restart));

	cl_env_put(env, &refcheck);
	RETURN(result);
}
Esempio n. 4
0
static int osc_io_write_start(const struct lu_env *env,
                              const struct cl_io_slice *slice)
{
        struct osc_io    *oio   = cl2osc_io(env, slice);
        struct cl_object *obj   = slice->cis_obj;
        struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
        int              result = 0;
        ENTRY;

        if (oio->oi_lockless == 0) {
		OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
                cl_object_attr_lock(obj);
                result = cl_object_attr_get(env, obj, attr);
                if (result == 0) {
                        attr->cat_mtime = attr->cat_ctime =
                                LTIME_S(CFS_CURRENT_TIME);
                        result = cl_object_attr_set(env, obj, attr,
                                                    CAT_MTIME | CAT_CTIME);
                }
                cl_object_attr_unlock(obj);
        }
        RETURN(result);
}
Esempio n. 5
0
/* When this (destroy) operation is committed, return the cancel cookie */
void filter_cancel_cookies_cb(struct obd_device *obd, __u64 transno,
                              void *cb_data, int error)
{
        struct llog_cookie *cookie = cb_data;
        struct obd_llog_group *olg;
        struct llog_ctxt *ctxt;
        int rc;

        /* we have to find context for right group */
        if (error != 0 || obd->obd_stopping) {
                CDEBUG(D_INODE, "not cancel logcookie err %d stopping %d \n",
                       error, obd->obd_stopping);
                GOTO (out, rc = 0);
        }

        olg = filter_find_olg(obd, cookie->lgc_lgl.lgl_oseq);
        if (!olg) {
                CDEBUG(D_HA, "unknown group "LPU64"!\n", cookie->lgc_lgl.lgl_oseq);
                GOTO(out, rc = 0);
        }

        ctxt = llog_group_get_ctxt(olg, cookie->lgc_subsys + 1);
        if (!ctxt) {
                CERROR("no valid context for group "LPU64"\n",
                        cookie->lgc_lgl.lgl_oseq);
                GOTO(out, rc = 0);
        }

        OBD_FAIL_TIMEOUT(OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT, 30);

        rc = llog_cancel(ctxt, NULL, 1, cookie, 0);
        if (rc)
                CERROR("error cancelling log cookies: rc = %d\n", rc);
        llog_ctxt_put(ctxt);
out:
        OBD_FREE(cookie, sizeof(*cookie));
}
Esempio n. 6
0
/**
 * Send request \a request.
 * if \a noreply is set, don't expect any reply back and don't set up
 * reply buffers.
 * Returns 0 on success or error code.
 */
int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
{
	int rc;
	int rc2;
	int mpflag = 0;
	struct ptlrpc_connection *connection;
	lnet_handle_me_t reply_me_h;
	lnet_md_t reply_md;
	struct obd_device *obd = request->rq_import->imp_obd;

	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
		return 0;

	LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
	LASSERT(request->rq_wait_ctx == 0);

	/* If this is a re-transmit, we're required to have disengaged
	 * cleanly from the previous attempt */
	LASSERT(!request->rq_receiving_reply);
	LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
		(request->rq_import->imp_state == LUSTRE_IMP_FULL)));

	if (unlikely(obd != NULL && obd->obd_fail)) {
		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
			obd->obd_name);
		/* this prevents us from waiting in ptlrpc_queue_wait */
		spin_lock(&request->rq_lock);
		request->rq_err = 1;
		spin_unlock(&request->rq_lock);
		request->rq_status = -ENODEV;
		return -ENODEV;
	}

	connection = request->rq_import->imp_connection;

	lustre_msg_set_handle(request->rq_reqmsg,
			      &request->rq_import->imp_remote_handle);
	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
	lustre_msg_set_conn_cnt(request->rq_reqmsg,
				request->rq_import->imp_conn_cnt);
	lustre_msghdr_set_flags(request->rq_reqmsg,
				request->rq_import->imp_msghdr_flags);

	if (request->rq_resend)
		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);

	if (request->rq_memalloc)
		mpflag = cfs_memory_pressure_get_and_set();

	rc = sptlrpc_cli_wrap_request(request);
	if (rc)
		goto out;

	/* bulk register should be done after wrap_request() */
	if (request->rq_bulk != NULL) {
		rc = ptlrpc_register_bulk(request);
		if (rc != 0)
			goto out;
	}

	if (!noreply) {
		LASSERT(request->rq_replen != 0);
		if (request->rq_repbuf == NULL) {
			LASSERT(request->rq_repdata == NULL);
			LASSERT(request->rq_repmsg == NULL);
			rc = sptlrpc_cli_alloc_repbuf(request,
						      request->rq_replen);
			if (rc) {
				/* this prevents us from looping in
				 * ptlrpc_queue_wait */
				spin_lock(&request->rq_lock);
				request->rq_err = 1;
				spin_unlock(&request->rq_lock);
				request->rq_status = rc;
				goto cleanup_bulk;
			}
		} else {
			request->rq_repdata = NULL;
			request->rq_repmsg = NULL;
		}

		rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
				  connection->c_peer, request->rq_xid, 0,
				  LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
		if (rc != 0) {
			CERROR("LNetMEAttach failed: %d\n", rc);
			LASSERT(rc == -ENOMEM);
			rc = -ENOMEM;
			goto cleanup_bulk;
		}
	}

	spin_lock(&request->rq_lock);
	/* If the MD attach succeeds, there _will_ be a reply_in callback */
	request->rq_receiving_reply = !noreply;
	request->rq_req_unlink = 1;
	/* We are responsible for unlinking the reply buffer */
	request->rq_reply_unlink = !noreply;
	/* Clear any flags that may be present from previous sends. */
	request->rq_replied = 0;
	request->rq_err = 0;
	request->rq_timedout = 0;
	request->rq_net_err = 0;
	request->rq_resend = 0;
	request->rq_restart = 0;
	request->rq_reply_truncate = 0;
	spin_unlock(&request->rq_lock);

	if (!noreply) {
		reply_md.start = request->rq_repbuf;
		reply_md.length = request->rq_repbuf_len;
		/* Allow multiple early replies */
		reply_md.threshold = LNET_MD_THRESH_INF;
		/* Manage remote for early replies */
		reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
			LNET_MD_MANAGE_REMOTE |
			LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */
		reply_md.user_ptr = &request->rq_reply_cbid;
		reply_md.eq_handle = ptlrpc_eq_h;

		/* We must see the unlink callback to unset rq_reply_unlink,
		   so we can't auto-unlink */
		rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
				  &request->rq_reply_md_h);
		if (rc != 0) {
			CERROR("LNetMDAttach failed: %d\n", rc);
			LASSERT(rc == -ENOMEM);
			spin_lock(&request->rq_lock);
			/* ...but the MD attach didn't succeed... */
			request->rq_receiving_reply = 0;
			spin_unlock(&request->rq_lock);
			rc = -ENOMEM;
			goto cleanup_me;
		}

		CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n",
		       request->rq_repbuf_len, request->rq_xid,
		       request->rq_reply_portal);
	}

	/* add references on request for request_out_callback */
	ptlrpc_request_addref(request);
	if (obd != NULL && obd->obd_svc_stats != NULL)
		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
			atomic_read(&request->rq_import->imp_inflight));

	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);

	ktime_get_real_ts64(&request->rq_arrival_time);
	request->rq_sent = ktime_get_real_seconds();
	/* We give the server rq_timeout secs to process the req, and
	   add the network latency for our local timeout. */
	request->rq_deadline = request->rq_sent + request->rq_timeout +
		ptlrpc_at_get_net_latency(request);

	ptlrpc_pinger_sending_on_import(request->rq_import);

	DEBUG_REQ(D_INFO, request, "send flg=%x",
		  lustre_msg_get_flags(request->rq_reqmsg));
	rc = ptl_send_buf(&request->rq_req_md_h,
			  request->rq_reqbuf, request->rq_reqdata_len,
			  LNET_NOACK_REQ, &request->rq_req_cbid,
			  connection,
			  request->rq_request_portal,
			  request->rq_xid, 0);
	if (rc == 0)
		goto out;

	ptlrpc_req_finished(request);
	if (noreply)
		goto out;

 cleanup_me:
	/* MEUnlink is safe; the PUT didn't even get off the ground, and
	 * nobody apart from the PUT's target has the right nid+XID to
	 * access the reply buffer. */
	rc2 = LNetMEUnlink(reply_me_h);
	LASSERT(rc2 == 0);
	/* UNLINKED callback called synchronously */
	LASSERT(!request->rq_receiving_reply);

 cleanup_bulk:
	/* We do sync unlink here as there was no real transfer here so
	 * the chance to have long unlink to sluggish net is smaller here. */
	ptlrpc_unregister_bulk(request, 0);
 out:
	if (request->rq_memalloc)
		cfs_memory_pressure_restore(mpflag);
	return rc;
}
Esempio n. 7
0
/**
 * Callback handler for receiving incoming completion ASTs.
 *
 * This only can happen on client side.
 */
static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
				    struct ldlm_namespace *ns,
				    struct ldlm_request *dlm_req,
				    struct ldlm_lock *lock)
{
	int lvb_len;
	LIST_HEAD(ast_list);
	int rc = 0;

	LDLM_DEBUG(lock, "client completion callback handler START");

	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
		int to = cfs_time_seconds(1);
		while (to > 0) {
			schedule_timeout_and_set_state(
				TASK_INTERRUPTIBLE, to);
			if (lock->l_granted_mode == lock->l_req_mode ||
			    lock->l_flags & LDLM_FL_DESTROYED)
				break;
		}
	}

	lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
	if (lvb_len < 0) {
		LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
		GOTO(out, rc = lvb_len);
	} else if (lvb_len > 0) {
		if (lock->l_lvb_len > 0) {
			/* for extent lock, lvb contains ost_lvb{}. */
			LASSERT(lock->l_lvb_data != NULL);

			if (unlikely(lock->l_lvb_len < lvb_len)) {
				LDLM_ERROR(lock, "Replied LVB is larger than "
					   "expectation, expected = %d, "
					   "replied = %d",
					   lock->l_lvb_len, lvb_len);
				GOTO(out, rc = -EINVAL);
			}
		} else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
						     * variable length */
			void *lvb_data;

			OBD_ALLOC(lvb_data, lvb_len);
			if (lvb_data == NULL) {
				LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
				GOTO(out, rc = -ENOMEM);
			}

			lock_res_and_lock(lock);
			LASSERT(lock->l_lvb_data == NULL);
			lock->l_lvb_data = lvb_data;
			lock->l_lvb_len = lvb_len;
			unlock_res_and_lock(lock);
		}
	}

	lock_res_and_lock(lock);
	if ((lock->l_flags & LDLM_FL_DESTROYED) ||
	    lock->l_granted_mode == lock->l_req_mode) {
		/* bug 11300: the lock has already been granted */
		unlock_res_and_lock(lock);
		LDLM_DEBUG(lock, "Double grant race happened");
		GOTO(out, rc = 0);
	}

	/* If we receive the completion AST before the actual enqueue returned,
	 * then we might need to switch lock modes, resources, or extents. */
	if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
		lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
		LDLM_DEBUG(lock, "completion AST, new lock mode");
	}

	if (lock->l_resource->lr_type != LDLM_PLAIN) {
		ldlm_convert_policy_to_local(req->rq_export,
					  dlm_req->lock_desc.l_resource.lr_type,
					  &dlm_req->lock_desc.l_policy_data,
					  &lock->l_policy_data);
		LDLM_DEBUG(lock, "completion AST, new policy data");
	}

	ldlm_resource_unlink_lock(lock);
	if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
		   &lock->l_resource->lr_name,
		   sizeof(lock->l_resource->lr_name)) != 0) {
		unlock_res_and_lock(lock);
		rc = ldlm_lock_change_resource(ns, lock,
				&dlm_req->lock_desc.l_resource.lr_name);
		if (rc < 0) {
			LDLM_ERROR(lock, "Failed to allocate resource");
			GOTO(out, rc);
		}
		LDLM_DEBUG(lock, "completion AST, new resource");
		CERROR("change resource!\n");
		lock_res_and_lock(lock);
	}

	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
		/* BL_AST locks are not needed in LRU.
		 * Let ldlm_cancel_lru() be fast. */
		ldlm_lock_remove_from_lru(lock);
		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
		LDLM_DEBUG(lock, "completion AST includes blocking AST");
	}

	if (lock->l_lvb_len > 0) {
		rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
				   lock->l_lvb_data, lvb_len);
		if (rc < 0) {
			unlock_res_and_lock(lock);
			GOTO(out, rc);
		}
	}

	ldlm_grant_lock(lock, &ast_list);
	unlock_res_and_lock(lock);

	LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");

	/* Let Enqueue to call osc_lock_upcall() and initialize
	 * l_ast_data */
	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);

	ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);

	LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
			  lock);
	GOTO(out, rc);

out:
	if (rc < 0) {
		lock_res_and_lock(lock);
		lock->l_flags |= LDLM_FL_FAILED;
		unlock_res_and_lock(lock);
		wake_up(&lock->l_waitq);
	}
	LDLM_LOCK_RELEASE(lock);
}
Esempio n. 8
0
int ll_revalidate_it(struct dentry *de, int lookup_flags,
                     struct lookup_intent *it)
{
        struct md_op_data *op_data;
        struct ptlrpc_request *req = NULL;
        struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
        struct obd_export *exp;
        struct inode *parent = de->d_parent->d_inode;
        int rc, first = 0;

        ENTRY;
        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
               LL_IT2STR(it));

        if (de->d_inode == NULL) {
                /* We can only use negative dentries if this is stat or lookup,
                   for opens and stuff we do need to query server. */
                /* If there is IT_CREAT in intent op set, then we must throw
                   away this negative dentry and actually do the request to
                   kernel to create whatever needs to be created (if possible)*/
                if (it && (it->it_op & IT_CREAT))
                        RETURN(0);

                if (de->d_flags & DCACHE_LUSTRE_INVALID)
                        RETURN(0);

                rc = ll_have_md_lock(parent, MDS_INODELOCK_UPDATE, LCK_MINMODE);
                GOTO(out_sa, rc);
        }

        /* Never execute intents for mount points.
         * Attributes will be fixed up in ll_inode_revalidate_it */
        if (d_mountpoint(de))
                GOTO(out_sa, rc = 1);

        /* need to get attributes in case root got changed from other client */
        if (de == de->d_sb->s_root) {
                rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
                if (rc == 0)
                        rc = 1;
                GOTO(out_sa, rc);
        }

        exp = ll_i2mdexp(de->d_inode);

        OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
        ll_frob_intent(&it, &lookup_it);
        LASSERT(it);

        if (it->it_op == IT_LOOKUP && !(de->d_flags & DCACHE_LUSTRE_INVALID))
                GOTO(out_sa, rc = 1);

        op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
                                     de->d_name.name, de->d_name.len,
                                     0, LUSTRE_OPC_ANY, NULL);
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));

        if ((it->it_op == IT_OPEN) && de->d_inode) {
                struct inode *inode = de->d_inode;
                struct ll_inode_info *lli = ll_i2info(inode);
                struct obd_client_handle **och_p;
                __u64 *och_usecount;

                /*
                 * We used to check for MDS_INODELOCK_OPEN here, but in fact
                 * just having LOOKUP lock is enough to justify inode is the
                 * same. And if inode is the same and we have suitable
                 * openhandle, then there is no point in doing another OPEN RPC
                 * just to throw away newly received openhandle.  There are no
                 * security implications too, if file owner or access mode is
                 * change, LOOKUP lock is revoked.
                 */


                if (it->it_flags & FMODE_WRITE) {
                        och_p = &lli->lli_mds_write_och;
                        och_usecount = &lli->lli_open_fd_write_count;
                } else if (it->it_flags & FMODE_EXEC) {
                        och_p = &lli->lli_mds_exec_och;
                        och_usecount = &lli->lli_open_fd_exec_count;
                } else {
                        och_p = &lli->lli_mds_read_och;
                        och_usecount = &lli->lli_open_fd_read_count;
                }
                /* Check for the proper lock. */
                if (!ll_have_md_lock(inode, MDS_INODELOCK_LOOKUP, LCK_MINMODE))
                        goto do_lock;
                cfs_down(&lli->lli_och_sem);
                if (*och_p) { /* Everything is open already, do nothing */
                        /*(*och_usecount)++;  Do not let them steal our open
                          handle from under us */
                        /* XXX The code above was my original idea, but in case
                           we have the handle, but we cannot use it due to later
                           checks (e.g. O_CREAT|O_EXCL flags set), nobody
                           would decrement counter increased here. So we just
                           hope the lock won't be invalidated in between. But
                           if it would be, we'll reopen the open request to
                           MDS later during file open path */
                        cfs_up(&lli->lli_och_sem);
                        ll_finish_md_op_data(op_data);
                        RETURN(1);
                } else {
                        cfs_up(&lli->lli_och_sem);
                }
        }

        if (it->it_op == IT_GETATTR) {
                first = ll_statahead_enter(parent, &de, 0);
                if (first == 1) {
                        ll_statahead_exit(parent, de, 1);
                        ll_finish_md_op_data(op_data);
                        GOTO(out, rc = 1);
                }
        }

do_lock:
        it->it_create_mode &= ~current->fs->umask;
        it->it_create_mode |= M_CHECK_STALE;
        rc = md_intent_lock(exp, op_data, NULL, 0, it,
                            lookup_flags,
                            &req, ll_md_blocking_ast, 0);
        it->it_create_mode &= ~M_CHECK_STALE;
        ll_finish_md_op_data(op_data);
        if (it->it_op == IT_GETATTR && !first)
                /* If there are too many locks on client-side, then some
                 * locks taken by statahead maybe dropped automatically
                 * before the real "revalidate" using them. */
                ll_statahead_exit(parent, de, req == NULL ? rc : 0);
        else if (first == -EEXIST)
                ll_statahead_mark(parent, de);

        /* If req is NULL, then md_intent_lock only tried to do a lock match;
         * if all was well, it will return 1 if it found locks, 0 otherwise. */
        if (req == NULL && rc >= 0) {
                if (!rc)
                        goto do_lookup;
                GOTO(out, rc);
        }

        if (rc < 0) {
                if (rc != -ESTALE) {
                        CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
                               "%d\n", rc, it->d.lustre.it_status);
                }
                GOTO(out, rc = 0);
        }

revalidate_finish:
        rc = ll_revalidate_it_finish(req, it, de);
        if (rc != 0) {
                if (rc != -ESTALE && rc != -ENOENT)
                        ll_intent_release(it);
                GOTO(out, rc = 0);
        }

        if ((it->it_op & IT_OPEN) && de->d_inode &&
            !S_ISREG(de->d_inode->i_mode) &&
            !S_ISDIR(de->d_inode->i_mode)) {
                ll_release_openhandle(de, it);
        }
        rc = 1;

        /* unfortunately ll_intent_lock may cause a callback and revoke our
         * dentry */
        cfs_spin_lock(&ll_lookup_lock);
        spin_lock(&dcache_lock);
        lock_dentry(de);
        __d_drop(de);
        unlock_dentry(de);
        d_rehash_cond(de, 0);
        spin_unlock(&dcache_lock);
        cfs_spin_unlock(&ll_lookup_lock);

out:
        /* We do not free request as it may be reused during following lookup
         * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
         * be freed in ll_lookup_it or in ll_intent_release. But if
         * request was not completed, we need to free it. (bug 5154, 9903) */
        if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
                ptlrpc_req_finished(req);
        if (rc == 0) {
                ll_unhash_aliases(de->d_inode);
                /* done in ll_unhash_aliases()
                   dentry->d_flags |= DCACHE_LUSTRE_INVALID; */
        } else {
                CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
                       "inode %p refc %d\n", de->d_name.len,
                       de->d_name.name, de, de->d_parent, de->d_inode,
                       atomic_read(&de->d_count));
                if (first != 1) {
                        if (de->d_flags & DCACHE_LUSTRE_INVALID) {
                                lock_dentry(de);
                                de->d_flags &= ~DCACHE_LUSTRE_INVALID;
                                unlock_dentry(de);
                        }
                        ll_lookup_finish_locks(it, de);
                }
        }
        RETURN(rc);

        /*
         * This part is here to combat evil-evil race in real_lookup on 2.6
         * kernels.  The race details are: We enter do_lookup() looking for some
         * name, there is nothing in dcache for this name yet and d_lookup()
         * returns NULL.  We proceed to real_lookup(), and while we do this,
         * another process does open on the same file we looking up (most simple
         * reproducer), open succeeds and the dentry is added. Now back to
         * us. In real_lookup() we do d_lookup() again and suddenly find the
         * dentry, so we call d_revalidate on it, but there is no lock, so
         * without this code we would return 0, but unpatched real_lookup just
         * returns -ENOENT in such a case instead of retrying the lookup. Once
         * this is dealt with in real_lookup(), all of this ugly mess can go and
         * we can just check locks in ->d_revalidate without doing any RPCs
         * ever.
         */
do_lookup:
        if (it != &lookup_it) {
                /* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
                if (it->it_op == IT_GETATTR)
                        lookup_it.it_op = IT_GETATTR;
                ll_lookup_finish_locks(it, de);
                it = &lookup_it;
        }

        /* Do real lookup here. */
        op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
                                     de->d_name.len, 0, (it->it_op & IT_CREAT ?
                                                         LUSTRE_OPC_CREATE :
                                                         LUSTRE_OPC_ANY), NULL);
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));

        rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
                            ll_md_blocking_ast, 0);
        if (rc >= 0) {
                struct mdt_body *mdt_body;
                struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
                mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);

                if (de->d_inode)
                        fid = *ll_inode2fid(de->d_inode);

                /* see if we got same inode, if not - return error */
                if (lu_fid_eq(&fid, &mdt_body->fid1)) {
                        ll_finish_md_op_data(op_data);
                        op_data = NULL;
                        goto revalidate_finish;
                }
                ll_intent_release(it);
        }
        ll_finish_md_op_data(op_data);
        GOTO(out, rc = 0);

out_sa:
        /*
         * For rc == 1 case, should not return directly to prevent losing
         * statahead windows; for rc == 0 case, the "lookup" will be done later.
         */
        if (it && it->it_op == IT_GETATTR && rc == 1) {
                first = ll_statahead_enter(parent, &de, 0);
                if (first >= 0)
                        ll_statahead_exit(parent, de, 1);
                else if (first == -EEXIST)
                        ll_statahead_mark(parent, de);
        }

        return rc;
}

#if 0
static void ll_pin(struct dentry *de, struct vfsmount *mnt, int flag)
{
        struct inode *inode= de->d_inode;
        struct ll_sb_info *sbi = ll_i2sbi(inode);
        struct ll_dentry_data *ldd = ll_d2d(de);
        struct obd_client_handle *handle;
        struct obd_capa *oc;
        int rc = 0;
        ENTRY;
        LASSERT(ldd);

        cfs_lock_kernel();
        /* Strictly speaking this introduces an additional race: the
         * increments should wait until the rpc has returned.
         * However, given that at present the function is void, this
         * issue is moot. */
        if (flag == 1 && (++ldd->lld_mnt_count) > 1) {
                cfs_unlock_kernel();
                EXIT;
                return;
        }

        if (flag == 0 && (++ldd->lld_cwd_count) > 1) {
                cfs_unlock_kernel();
                EXIT;
                return;
        }
        cfs_unlock_kernel();

        handle = (flag) ? &ldd->lld_mnt_och : &ldd->lld_cwd_och;
        oc = ll_mdscapa_get(inode);
        rc = obd_pin(sbi->ll_md_exp, ll_inode2fid(inode), oc, handle, flag);
        capa_put(oc);
        if (rc) {
                cfs_lock_kernel();
                memset(handle, 0, sizeof(*handle));
                if (flag == 0)
                        ldd->lld_cwd_count--;
                else
                        ldd->lld_mnt_count--;
                cfs_unlock_kernel();
        }

        EXIT;
        return;
}

static void ll_unpin(struct dentry *de, struct vfsmount *mnt, int flag)
{
        struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
        struct ll_dentry_data *ldd = ll_d2d(de);
        struct obd_client_handle handle;
        int count, rc = 0;
        ENTRY;
        LASSERT(ldd);

        cfs_lock_kernel();
        /* Strictly speaking this introduces an additional race: the
         * increments should wait until the rpc has returned.
         * However, given that at present the function is void, this
         * issue is moot. */
        handle = (flag) ? ldd->lld_mnt_och : ldd->lld_cwd_och;
        if (handle.och_magic != OBD_CLIENT_HANDLE_MAGIC) {
                /* the "pin" failed */
                cfs_unlock_kernel();
                EXIT;
                return;
        }

        if (flag)
                count = --ldd->lld_mnt_count;
        else
                count = --ldd->lld_cwd_count;
        cfs_unlock_kernel();

        if (count != 0) {
                EXIT;
                return;
        }

        rc = obd_unpin(sbi->ll_md_exp, &handle, flag);
        EXIT;
        return;
}
#endif

#ifdef HAVE_VFS_INTENT_PATCHES
int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
{
        int rc;
        ENTRY;

        if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST))
                rc = ll_revalidate_it(dentry, nd->flags, &nd->intent);
        else
                rc = ll_revalidate_it(dentry, 0, NULL);

        RETURN(rc);
}
Esempio n. 9
0
/**
 * Send request \a request.
 * if \a noreply is set, don't expect any reply back and don't set up
 * reply buffers.
 * Returns 0 on success or error code.
 */
int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
{
        int rc;
        int rc2;
        int mpflag = 0;
        struct ptlrpc_connection *connection;
        lnet_handle_me_t  reply_me_h;
        lnet_md_t         reply_md;
	struct obd_import *imp = request->rq_import;
	struct obd_device *obd = imp->imp_obd;
        ENTRY;

        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
                RETURN(0);

        LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
        LASSERT(request->rq_wait_ctx == 0);

        /* If this is a re-transmit, we're required to have disengaged
         * cleanly from the previous attempt */
        LASSERT(!request->rq_receiving_reply);
	LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
		(imp->imp_state == LUSTRE_IMP_FULL)));

	if (unlikely(obd != NULL && obd->obd_fail)) {
		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
			obd->obd_name);
		/* this prevents us from waiting in ptlrpc_queue_wait */
		spin_lock(&request->rq_lock);
		request->rq_err = 1;
		spin_unlock(&request->rq_lock);
                request->rq_status = -ENODEV;
                RETURN(-ENODEV);
        }

	connection = imp->imp_connection;

	lustre_msg_set_handle(request->rq_reqmsg,
			      &imp->imp_remote_handle);
	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
	lustre_msg_set_conn_cnt(request->rq_reqmsg,
				imp->imp_conn_cnt);
	lustre_msghdr_set_flags(request->rq_reqmsg,
				imp->imp_msghdr_flags);

	/* If it's the first time to resend the request for EINPROGRESS,
	 * we need to allocate a new XID (see after_reply()), it's different
	 * from the resend for reply timeout. */
	if (request->rq_nr_resend != 0 &&
	    list_empty(&request->rq_unreplied_list)) {
		__u64 min_xid = 0;
		/* resend for EINPROGRESS, allocate new xid to avoid reply
		 * reconstruction */
		spin_lock(&imp->imp_lock);
		ptlrpc_assign_next_xid_nolock(request);
		request->rq_mbits = request->rq_xid;
		min_xid = ptlrpc_known_replied_xid(imp);
		spin_unlock(&imp->imp_lock);

		lustre_msg_set_last_xid(request->rq_reqmsg, min_xid);
		DEBUG_REQ(D_RPCTRACE, request, "Allocating new xid for "
			  "resend on EINPROGRESS");
	} else if (request->rq_bulk != NULL) {
		ptlrpc_set_bulk_mbits(request);
		lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits);
	}

	if (list_empty(&request->rq_unreplied_list) ||
	    request->rq_xid <= imp->imp_known_replied_xid) {
		DEBUG_REQ(D_ERROR, request, "xid: "LPU64", replied: "LPU64", "
			  "list_empty:%d\n", request->rq_xid,
			  imp->imp_known_replied_xid,
			  list_empty(&request->rq_unreplied_list));
		LBUG();
	}

	/** For enabled AT all request should have AT_SUPPORT in the
	 * FULL import state when OBD_CONNECT_AT is set */
	LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL ||
		(imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) ||
		!(imp->imp_connect_data.ocd_connect_flags &
		OBD_CONNECT_AT));

	if (request->rq_resend) {
		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
		if (request->rq_resend_cb != NULL)
			request->rq_resend_cb(request, &request->rq_async_args);
	}
        if (request->rq_memalloc)
                mpflag = cfs_memory_pressure_get_and_set();

	rc = sptlrpc_cli_wrap_request(request);
	if (rc == -ENOMEM)
		/* set rq_sent so that this request is treated
		 * as a delayed send in the upper layers */
		request->rq_sent = cfs_time_current_sec();
	if (rc)
		GOTO(out, rc);

        /* bulk register should be done after wrap_request() */
        if (request->rq_bulk != NULL) {
                rc = ptlrpc_register_bulk (request);
                if (rc != 0)
                        GOTO(out, rc);
        }

        if (!noreply) {
                LASSERT (request->rq_replen != 0);
                if (request->rq_repbuf == NULL) {
                        LASSERT(request->rq_repdata == NULL);
                        LASSERT(request->rq_repmsg == NULL);
                        rc = sptlrpc_cli_alloc_repbuf(request,
                                                      request->rq_replen);
                        if (rc) {
                                /* this prevents us from looping in
                                 * ptlrpc_queue_wait */
				spin_lock(&request->rq_lock);
				request->rq_err = 1;
				spin_unlock(&request->rq_lock);
                                request->rq_status = rc;
                                GOTO(cleanup_bulk, rc);
                        }
                } else {
                        request->rq_repdata = NULL;
                        request->rq_repmsg = NULL;
                }

                rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
                                  connection->c_peer, request->rq_xid, 0,
                                  LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
                if (rc != 0) {
                        CERROR("LNetMEAttach failed: %d\n", rc);
                        LASSERT (rc == -ENOMEM);
                        GOTO(cleanup_bulk, rc = -ENOMEM);
                }
        }

	spin_lock(&request->rq_lock);
	/* We are responsible for unlinking the reply buffer */
	request->rq_reply_unlinked = noreply;
	request->rq_receiving_reply = !noreply;
	/* Clear any flags that may be present from previous sends. */
	request->rq_req_unlinked = 0;
        request->rq_replied = 0;
        request->rq_err = 0;
        request->rq_timedout = 0;
        request->rq_net_err = 0;
        request->rq_resend = 0;
        request->rq_restart = 0;
	request->rq_reply_truncated = 0;
	spin_unlock(&request->rq_lock);

        if (!noreply) {
                reply_md.start     = request->rq_repbuf;
                reply_md.length    = request->rq_repbuf_len;
                /* Allow multiple early replies */
                reply_md.threshold = LNET_MD_THRESH_INF;
                /* Manage remote for early replies */
                reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
                        LNET_MD_MANAGE_REMOTE |
                        LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
                reply_md.user_ptr  = &request->rq_reply_cbid;
                reply_md.eq_handle = ptlrpc_eq_h;

		/* We must see the unlink callback to set rq_reply_unlinked,
		 * so we can't auto-unlink */
                rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
                                  &request->rq_reply_md_h);
                if (rc != 0) {
                        CERROR("LNetMDAttach failed: %d\n", rc);
                        LASSERT (rc == -ENOMEM);
			spin_lock(&request->rq_lock);
			/* ...but the MD attach didn't succeed... */
			request->rq_receiving_reply = 0;
			spin_unlock(&request->rq_lock);
                        GOTO(cleanup_me, rc = -ENOMEM);
                }

                CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
                       ", portal %u\n",
                       request->rq_repbuf_len, request->rq_xid,
                       request->rq_reply_portal);
        }

        /* add references on request for request_out_callback */
        ptlrpc_request_addref(request);
	if (obd != NULL && obd->obd_svc_stats != NULL)
		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
			atomic_read(&imp->imp_inflight));

	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);

	do_gettimeofday(&request->rq_sent_tv);
	request->rq_sent = cfs_time_current_sec();
	/* We give the server rq_timeout secs to process the req, and
	   add the network latency for our local timeout. */
        request->rq_deadline = request->rq_sent + request->rq_timeout +
                ptlrpc_at_get_net_latency(request);

	ptlrpc_pinger_sending_on_import(imp);

        DEBUG_REQ(D_INFO, request, "send flg=%x",
                  lustre_msg_get_flags(request->rq_reqmsg));
        rc = ptl_send_buf(&request->rq_req_md_h,
                          request->rq_reqbuf, request->rq_reqdata_len,
                          LNET_NOACK_REQ, &request->rq_req_cbid,
                          connection,
                          request->rq_request_portal,
                          request->rq_xid, 0);
	if (likely(rc == 0))
		GOTO(out, rc);

	request->rq_req_unlinked = 1;
        ptlrpc_req_finished(request);
        if (noreply)
                GOTO(out, rc);

 cleanup_me:
        /* MEUnlink is safe; the PUT didn't even get off the ground, and
         * nobody apart from the PUT's target has the right nid+XID to
         * access the reply buffer. */
        rc2 = LNetMEUnlink(reply_me_h);
        LASSERT (rc2 == 0);
        /* UNLINKED callback called synchronously */
        LASSERT(!request->rq_receiving_reply);

 cleanup_bulk:
        /* We do sync unlink here as there was no real transfer here so
         * the chance to have long unlink to sluggish net is smaller here. */
        ptlrpc_unregister_bulk(request, 0);
 out:
        if (request->rq_memalloc)
                cfs_memory_pressure_restore(mpflag);
        return rc;
}
Esempio n. 10
0
static int quota_chk_acq_common(struct obd_device *obd, struct obd_export *exp,
                                const unsigned int id[], int pending[],
                                int count, quota_acquire acquire,
                                struct obd_trans_info *oti, int isblk,
                                struct inode *inode, int frags)
{
        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
        struct timeval work_start;
        struct timeval work_end;
        long timediff;
        struct l_wait_info lwi = { 0 };
        int rc = 0, cycle = 0, count_err = 1;
        ENTRY;

        if (!quota_is_set(obd, id, isblk ? QB_SET : QI_SET))
                RETURN(0);

        if (isblk && (exp->exp_failed || exp->exp_abort_active_req))
                /* If the client has been evicted or if it
                 * timed out and tried to reconnect already,
                 * abort the request immediately */
                RETURN(-ENOTCONN);

        CDEBUG(D_QUOTA, "check quota for %s\n", obd->obd_name);
        pending[USRQUOTA] = pending[GRPQUOTA] = 0;
        /* Unfortunately, if quota master is too busy to handle the
         * pre-dqacq in time and quota hash on ost is used up, we
         * have to wait for the completion of in flight dqacq/dqrel,
         * in order to get enough quota for write b=12588 */
        cfs_gettimeofday(&work_start);
        while ((rc = quota_check_common(obd, id, pending, count, cycle, isblk,
                                        inode, frags)) &
               QUOTA_RET_ACQUOTA) {

                cfs_spin_lock(&qctxt->lqc_lock);
                if (!qctxt->lqc_import && oti) {
                        cfs_spin_unlock(&qctxt->lqc_lock);

                        LASSERT(oti && oti->oti_thread &&
                                oti->oti_thread->t_watchdog);

                        lc_watchdog_disable(oti->oti_thread->t_watchdog);
                        CDEBUG(D_QUOTA, "sleep for quota master\n");
                        l_wait_event(qctxt->lqc_wait_for_qmaster, check_qm(qctxt),
                                     &lwi);
                        CDEBUG(D_QUOTA, "wake up when quota master is back\n");
                        lc_watchdog_touch(oti->oti_thread->t_watchdog,
                                 CFS_GET_TIMEOUT(oti->oti_thread->t_svc));
                } else {
                        cfs_spin_unlock(&qctxt->lqc_lock);
                }

                cycle++;
                if (isblk)
                        OBD_FAIL_TIMEOUT(OBD_FAIL_OST_HOLD_WRITE_RPC, 90);
                /* after acquire(), we should run quota_check_common again
                 * so that we confirm there are enough quota to finish write */
                rc = acquire(obd, id, oti, isblk);

                /* please reference to dqacq_completion for the below */
                /* a new request is finished, try again */
                if (rc == QUOTA_REQ_RETURNED) {
                        CDEBUG(D_QUOTA, "finish a quota req, try again\n");
                        continue;
                }

                /* it is out of quota already */
                if (rc == -EDQUOT) {
                        CDEBUG(D_QUOTA, "out of quota,  return -EDQUOT\n");
                        break;
                }

                /* Related quota has been disabled by master, but enabled by
                 * slave, do not try again. */
                if (unlikely(rc == -ESRCH)) {
                        CERROR("mismatched quota configuration, stop try.\n");
                        break;
                }

                if (isblk && (exp->exp_failed || exp->exp_abort_active_req))
                        /* The client has been evicted or tried to
                         * to reconnect already, abort the request */
                        RETURN(-ENOTCONN);

                /* -EBUSY and others, wait a second and try again */
                if (rc < 0) {
                        cfs_waitq_t        waitq;
                        struct l_wait_info lwi;

                        if (oti && oti->oti_thread && oti->oti_thread->t_watchdog)
                                lc_watchdog_touch(oti->oti_thread->t_watchdog,
                                       CFS_GET_TIMEOUT(oti->oti_thread->t_svc));
                        CDEBUG(D_QUOTA, "rc: %d, count_err: %d\n", rc,
                               count_err++);

                        cfs_waitq_init(&waitq);
                        lwi = LWI_TIMEOUT(cfs_time_seconds(min(cycle, 10)), NULL,
                                          NULL);
                        l_wait_event(waitq, 0, &lwi);
                }

                if (rc < 0 || cycle % 10 == 0) {
                        cfs_spin_lock(&last_print_lock);
                        if (last_print == 0 ||
                            cfs_time_before((last_print + cfs_time_seconds(30)),
                                            cfs_time_current())) {
                                last_print = cfs_time_current();
                                cfs_spin_unlock(&last_print_lock);
                                CWARN("still haven't managed to acquire quota "
                                      "space from the quota master after %d "
                                      "retries (err=%d, rc=%d)\n",
                                      cycle, count_err - 1, rc);
                        } else {
                                cfs_spin_unlock(&last_print_lock);
                        }
                }

                CDEBUG(D_QUOTA, "recheck quota with rc: %d, cycle: %d\n", rc,
                       cycle);
        }
        cfs_gettimeofday(&work_end);
        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
        lprocfs_counter_add(qctxt->lqc_stats,
                            isblk ? LQUOTA_WAIT_FOR_CHK_BLK :
                                    LQUOTA_WAIT_FOR_CHK_INO,
                            timediff);

        if (rc > 0)
                rc = 0;
        RETURN(rc);
}
Esempio n. 11
0
/**
 * check whether the left quota of certain uid and gid can satisfy a block_write
 * or inode_create rpc. When need to acquire quota, return QUOTA_RET_ACQUOTA
 */
static int quota_check_common(struct obd_device *obd, const unsigned int id[],
                              int pending[], int count, int cycle, int isblk,
                              struct inode *inode, int frags)
{
        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
        int i;
        struct qunit_data qdata[MAXQUOTAS];
        int mb = 0;
        int rc = 0, rc2[2] = { 0, 0 };
        ENTRY;

        cfs_spin_lock(&qctxt->lqc_lock);
        if (!qctxt->lqc_valid){
                cfs_spin_unlock(&qctxt->lqc_lock);
                RETURN(rc);
        }
        cfs_spin_unlock(&qctxt->lqc_lock);

        for (i = 0; i < MAXQUOTAS; i++) {
                struct lustre_qunit_size *lqs = NULL;

                qdata[i].qd_id = id[i];
                qdata[i].qd_flags = i;
                if (isblk)
                        QDATA_SET_BLK(&qdata[i]);
                qdata[i].qd_count = 0;

                /* ignore root user */
                if (qdata[i].qd_id == 0 && !QDATA_IS_GRP(&qdata[i]))
                        continue;

                lqs = quota_search_lqs(LQS_KEY(i, id[i]), qctxt, 0);
                if (lqs == NULL || IS_ERR(lqs))
                        continue;

                if (IS_ERR(lqs)) {
                        CERROR("can not find lqs for check_common: "
                               "[id %u] [%c] [isblk %d] [count %d] [rc %ld]\n",
                               id[i], i % 2 ? 'g': 'u', isblk, count,
                               PTR_ERR(lqs));
                        RETURN(PTR_ERR(lqs));
                }

                rc2[i] = compute_remquota(obd, qctxt, &qdata[i], isblk);
                cfs_spin_lock(&lqs->lqs_lock);
                if (!cycle) {
                        if (isblk) {
                                pending[i] = count * CFS_PAGE_SIZE;
                                /* in order to complete this write, we need extra
                                 * meta blocks. This function can get it through
                                 * data needed to be written b=16542 */
                                if (inode) {
                                        mb = pending[i];
                                        rc = fsfilt_get_mblk(obd, qctxt->lqc_sb,
                                                             &mb, inode,
                                                             frags);
                                        if (rc)
                                                CERROR("%s: can't get extra "
                                                       "meta blocks\n",
                                                       obd->obd_name);
                                        else
                                                pending[i] += mb;
                                }
                                lqs->lqs_bwrite_pending += pending[i];
                        } else {
                                pending[i] = count;
                                lqs->lqs_iwrite_pending += pending[i];
                        }
                }

                /* if xx_rec < 0, that means quota are releasing,
                 * and it may return before we use quota. So if
                 * we find this situation, we assuming it has
                 * returned b=18491 */
                if (isblk && lqs->lqs_blk_rec < 0) {
                        if (qdata[i].qd_count < -lqs->lqs_blk_rec)
                                qdata[i].qd_count = 0;
                        else
                                qdata[i].qd_count += lqs->lqs_blk_rec;
                }
                if (!isblk && lqs->lqs_ino_rec < 0) {
                        if (qdata[i].qd_count < -lqs->lqs_ino_rec)
                                qdata[i].qd_count = 0;
                        else
                                qdata[i].qd_count += lqs->lqs_ino_rec;
                }

                CDEBUG(D_QUOTA, "[id %u] [%c] [isblk %d] [count %d]"
                       " [lqs pending: %lu] [qd_count: "LPU64"] [metablocks: %d]"
                       " [pending: %d]\n", id[i], i % 2 ? 'g': 'u', isblk, count,
                       isblk ? lqs->lqs_bwrite_pending : lqs->lqs_iwrite_pending,
                       qdata[i].qd_count, mb, pending[i]);
                if (rc2[i] == QUOTA_RET_OK) {
                        if (isblk && qdata[i].qd_count < lqs->lqs_bwrite_pending)
                                rc2[i] = QUOTA_RET_ACQUOTA;
                        if (!isblk && qdata[i].qd_count <
                            lqs->lqs_iwrite_pending)
                                rc2[i] = QUOTA_RET_ACQUOTA;
                }

                cfs_spin_unlock(&lqs->lqs_lock);

                if (lqs->lqs_blk_rec  < 0 &&
                    qdata[i].qd_count <
                    lqs->lqs_bwrite_pending - lqs->lqs_blk_rec - mb)
                        OBD_FAIL_TIMEOUT(OBD_FAIL_QUOTA_DELAY_REL, 5);

                /* When cycle is zero, lqs_*_pending will be changed. We will
                 * get reference of the lqs here and put reference of lqs in
                 * quota_pending_commit b=14784 */
                if (!cycle)
                        lqs_getref(lqs);

                /* this is for quota_search_lqs */
                lqs_putref(lqs);
        }

        if (rc2[0] == QUOTA_RET_ACQUOTA || rc2[1] == QUOTA_RET_ACQUOTA)
                RETURN(QUOTA_RET_ACQUOTA);
        else
                RETURN(rc);
}
Esempio n. 12
0
/* Called whenever a target starts up.  Flags indicate first connect, etc. */
static int mgs_handle_target_reg(struct ptlrpc_request *req)
{
        struct obd_device *obd = req->rq_export->exp_obd;
        struct mgs_target_info *mti, *rep_mti;
        struct fs_db *fsdb;
        int opc;
        int rc = 0;
        ENTRY;

        mgs_counter_incr(req->rq_export, LPROC_MGS_TARGET_REG);

        mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);

        opc = mti->mti_flags & LDD_F_OPC_MASK;
        if (opc == LDD_F_OPC_READY) {
                CDEBUG(D_MGS, "fs: %s index: %d is ready to reconnect.\n",
                       mti->mti_fsname, mti->mti_stripe_index);
                rc = mgs_ir_update(obd, mti);
                if (rc) {
                        LASSERT(!(mti->mti_flags & LDD_F_IR_CAPABLE));
                        CERROR("Update IR return with %d(ignore and IR "
                               "disabled)\n", rc);
                }
                GOTO(out_nolock, rc);
        }

        /* Do not support unregistering right now. */
        if (opc != LDD_F_OPC_REG)
                GOTO(out_nolock, rc = -EINVAL);

        CDEBUG(D_MGS, "fs: %s index: %d is registered to MGS.\n",
               mti->mti_fsname, mti->mti_stripe_index);

        if (mti->mti_flags & LDD_F_NEED_INDEX)
                mti->mti_flags |= LDD_F_WRITECONF;

        if (!(mti->mti_flags & (LDD_F_WRITECONF | LDD_F_UPGRADE14 |
                                LDD_F_UPDATE))) {
                /* We're just here as a startup ping. */
                CDEBUG(D_MGS, "Server %s is running on %s\n",
                       mti->mti_svname, obd_export_nid2str(req->rq_export));
                rc = mgs_check_target(obd, mti);
                /* above will set appropriate mti flags */
                if (rc <= 0)
                        /* Nothing wrong, or fatal error */
                        GOTO(out_nolock, rc);
        } else {
                if (!(mti->mti_flags & LDD_F_NO_PRIMNODE)
                    && (rc = mgs_check_failover_reg(mti)))
                        GOTO(out_nolock, rc);
        }

        OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10);

        if (mti->mti_flags & LDD_F_WRITECONF) {
                if (mti->mti_flags & LDD_F_SV_TYPE_MDT &&
                    mti->mti_stripe_index == 0) {
                        rc = mgs_erase_logs(obd, mti->mti_fsname);
                        LCONSOLE_WARN("%s: Logs for fs %s were removed by user "
                                      "request.  All servers must be restarted "
                                      "in order to regenerate the logs."
                                      "\n", obd->obd_name, mti->mti_fsname);
                } else if (mti->mti_flags &
                           (LDD_F_SV_TYPE_OST | LDD_F_SV_TYPE_MDT)) {
                        rc = mgs_erase_log(obd, mti->mti_svname);
                        LCONSOLE_WARN("%s: Regenerating %s log by user "
                                      "request.\n",
                                      obd->obd_name, mti->mti_svname);
                }
                mti->mti_flags |= LDD_F_UPDATE;
                /* Erased logs means start from scratch. */
                mti->mti_flags &= ~LDD_F_UPGRADE14;
        }

        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb);
        if (rc) {
                CERROR("Can't get db for %s: %d\n", mti->mti_fsname, rc);
                GOTO(out_nolock, rc);
        }

        /*
         * Log writing contention is handled by the fsdb_mutex.
         *
         * It should be alright if someone was reading while we were
         * updating the logs - if we revoke at the end they will just update
         * from where they left off.
         */

        /* COMPAT_146 */
        if (mti->mti_flags & LDD_F_UPGRADE14) {
                rc = mgs_upgrade_sv_14(obd, mti, fsdb);
                if (rc) {
                        CERROR("Can't upgrade from 1.4 (%d)\n", rc);
                        GOTO(out, rc);
                }

                /* We're good to go */
                mti->mti_flags |= LDD_F_UPDATE;
        }
        /* end COMPAT_146 */

        if (mti->mti_flags & LDD_F_UPDATE) {
                CDEBUG(D_MGS, "updating %s, index=%d\n", mti->mti_svname,
                       mti->mti_stripe_index);

                /* create or update the target log
                   and update the client/mdt logs */
                rc = mgs_write_log_target(obd, mti, fsdb);
                if (rc) {
                        CERROR("Failed to write %s log (%d)\n",
                               mti->mti_svname, rc);
                        GOTO(out, rc);
                }

                mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE |
                                    LDD_F_NEED_INDEX | LDD_F_WRITECONF |
                                    LDD_F_UPGRADE14);
                mti->mti_flags |= LDD_F_REWRITE_LDD;
        }

out:
        mgs_revoke_lock(obd, fsdb, CONFIG_T_CONFIG);

out_nolock:
        CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname,
               mti->mti_stripe_index, rc);
        req->rq_status = rc;
        if (rc)
                /* we need an error flag to tell the target what's going on,
                 * instead of just doing it by error code only. */
                mti->mti_flags |= LDD_F_ERROR;

        rc = req_capsule_server_pack(&req->rq_pill);
        if (rc)
                RETURN(rc);

        /* send back the whole mti in the reply */
        rep_mti = req_capsule_server_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
        *rep_mti = *mti;

        /* Flush logs to disk */
        fsfilt_sync(obd, obd->u.mgs.mgs_sb);
        RETURN(rc);
}
Esempio n. 13
0
/*
 * VBR: rename versions in reply: 0 - src parent; 1 - tgt parent;
 * 2 - src child; 3 - tgt child.
 * Update on disk version of src child.
 */
static int mdt_reint_rename(struct mdt_thread_info *info,
                            struct mdt_lock_handle *lhc)
{
        struct mdt_reint_record *rr = &info->mti_rr;
        struct md_attr          *ma = &info->mti_attr;
        struct ptlrpc_request   *req = mdt_info_req(info);
        struct mdt_object       *msrcdir;
        struct mdt_object       *mtgtdir;
        struct mdt_object       *mold;
        struct mdt_object       *mnew = NULL;
        struct mdt_lock_handle  *lh_srcdirp;
        struct mdt_lock_handle  *lh_tgtdirp;
        struct mdt_lock_handle  *lh_oldp;
        struct mdt_lock_handle  *lh_newp;
        struct lu_fid           *old_fid = &info->mti_tmp_fid1;
        struct lu_fid           *new_fid = &info->mti_tmp_fid2;
        struct lustre_handle     rename_lh = { 0 };
        struct lu_name           slname = { 0 };
        struct lu_name          *lname;
        int                      rc;
        ENTRY;

        if (info->mti_dlm_req)
                ldlm_request_cancel(req, info->mti_dlm_req, 0);

        DEBUG_REQ(D_INODE, req, "rename "DFID"/%s to "DFID"/%s",
                  PFID(rr->rr_fid1), rr->rr_name,
                  PFID(rr->rr_fid2), rr->rr_tgt);

	rc = mdt_rename_lock(info, &rename_lh);
	if (rc) {
		CERROR("Can't lock FS for rename, rc %d\n", rc);
		RETURN(rc);
	}

        lh_newp = &info->mti_lh[MDT_LH_NEW];

        /* step 1: lock the source dir. */
        lh_srcdirp = &info->mti_lh[MDT_LH_PARENT];
        mdt_lock_pdo_init(lh_srcdirp, LCK_PW, rr->rr_name,
                          rr->rr_namelen);
        msrcdir = mdt_object_find_lock(info, rr->rr_fid1, lh_srcdirp,
                                       MDS_INODELOCK_UPDATE);
        if (IS_ERR(msrcdir))
                GOTO(out_rename_lock, rc = PTR_ERR(msrcdir));

        if (mdt_object_obf(msrcdir))
                GOTO(out_unlock_source, rc = -EPERM);

        rc = mdt_version_get_check_save(info, msrcdir, 0);
        if (rc)
                GOTO(out_unlock_source, rc);

        /* step 2: find & lock the target dir. */
        lh_tgtdirp = &info->mti_lh[MDT_LH_CHILD];
        mdt_lock_pdo_init(lh_tgtdirp, LCK_PW, rr->rr_tgt,
                          rr->rr_tgtlen);
        if (lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) {
                mdt_object_get(info->mti_env, msrcdir);
                mtgtdir = msrcdir;
                if (lh_tgtdirp->mlh_pdo_hash != lh_srcdirp->mlh_pdo_hash) {
                         rc = mdt_pdir_hash_lock(info, lh_tgtdirp, mtgtdir,
                                                 MDS_INODELOCK_UPDATE);
                         if (rc)
                                 GOTO(out_unlock_source, rc);
                         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK2, 10);
                }
        } else {
                mtgtdir = mdt_object_find(info->mti_env, info->mti_mdt,
                                          rr->rr_fid2);
                if (IS_ERR(mtgtdir))
                        GOTO(out_unlock_source, rc = PTR_ERR(mtgtdir));

                if (mdt_object_obf(mtgtdir))
                        GOTO(out_put_target, rc = -EPERM);

                /* check early, the real version will be saved after locking */
                rc = mdt_version_get_check(info, mtgtdir, 1);
                if (rc)
                        GOTO(out_put_target, rc);

		if (unlikely(mdt_object_remote(mtgtdir))) {
			CDEBUG(D_INFO, "Source dir "DFID" target dir "DFID
			       "on different MDTs\n", PFID(rr->rr_fid1),
			       PFID(rr->rr_fid2));
			GOTO(out_put_target, rc = -EXDEV);
		} else {
			if (likely(mdt_object_exists(mtgtdir))) {
				/* we lock the target dir if it is local */
				rc = mdt_object_lock(info, mtgtdir, lh_tgtdirp,
						     MDS_INODELOCK_UPDATE,
						     MDT_LOCAL_LOCK);
				if (rc != 0)
					GOTO(out_put_target, rc);
				/* get and save correct version after locking */
				mdt_version_get_save(info, mtgtdir, 1);
			} else {
				GOTO(out_put_target, rc = -ESTALE);
			}
		}
	}

        /* step 3: find & lock the old object. */
        lname = mdt_name(info->mti_env, (char *)rr->rr_name, rr->rr_namelen);
        mdt_name_copy(&slname, lname);
        fid_zero(old_fid);
        rc = mdt_lookup_version_check(info, msrcdir, &slname, old_fid, 2);
        if (rc != 0)
                GOTO(out_unlock_target, rc);

        if (lu_fid_eq(old_fid, rr->rr_fid1) || lu_fid_eq(old_fid, rr->rr_fid2))
                GOTO(out_unlock_target, rc = -EINVAL);

	mold = mdt_object_find(info->mti_env, info->mti_mdt, old_fid);
	if (IS_ERR(mold))
		GOTO(out_unlock_target, rc = PTR_ERR(mold));
	if (mdt_object_remote(mold)) {
		mdt_object_put(info->mti_env, mold);
		CDEBUG(D_INFO, "Source child "DFID" is on another MDT\n",
		       PFID(old_fid));
		GOTO(out_unlock_target, rc = -EXDEV);
	}

	if (mdt_object_obf(mold)) {
		mdt_object_put(info->mti_env, mold);
		GOTO(out_unlock_target, rc = -EPERM);
	}

        lh_oldp = &info->mti_lh[MDT_LH_OLD];
        mdt_lock_reg_init(lh_oldp, LCK_EX);
        rc = mdt_object_lock(info, mold, lh_oldp, MDS_INODELOCK_LOOKUP,
                             MDT_CROSS_LOCK);
        if (rc != 0) {
                mdt_object_put(info->mti_env, mold);
                GOTO(out_unlock_target, rc);
        }

        info->mti_mos = mold;
        /* save version after locking */
        mdt_version_get_save(info, mold, 2);
        mdt_set_capainfo(info, 2, old_fid, BYPASS_CAPA);

        /* step 4: find & lock the new object. */
        /* new target object may not exist now */
        lname = mdt_name(info->mti_env, (char *)rr->rr_tgt, rr->rr_tgtlen);
        /* lookup with version checking */
        fid_zero(new_fid);
        rc = mdt_lookup_version_check(info, mtgtdir, lname, new_fid, 3);
        if (rc == 0) {
                /* the new_fid should have been filled at this moment */
                if (lu_fid_eq(old_fid, new_fid))
                       GOTO(out_unlock_old, rc);

                if (lu_fid_eq(new_fid, rr->rr_fid1) ||
                    lu_fid_eq(new_fid, rr->rr_fid2))
                        GOTO(out_unlock_old, rc = -EINVAL);

                mdt_lock_reg_init(lh_newp, LCK_EX);
                mnew = mdt_object_find(info->mti_env, info->mti_mdt, new_fid);
                if (IS_ERR(mnew))
                        GOTO(out_unlock_old, rc = PTR_ERR(mnew));

		if (mdt_object_obf(mnew)) {
			mdt_object_put(info->mti_env, mnew);
			GOTO(out_unlock_old, rc = -EPERM);
		}

		if (mdt_object_remote(mnew)) {
			mdt_object_put(info->mti_env, mnew);
			CDEBUG(D_INFO, "src child "DFID" is on another MDT\n",
			       PFID(new_fid));
			GOTO(out_unlock_old, rc = -EXDEV);
		}

                rc = mdt_object_lock(info, mnew, lh_newp,
                                     MDS_INODELOCK_FULL, MDT_CROSS_LOCK);
                if (rc != 0) {
                        mdt_object_put(info->mti_env, mnew);
                        GOTO(out_unlock_old, rc);
                }
                /* get and save version after locking */
                mdt_version_get_save(info, mnew, 3);
                mdt_set_capainfo(info, 3, new_fid, BYPASS_CAPA);
        } else if (rc != -EREMOTE && rc != -ENOENT) {
                GOTO(out_unlock_old, rc);
        } else {
                mdt_enoent_version_save(info, 3);
        }

        /* step 5: rename it */
        mdt_reint_init_ma(info, ma);

        mdt_fail_write(info->mti_env, info->mti_mdt->mdt_bottom,
                       OBD_FAIL_MDS_REINT_RENAME_WRITE);


        /* Check if @dst is subdir of @src. */
        rc = mdt_rename_sanity(info, old_fid);
        if (rc)
                GOTO(out_unlock_new, rc);

        rc = mdo_rename(info->mti_env, mdt_object_child(msrcdir),
                        mdt_object_child(mtgtdir), old_fid, &slname,
                        (mnew ? mdt_object_child(mnew) : NULL),
                        lname, ma);

        /* handle last link of tgt object */
        if (rc == 0) {
		mdt_counter_incr(req, LPROC_MDT_RENAME);
                if (mnew)
                        mdt_handle_last_unlink(info, mnew, ma);

		mdt_rename_counter_tally(info, info->mti_mdt, req,
                                         msrcdir, mtgtdir);
        }

        EXIT;
out_unlock_new:
        if (mnew)
                mdt_object_unlock_put(info, mnew, lh_newp, rc);
out_unlock_old:
        mdt_object_unlock_put(info, mold, lh_oldp, rc);
out_unlock_target:
        mdt_object_unlock(info, mtgtdir, lh_tgtdirp, rc);
out_put_target:
        mdt_object_put(info->mti_env, mtgtdir);
out_unlock_source:
        mdt_object_unlock_put(info, msrcdir, lh_srcdirp, rc);
out_rename_lock:
	if (lustre_handle_is_used(&rename_lh))
		mdt_rename_unlock(&rename_lh);
	return rc;
}
Esempio n. 14
0
File: dcache.c Progetto: LLNL/lustre
int ll_revalidate_it(struct dentry *de, int lookup_flags,
                     struct lookup_intent *it)
{
        struct md_op_data *op_data;
        struct ptlrpc_request *req = NULL;
        struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
        struct obd_export *exp;
        struct inode *parent = de->d_parent->d_inode;
        int rc;

        ENTRY;
        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
               LL_IT2STR(it));

        if (de->d_inode == NULL) {
                __u64 ibits;

                /* We can only use negative dentries if this is stat or lookup,
                   for opens and stuff we do need to query server. */
                /* If there is IT_CREAT in intent op set, then we must throw
                   away this negative dentry and actually do the request to
                   kernel to create whatever needs to be created (if possible)*/
                if (it && (it->it_op & IT_CREAT))
                        RETURN(0);

                if (de->d_flags & DCACHE_LUSTRE_INVALID)
                        RETURN(0);

                ibits = MDS_INODELOCK_UPDATE;
                rc = ll_have_md_lock(parent, &ibits, LCK_MINMODE);
                GOTO(out_sa, rc);
        }

        /* Never execute intents for mount points.
         * Attributes will be fixed up in ll_inode_revalidate_it */
        if (d_mountpoint(de))
                GOTO(out_sa, rc = 1);

        /* need to get attributes in case root got changed from other client */
        if (de == de->d_sb->s_root) {
                rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
                if (rc == 0)
                        rc = 1;
                GOTO(out_sa, rc);
        }

        exp = ll_i2mdexp(de->d_inode);

        OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
        ll_frob_intent(&it, &lookup_it);
        LASSERT(it);

        if (it->it_op == IT_LOOKUP && !(de->d_flags & DCACHE_LUSTRE_INVALID))
                RETURN(1);

        if ((it->it_op == IT_OPEN) && de->d_inode) {
                struct inode *inode = de->d_inode;
                struct ll_inode_info *lli = ll_i2info(inode);
                struct obd_client_handle **och_p;
                __u64 *och_usecount;
                __u64 ibits;

                /*
                 * We used to check for MDS_INODELOCK_OPEN here, but in fact
                 * just having LOOKUP lock is enough to justify inode is the
                 * same. And if inode is the same and we have suitable
                 * openhandle, then there is no point in doing another OPEN RPC
                 * just to throw away newly received openhandle.  There are no
                 * security implications too, if file owner or access mode is
                 * change, LOOKUP lock is revoked.
                 */


                if (it->it_flags & FMODE_WRITE) {
                        och_p = &lli->lli_mds_write_och;
                        och_usecount = &lli->lli_open_fd_write_count;
                } else if (it->it_flags & FMODE_EXEC) {
                        och_p = &lli->lli_mds_exec_och;
                        och_usecount = &lli->lli_open_fd_exec_count;
                } else {
                        och_p = &lli->lli_mds_read_och;
                        och_usecount = &lli->lli_open_fd_read_count;
                }
                /* Check for the proper lock. */
                ibits = MDS_INODELOCK_LOOKUP;
                if (!ll_have_md_lock(inode, &ibits, LCK_MINMODE))
                        goto do_lock;
                cfs_mutex_lock(&lli->lli_och_mutex);
                if (*och_p) { /* Everything is open already, do nothing */
                        /*(*och_usecount)++;  Do not let them steal our open
                          handle from under us */
                        SET_BUT_UNUSED(och_usecount);
                        /* XXX The code above was my original idea, but in case
                           we have the handle, but we cannot use it due to later
                           checks (e.g. O_CREAT|O_EXCL flags set), nobody
                           would decrement counter increased here. So we just
                           hope the lock won't be invalidated in between. But
                           if it would be, we'll reopen the open request to
                           MDS later during file open path */
                        cfs_mutex_unlock(&lli->lli_och_mutex);
                        RETURN(1);
                } else {
                        cfs_mutex_unlock(&lli->lli_och_mutex);
                }
        }

        if (it->it_op == IT_GETATTR) {
                rc = ll_statahead_enter(parent, &de, 0);
                if (rc == 1)
                        goto mark;
        }

do_lock:
        op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
                                     de->d_name.name, de->d_name.len,
                                     0, LUSTRE_OPC_ANY, NULL);
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));

        it->it_create_mode &= ~cfs_curproc_umask();
        it->it_create_mode |= M_CHECK_STALE;
        rc = md_intent_lock(exp, op_data, NULL, 0, it,
                            lookup_flags,
                            &req, ll_md_blocking_ast, 0);
        it->it_create_mode &= ~M_CHECK_STALE;
        ll_finish_md_op_data(op_data);

        /* If req is NULL, then md_intent_lock only tried to do a lock match;
         * if all was well, it will return 1 if it found locks, 0 otherwise. */
        if (req == NULL && rc >= 0) {
                if (!rc)
                        goto do_lookup;
                GOTO(out, rc);
        }

        if (rc < 0) {
                if (rc != -ESTALE) {
                        CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
                               "%d\n", rc, it->d.lustre.it_status);
                }
                GOTO(out, rc = 0);
        }

revalidate_finish:
        rc = ll_revalidate_it_finish(req, it, de);
        if (rc != 0) {
                if (rc != -ESTALE && rc != -ENOENT)
                        ll_intent_release(it);
                GOTO(out, rc = 0);
        }

        if ((it->it_op & IT_OPEN) && de->d_inode &&
            !S_ISREG(de->d_inode->i_mode) &&
            !S_ISDIR(de->d_inode->i_mode)) {
                ll_release_openhandle(de, it);
        }
        rc = 1;

        /* unfortunately ll_intent_lock may cause a callback and revoke our
         * dentry */
        ll_dentry_rehash(de, 0);

out:
        /* We do not free request as it may be reused during following lookup
         * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
         * be freed in ll_lookup_it or in ll_intent_release. But if
         * request was not completed, we need to free it. (bug 5154, 9903) */
        if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
                ptlrpc_req_finished(req);
        if (rc == 0) {
                ll_unhash_aliases(de->d_inode);
                /* done in ll_unhash_aliases()
                   dentry->d_flags |= DCACHE_LUSTRE_INVALID; */
        } else {
                __u64 bits = 0;

                CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
                       "inode %p refc %d\n", de->d_name.len,
                       de->d_name.name, de, de->d_parent, de->d_inode,
                       atomic_read(&de->d_count));
                ll_set_lock_data(exp, de->d_inode, it, &bits);
                ll_dentry_reset_flags(de, bits);
                ll_lookup_finish_locks(it, de);
        }

mark:
        if (it != NULL && it->it_op == IT_GETATTR && rc > 0)
                ll_statahead_mark(parent, de);
        RETURN(rc);

        /*
         * This part is here to combat evil-evil race in real_lookup on 2.6
         * kernels.  The race details are: We enter do_lookup() looking for some
         * name, there is nothing in dcache for this name yet and d_lookup()
         * returns NULL.  We proceed to real_lookup(), and while we do this,
         * another process does open on the same file we looking up (most simple
         * reproducer), open succeeds and the dentry is added. Now back to
         * us. In real_lookup() we do d_lookup() again and suddenly find the
         * dentry, so we call d_revalidate on it, but there is no lock, so
         * without this code we would return 0, but unpatched real_lookup just
         * returns -ENOENT in such a case instead of retrying the lookup. Once
         * this is dealt with in real_lookup(), all of this ugly mess can go and
         * we can just check locks in ->d_revalidate without doing any RPCs
         * ever.
         */
do_lookup:
        if (it != &lookup_it) {
                /* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
                if (it->it_op == IT_GETATTR)
                        lookup_it.it_op = IT_GETATTR;
                ll_lookup_finish_locks(it, de);
                it = &lookup_it;
        }

        /* Do real lookup here. */
        op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
                                     de->d_name.len, 0, (it->it_op & IT_CREAT ?
                                                         LUSTRE_OPC_CREATE :
                                                         LUSTRE_OPC_ANY), NULL);
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));

        rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
                            ll_md_blocking_ast, 0);
        if (rc >= 0) {
                struct mdt_body *mdt_body;
                struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
                mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);

                if (de->d_inode)
                        fid = *ll_inode2fid(de->d_inode);

                /* see if we got same inode, if not - return error */
                if (lu_fid_eq(&fid, &mdt_body->fid1)) {
                        ll_finish_md_op_data(op_data);
                        op_data = NULL;
                        goto revalidate_finish;
                }
                ll_intent_release(it);
        }
        ll_finish_md_op_data(op_data);
        GOTO(out, rc = 0);

out_sa:
        /*
         * For rc == 1 case, should not return directly to prevent losing
         * statahead windows; for rc == 0 case, the "lookup" will be done later.
         */
        if (it != NULL && it->it_op == IT_GETATTR && rc == 1)
                ll_statahead_enter(parent, &de, 1);
        goto mark;
}

int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
{
        int rc;
        ENTRY;

        if (nd && !(nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))) {
                struct lookup_intent *it;

                it = ll_convert_intent(&nd->intent.open, nd->flags);
                if (IS_ERR(it))
                        RETURN(0);

                if (it->it_op == (IT_OPEN|IT_CREAT) &&
                    nd->intent.open.flags & O_EXCL) {
                        CDEBUG(D_VFSTRACE, "create O_EXCL, returning 0\n");
                        rc = 0;
                        goto out_it;
                }

                rc = ll_revalidate_it(dentry, nd->flags, it);

                if (rc && (nd->flags & LOOKUP_OPEN) &&
                    it_disposition(it, DISP_OPEN_OPEN)) {/*Open*/
// XXX Code duplication with ll_lookup_nd
                        if (S_ISFIFO(dentry->d_inode->i_mode)) {
                                // We cannot call open here as it would
                                // deadlock.
                                ptlrpc_req_finished(
                                               (struct ptlrpc_request *)
                                                  it->d.lustre.it_data);
                        } else {
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17))
/* 2.6.1[456] have a bug in open_namei() that forgets to check
 * nd->intent.open.file for error, so we need to return it as lookup's result
 * instead */
                                struct file *filp;

                                nd->intent.open.file->private_data = it;
                                filp = lookup_instantiate_filp(nd, dentry,NULL);
                                if (IS_ERR(filp)) {
                                        rc = PTR_ERR(filp);
                                }
#else
                                nd->intent.open.file->private_data = it;
                                (void)lookup_instantiate_filp(nd, dentry,NULL);
#endif
                        }
                }
                if (!rc && (nd->flags & LOOKUP_CREATE) &&
                    it_disposition(it, DISP_OPEN_CREATE)) {
                        /* We created something but we may only return
                         * negative dentry here, so save request in dentry,
                         * if lookup will be called later on, it will
                         * pick the request, otherwise it would be freed
                         * with dentry */
                        ll_d2d(dentry)->lld_it = it;
                        it = NULL; /* avoid freeing */
                }

out_it:
                if (it) {
                        ll_intent_release(it);
                        OBD_FREE(it, sizeof(*it));
                }
        } else {
                rc = ll_revalidate_it(dentry, 0, NULL);
        }

        RETURN(rc);
}