static void osp_statfs_timer_cb(unsigned long _d)
{
	struct osp_device *d = (struct osp_device *) _d;

	LASSERT(d);
	cfs_waitq_signal(&d->opd_pre_waitq);
}
/*
 * this function relies on reservation made before
 */
int osp_precreate_get_fid(const struct lu_env *env, struct osp_device *d,
			  struct lu_fid *fid)
{
	/* grab next id from the pool */
	spin_lock(&d->opd_pre_lock);

	LASSERTF(lu_fid_diff(&d->opd_pre_used_fid,
			     &d->opd_pre_last_created_fid) < 0,
		 "next fid "DFID" last created fid "DFID"\n",
		 PFID(&d->opd_pre_used_fid),
		 PFID(&d->opd_pre_last_created_fid));

	d->opd_pre_used_fid.f_oid++;
	memcpy(fid, &d->opd_pre_used_fid, sizeof(*fid));
	d->opd_pre_reserved--;
	/*
	 * last_used_id must be changed along with getting new id otherwise
	 * we might miscalculate gap causing object loss or leak
	 */
	osp_update_last_fid(d, fid);
	spin_unlock(&d->opd_pre_lock);

	/*
	 * probably main thread suspended orphan cleanup till
	 * all reservations are released, see comment in
	 * osp_precreate_thread() just before orphan cleanup
	 */
	if (unlikely(d->opd_pre_reserved == 0 && d->opd_pre_status))
		cfs_waitq_signal(&d->opd_pre_waitq);

	return 0;
}
Beispiel #3
0
static void seq_fid_alloc_fini(struct lu_client_seq *seq)
{
        LASSERT(seq->lcs_update == 1);
	mutex_lock(&seq->lcs_mutex);
        --seq->lcs_update;
        cfs_waitq_signal(&seq->lcs_waitq);
}
Beispiel #4
0
/*
 * Workitem scheduled with (serial == 1) is strictly serialised not only with
 * itself, but also with others scheduled this way.
 *
 * Now there's only one static serialised queue, but in the future more might
 * be added, and even dynamic creation of serialised queues might be supported.
 */
void
cfs_wi_schedule(cfs_workitem_t *wi)
{
        cfs_wi_sched_t *sched = cfs_wi_to_sched(wi);

        LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */
        LASSERT (!sched->ws_shuttingdown);

        cfs_wi_sched_lock(sched);

        if (!wi->wi_scheduled) {
                LASSERT (cfs_list_empty(&wi->wi_list));

                wi->wi_scheduled = 1;
                if (!wi->wi_running) {
                        cfs_list_add_tail(&wi->wi_list, &sched->ws_runq);
#ifdef __KERNEL__
                        cfs_waitq_signal(&sched->ws_waitq);
#endif
                } else {
                        cfs_list_add(&wi->wi_list, &sched->ws_rerunq);
                }
        }

        LASSERT (!cfs_list_empty(&wi->wi_list));
        cfs_wi_sched_unlock(sched);
        return;
}
Beispiel #5
0
void mdt_ck_timer_callback(unsigned long castmeharder)
{
        struct mdt_device *mdt = (struct mdt_device *)castmeharder;
        struct ptlrpc_thread *thread = &mdt->mdt_ck_thread;

        ENTRY;
        thread_add_flags(thread, SVC_EVENT);
        cfs_waitq_signal(&thread->t_ctl_waitq);
        EXIT;
}
Beispiel #6
0
int LL_PROC_PROTO(proc_fail_loc)
{
        int rc;
        long old_fail_loc = cfs_fail_loc;

        rc = ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos);
        if (old_fail_loc != cfs_fail_loc)
                cfs_waitq_signal(&cfs_race_waitq);
        return rc;
}
Beispiel #7
0
void mdt_ck_thread_stop(struct mdt_device *mdt)
{
        struct ptlrpc_thread *thread = &mdt->mdt_ck_thread;

        if (!thread_is_running(thread))
                return;

        thread_set_flags(thread, SVC_STOPPING);
        cfs_waitq_signal(&thread->t_ctl_waitq);
        l_wait_condition(thread->t_ctl_waitq, thread_is_stopped(thread));
}
/*
 * XXX: there might be a case where removed object(s) do not add free
 * space (empty object). if the number of such deletions is high, then
 * we can start to update statfs too often - a rpc storm
 * TODO: some throttling is needed
 */
void osp_statfs_need_now(struct osp_device *d)
{
	if (!d->opd_statfs_update_in_progress) {
		/*
		 * if current status is -ENOSPC (lack of free space on OST)
		 * then we should poll OST immediately once object destroy
		 * is replied
		 */
		d->opd_statfs_fresh_till = cfs_time_shift(-1);
		cfs_timer_disarm(&d->opd_statfs_timer);
		cfs_waitq_signal(&d->opd_pre_waitq);
	}
}
/*
 * the function updates current precreation status used: functional or not
 *
 * rc is a last code from the transport, rc == 0 meaning transport works
 * well and users of lod can use objects from this OSP
 *
 * the status depends on current usage of OST
 */
void osp_pre_update_status(struct osp_device *d, int rc)
{
	struct obd_statfs	*msfs = &d->opd_statfs;
	int			 old = d->opd_pre_status;
	__u64			 used;

	d->opd_pre_status = rc;
	if (rc)
		goto out;

	/* Add a bit of hysteresis so this flag isn't continually flapping,
	 * and ensure that new files don't get extremely fragmented due to
	 * only a small amount of available space in the filesystem.
	 * We want to set the NOSPC flag when there is less than ~0.1% free
	 * and clear it when there is at least ~0.2% free space, so:
	 *                   avail < ~0.1% max          max = avail + used
	 *            1025 * avail < avail + used       used = blocks - free
	 *            1024 * avail < used
	 *            1024 * avail < blocks - free
	 *                   avail < ((blocks - free) >> 10)
	 *
	 * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
	 * lose that amount of space so in those cases we report no space left
	 * if their is less than 1 GB left.                             */
	if (likely(msfs->os_type)) {
		used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
				    1 << 30);
		if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) {
			d->opd_pre_status = -ENOSPC;
			if (old != -ENOSPC)
				CDEBUG(D_INFO, "%s: status: "LPU64" blocks, "
				       LPU64" free, "LPU64" used, "LPU64" "
				       "avail -> %d: rc = %d\n",
				       d->opd_obd->obd_name, msfs->os_blocks,
				       msfs->os_bfree, used, msfs->os_bavail,
				       d->opd_pre_status, rc);
			CDEBUG(D_INFO,
			       "non-commited changes: %lu, in progress: %u\n",
			       d->opd_syn_changes, d->opd_syn_rpc_in_progress);
		} else if (old == -ENOSPC) {
			d->opd_pre_status = 0;
			d->opd_pre_grow_slow = 0;
			d->opd_pre_grow_count = OST_MIN_PRECREATE;
			cfs_waitq_signal(&d->opd_pre_waitq);
			CDEBUG(D_INFO, "%s: no space: "LPU64" blocks, "LPU64
			       " free, "LPU64" used, "LPU64" avail -> %d: "
			       "rc = %d\n", d->opd_obd->obd_name,
			       msfs->os_blocks, msfs->os_bfree, used,
			       msfs->os_bavail, d->opd_pre_status, rc);
		}
	}
void osp_precreate_fini(struct osp_device *d)
{
	struct ptlrpc_thread *thread = &d->opd_pre_thread;

	ENTRY;

	cfs_timer_disarm(&d->opd_statfs_timer);

	thread->t_flags = SVC_STOPPING;
	cfs_waitq_signal(&d->opd_pre_waitq);

	cfs_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED);

	EXIT;
}
Beispiel #11
0
static void lcw_dispatch_stop(void)
{
        ENTRY;
        LASSERT(lcw_refcount == 0);

        CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");

        cfs_set_bit(LCW_FLAG_STOP, &lcw_flags);
        cfs_waitq_signal(&lcw_event_waitq);

        cfs_wait_for_completion(&lcw_stop_completion);

        CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");

        EXIT;
}
Beispiel #12
0
/* return a page that has 'len' bytes left at the end */
static struct cfs_trace_page *
cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
{
        struct cfs_trace_page *tage;

        if (tcd->tcd_cur_pages > 0) {
                __LASSERT(!cfs_list_empty(&tcd->tcd_pages));
                tage = cfs_tage_from_list(tcd->tcd_pages.prev);
                if (tage->used + len <= CFS_PAGE_SIZE)
                        return tage;
        }

        if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
                if (tcd->tcd_cur_stock_pages > 0) {
                        tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
                        -- tcd->tcd_cur_stock_pages;
                        cfs_list_del_init(&tage->linkage);
                } else {
                        tage = cfs_tage_alloc(CFS_ALLOC_ATOMIC);
                        if (tage == NULL) {
                                if (printk_ratelimit())
                                        printk(CFS_KERN_WARNING
                                               "cannot allocate a tage (%ld)\n",
                                       tcd->tcd_cur_pages);
                                return NULL;
                        }
                }

                tage->used = 0;
                tage->cpu = cfs_smp_processor_id();
                tage->type = tcd->tcd_type;
                cfs_list_add_tail(&tage->linkage, &tcd->tcd_pages);
                tcd->tcd_cur_pages++;

                if (tcd->tcd_cur_pages > 8 && thread_running) {
                        struct tracefiled_ctl *tctl = &trace_tctl;
                        /*
                         * wake up tracefiled to process some pages.
                         */
                        cfs_waitq_signal(&tctl->tctl_waitq);
                }
                return tage;
        }
        return NULL;
}
static int osp_statfs_interpret(const struct lu_env *env,
				struct ptlrpc_request *req,
				union ptlrpc_async_args *aa, int rc)
{
	struct obd_import	*imp = req->rq_import;
	struct obd_statfs	*msfs;
	struct osp_device	*d;

	ENTRY;

	aa = ptlrpc_req_async_args(req);
	d = aa->pointer_arg[0];
	LASSERT(d);

	if (rc != 0)
		GOTO(out, rc);

	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
	if (msfs == NULL)
		GOTO(out, rc = -EPROTO);

	d->opd_statfs = *msfs;

	osp_pre_update_status(d, rc);

	/* schedule next update */
	d->opd_statfs_fresh_till = cfs_time_shift(d->opd_statfs_maxage);
	cfs_timer_arm(&d->opd_statfs_timer, d->opd_statfs_fresh_till);
	d->opd_statfs_update_in_progress = 0;

	CDEBUG(D_CACHE, "updated statfs %p\n", d);

	RETURN(0);
out:
	/* couldn't update statfs, try again as soon as possible */
	cfs_waitq_signal(&d->opd_pre_waitq);
	if (req->rq_import_generation == imp->imp_generation)
		CDEBUG(D_CACHE, "%s: couldn't update statfs: rc = %d\n",
		       d->opd_obd->obd_name, rc);
	RETURN(rc);
}
Beispiel #14
0
/** Queues DONE_WRITING if
 * - done writing is allowed;
 * - inode has no no dirty pages; */
void ll_queue_done_writing(struct inode *inode, unsigned long flags)
{
	struct ll_inode_info *lli = ll_i2info(inode);
	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
	ENTRY;

	spin_lock(&lli->lli_lock);
        lli->lli_flags |= flags;

        if ((lli->lli_flags & LLIF_DONE_WRITING) &&
            cfs_list_empty(&club->cob_pending_list)) {
                struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;

                if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
                        CWARN("ino %lu/%u(flags %u) som valid it just after "
                              "recovery\n",
                              inode->i_ino, inode->i_generation,
                              lli->lli_flags);
                /* DONE_WRITING is allowed and inode has no dirty page. */
		spin_lock(&lcq->lcq_lock);

                LASSERT(cfs_list_empty(&lli->lli_close_list));
                CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
                       inode->i_ino, inode->i_generation);
                cfs_list_add_tail(&lli->lli_close_list, &lcq->lcq_head);

                /* Avoid a concurrent insertion into the close thread queue:
                 * an inode is already in the close thread, open(), write(),
                 * close() happen, epoch is closed as the inode is marked as
                 * LLIF_EPOCH_PENDING. When pages are written inode should not
                 * be inserted into the queue again, clear this flag to avoid
                 * it. */
                lli->lli_flags &= ~LLIF_DONE_WRITING;

                cfs_waitq_signal(&lcq->lcq_waitq);
		spin_unlock(&lcq->lcq_lock);
	}
	spin_unlock(&lli->lli_lock);
	EXIT;
}
Beispiel #15
0
static int filter_quota_setinfo(struct obd_device *obd, void *data)
{
        struct obd_export *exp = data;
        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
        struct obd_import *imp = exp->exp_imp_reverse;
        ENTRY;

        LASSERT(imp != NULL);

        /* setup the quota context import */
        cfs_spin_lock(&qctxt->lqc_lock);
        if (qctxt->lqc_import != NULL) {
                cfs_spin_unlock(&qctxt->lqc_lock);
                if (qctxt->lqc_import == imp)
                        CDEBUG(D_WARNING, "%s: lqc_import(%p) of obd(%p) was "
                               "activated already.\n", obd->obd_name, imp, obd);
                else
                        CERROR("%s: lqc_import(%p:%p) of obd(%p) was "
                               "activated by others.\n", obd->obd_name,
                               qctxt->lqc_import, imp, obd);
        } else {
                qctxt->lqc_import = imp;
                /* make imp's connect flags equal relative exp's connect flags
                 * adding it to avoid the scan export list */
                imp->imp_connect_data.ocd_connect_flags |=
                                (exp->exp_connect_flags &
                                 (OBD_CONNECT_QUOTA64 | OBD_CONNECT_CHANGE_QS));
                cfs_spin_unlock(&qctxt->lqc_lock);
                CDEBUG(D_QUOTA, "%s: lqc_import(%p) of obd(%p) is reactivated "
                       "now.\n", obd->obd_name, imp, obd);

                cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster);
                /* start quota slave recovery thread. (release high limits) */
                qslave_start_recovery(obd, qctxt);
        }
        RETURN(0);
}
Beispiel #16
0
static void lcw_cb(ulong_ptr_t data)
{
        struct lc_watchdog *lcw = (struct lc_watchdog *)data;
        ENTRY;

        if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
                EXIT;
                return;
        }

        lcw->lcw_state = LC_WATCHDOG_EXPIRED;

        cfs_spin_lock_bh(&lcw->lcw_lock);
        LASSERT(cfs_list_empty(&lcw->lcw_list));

        cfs_spin_lock_bh(&lcw_pending_timers_lock);
        lcw->lcw_refcount++; /* +1 for pending list */
        cfs_list_add(&lcw->lcw_list, &lcw_pending_timers);
        cfs_waitq_signal(&lcw_event_waitq);

        cfs_spin_unlock_bh(&lcw_pending_timers_lock);
        cfs_spin_unlock_bh(&lcw->lcw_lock);
        EXIT;
}
static int osp_precreate_thread(void *_arg)
{
	struct osp_device	*d = _arg;
	struct ptlrpc_thread	*thread = &d->opd_pre_thread;
	struct l_wait_info	 lwi = { 0 };
	struct lu_env		 env;
	int			 rc;

	ENTRY;

	rc = lu_env_init(&env, d->opd_dt_dev.dd_lu_dev.ld_type->ldt_ctx_tags);
	if (rc) {
		CERROR("%s: init env error: rc = %d\n", d->opd_obd->obd_name,
		       rc);
		RETURN(rc);
	}

	spin_lock(&d->opd_pre_lock);
	thread->t_flags = SVC_RUNNING;
	spin_unlock(&d->opd_pre_lock);
	cfs_waitq_signal(&thread->t_ctl_waitq);

	while (osp_precreate_running(d)) {
		/*
		 * need to be connected to OST
		 */
		while (osp_precreate_running(d)) {
			l_wait_event(d->opd_pre_waitq,
				     !osp_precreate_running(d) ||
				     d->opd_new_connection,
				     &lwi);

			if (!d->opd_new_connection)
				continue;

			d->opd_new_connection = 0;
			d->opd_got_disconnected = 0;
			break;
		}

		if (!osp_precreate_running(d))
			break;

		LASSERT(d->opd_obd->u.cli.cl_seq != NULL);
		if (d->opd_obd->u.cli.cl_seq->lcs_exp == NULL) {
			/* Get new sequence for client first */
			LASSERT(d->opd_exp != NULL);
			d->opd_obd->u.cli.cl_seq->lcs_exp =
			class_export_get(d->opd_exp);
			rc = osp_init_pre_fid(d);
			if (rc != 0) {
				class_export_put(d->opd_exp);
				d->opd_obd->u.cli.cl_seq->lcs_exp = NULL;
				CERROR("%s: init pre fid error: rc = %d\n",
				       d->opd_obd->obd_name, rc);
				continue;
			}
		}

		osp_statfs_update(d);

		/*
		 * Clean up orphans or recreate missing objects.
		 */
		rc = osp_precreate_cleanup_orphans(&env, d);
		if (rc != 0)
			continue;
		/*
		 * connected, can handle precreates now
		 */
		while (osp_precreate_running(d)) {
			l_wait_event(d->opd_pre_waitq,
				     !osp_precreate_running(d) ||
				     osp_precreate_near_empty(&env, d) ||
				     osp_statfs_need_update(d) ||
				     d->opd_got_disconnected, &lwi);

			if (!osp_precreate_running(d))
				break;

			/* something happened to the connection
			 * have to start from the beginning */
			if (d->opd_got_disconnected)
				break;

			if (osp_statfs_need_update(d))
				osp_statfs_update(d);

			/* To avoid handling different seq in precreate/orphan
			 * cleanup, it will hold precreate until current seq is
			 * used up. */
			if (unlikely(osp_precreate_end_seq(&env, d) &&
			    !osp_create_end_seq(&env, d)))
				continue;

			if (unlikely(osp_precreate_end_seq(&env, d) &&
				     osp_create_end_seq(&env, d))) {
				LCONSOLE_INFO("%s:"LPX64" is used up."
					      " Update to new seq\n",
					      d->opd_obd->obd_name,
					 fid_seq(&d->opd_pre_last_created_fid));
				rc = osp_precreate_rollover_new_seq(&env, d);
				if (rc)
					continue;
			}

			if (osp_precreate_near_empty(&env, d)) {
				rc = osp_precreate_send(&env, d);
				/* osp_precreate_send() sets opd_pre_status
				 * in case of error, that prevent the using of
				 * failed device. */
				if (rc < 0 && rc != -ENOSPC &&
				    rc != -ETIMEDOUT && rc != -ENOTCONN)
					CERROR("%s: cannot precreate objects:"
					       " rc = %d\n",
					       d->opd_obd->obd_name, rc);
			}
		}
	}

	thread->t_flags = SVC_STOPPED;
	lu_env_fini(&env);
	cfs_waitq_signal(&thread->t_ctl_waitq);

	RETURN(0);
}
Beispiel #18
0
static int mdt_ck_thread_main(void *args)
{
        struct mdt_device      *mdt = args;
        struct ptlrpc_thread   *thread = &mdt->mdt_ck_thread;
        struct lustre_capa_key *bkey = &mdt->mdt_capa_keys[0],
                               *rkey = &mdt->mdt_capa_keys[1];
        struct lustre_capa_key *tmp;
        struct lu_env           env;
        struct mdt_thread_info *info;
        struct md_device       *next;
        struct l_wait_info      lwi = { 0 };
        mdsno_t                 mdsnum;
        int                     rc;
        ENTRY;

        cfs_daemonize_ctxt("mdt_ck");
        cfs_block_allsigs();

        thread_set_flags(thread, SVC_RUNNING);
        cfs_waitq_signal(&thread->t_ctl_waitq);

        rc = lu_env_init(&env, LCT_MD_THREAD|LCT_REMEMBER|LCT_NOREF);
        if (rc)
                RETURN(rc);

        thread->t_env = &env;
        env.le_ctx.lc_thread = thread;
        env.le_ctx.lc_cookie = 0x1;

        info = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
        LASSERT(info != NULL);

        tmp = &info->mti_capa_key;
        mdsnum = mdt_md_site(mdt)->ms_node_id;
        while (1) {
                l_wait_event(thread->t_ctl_waitq,
                             thread_is_stopping(thread) ||
                             thread_is_event(thread),
                             &lwi);

                if (thread_is_stopping(thread))
                        break;
                thread_clear_flags(thread, SVC_EVENT);

                if (cfs_time_before(cfs_time_current(), mdt->mdt_ck_expiry))
                        break;

                *tmp = *rkey;
                make_capa_key(tmp, mdsnum, rkey->lk_keyid);

                next = mdt->mdt_child;
                rc = next->md_ops->mdo_update_capa_key(&env, next, tmp);
                if (!rc) {
                        cfs_spin_lock(&capa_lock);
                        *bkey = *rkey;
                        *rkey = *tmp;
                        cfs_spin_unlock(&capa_lock);

                        rc = write_capa_keys(&env, mdt, mdt->mdt_capa_keys);
                        if (rc) {
                                cfs_spin_lock(&capa_lock);
                                *rkey = *bkey;
                                memset(bkey, 0, sizeof(*bkey));
                                cfs_spin_unlock(&capa_lock);
                        } else {
                                set_capa_key_expiry(mdt);
                                DEBUG_CAPA_KEY(D_SEC, rkey, "new");
                        }
                }
                if (rc) {
                        DEBUG_CAPA_KEY(D_ERROR, rkey, "update failed for");
                        /* next retry is in 300 sec */
                        mdt->mdt_ck_expiry = jiffies + 300 * CFS_HZ;
                }

                cfs_timer_arm(&mdt->mdt_ck_timer, mdt->mdt_ck_expiry);
                CDEBUG(D_SEC, "mdt_ck_timer %lu\n", mdt->mdt_ck_expiry);
        }
        lu_env_fini(&env);

        thread_set_flags(thread, SVC_STOPPED);
        cfs_waitq_signal(&thread->t_ctl_waitq);
        RETURN(0);
}
Beispiel #19
0
int libcfs_debug_dumplog_thread(void *arg)
{
        libcfs_debug_dumplog_internal(arg);
        cfs_waitq_signal(&debug_ctlwq);
        return 0;
}
/**
 * asks OST to clean precreate orphans
 * and gets next id for new objects
 */
static int osp_precreate_cleanup_orphans(struct lu_env *env,
					 struct osp_device *d)
{
	struct osp_thread_info	*osi = osp_env_info(env);
	struct lu_fid		*last_fid = &osi->osi_fid;
	struct ptlrpc_request	*req = NULL;
	struct obd_import	*imp;
	struct ost_body		*body;
	struct l_wait_info	 lwi = { 0 };
	int			 update_status = 0;
	int			 rc;
	int			 diff;

	ENTRY;

	/*
	 * wait for local recovery to finish, so we can cleanup orphans
	 * orphans are all objects since "last used" (assigned), but
	 * there might be objects reserved and in some cases they won't
	 * be used. we can't cleanup them till we're sure they won't be
	 * used. also can't we allow new reservations because they may
	 * end up getting orphans being cleaned up below. so we block
	 * new reservations and wait till all reserved objects either
	 * user or released.
	 */
	spin_lock(&d->opd_pre_lock);
	d->opd_pre_recovering = 1;
	spin_unlock(&d->opd_pre_lock);
	/*
	 * The locking above makes sure the opd_pre_reserved check below will
	 * catch all osp_precreate_reserve() calls who find
	 * "!opd_pre_recovering".
	 */
	l_wait_event(d->opd_pre_waitq,
		     (!d->opd_pre_reserved && d->opd_recovery_completed) ||
		     !osp_precreate_running(d) || d->opd_got_disconnected,
		     &lwi);
	if (!osp_precreate_running(d) || d->opd_got_disconnected)
		GOTO(out, rc = -EAGAIN);

	CDEBUG(D_HA, "%s: going to cleanup orphans since "DFID"\n",
	       d->opd_obd->obd_name, PFID(&d->opd_last_used_fid));

	*last_fid = d->opd_last_used_fid;
	/* The OSP should already get the valid seq now */
	LASSERT(!fid_is_zero(last_fid));
	if (fid_oid(&d->opd_last_used_fid) < 2) {
		/* lastfid looks strange... ask OST */
		rc = osp_get_lastfid_from_ost(env, d);
		if (rc)
			GOTO(out, rc);
	}

	imp = d->opd_obd->u.cli.cl_import;
	LASSERT(imp);

	req = ptlrpc_request_alloc(imp, &RQF_OST_CREATE);
	if (req == NULL)
		GOTO(out, rc = -ENOMEM);

	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
	if (rc) {
		ptlrpc_request_free(req);
		req = NULL;
		GOTO(out, rc);
	}

	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
	if (body == NULL)
		GOTO(out, rc = -EPROTO);

	body->oa.o_flags = OBD_FL_DELORPHAN;
	body->oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP;

	fid_to_ostid(&d->opd_last_used_fid, &body->oa.o_oi);

	ptlrpc_request_set_replen(req);

	/* Don't resend the delorphan req */
	req->rq_no_resend = req->rq_no_delay = 1;

	rc = ptlrpc_queue_wait(req);
	if (rc) {
		update_status = 1;
		GOTO(out, rc);
	}

	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
	if (body == NULL)
		GOTO(out, rc = -EPROTO);

	/*
	 * OST provides us with id new pool starts from in body->oa.o_id
	 */
	ostid_to_fid(last_fid, &body->oa.o_oi, d->opd_index);

	spin_lock(&d->opd_pre_lock);
	diff = lu_fid_diff(&d->opd_last_used_fid, last_fid);
	if (diff > 0) {
		d->opd_pre_grow_count = OST_MIN_PRECREATE + diff;
		d->opd_pre_last_created_fid = d->opd_last_used_fid;
	} else {
		d->opd_pre_grow_count = OST_MIN_PRECREATE;
		d->opd_pre_last_created_fid = *last_fid;
	}
	/*
	 * This empties the pre-creation pool and effectively blocks any new
	 * reservations.
	 */
	LASSERT(fid_oid(&d->opd_pre_last_created_fid) <=
		LUSTRE_DATA_SEQ_MAX_WIDTH);
	d->opd_pre_used_fid = d->opd_pre_last_created_fid;
	d->opd_pre_grow_slow = 0;
	spin_unlock(&d->opd_pre_lock);

	CDEBUG(D_HA, "%s: Got last_id "DFID" from OST, last_created "DFID
	       "last_used is "DFID"\n", d->opd_obd->obd_name, PFID(last_fid),
	       PFID(&d->opd_pre_last_created_fid), PFID(&d->opd_last_used_fid));
out:
	if (req)
		ptlrpc_req_finished(req);

	d->opd_pre_recovering = 0;

	/*
	 * If rc is zero, the pre-creation window should have been emptied.
	 * Since waking up the herd would be useless without pre-created
	 * objects, we defer the signal to osp_precreate_send() in that case.
	 */
	if (rc != 0) {
		if (update_status) {
			CERROR("%s: cannot cleanup orphans: rc = %d\n",
			       d->opd_obd->obd_name, rc);
			/* we can't proceed from here, OST seem to
			 * be in a bad shape, better to wait for
			 * a new instance of the server and repeat
			 * from the beginning. notify possible waiters
			 * this OSP isn't quite functional yet */
			osp_pre_update_status(d, rc);
		} else {
			cfs_waitq_signal(&d->opd_pre_user_waitq);
		}
	}

	RETURN(rc);
}
static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
{
	struct osp_thread_info	*oti = osp_env_info(env);
	struct ptlrpc_request	*req;
	struct obd_import	*imp;
	struct ost_body		*body;
	int			 rc, grow, diff;
	struct lu_fid		*fid = &oti->osi_fid;
	ENTRY;

	/* don't precreate new objects till OST healthy and has free space */
	if (unlikely(d->opd_pre_status)) {
		CDEBUG(D_INFO, "%s: don't send new precreate: rc = %d\n",
		       d->opd_obd->obd_name, d->opd_pre_status);
		RETURN(0);
	}

	/*
	 * if not connection/initialization is compeleted, ignore
	 */
	imp = d->opd_obd->u.cli.cl_import;
	LASSERT(imp);

	req = ptlrpc_request_alloc(imp, &RQF_OST_CREATE);
	if (req == NULL)
		RETURN(-ENOMEM);
	req->rq_request_portal = OST_CREATE_PORTAL;
	/* we should not resend create request - anyway we will have delorphan
	 * and kill these objects */
	req->rq_no_delay = req->rq_no_resend = 1;

	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
	if (rc) {
		ptlrpc_request_free(req);
		RETURN(rc);
	}

	spin_lock(&d->opd_pre_lock);
	if (d->opd_pre_grow_count > d->opd_pre_max_grow_count / 2)
		d->opd_pre_grow_count = d->opd_pre_max_grow_count / 2;
	grow = d->opd_pre_grow_count;
	spin_unlock(&d->opd_pre_lock);

	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
	LASSERT(body);

	*fid = d->opd_pre_last_created_fid;
	rc = osp_precreate_fids(env, d, fid, &grow);
	if (rc == 1) {
		/* Current seq has been used up*/
		if (!osp_is_fid_client(d)) {
			osp_pre_update_status(d, -ENOSPC);
			rc = -ENOSPC;
		}
		cfs_waitq_signal(&d->opd_pre_waitq);
		GOTO(out_req, rc);
	}

	if (!osp_is_fid_client(d)) {
		/* Non-FID client will always send seq 0 because of
		 * compatiblity */
		LASSERTF(fid_is_idif(fid), "Invalid fid "DFID"\n", PFID(fid));
		fid->f_seq = 0;
	}

	fid_to_ostid(fid, &body->oa.o_oi);
	body->oa.o_valid = OBD_MD_FLGROUP;

	ptlrpc_request_set_replen(req);

	rc = ptlrpc_queue_wait(req);
	if (rc) {
		CERROR("%s: can't precreate: rc = %d\n", d->opd_obd->obd_name,
		       rc);
		GOTO(out_req, rc);
	}
	LASSERT(req->rq_transno == 0);

	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
	if (body == NULL)
		GOTO(out_req, rc = -EPROTO);

	ostid_to_fid(fid, &body->oa.o_oi, d->opd_index);
	LASSERTF(lu_fid_diff(fid, &d->opd_pre_used_fid) > 0,
		 "reply fid "DFID" pre used fid "DFID"\n", PFID(fid),
		 PFID(&d->opd_pre_used_fid));

	diff = lu_fid_diff(fid, &d->opd_pre_last_created_fid);

	spin_lock(&d->opd_pre_lock);
	if (diff < grow) {
		/* the OST has not managed to create all the
		 * objects we asked for */
		d->opd_pre_grow_count = max(diff, OST_MIN_PRECREATE);
		d->opd_pre_grow_slow = 1;
	} else {
		/* the OST is able to keep up with the work,
		 * we could consider increasing grow_count
		 * next time if needed */
		d->opd_pre_grow_slow = 0;
	}

	d->opd_pre_last_created_fid = *fid;
	spin_unlock(&d->opd_pre_lock);

	CDEBUG(D_HA, "%s: current precreated pool: "DFID"-"DFID"\n",
	       d->opd_obd->obd_name, PFID(&d->opd_pre_used_fid),
	       PFID(&d->opd_pre_last_created_fid));
out_req:
	/* now we can wakeup all users awaiting for objects */
	osp_pre_update_status(d, rc);
	cfs_waitq_signal(&d->opd_pre_user_waitq);

	ptlrpc_req_finished(req);
	RETURN(rc);
}
/*
 * called to reserve object in the pool
 * return codes:
 *  ENOSPC - no space on corresponded OST
 *  EAGAIN - precreation is in progress, try later
 *  EIO    - no access to OST
 */
int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d)
{
	struct l_wait_info	 lwi;
	cfs_time_t		 expire = cfs_time_shift(obd_timeout);
	int			 precreated, rc;

	ENTRY;

	LASSERTF(osp_objs_precreated(env, d) >= 0, "Last created FID "DFID
		 "Next FID "DFID"\n", PFID(&d->opd_pre_last_created_fid),
		 PFID(&d->opd_pre_used_fid));

	/*
	 * wait till:
	 *  - preallocation is done
	 *  - no free space expected soon
	 *  - can't connect to OST for too long (obd_timeout)
	 *  - OST can allocate fid sequence.
	 */
	while ((rc = d->opd_pre_status) == 0 || rc == -ENOSPC ||
		rc == -ENODEV || rc == -EAGAIN) {

		/*
		 * increase number of precreations
		 */
		precreated = osp_objs_precreated(env, d);
		if (d->opd_pre_grow_count < d->opd_pre_max_grow_count &&
		    d->opd_pre_grow_slow == 0 &&
		    precreated <= (d->opd_pre_grow_count / 4 + 1)) {
			spin_lock(&d->opd_pre_lock);
			d->opd_pre_grow_slow = 1;
			d->opd_pre_grow_count *= 2;
			spin_unlock(&d->opd_pre_lock);
		}

		spin_lock(&d->opd_pre_lock);
		precreated = osp_objs_precreated(env, d);
		if (precreated > d->opd_pre_reserved &&
		    !d->opd_pre_recovering) {
			d->opd_pre_reserved++;
			spin_unlock(&d->opd_pre_lock);
			rc = 0;

			/* XXX: don't wake up if precreation is in progress */
			if (osp_precreate_near_empty_nolock(env, d) &&
			   !osp_precreate_end_seq_nolock(env, d))
				cfs_waitq_signal(&d->opd_pre_waitq);

			break;
		}
		spin_unlock(&d->opd_pre_lock);

		/*
		 * all precreated objects have been used and no-space
		 * status leave us no chance to succeed very soon
		 * but if there is destroy in progress, then we should
		 * wait till that is done - some space might be released
		 */
		if (unlikely(rc == -ENOSPC)) {
			if (d->opd_syn_changes) {
				/* force local commit to release space */
				dt_commit_async(env, d->opd_storage);
			}
			if (d->opd_syn_rpc_in_progress) {
				/* just wait till destroys are done */
				/* see l_wait_even() few lines below */
			}
			if (d->opd_syn_changes +
			    d->opd_syn_rpc_in_progress == 0) {
				/* no hope for free space */
				break;
			}
		}

		/* XXX: don't wake up if precreation is in progress */
		cfs_waitq_signal(&d->opd_pre_waitq);

		lwi = LWI_TIMEOUT(expire - cfs_time_current(),
				osp_precreate_timeout_condition, d);
		if (cfs_time_aftereq(cfs_time_current(), expire)) {
			rc = -ETIMEDOUT;
			break;
		}

		l_wait_event(d->opd_pre_user_waitq,
			     osp_precreate_ready_condition(env, d), &lwi);
	}

	RETURN(rc);
}
Beispiel #23
0
void cfs_complete(cfs_completion_t *c)
{
        LASSERT(c != NULL);
        c->done  = 1;
        cfs_waitq_signal(&c->wait);
}