Example #1
0
static int slp_io_rw_lock(const struct lu_env *env,
                          const struct cl_io_slice *ios)
{
        struct ccc_io *cio = ccc_env_io(env);
        struct cl_io *io   = ios->cis_io;
        loff_t start;
        loff_t end;

        if (cl_io_is_append(io)) {
                start = 0;
                end   = OBD_OBJECT_EOF;
        } else {
                start = io->u.ci_wr.wr.crw_pos;
                end   = start + io->u.ci_wr.wr.crw_count - 1;
        }

        ccc_io_update_iov(env, cio, io);

        /*
         * This acquires real DLM lock only in O_APPEND case, because of
         * the io->ci_lockreq setting in llu_io_init().
         */
        LASSERT(ergo(cl_io_is_append(io), io->ci_lockreq == CILR_MANDATORY));
        LASSERT(ergo(!cl_io_is_append(io), io->ci_lockreq == CILR_NEVER));
        return ccc_io_one_lock(env, io, 0,
                               io->ci_type == CIT_READ ? CLM_READ : CLM_WRITE,
                               start, end);

}
Example #2
0
int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
{
	int ret;
	reiser4_context *ctx;

	assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
				   lock_stack_isclean(get_current_lock_stack
						      ())));
	ctx = get_current_context();
	if (!(flags & BA_FORCE) && !is_grab_enabled(ctx))
		return 0;

	ret = reiser4_grab(ctx, count, flags);
	if (ret == -ENOSPC) {

		/* Trying to commit the all transactions if BA_CAN_COMMIT flag
		   present */
		if (flags & BA_CAN_COMMIT) {
			txnmgr_force_commit_all(ctx->super, 0);
			ctx->grab_enabled = 1;
			ret = reiser4_grab(ctx, count, flags);
		}
	}
	/*
	 * allocation from reserved pool cannot fail. This is severe error.
	 */
	assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
	return ret;
}
Example #3
0
/**
 * Invariant that has to be true all of the time.
 */
static int osc_lock_invariant(struct osc_lock *ols)
{
	struct ldlm_lock *lock	= osc_handle_ptr(&ols->ols_handle);
	struct ldlm_lock *olock       = ols->ols_lock;
	int	       handle_used = lustre_handle_is_used(&ols->ols_handle);

	return
		ergo(osc_lock_is_lockless(ols),
		     ols->ols_locklessable && ols->ols_lock == NULL)  ||
		(ergo(olock != NULL, handle_used) &&
		 ergo(olock != NULL,
		      olock->l_handle.h_cookie == ols->ols_handle.cookie) &&
		 /*
		  * Check that ->ols_handle and ->ols_lock are consistent, but
		  * take into account that they are set at the different time.
		  */
		 ergo(handle_used,
		      ergo(lock != NULL && olock != NULL, lock == olock) &&
		      ergo(lock == NULL, olock == NULL)) &&
		 ergo(ols->ols_state == OLS_CANCELLED,
		      olock == NULL && !handle_used) &&
		 /*
		  * DLM lock is destroyed only after we have seen cancellation
		  * ast.
		  */
		 ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
		      !olock->l_destroyed) &&
		 ergo(ols->ols_state == OLS_GRANTED,
		      olock != NULL &&
		      olock->l_req_mode == olock->l_granted_mode &&
		      ols->ols_hold));
}
Example #4
0
static int lov_page_invariant(const struct cl_page_slice *slice)
{
	const struct cl_page  *page = slice->cpl_page;
	const struct cl_page  *sub  = lov_sub_page(slice);

	return ergo(sub != NULL,
		    page->cp_child == sub &&
		    sub->cp_parent == page &&
		    page->cp_state == sub->cp_state);
}
Example #5
0
static struct cl_object *lov_sub_find(const struct lu_env *env,
				      struct cl_device *dev,
				      const struct lu_fid *fid,
				      const struct cl_object_conf *conf)
{
	struct lu_object *o;

	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
	return lu2cl(o);
}
Example #6
0
/**
 * Invariant that has to be true all of the time.
 */
static int osc_lock_invariant(struct osc_lock *ols)
{
	struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle);
	struct ldlm_lock *olock = ols->ols_lock;
	int handle_used = lustre_handle_is_used(&ols->ols_handle);

	if (ergo(osc_lock_is_lockless(ols),
		 ols->ols_locklessable && ols->ols_lock == NULL))
		return 1;

	/*
	 * If all the following "ergo"s are true, return 1, otherwise 0
	 */
	if (!ergo(olock != NULL, handle_used))
		return 0;

	if (!ergo(olock != NULL,
		   olock->l_handle.h_cookie == ols->ols_handle.cookie))
		return 0;

	if (!ergo(handle_used,
		   ergo(lock != NULL && olock != NULL, lock == olock) &&
		   ergo(lock == NULL, olock == NULL)))
		return 0;
	/*
	 * Check that ->ols_handle and ->ols_lock are consistent, but
	 * take into account that they are set at the different time.
	 */
	if (!ergo(ols->ols_state == OLS_CANCELLED,
		   olock == NULL && !handle_used))
		return 0;
	/*
	 * DLM lock is destroyed only after we have seen cancellation
	 * ast.
	 */
	if (!ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
		   ((olock->l_flags & LDLM_FL_DESTROYED) == 0)))
		return 0;

	if (!ergo(ols->ols_state == OLS_GRANTED,
		   olock != NULL &&
		   olock->l_req_mode == olock->l_granted_mode &&
		   ols->ols_hold))
		return 0;
	return 1;
}
Example #7
0
/* cde_check ->check() method for compressed directory items

   used for debugging, every item should have here the most complete
   possible check of the consistency of the item that the inventor can
   construct
*/
int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
		      const char **error /* where to store error message */)
{
	int i;
	int result;
	char *item_start;
	char *item_end;
	reiser4_key key;

	coord_t c;

	assert("nikita-1357", coord != NULL);
	assert("nikita-1358", error != NULL);

	if (!ergo(coord->item_pos != 0,
		  is_dot_key(item_key_by_coord(coord, &key)))) {
		*error = "CDE doesn't start with dot";
		return -1;
	}
	item_start = item_body_by_coord(coord);
	item_end = item_start + item_length_by_coord(coord);

	coord_dup(&c, coord);
	result = 0;
	for (i = 0; i < units(coord); ++i) {
		directory_entry_format *entry;

		if ((char *)(header_at(coord, i) + 1) >
		    item_end - units(coord) * sizeof *entry) {
			*error = "CDE header is out of bounds";
			result = -1;
			break;
		}
		entry = entry_at(coord, i);
		if ((char *)entry < item_start + sizeof(cde_item_format)) {
			*error = "CDE header is too low";
			result = -1;
			break;
		}
		if ((char *)(entry + 1) > item_end) {
			*error = "CDE header is too high";
			result = -1;
			break;
		}
	}

	return result;
}
Example #8
0
static int llog_osd_declare_write_rec(const struct lu_env *env,
				      struct llog_handle *loghandle,
				      struct llog_rec_hdr *rec,
				      int idx, struct thandle *th)
{
	struct llog_thread_info	*lgi = llog_info(env);
	struct dt_object	*o;
	int			 rc;

	ENTRY;

	LASSERT(env);
	LASSERT(th);
	LASSERT(loghandle);

	o = loghandle->lgh_obj;
	LASSERT(o);

	/* each time we update header */
	rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
				     th);
	if (rc || idx == 0) /* if error or just header */
		RETURN(rc);

	if (dt_object_exists(o)) {
		rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
		lgi->lgi_off = lgi->lgi_attr.la_size;
		LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
		if (rc)
			RETURN(rc);

		rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
		if (rc)
			RETURN(rc);
	} else {
		lgi->lgi_off = 0;
	}

	/* XXX: implement declared window or multi-chunks approach */
	rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);

	RETURN(rc);
}
Example #9
0
/* like coord_by_key(), but starts traversal from vroot of @object rather than
 * from tree root. */
lookup_result reiser4_object_lookup(struct inode *object,
				    const reiser4_key * key,
				    coord_t *coord,
				    lock_handle * lh,
				    znode_lock_mode lock_mode,
				    lookup_bias bias,
				    tree_level lock_level,
				    tree_level stop_level, __u32 flags,
				    ra_info_t *info)
{
	cbk_handle handle;
	lock_handle parent_lh;
	lookup_result result;

	init_lh(lh);
	init_lh(&parent_lh);

	assert("nikita-3023", reiser4_schedulable());

	assert("nikita-354", key != NULL);
	assert("nikita-355", coord != NULL);
	assert("nikita-356", (bias == FIND_EXACT)
	       || (bias == FIND_MAX_NOT_MORE_THAN));
	assert("nikita-357", stop_level >= LEAF_LEVEL);
	/* no locks can be held during tree search by key */
	assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));

	cbk_pack(&handle,
		 object != NULL ? reiser4_tree_by_inode(object) : current_tree,
		 key,
		 coord,
		 lh,
		 &parent_lh,
		 lock_mode, bias, lock_level, stop_level, flags, info);
	handle.object = object;

	result = coord_by_handle(&handle);
	assert("nikita-3247",
	       ergo(!IS_CBKERR(result), coord->node == lh->node));
	return result;
}
Example #10
0
static int lov_init_raid0(const struct lu_env *env,
			  struct lov_device *dev, struct lov_object *lov,
			  const struct cl_object_conf *conf,
			  union  lov_layout_state *state)
{
	int result;
	int i;

	struct cl_object	*stripe;
	struct lov_thread_info  *lti     = lov_env_info(env);
	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
	struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
	struct lu_fid	   *ofid    = &lti->lti_fid;
	struct lov_layout_raid0 *r0      = &state->raid0;

	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
		dump_lsm(D_ERROR, lsm);
		LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
			 LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
	}

	LASSERT(!lov->lo_lsm);
	lov->lo_lsm = lsm_addref(lsm);
	r0->lo_nr  = lsm->lsm_stripe_count;
	LASSERT(r0->lo_nr <= lov_targets_nr(dev));

	r0->lo_sub = libcfs_kvzalloc(r0->lo_nr * sizeof(r0->lo_sub[0]),
				     GFP_NOFS);
	if (r0->lo_sub) {
		int psz = 0;

		result = 0;
		subconf->coc_inode = conf->coc_inode;
		spin_lock_init(&r0->lo_sub_lock);
		/*
		 * Create stripe cl_objects.
		 */
		for (i = 0; i < r0->lo_nr && result == 0; ++i) {
			struct cl_device *subdev;
			struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
			int ost_idx = oinfo->loi_ost_idx;

			if (lov_oinfo_is_dummy(oinfo))
				continue;

			result = ostid_to_fid(ofid, &oinfo->loi_oi,
					      oinfo->loi_ost_idx);
			if (result != 0)
				goto out;

			subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
			subconf->u.coc_oinfo = oinfo;
			LASSERTF(subdev, "not init ost %d\n", ost_idx);
			/* In the function below, .hs_keycmp resolves to
			 * lu_obj_hop_keycmp()
			 */
			/* coverity[overrun-buffer-val] */
			stripe = lov_sub_find(env, subdev, ofid, subconf);
			if (!IS_ERR(stripe)) {
				result = lov_init_sub(env, lov, stripe, r0, i);
				if (result == -EAGAIN) { /* try again */
					--i;
					result = 0;
					continue;
				}
			} else {
				result = PTR_ERR(stripe);
			}

			if (result == 0) {
				int sz = lov_page_slice_fixup(lov, stripe);

				LASSERT(ergo(psz > 0, psz == sz));
				psz = sz;
			}
		}
		if (result == 0)
			cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
	} else {
		result = -ENOMEM;
	}
out:
	return result;
}
Example #11
0
int osd_oi_init(struct osd_thread_info *info, struct osd_device *osd,
		bool restored)
{
	struct osd_scrub  *scrub = &osd->od_scrub;
	struct scrub_file *sf = &scrub->os_file;
	struct osd_oi    **oi;
	int		   rc;
	ENTRY;

	if (restored) {
		rc = osd_remove_ois(info, osd);
		if (rc != 0)
			return rc;
	}

	OBD_ALLOC(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
	if (oi == NULL)
		RETURN(-ENOMEM);

	mutex_lock(&oi_init_lock);
	/* try to open existing multiple OIs first */
	rc = osd_oi_table_open(info, osd, oi, sf->sf_oi_count, false);
	if (rc < 0)
		GOTO(out, rc);

	if (rc > 0) {
		if (rc == sf->sf_oi_count || sf->sf_oi_count == 0)
			GOTO(out, rc);

		osd_scrub_file_reset(scrub,
				     LDISKFS_SB(osd_sb(osd))->s_es->s_uuid,
				     SF_RECREATED);
		osd_oi_count = sf->sf_oi_count;
		goto create;
	}

	/* if previous failed then try found single OI from old filesystem */
	rc = osd_oi_open(info, osd, OSD_OI_NAME_BASE, &oi[0], false);
	if (rc == 0) { /* found single OI from old filesystem */
		ldiskfs_clear_bit(0, sf->sf_oi_bitmap);
		if (sf->sf_success_count == 0)
			/* XXX: There is one corner case that if the OI_scrub
			 *	file crashed or lost and we regard it upgrade,
			 *	then we allow IGIF lookup to bypass OI files.
			 *
			 *	The risk is that osd_fid_lookup() may found
			 *	a wrong inode with the given IGIF especially
			 *	when the MDT has performed file-level backup
			 *	and restored after former upgrading from 1.8
			 *	to 2.x. Fortunately, the osd_fid_lookup()can
			 *	verify the inode to decrease the risk. */
			osd_scrub_file_reset(scrub,
					LDISKFS_SB(osd_sb(osd))->s_es->s_uuid,
					SF_UPGRADE);
		GOTO(out, rc = 1);
	} else if (rc != -ENOENT) {
		CERROR("%.16s: can't open %s: rc = %d\n",
		       LDISKFS_SB(osd_sb(osd))->s_es->s_volume_name,
		       OSD_OI_NAME_BASE, rc);
		GOTO(out, rc);
	}

	if (sf->sf_oi_count > 0) {
		int i;

		memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE);
		for (i = 0; i < osd_oi_count; i++)
			ldiskfs_set_bit(i, sf->sf_oi_bitmap);
		osd_scrub_file_reset(scrub,
				     LDISKFS_SB(osd_sb(osd))->s_es->s_uuid,
				     SF_RECREATED);
	}
	sf->sf_oi_count = osd_oi_count;

create:
	rc = osd_scrub_file_store(scrub);
	if (rc < 0) {
		osd_oi_table_put(info, oi, sf->sf_oi_count);
		GOTO(out, rc);
	}

	/* No OIs exist, new filesystem, create OI objects */
	rc = osd_oi_table_open(info, osd, oi, osd_oi_count, true);
	LASSERT(ergo(rc >= 0, rc == osd_oi_count));

	GOTO(out, rc);

out:
	if (rc < 0) {
		OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
	} else {
		LASSERT((rc & (rc - 1)) == 0);
		osd->od_oi_table = oi;
		osd->od_oi_count = rc;
		if (sf->sf_oi_count != rc) {
			sf->sf_oi_count = rc;
			rc = osd_scrub_file_store(scrub);
			if (rc < 0) {
				osd_oi_table_put(info, oi, sf->sf_oi_count);
				OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
			}
		} else {
			rc = 0;
		}
	}

	mutex_unlock(&oi_init_lock);
	return rc;
}
Example #12
0
/**
 * Implementation of struct cl_req_operations::cro_attr_set() for osc
 * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
 * fields.
 */
static void osc_req_attr_set(const struct lu_env *env,
			     const struct cl_req_slice *slice,
			     const struct cl_object *obj,
			     struct cl_req_attr *attr, u64 flags)
{
	struct lov_oinfo *oinfo;
	struct cl_req    *clerq;
	struct cl_page   *apage; /* _some_ page in @clerq */
	struct cl_lock   *lock;  /* _some_ lock protecting @apage */
	struct osc_lock  *olck;
	struct osc_page  *opg;
	struct obdo      *oa;
	struct ost_lvb   *lvb;

	oinfo	= cl2osc(obj)->oo_oinfo;
	lvb	= &oinfo->loi_lvb;
	oa	= attr->cra_oa;

	if ((flags & OBD_MD_FLMTIME) != 0) {
		oa->o_mtime = lvb->lvb_mtime;
		oa->o_valid |= OBD_MD_FLMTIME;
	}
	if ((flags & OBD_MD_FLATIME) != 0) {
		oa->o_atime = lvb->lvb_atime;
		oa->o_valid |= OBD_MD_FLATIME;
	}
	if ((flags & OBD_MD_FLCTIME) != 0) {
		oa->o_ctime = lvb->lvb_ctime;
		oa->o_valid |= OBD_MD_FLCTIME;
	}
	if (flags & OBD_MD_FLGROUP) {
		ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
		oa->o_valid |= OBD_MD_FLGROUP;
	}
	if (flags & OBD_MD_FLID) {
		ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
		oa->o_valid |= OBD_MD_FLID;
	}
	if (flags & OBD_MD_FLHANDLE) {
		clerq = slice->crs_req;
		LASSERT(!list_empty(&clerq->crq_pages));
		apage = container_of(clerq->crq_pages.next,
				     struct cl_page, cp_flight);
		opg = osc_cl_page_osc(apage);
		apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
		lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
		if (lock == NULL) {
			struct cl_object_header *head;
			struct cl_lock	  *scan;

			head = cl_object_header(apage->cp_obj);
			list_for_each_entry(scan, &head->coh_locks,
						cll_linkage)
				CL_LOCK_DEBUG(D_ERROR, env, scan,
					      "no cover page!\n");
			CL_PAGE_DEBUG(D_ERROR, env, apage,
				      "dump uncover page!\n");
			dump_stack();
			LBUG();
		}

		olck = osc_lock_at(lock);
		LASSERT(olck != NULL);
		LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
		/* check for lockless io. */
		if (olck->ols_lock != NULL) {
			oa->o_handle = olck->ols_lock->l_remote_handle;
			oa->o_valid |= OBD_MD_FLHANDLE;
		}
		cl_lock_put(env, lock);
	}
Example #13
0
int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
			  obd_id id, struct ofd_seq *oseq, int nr, int sync)
{
	struct ofd_thread_info	*info = ofd_info(env);
	struct ofd_object	*fo = NULL;
	struct dt_object	*next;
	struct thandle		*th;
	struct ofd_object	**batch;
	struct lu_fid		*fid = &info->fti_fid;
	obd_id			 tmp;
	int			 rc;
	int			 i;
	int			 objects = 0;
	int			 nr_saved = nr;

	ENTRY;

	/* Don't create objects beyond the valid range for this SEQ */
	if (unlikely(fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
		     (id + nr) >= IDIF_MAX_OID)) {
		CERROR("%s:"DOSTID" hit the IDIF_MAX_OID (1<<48)!\n",
		       ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
		RETURN(rc = -ENOSPC);
	} else if (unlikely(!fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
			    (id + nr) >= OBIF_MAX_OID)) {
		CERROR("%s:"DOSTID" hit the OBIF_MAX_OID (1<<32)!\n",
		       ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
		RETURN(rc = -ENOSPC);
	}

	OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
	if (batch == NULL)
		RETURN(-ENOMEM);

	info->fti_attr.la_valid = LA_TYPE | LA_MODE;
	/*
	 * We mark object SUID+SGID to flag it for accepting UID+GID from
	 * client on first write.  Currently the permission bits on the OST are
	 * never used, so this is OK.
	 */
	info->fti_attr.la_mode = S_IFREG | S_ISUID | S_ISGID | 0666;
	info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);

	/* Initialize a/c/m time so any client timestamp will always
	 * be newer and update the inode. ctime = 0 is also handled
	 * specially in osd_inode_setattr(). See LU-221, LU-1042 */
	info->fti_attr.la_valid |= LA_ATIME | LA_MTIME | LA_CTIME;
	info->fti_attr.la_atime = 0;
	info->fti_attr.la_mtime = 0;
	info->fti_attr.la_ctime = 0;

	LASSERT(id != 0);

	/* prepare objects */
	*fid = *lu_object_fid(&oseq->os_lastid_obj->do_lu);
	for (i = 0; i < nr; i++) {
		rc = fid_set_id(fid, id + i);
		if (rc != 0) {
			if (i == 0)
				GOTO(out, rc);

			nr = i;
			break;
		}

		fo = ofd_object_find(env, ofd, fid);
		if (IS_ERR(fo)) {
			if (i == 0)
				GOTO(out, rc = PTR_ERR(fo));

			nr = i;
			break;
		}

		ofd_write_lock(env, fo);
		batch[i] = fo;
	}
	info->fti_buf.lb_buf = &tmp;
	info->fti_buf.lb_len = sizeof(tmp);
	info->fti_off = 0;

	th = ofd_trans_create(env, ofd);
	if (IS_ERR(th))
		GOTO(out, rc = PTR_ERR(th));

	th->th_sync |= sync;

	rc = dt_declare_record_write(env, oseq->os_lastid_obj, &info->fti_buf,
				     info->fti_off, th);
	if (rc)
		GOTO(trans_stop, rc);

	for (i = 0; i < nr; i++) {
		fo = batch[i];
		LASSERT(fo);

		if (unlikely(ofd_object_exists(fo))) {
			/* object may exist being re-created by write replay */
			CDEBUG(D_INODE, "object "LPX64"/"LPX64" exists: "
			       DFID"\n", ostid_seq(&oseq->os_oi), id,
			       PFID(lu_object_fid(&fo->ofo_obj.do_lu)));
			continue;
		}

		next = ofd_object_child(fo);
		LASSERT(next != NULL);

		rc = dt_declare_create(env, next, &info->fti_attr, NULL,
				       &info->fti_dof, th);
		if (rc) {
			nr = i;
			break;
		}
	}

	rc = dt_trans_start_local(env, ofd->ofd_osd, th);
	if (rc)
		GOTO(trans_stop, rc);

	CDEBUG(D_OTHER, "%s: create new object "DFID" nr %d\n",
	       ofd_name(ofd), PFID(fid), nr);

	LASSERT(nr > 0);

	 /* When the LFSCK scanning the whole device to verify the LAST_ID file
	  * consistency, it will load the last_id into RAM firstly, and compare
	  * the last_id with each OST-object's ID. If the later one is larger,
	  * then it will regard the LAST_ID file crashed. But during the LFSCK
	  * scanning, the OFD may continue to create new OST-objects. Those new
	  * created OST-objects will have larger IDs than the LFSCK known ones.
	  * So from the LFSCK view, it needs to re-load the last_id from disk
	  * file, and if the latest last_id is still smaller than the object's
	  * ID, then the LAST_ID file is real crashed.
	  *
	  * To make above mechanism to work, before OFD pre-create OST-objects,
	  * it needs to update the LAST_ID file firstly, otherwise, the LFSCK
	  * may cannot get latest last_id although new OST-object created. */
	if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_SKIP_LASTID)) {
		tmp = cpu_to_le64(id + nr - 1);
		dt_write_lock(env, oseq->os_lastid_obj, 0);
		rc = dt_record_write(env, oseq->os_lastid_obj,
				     &info->fti_buf, &info->fti_off, th);
		dt_write_unlock(env, oseq->os_lastid_obj);
		if (rc != 0)
			GOTO(trans_stop, rc);
	}

	for (i = 0; i < nr; i++) {
		fo = batch[i];
		LASSERT(fo);

		/* Only the new created objects need to be recorded. */
		if (ofd->ofd_osd->dd_record_fid_accessed) {
			lfsck_pack_rfa(&ofd_info(env)->fti_lr,
				       lu_object_fid(&fo->ofo_obj.do_lu));
			lfsck_in_notify(env, ofd->ofd_osd,
					&ofd_info(env)->fti_lr);
		}

		if (likely(!ofd_object_exists(fo) &&
			   !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DANGLING))) {
			next = ofd_object_child(fo);
			LASSERT(next != NULL);

			rc = dt_create(env, next, &info->fti_attr, NULL,
				       &info->fti_dof, th);
			if (rc)
				break;
			LASSERT(ofd_object_exists(fo));
		}
		ofd_seq_last_oid_set(oseq, id + i);
	}

	objects = i;
	/* NOT all the wanted objects have been created,
	 * set the LAST_ID as the real created. */
	if (unlikely(objects < nr)) {
		int rc1;

		info->fti_off = 0;
		tmp = cpu_to_le64(ofd_seq_last_oid(oseq));
		dt_write_lock(env, oseq->os_lastid_obj, 0);
		rc1 = dt_record_write(env, oseq->os_lastid_obj,
				      &info->fti_buf, &info->fti_off, th);
		dt_write_unlock(env, oseq->os_lastid_obj);
		if (rc1 != 0)
			CERROR("%s: fail to reset the LAST_ID for seq ("LPX64
			       ") from "LPU64" to "LPU64"\n", ofd_name(ofd),
			       ostid_seq(&oseq->os_oi), id + nr - 1,
			       ofd_seq_last_oid(oseq));
	}

trans_stop:
	ofd_trans_stop(env, ofd, th, rc);
out:
	for (i = 0; i < nr_saved; i++) {
		fo = batch[i];
		if (fo) {
			ofd_write_unlock(env, fo);
			ofd_object_put(env, fo);
		}
	}
	OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));

	CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
	       "created %d/%d objects: %d\n", objects, nr_saved, rc);

	LASSERT(ergo(objects == 0, rc < 0));
	RETURN(objects > 0 ? objects : rc);
}
Example #14
0
/* main tree lookup procedure

   Check coord cache. If key we are looking for is not found there, call cbk()
   to do real tree traversal.

   As we have extents on the twig level, @lock_level and @stop_level can
   be different from LEAF_LEVEL and each other.

   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
   long term locks) while calling this.
*/
lookup_result coord_by_key(reiser4_tree * tree	/* tree to perform search
						 * in. Usually this tree is
						 * part of file-system
						 * super-block */ ,
			   const reiser4_key * key /* key to look for */ ,
			   coord_t *coord	/* where to store found
						 * position in a tree. Fields
						 * in "coord" are only valid if
						 * coord_by_key() returned
						 * "CBK_COORD_FOUND" */ ,
			   lock_handle * lh,	/* resulting lock handle */
			   znode_lock_mode lock_mode	/* type of lookup we
							 * want on node. Pass
							 * ZNODE_READ_LOCK here
							 * if you only want to
							 * read item found and
							 * ZNODE_WRITE_LOCK if
							 * you want to modify
							 * it */ ,
			   lookup_bias bias	/* what to return if coord
						 * with exactly the @key is
						 * not in the tree */ ,
			   tree_level lock_level/* tree level where to start
						 * taking @lock type of
						 * locks */ ,
			   tree_level stop_level/* tree level to stop. Pass
						 * LEAF_LEVEL or TWIG_LEVEL
						 * here Item being looked
						 * for has to be between
						 * @lock_level and
						 * @stop_level, inclusive */ ,
			   __u32 flags /* search flags */ ,
			   ra_info_t *
			   info
			   /* information about desired tree traversal
			    * readahead */
			   )
{
	cbk_handle handle;
	lock_handle parent_lh;
	lookup_result result;

	init_lh(lh);
	init_lh(&parent_lh);

	assert("nikita-3023", reiser4_schedulable());

	assert("nikita-353", tree != NULL);
	assert("nikita-354", key != NULL);
	assert("nikita-355", coord != NULL);
	assert("nikita-356", (bias == FIND_EXACT)
	       || (bias == FIND_MAX_NOT_MORE_THAN));
	assert("nikita-357", stop_level >= LEAF_LEVEL);
	/* no locks can be held during tree traversal */
	assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));

	cbk_pack(&handle,
		 tree,
		 key,
		 coord,
		 lh,
		 &parent_lh,
		 lock_mode, bias, lock_level, stop_level, flags, info);

	result = coord_by_handle(&handle);
	assert("nikita-3247",
	       ergo(!IS_CBKERR(result), coord->node == lh->node));
	return result;
}
Example #15
0
/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
			    bool *retry)
{
	struct lu_env	   *env;
	struct cl_io	    *io;
	struct vvp_io	   *vio;
	struct cl_env_nest       nest;
	int		      result;
	sigset_t	     set;
	struct inode	     *inode;
	struct ll_inode_info     *lli;

	io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
	if (IS_ERR(io)) {
		result = PTR_ERR(io);
		goto out;
	}

	result = io->ci_result;
	if (result < 0)
		goto out_io;

	io->u.ci_fault.ft_mkwrite = 1;
	io->u.ci_fault.ft_writable = 1;

	vio = vvp_env_io(env);
	vio->u.fault.ft_vma    = vma;
	vio->u.fault.ft_vmpage = vmpage;

	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));

	/* we grab lli_trunc_sem to exclude truncate case.
	 * Otherwise, we could add dirty pages into osc cache
	 * while truncate is on-going.
	 */
	inode = ccc_object_inode(io->ci_obj);
	lli = ll_i2info(inode);
	down_read(&lli->lli_trunc_sem);

	result = cl_io_loop(env, io);

	up_read(&lli->lli_trunc_sem);

	cfs_restore_sigs(set);

	if (result == 0) {
		struct inode *inode = file_inode(vma->vm_file);
		struct ll_inode_info *lli = ll_i2info(inode);

		lock_page(vmpage);
		if (!vmpage->mapping) {
			unlock_page(vmpage);

			/* page was truncated and lock was cancelled, return
			 * ENODATA so that VM_FAULT_NOPAGE will be returned
			 * to handle_mm_fault().
			 */
			if (result == 0)
				result = -ENODATA;
		} else if (!PageDirty(vmpage)) {
			/* race, the page has been cleaned by ptlrpcd after
			 * it was unlocked, it has to be added into dirty
			 * cache again otherwise this soon-to-dirty page won't
			 * consume any grants, even worse if this page is being
			 * transferred because it will break RPC checksum.
			 */
			unlock_page(vmpage);

			CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n",
			       vmpage, vmpage->index);

			*retry = true;
			result = -EAGAIN;
		}

		if (result == 0) {
			spin_lock(&lli->lli_lock);
			lli->lli_flags |= LLIF_DATA_MODIFIED;
			spin_unlock(&lli->lli_lock);
		}
	}

out_io:
	cl_io_fini(env, io);
	cl_env_nested_put(&nest, env);
out:
	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
	LASSERT(ergo(result == 0, PageLocked(vmpage)));

	return result;
}
Example #16
0
/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
                            bool *retry)
{
	struct lu_env           *env;
	struct cl_io            *io;
	struct vvp_io           *vio;
	int                      result;
	__u16			 refcheck;
	sigset_t		 set;
	struct inode             *inode;
	struct ll_inode_info     *lli;
	ENTRY;

	LASSERT(vmpage != NULL);
	env = cl_env_get(&refcheck);
	if (IS_ERR(env))
		RETURN(PTR_ERR(env));

	io = ll_fault_io_init(env, vma, vmpage->index, NULL);
	if (IS_ERR(io))
		GOTO(out, result = PTR_ERR(io));

	result = io->ci_result;
	if (result < 0)
		GOTO(out_io, result);

	io->u.ci_fault.ft_mkwrite = 1;
	io->u.ci_fault.ft_writable = 1;

	vio = vvp_env_io(env);
	vio->u.fault.ft_vma    = vma;
	vio->u.fault.ft_vmpage = vmpage;

	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));

	inode = vvp_object_inode(io->ci_obj);
	lli = ll_i2info(inode);

	result = cl_io_loop(env, io);

	cfs_restore_sigs(set);

        if (result == 0) {
                lock_page(vmpage);
                if (vmpage->mapping == NULL) {
                        unlock_page(vmpage);

                        /* page was truncated and lock was cancelled, return
                         * ENODATA so that VM_FAULT_NOPAGE will be returned
                         * to handle_mm_fault(). */
                        if (result == 0)
                                result = -ENODATA;
                } else if (!PageDirty(vmpage)) {
                        /* race, the page has been cleaned by ptlrpcd after
                         * it was unlocked, it has to be added into dirty
                         * cache again otherwise this soon-to-dirty page won't
                         * consume any grants, even worse if this page is being
                         * transferred because it will break RPC checksum.
                         */
                        unlock_page(vmpage);

                        CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
                               "been written out, retry.\n",
                               vmpage, vmpage->index);

                        *retry = true;
                        result = -EAGAIN;
                }

		if (result == 0)
			ll_file_set_flag(lli, LLIF_DATA_MODIFIED);
        }
        EXIT;

out_io:
	cl_io_fini(env, io);
out:
	cl_env_put(env, &refcheck);
	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
	LASSERT(ergo(result == 0, PageLocked(vmpage)));

	return result;
}
Example #17
0
/* main function that handles common parts of tree traversal: starting
    (fake znode handling), restarts, error handling, completion */
static lookup_result traverse_tree(cbk_handle * h/* search handle */)
{
	int done;
	int iterations;
	int vroot_used;

	assert("nikita-365", h != NULL);
	assert("nikita-366", h->tree != NULL);
	assert("nikita-367", h->key != NULL);
	assert("nikita-368", h->coord != NULL);
	assert("nikita-369", (h->bias == FIND_EXACT)
	       || (h->bias == FIND_MAX_NOT_MORE_THAN));
	assert("nikita-370", h->stop_level >= LEAF_LEVEL);
	assert("nikita-2949", !(h->flags & CBK_DKSET));
	assert("zam-355", lock_stack_isclean(get_current_lock_stack()));

	done = 0;
	iterations = 0;
	vroot_used = 0;

	/* loop for restarts */
restart:

	assert("nikita-3024", reiser4_schedulable());

	h->result = CBK_COORD_FOUND;
	/* connect_znode() needs it */
	h->ld_key = *reiser4_min_key();
	h->rd_key = *reiser4_max_key();
	h->flags |= CBK_DKSET;
	h->error = NULL;

	if (!vroot_used && h->object != NULL) {
		vroot_used = 1;
		done = prepare_object_lookup(h);
		if (done == LOOKUP_REST)
			goto restart;
		else if (done == LOOKUP_DONE)
			return h->result;
	}
	if (h->parent_lh->node == NULL) {
		done =
		    get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
				   h->parent_lh);

		assert("nikita-1637", done != -E_DEADLOCK);

		h->block = h->tree->root_block;
		h->level = h->tree->height;
		h->coord->node = h->parent_lh->node;

		if (done != 0)
			return done;
	}

	/* loop descending a tree */
	while (!done) {

		if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
			     IS_POW(iterations))) {
			warning("nikita-1481", "Too many iterations: %i",
				iterations);
			reiser4_print_key("key", h->key);
			++iterations;
		} else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
			h->error =
			    "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
			h->result = RETERR(-EIO);
			break;
		}
		switch (cbk_level_lookup(h)) {
		case LOOKUP_CONT:
			move_lh(h->parent_lh, h->active_lh);
			continue;
		default:
			wrong_return_value("nikita-372", "cbk_level");
		case LOOKUP_DONE:
			done = 1;
			break;
		case LOOKUP_REST:
			hput(h);
			/* deadlock avoidance is normal case. */
			if (h->result != -E_DEADLOCK)
				++iterations;
			reiser4_preempt_point();
			goto restart;
		}
	}
	/* that's all. The rest is error handling */
	if (unlikely(h->error != NULL)) {
		warning("nikita-373", "%s: level: %i, "
			"lock_level: %i, stop_level: %i "
			"lock_mode: %s, bias: %s",
			h->error, h->level, h->lock_level, h->stop_level,
			lock_mode_name(h->lock_mode), bias_name(h->bias));
		reiser4_print_address("block", &h->block);
		reiser4_print_key("key", h->key);
		print_coord_content("coord", h->coord);
	}
	/* `unlikely' error case */
	if (unlikely(IS_CBKERR(h->result))) {
		/* failure. do cleanup */
		hput(h);
	} else {
		assert("nikita-1605", WITH_DATA_RET
		       (h->coord->node, 1,
			ergo((h->result == CBK_COORD_FOUND) &&
			     (h->bias == FIND_EXACT) &&
			     (!node_is_empty(h->coord->node)),
			     coord_is_existing_item(h->coord))));
	}
	return h->result;
}
Example #18
0
int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
			  obd_id id, struct ofd_seq *oseq, int nr, int sync)
{
	struct ofd_thread_info	*info = ofd_info(env);
	struct ofd_object	*fo = NULL;
	struct dt_object	*next;
	struct thandle		*th;
	struct ofd_object	**batch;
	obd_id			 tmp;
	int			 rc;
	int			 i;
	int			 objects = 0;
	int			 nr_saved = nr;

	ENTRY;

	/* Don't create objects beyond the valid range for this SEQ */
	if (unlikely(fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
		     (id + nr) >= IDIF_MAX_OID)) {
		CERROR("%s:"DOSTID" hit the IDIF_MAX_OID (1<<48)!\n",
		       ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
		RETURN(rc = -ENOSPC);
	} else if (unlikely(!fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
			    (id + nr) >= OBIF_MAX_OID)) {
		CERROR("%s:"DOSTID" hit the OBIF_MAX_OID (1<<32)!\n",
		       ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
		RETURN(rc = -ENOSPC);
	}

	OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
	if (batch == NULL)
		RETURN(-ENOMEM);

	info->fti_attr.la_valid = LA_TYPE | LA_MODE;
	/*
	 * We mark object SUID+SGID to flag it for accepting UID+GID from
	 * client on first write.  Currently the permission bits on the OST are
	 * never used, so this is OK.
	 */
	info->fti_attr.la_mode = S_IFREG | S_ISUID | S_ISGID | 0666;
	info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);

	/* Initialize a/c/m time so any client timestamp will always
	 * be newer and update the inode. ctime = 0 is also handled
	 * specially in osd_inode_setattr(). See LU-221, LU-1042 */
	info->fti_attr.la_valid |= LA_ATIME | LA_MTIME | LA_CTIME;
	info->fti_attr.la_atime = 0;
	info->fti_attr.la_mtime = 0;
	info->fti_attr.la_ctime = 0;

	/* prepare objects */
	ostid_set_seq(&info->fti_ostid, ostid_seq(&oseq->os_oi));
	for (i = 0; i < nr; i++) {
		ostid_set_id(&info->fti_ostid, id + i);
		rc = ostid_to_fid(&info->fti_fid, &info->fti_ostid, 0);
		if (rc) {
			if (i == 0)
				GOTO(out, rc);

			nr = i;
			break;
		}

		fo = ofd_object_find(env, ofd, &info->fti_fid);
		if (IS_ERR(fo)) {
			if (i == 0)
				GOTO(out, rc = PTR_ERR(fo));

			nr = i;
			break;
		}

		ofd_write_lock(env, fo);
		batch[i] = fo;
	}
	info->fti_buf.lb_buf = &tmp;
	info->fti_buf.lb_len = sizeof(tmp);
	info->fti_off = 0;

	th = ofd_trans_create(env, ofd);
	if (IS_ERR(th))
		GOTO(out, rc = PTR_ERR(th));

	th->th_sync |= sync;

	rc = dt_declare_record_write(env, oseq->os_lastid_obj, sizeof(tmp),
				     info->fti_off, th);
	if (rc)
		GOTO(trans_stop, rc);

	for (i = 0; i < nr; i++) {
		fo = batch[i];
		LASSERT(fo);

		if (unlikely(ofd_object_exists(fo))) {
			/* object may exist being re-created by write replay */
			CDEBUG(D_INODE, "object "LPX64"/"LPX64" exists: "
			       DFID"\n", ostid_seq(&oseq->os_oi), id,
			       PFID(&info->fti_fid));
			continue;
		}

		next = ofd_object_child(fo);
		LASSERT(next != NULL);

		rc = dt_declare_create(env, next, &info->fti_attr, NULL,
				       &info->fti_dof, th);
		if (rc) {
			nr = i;
			break;
		}
	}

	rc = dt_trans_start_local(env, ofd->ofd_osd, th);
	if (rc)
		GOTO(trans_stop, rc);

	CDEBUG(D_OTHER, "%s: create new object "DFID" nr %d\n",
	       ofd_name(ofd), PFID(&info->fti_fid), nr);

	for (i = 0; i < nr; i++) {
		fo = batch[i];
		LASSERT(fo);

		if (likely(!ofd_object_exists(fo))) {
			next = ofd_object_child(fo);
			LASSERT(next != NULL);

			rc = dt_create(env, next, &info->fti_attr, NULL,
				       &info->fti_dof, th);
			if (rc)
				break;
			LASSERT(ofd_object_exists(fo));
		}
		ofd_seq_last_oid_set(oseq, id + i);
	}

	objects = i;
	if (objects > 0) {
		tmp = cpu_to_le64(ofd_seq_last_oid(oseq));
		rc = dt_record_write(env, oseq->os_lastid_obj,
				     &info->fti_buf, &info->fti_off, th);
	}
trans_stop:
	ofd_trans_stop(env, ofd, th, rc);
out:
	for (i = 0; i < nr_saved; i++) {
		fo = batch[i];
		if (fo) {
			ofd_write_unlock(env, fo);
			ofd_object_put(env, fo);
		}
	}
	OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));

	CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
	       "created %d/%d objects: %d\n", objects, nr_saved, rc);

	LASSERT(ergo(objects == 0, rc < 0));
	RETURN(objects > 0 ? objects : rc);
}
Example #19
0
/**
 * process_cursors - do action on each cursor attached to inode
 * @inode:
 * @act: action to do
 *
 * Finds all cursors of @inode in reiser4's super block radix tree of cursors
 * and performs action specified by @act on each of cursors.
 */
static void process_cursors(struct inode *inode, enum cursor_action act)
{
	oid_t oid;
	dir_cursor *start;
	struct list_head *head;
	reiser4_context *ctx;
	struct d_cursor_info *info;

	/* this can be called by
	 *
	 * kswapd->...->prune_icache->..reiser4_destroy_inode
	 *
	 * without reiser4_context
	 */
	ctx = reiser4_init_context(inode->i_sb);
	if (IS_ERR(ctx)) {
		warning("vs-23", "failed to init context");
		return;
	}

	assert("nikita-3558", inode != NULL);

	info = d_info(inode);
	oid = get_inode_oid(inode);
	spin_lock_inode(inode);
	head = get_readdir_list(inode);
	spin_lock(&d_lock);
	/* find any cursor for this oid: reference to it is hanging of radix
	 * tree */
	start = lookup(info, (unsigned long)oid);
	if (start != NULL) {
		dir_cursor *scan;
		reiser4_file_fsdata *fsdata;

		/* process circular list of cursors for this oid */
		scan = start;
		do {
			dir_cursor *next;

			next = list_entry(scan->list.next, dir_cursor, list);
			fsdata = scan->fsdata;
			assert("nikita-3557", fsdata != NULL);
			if (scan->key.oid == oid) {
				switch (act) {
				case CURSOR_DISPOSE:
					list_del_init(&fsdata->dir.linkage);
					break;
				case CURSOR_LOAD:
					list_add(&fsdata->dir.linkage, head);
					break;
				case CURSOR_KILL:
					kill_cursor(scan);
					break;
				}
			}
			if (scan == next)
				/* last cursor was just killed */
				break;
			scan = next;
		} while (scan != start);
	}
	spin_unlock(&d_lock);
	/* check that we killed 'em all */
	assert("nikita-3568",
	       ergo(act == CURSOR_KILL,
		    list_empty_careful(get_readdir_list(inode))));
	assert("nikita-3569",
	       ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
	spin_unlock_inode(inode);
	reiser4_exit_context(ctx);
}
Example #20
0
/*
 * Concurrency: @dt is write locked.
 */
static int osd_object_create(const struct lu_env *env, struct dt_object *dt,
			     struct lu_attr *attr,
			     struct dt_allocation_hint *hint,
			     struct dt_object_format *dof,
			     struct thandle *th)
{
	struct zpl_direntry	*zde = &osd_oti_get(env)->oti_zde.lzd_reg;
	const struct lu_fid	*fid = lu_object_fid(&dt->do_lu);
	struct osd_object	*obj = osd_dt_obj(dt);
	struct osd_device	*osd = osd_obj2dev(obj);
	char			*buf = osd_oti_get(env)->oti_str;
	struct osd_thandle	*oh;
	dmu_buf_t		*db;
	uint64_t		 zapid;
	int			 rc;

	ENTRY;

	/* concurrent create declarations should not see
	 * the object inconsistent (db, attr, etc).
	 * in regular cases acquisition should be cheap */
	down(&obj->oo_guard);

	LASSERT(osd_invariant(obj));
	LASSERT(!dt_object_exists(dt));
	LASSERT(dof != NULL);

	LASSERT(th != NULL);
	oh = container_of0(th, struct osd_thandle, ot_super);

	/*
	 * XXX missing: Quote handling.
	 */

	LASSERT(obj->oo_db == NULL);

	db = osd_create_type_f(dof->dof_type)(env, osd, attr, oh);
	if (IS_ERR(db))
		GOTO(out, rc = PTR_ERR(th));

	zde->zde_pad = 0;
	zde->zde_dnode = db->db_object;
	zde->zde_type = IFTODT(attr->la_mode & S_IFMT);

	zapid = osd_get_name_n_idx(env, osd, fid, buf);

	rc = -zap_add(osd->od_objset.os, zapid, buf, 8, 1, zde, oh->ot_tx);
	if (rc)
		GOTO(out, rc);

	/* Add new object to inode accounting.
	 * Errors are not considered as fatal */
	rc = -zap_increment_int(osd->od_objset.os, osd->od_iusr_oid,
				(attr->la_valid & LA_UID) ? attr->la_uid : 0, 1,
				oh->ot_tx);
	if (rc)
		CERROR("%s: failed to add "DFID" to accounting ZAP for usr %d "
			"(%d)\n", osd->od_svname, PFID(fid), attr->la_uid, rc);
	rc = -zap_increment_int(osd->od_objset.os, osd->od_igrp_oid,
				(attr->la_valid & LA_GID) ? attr->la_gid : 0, 1,
				oh->ot_tx);
	if (rc)
		CERROR("%s: failed to add "DFID" to accounting ZAP for grp %d "
			"(%d)\n", osd->od_svname, PFID(fid), attr->la_gid, rc);

	/* configure new osd object */
	obj->oo_db = db;
	rc = osd_object_init0(env, obj);
	LASSERT(ergo(rc == 0, dt_object_exists(dt)));
	LASSERT(osd_invariant(obj));

	rc = osd_init_lma(env, obj, fid, oh);
	if (rc) {
		CERROR("%s: can not set LMA on "DFID": rc = %d\n",
		       osd->od_svname, PFID(fid), rc);
		/* ignore errors during LMA initialization */
		rc = 0;
	}

out:
	up(&obj->oo_guard);
	RETURN(rc);
}
Example #21
0
/* Allocate "real" disk blocks by calling a proper space allocation plugin
 * method. Blocks are allocated in one contiguous disk region. The plugin
 * independent part accounts blocks by subtracting allocated amount from grabbed
 * or fake block counter and add the same amount to the counter of allocated
 * blocks.
 *
 * @hint -- a reiser4 blocknr hint object which contains further block
 *          allocation hints and parameters (search start, a stage of block
 *          which will be mapped to disk, etc.),
 * @blk  -- an out parameter for the beginning of the allocated region,
 * @len  -- in/out parameter, it should contain the maximum number of allocated
 *          blocks, after block allocation completes, it contains the length of
 *          allocated disk region.
 * @flags -- see reiser4_ba_flags_t description.
 *
 * @return -- 0 if success, error code otherwise.
 */
int
reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
		     reiser4_block_nr * len, reiser4_ba_flags_t flags)
{
	__u64 needed = *len;
	reiser4_context *ctx;
	reiser4_super_info_data *sbinfo;
	int ret;

	assert("zam-986", hint != NULL);

	ctx = get_current_context();
	sbinfo = get_super_private(ctx->super);

	/* For write-optimized data we use default search start value, which is
	 * close to last write location. */
	if (flags & BA_USE_DEFAULT_SEARCH_START)
		get_blocknr_hint_default(&hint->blk);

	/* VITALY: allocator should grab this for internal/tx-lists/similar
	   only. */
/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)?*/
	if (hint->block_stage == BLOCK_NOT_COUNTED) {
		ret = reiser4_grab_space_force(*len, flags);
		if (ret != 0)
			return ret;
	}

	ret =
	    sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
			    hint, (int)needed, blk, len);

	if (!ret) {
		assert("zam-680", *blk < reiser4_block_count(ctx->super));
		assert("zam-681",
		       *blk + *len <= reiser4_block_count(ctx->super));

		if (flags & BA_PERMANENT) {
			/* we assume that current atom exists at this moment */
			txn_atom *atom = get_current_atom_locked();
			atom->nr_blocks_allocated += *len;
			spin_unlock_atom(atom);
		}

		switch (hint->block_stage) {
		case BLOCK_NOT_COUNTED:
		case BLOCK_GRABBED:
			grabbed2used(ctx, sbinfo, *len);
			break;
		case BLOCK_UNALLOCATED:
			fake_allocated2used(sbinfo, *len, flags);
			break;
		case BLOCK_FLUSH_RESERVED:
			{
				txn_atom *atom = get_current_atom_locked();
				flush_reserved2used(atom, *len);
				spin_unlock_atom(atom);
			}
			break;
		default:
			impossible("zam-531", "wrong block stage");
		}
	} else {
		assert("zam-821",
		       ergo(hint->max_dist == 0
			    && !hint->backward, ret != -ENOSPC));
		if (hint->block_stage == BLOCK_NOT_COUNTED)
			grabbed2free(ctx, sbinfo, needed);
	}

	return ret;
}
Example #22
0
/**
 * Creates sub-locks for a given lov_lock for the first time.
 *
 * Goes through all sub-objects of top-object, and creates sub-locks on every
 * sub-object intersecting with top-lock extent. This is complicated by the
 * fact that top-lock (that is being created) can be accessed concurrently
 * through already created sub-locks (possibly shared with other top-locks).
 */
static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
					  const struct cl_object *obj,
					  struct cl_lock *lock)
{
	int result = 0;
	int i;
	int nr;
	u64 start;
	u64 end;
	u64 file_start;
	u64 file_end;

	struct lov_object       *loo    = cl2lov(obj);
	struct lov_layout_raid0 *r0     = lov_r0(loo);
	struct lov_lock		*lovlck;

	file_start = cl_offset(lov2cl(loo), lock->cll_descr.cld_start);
	file_end   = cl_offset(lov2cl(loo), lock->cll_descr.cld_end + 1) - 1;

	for (i = 0, nr = 0; i < r0->lo_nr; i++) {
		/*
		 * XXX for wide striping smarter algorithm is desirable,
		 * breaking out of the loop, early.
		 */
		if (likely(r0->lo_sub[i]) && /* spare layout */
		    lov_stripe_intersects(loo->lo_lsm, i,
					  file_start, file_end, &start, &end))
			nr++;
	}
	LASSERT(nr > 0);
	lovlck = libcfs_kvzalloc(offsetof(struct lov_lock, lls_sub[nr]),
				 GFP_NOFS);
	if (!lovlck)
		return ERR_PTR(-ENOMEM);

	lovlck->lls_nr = nr;
	for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
		if (likely(r0->lo_sub[i]) &&
		    lov_stripe_intersects(loo->lo_lsm, i,
					  file_start, file_end, &start, &end)) {
			struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
			struct cl_lock_descr *descr;

			descr = &lls->sub_lock.cll_descr;

			LASSERT(!descr->cld_obj);
			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
			descr->cld_start = cl_index(descr->cld_obj, start);
			descr->cld_end   = cl_index(descr->cld_obj, end);
			descr->cld_mode  = lock->cll_descr.cld_mode;
			descr->cld_gid   = lock->cll_descr.cld_gid;
			descr->cld_enq_flags = lock->cll_descr.cld_enq_flags;
			lls->sub_stripe = i;

			/* initialize sub lock */
			result = lov_sublock_init(env, lock, lls);
			if (result < 0)
				break;

			lls->sub_initialized = 1;
			nr++;
		}
	}
	LASSERT(ergo(result == 0, nr == lovlck->lls_nr));

	if (result != 0) {
		for (i = 0; i < nr; ++i) {
			if (!lovlck->lls_sub[i].sub_initialized)
				break;

			cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock);
		}
		kvfree(lovlck);
		lovlck = ERR_PTR(result);
	}

	return lovlck;
}
Example #23
0
/**
 * Implementation of the llog_operations::lop_write
 *
 * This function writes the new record in the llog or modify the existed one.
 *
 * \param[in]  env		execution environment
 * \param[in]  loghandle	llog handle of the current llog
 * \param[in]  rec		llog record header. This is a real header of
 *				the full llog record to write. This is
 *				the beginning of buffer to write, the length
 *				of buffer is stored in \a rec::lrh_len
 * \param[out] reccookie	pointer to the cookie to return back if needed.
 *				It is used for further cancel of this llog
 *				record.
 * \param[in]  idx		index of the llog record. If \a idx == -1 then
 *				this is append case, otherwise \a idx is
 *				the index of record to modify
 * \param[in]  th		current transaction handle
 *
 * \retval			0 on successful write && \a reccookie == NULL
 *				1 on successful write && \a reccookie != NULL
 * \retval			negative error if write failed
 */
static int llog_osd_write_rec(const struct lu_env *env,
			      struct llog_handle *loghandle,
			      struct llog_rec_hdr *rec,
			      struct llog_cookie *reccookie,
			      int idx, struct thandle *th)
{
	struct llog_thread_info	*lgi = llog_info(env);
	struct llog_log_hdr	*llh;
	int			 reclen = rec->lrh_len;
	int			 index, rc;
	struct llog_rec_tail	*lrt;
	struct dt_object	*o;
	size_t			 left;
	bool			 header_is_updated = false;

	ENTRY;

	LASSERT(env);
	llh = loghandle->lgh_hdr;
	LASSERT(llh);
	o = loghandle->lgh_obj;
	LASSERT(o);
	LASSERT(th);

	CDEBUG(D_OTHER, "new record %x to "DFID"\n",
	       rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));

	/* record length should not bigger than LLOG_CHUNK_SIZE */
	if (reclen > LLOG_CHUNK_SIZE)
		RETURN(-E2BIG);

	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
	if (rc)
		RETURN(rc);

	/**
	 * The modification case.
	 * If idx set then the record with that index must be modified.
	 * There are three cases possible:
	 * 1) the common case is the llog header update (idx == 0)
	 * 2) the llog record modification during llog process.
	 *    This is indicated by the \a loghandle::lgh_cur_idx > 0.
	 *    In that case the \a loghandle::lgh_cur_offset
	 * 3) otherwise this is assumed that llog consist of records of
	 *    fixed size, i.e. catalog. The llog header must has llh_size
	 *    field equal to record size. The record offset is calculated
	 *    just by /a idx value
	 *
	 * During modification we don't need extra header update because
	 * the bitmap and record count are not changed. The record header
	 * and tail remains the same too.
	 */
	if (idx != LLOG_NEXT_IDX) {
		/* llog can be empty only when first record is being written */
		LASSERT(ergo(idx > 0, lgi->lgi_attr.la_size > 0));

		if (!ext2_test_bit(idx, llh->llh_bitmap)) {
			CERROR("%s: modify unset record %u\n",
			       o->do_lu.lo_dev->ld_obd->obd_name, idx);
			RETURN(-ENOENT);
		}

		if (idx != rec->lrh_index) {
			CERROR("%s: modify index mismatch %d %u\n",
			       o->do_lu.lo_dev->ld_obd->obd_name, idx,
			       rec->lrh_index);
			RETURN(-EFAULT);
		}

		if (idx == LLOG_HEADER_IDX) {
			/* llog header update */
			LASSERT(reclen == sizeof(struct llog_log_hdr));
			LASSERT(rec == &llh->llh_hdr);

			lgi->lgi_off = 0;
			lgi->lgi_buf.lb_len = reclen;
			lgi->lgi_buf.lb_buf = rec;
			rc = dt_record_write(env, o, &lgi->lgi_buf,
					     &lgi->lgi_off, th);
			RETURN(rc);
		} else if (loghandle->lgh_cur_idx > 0) {
			/**
			 * The lgh_cur_offset can be used only if index is
			 * the same.
			 */
			if (idx != loghandle->lgh_cur_idx) {
				CERROR("%s: modify index mismatch %d %d\n",
				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
				       loghandle->lgh_cur_idx);
				RETURN(-EFAULT);
			}

			lgi->lgi_off = loghandle->lgh_cur_offset;
			CDEBUG(D_OTHER, "modify record "DOSTID": idx:%d, "
			       "len:%u offset %llu\n",
			       POSTID(&loghandle->lgh_id.lgl_oi), idx,
			       rec->lrh_len, (long long)lgi->lgi_off);
		} else if (llh->llh_size > 0) {
			if (llh->llh_size != rec->lrh_len) {
				CERROR("%s: wrong record size, llh_size is %u"
				       " but record size is %u\n",
				       o->do_lu.lo_dev->ld_obd->obd_name,
				       llh->llh_size, rec->lrh_len);
				RETURN(-EINVAL);
			}
			lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
		} else {
			/* This can be result of lgh_cur_idx is not set during
			 * llog processing or llh_size is not set to proper
			 * record size for fixed records llog. Therefore it is
			 * impossible to get record offset. */
			CERROR("%s: can't get record offset, idx:%d, "
			       "len:%u.\n", o->do_lu.lo_dev->ld_obd->obd_name,
			       idx, rec->lrh_len);
			RETURN(-EFAULT);
		}

		/* update only data, header and tail remain the same */
		lgi->lgi_off += sizeof(struct llog_rec_hdr);
		lgi->lgi_buf.lb_len = REC_DATA_LEN(rec);
		lgi->lgi_buf.lb_buf = REC_DATA(rec);
		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
		if (rc == 0 && reccookie) {
			reccookie->lgc_lgl = loghandle->lgh_id;
			reccookie->lgc_index = idx;
			rc = 1;
		}
		RETURN(rc);
	}

	/**
	 * The append case.
	 * The most common case of using llog. The new index is assigned to
	 * the new record, new bit is set in llog bitmap and llog count is
	 * incremented.
	 *
	 * Make sure that records don't cross a chunk boundary, so we can
	 * process them page-at-a-time if needed.  If it will cross a chunk
	 * boundary, write in a fake (but referenced) entry to pad the chunk.
	 */
	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
	lgi->lgi_off = lgi->lgi_attr.la_size;
	left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
	/* NOTE: padding is a record, but no bit is set */
	if (left != 0 && left != reclen &&
	    left < (reclen + LLOG_MIN_REC_SIZE)) {
		index = loghandle->lgh_last_idx + 1;
		rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
		if (rc)
			RETURN(rc);
		loghandle->lgh_last_idx++; /* for pad rec */
	}
	/* if it's the last idx in log file, then return -ENOSPC */
	if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
		RETURN(-ENOSPC);

	/* increment the last_idx along with llh_tail index, they should
	 * be equal for a llog lifetime */
	loghandle->lgh_last_idx++;
	index = loghandle->lgh_last_idx;
	llh->llh_tail.lrt_index = index;
	/**
	 * NB: the caller should make sure only 1 process access
	 * the lgh_last_idx, e.g. append should be exclusive.
	 * Otherwise it might hit the assert.
	 */
	LASSERT(index < LLOG_BITMAP_SIZE(llh));
	rec->lrh_index = index;
	lrt = rec_tail(rec);
	lrt->lrt_len = rec->lrh_len;
	lrt->lrt_index = rec->lrh_index;

	/* the lgh_hdr_lock protects llog header data from concurrent
	 * update/cancel, the llh_count and llh_bitmap are protected */
	spin_lock(&loghandle->lgh_hdr_lock);
	if (ext2_set_bit(index, llh->llh_bitmap)) {
		CERROR("%s: index %u already set in log bitmap\n",
		       o->do_lu.lo_dev->ld_obd->obd_name, index);
		spin_unlock(&loghandle->lgh_hdr_lock);
		LBUG(); /* should never happen */
	}
	llh->llh_count++;
	spin_unlock(&loghandle->lgh_hdr_lock);

	lgi->lgi_off = 0;
	lgi->lgi_buf.lb_len = llh->llh_hdr.lrh_len;
	lgi->lgi_buf.lb_buf = &llh->llh_hdr;
	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
	if (rc)
		GOTO(out, rc);

	header_is_updated = true;
	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
	if (rc)
		GOTO(out, rc);

	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
	lgi->lgi_off = lgi->lgi_attr.la_size;
	lgi->lgi_buf.lb_len = reclen;
	lgi->lgi_buf.lb_buf = rec;
	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
	if (rc < 0)
		GOTO(out, rc);

	CDEBUG(D_OTHER, "added record "DOSTID": idx: %u, %u\n",
	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
	if (reccookie != NULL) {
		reccookie->lgc_lgl = loghandle->lgh_id;
		reccookie->lgc_index = index;
		if ((rec->lrh_type == MDS_UNLINK_REC) ||
		    (rec->lrh_type == MDS_SETATTR64_REC))
			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
		else if (rec->lrh_type == OST_SZ_REC)
			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
		else
			reccookie->lgc_subsys = -1;
		rc = 1;
	}
	RETURN(rc);
out:
	/* cleanup llog for error case */
	spin_lock(&loghandle->lgh_hdr_lock);
	ext2_clear_bit(index, llh->llh_bitmap);
	llh->llh_count--;
	spin_unlock(&loghandle->lgh_hdr_lock);

	/* restore llog last_idx */
	loghandle->lgh_last_idx--;
	llh->llh_tail.lrt_index = loghandle->lgh_last_idx;

	/* restore the header on disk if it was written */
	if (header_is_updated) {
		lgi->lgi_off = 0;
		lgi->lgi_buf.lb_len = llh->llh_hdr.lrh_len;
		lgi->lgi_buf.lb_buf = &llh->llh_hdr;
		dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
	}

	RETURN(rc);
}
Example #24
0
/**
 * reiser4_replace_extent - replace extent and paste 1 or 2 after it
 * @un_extent: coordinate of extent to be overwritten
 * @lh: need better comment
 * @key: need better comment
 * @exts_to_add: data prepared for insertion into tree
 * @replace: need better comment
 * @flags: need better comment
 * @return_insert_position: need better comment
 *
 * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
 * @return_inserted_position is 1 - @un_extent and @lh are returned set to
 * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
 * set to extent which was overwritten.
 */
int reiser4_replace_extent(struct replace_handle *h,
                           int return_inserted_position)
{
    int result;
    znode *orig_znode;
    /*ON_DEBUG(reiser4_extent orig_ext);*/	/* this is for debugging */

    assert("vs-990", coord_is_existing_unit(h->coord));
    assert("vs-1375", znode_is_write_locked(h->coord->node));
    assert("vs-1426", extent_get_width(&h->overwrite) != 0);
    assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
    assert("vs-1427", ergo(h->nr_new_extents == 2,
                           extent_get_width(&h->new_extents[1]) != 0));

    /* compose structure for paste */
    init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);

    coord_dup(&h->coord_after, h->coord);
    init_lh(&h->lh_after);
    copy_lh(&h->lh_after, h->lh);
    reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
    reiser4_tap_monitor(&h->watch);

    ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
    orig_znode = h->coord->node;

#if REISER4_DEBUG
    /* make sure that key is set properly */
    unit_key_by_coord(h->coord, &h->tmp);
    set_key_offset(&h->tmp,
                   get_key_offset(&h->tmp) +
                   extent_get_width(&h->overwrite) * current_blocksize);
    assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
#endif

    /* set insert point after unit to be replaced */
    h->coord->between = AFTER_UNIT;

    result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
                              &h->paste_key, &h->item, h->flags);
    if (!result) {
        /* now we have to replace the unit after which new units were
           inserted. Its position is tracked by @watch */
        reiser4_extent *ext;
        znode *node;

        node = h->coord_after.node;
        if (node != orig_znode) {
            coord_clear_iplug(&h->coord_after);
            result = zload(node);
        }

        if (likely(!result)) {
            ext = extent_by_coord(&h->coord_after);

            assert("vs-987", znode_is_loaded(node));
            assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));

            /* overwrite extent unit */
            memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
            znode_make_dirty(node);

            if (node != orig_znode)
                zrelse(node);

            if (return_inserted_position == 0) {
                /* coord and lh are to be set to overwritten
                   extent */
                assert("vs-1662",
                       WITH_DATA(node, !memcmp(&h->overwrite,
                                               extent_by_coord(
                                                   &h->coord_after),
                                               sizeof(reiser4_extent))));

                *h->coord = h->coord_after;
                done_lh(h->lh);
                copy_lh(h->lh, &h->lh_after);
            } else {
                /* h->coord and h->lh are to be set to first of
                   inserted units */
                assert("vs-1663",
                       WITH_DATA(h->coord->node,
                                 !memcmp(&h->new_extents[0],
                                         extent_by_coord(h->coord),
                                         sizeof(reiser4_extent))));
                assert("vs-1664", h->lh->node == h->coord->node);
            }
        }
    }
    reiser4_tap_done(&h->watch);

    return result;
}