Esempio n. 1
0
void show_tree_range(struct btree *btree, tuxkey_t start, unsigned count)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	__tux3_dbg("%i level btree at %Li:\n",
		   btree->root.depth, btree->root.block);
	if (!has_root(btree))
		return;

	struct cursor *cursor = alloc_cursor(btree, 0);
	if (!cursor) {
		tux3_err(btree->sb, "out of memory");
		return;
	}
	if (btree_probe(cursor, start)) {
		tux3_fs_error(btree->sb, "tell me why!!!");
		goto out;
	}

	struct buffer_head *buffer;
	do {
		buffer = cursor_leafbuf(cursor);
		assert((btree->ops->leaf_sniff)(btree, bufdata(buffer)));
		(btree->ops->leaf_dump)(btree, bufdata(buffer));
	} while (--count && cursor_advance(cursor));

out:
	free_cursor(cursor);
}
Esempio n. 2
0
File: btree.c Progetto: Zkin/tux3
/*
 * Cursor read root node.
 * < 0 - error
 *   0 - success
 */
static int cursor_read_root(struct cursor *cursor)
{
	struct btree *btree = cursor->btree;
	struct buffer_head *buffer;

	assert(has_root(btree));

	buffer = vol_bread(btree->sb, btree->root.block);
	if (!buffer)
		return -EIO; /* FIXME: stupid, it might have been NOMEM */
	assert(bnode_sniff(bufdata(buffer)));
	cursor_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries);
	return 0;
}
Esempio n. 3
0
/*
 * Cursor read root node.
 * < 0 - error
 *   0 - success
 */
static int cursor_read_root(struct cursor *cursor)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct btree *btree = cursor->btree;
	struct buffer_head *buffer;

	assert(has_root(btree));

	buffer = vol_bread(btree->sb, btree->root.block);
	if (!buffer)
		return -EIO; /* FIXME: stupid, it might have been NOMEM */
	assert(bnode_sniff(bufdata(buffer)));
	cursor_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries);
	return 0;
}
Esempio n. 4
0
/*
 * This is range deletion. So, instead of adjusting balance of the
 * space on sibling nodes for each change, this just removes the range
 * and merges from right to left even if it is not same parent.
 *
 *              +--------------- (A, B, C)--------------------+
 *              |                    |                        |
 *     +-- (AA, AB, AC) -+       +- (BA, BB, BC) -+      + (CA, CB, CC) +
 *     |        |        |       |        |       |      |       |      |
 * (AAA,AAB)(ABA,ABB)(ACA,ACB) (BAA,BAB)(BBA)(BCA,BCB)  (CAA)(CBA,CBB)(CCA)
 *
 * [less : A, AA, AAA, AAB, AB, ABA, ABB, AC, ACA, ACB, B, BA ... : greater]
 *
 * If we merged from cousin (or re-distributed), we may have to update
 * the index until common parent. (e.g. removed (ACB), then merged
 * from (BAA,BAB) to (ACA), we have to adjust B in root node to BB)
 *
 * See, adjust_parent_sep().
 *
 * FIXME: no re-distribute. so, we don't guarantee above than 50%
 * space efficiency. And if range is end of key (truncate() case), we
 * don't need to merge, and adjust_parent_sep().
 *
 * FIXME2: we may want to split chop work for each step. instead of
 * blocking for a long time.
 */
int btree_chop(struct btree *btree, tuxkey_t start, u64 len)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct sb *sb = btree->sb;
	struct btree_ops *ops = btree->ops;
	struct buffer_head **prev, *leafprev = NULL;
	struct chopped_index_info *cii;
	struct cursor *cursor;
	tuxkey_t limit;
	int ret, done = 0;

	if (!has_root(btree))
		return 0;

	/* Chop all range if len >= TUXKEY_LIMIT */
	limit = (len >= TUXKEY_LIMIT) ? TUXKEY_LIMIT : start + len;

	prev = malloc(sizeof(*prev) * btree->root.depth);
	if (prev == NULL)
		return -ENOMEM;
	memset(prev, 0, sizeof(*prev) * btree->root.depth);

	cii = malloc(sizeof(*cii) * btree->root.depth);
	if (cii == NULL) {
		ret = -ENOMEM;
		goto error_cii;
	}
	memset(cii, 0, sizeof(*cii) * btree->root.depth);

	cursor = alloc_cursor(btree, 0);
	if (!cursor) {
		ret = -ENOMEM;
		goto error_alloc_cursor;
	}

	down_write(&btree->lock);
	ret = btree_probe(cursor, start);
	if (ret)
		goto error_btree_probe;

	/* Walk leaves */
	while (1) {
		struct buffer_head *leafbuf;
		tuxkey_t this_key;

		/*
		 * FIXME: If leaf was merged and freed later, we don't
		 * need to redirect leaf and leaf_chop()
		 */
		if ((ret = cursor_redirect(cursor)))
			goto out;
		leafbuf = cursor_pop(cursor);

		/* Adjust start and len for this leaf */
		this_key = cursor_level_this_key(cursor);
		if (start < this_key) {
			if (limit < TUXKEY_LIMIT)
				len -= this_key - start;
			start = this_key;
		}

		ret = ops->leaf_chop(btree, start, len, bufdata(leafbuf));
		if (ret) {
			if (ret < 0) {
				blockput(leafbuf);
				goto out;
			}
			mark_buffer_dirty_non(leafbuf);
		}

		/* Try to merge this leaf with prev */
		if (leafprev) {
			if (try_leaf_merge(btree, leafprev, leafbuf)) {
				trace(">>> can merge leaf %p into leaf %p", leafbuf, leafprev);
				remove_index(cursor, cii);
				mark_buffer_dirty_non(leafprev);
				blockput_free(sb, leafbuf);
				goto keep_prev_leaf;
			}
			blockput(leafprev);
		}
		leafprev = leafbuf;

keep_prev_leaf:

		if (cursor_level_next_key(cursor) >= limit)
			done = 1;
		/* Pop and try to merge finished nodes */
		while (done || cursor_level_finished(cursor)) {
			struct buffer_head *buf;
			int level = cursor->level;
			struct chopped_index_info *ciil = &cii[level];


			/* Get merge src buffer, and go parent level */
			buf = cursor_pop(cursor);

			/*
			 * Logging chopped indexes
			 * FIXME: If node is freed later (e.g. merged),
			 * we dont't need to log this
			 */
			if (ciil->count) {
				log_bnode_del(sb, bufindex(buf), ciil->start,
					      ciil->count);
			}
			memset(ciil, 0, sizeof(*ciil));

			/* Try to merge node with prev */
			if (prev[level]) {
				assert(level);
				if (try_bnode_merge(sb, prev[level], buf)) {
					trace(">>> can merge node %p into node %p", buf, prev[level]);
					remove_index(cursor, cii);
					mark_buffer_unify_non(prev[level]);
					blockput_free_unify(sb, buf);
					goto keep_prev_node;
				}
				blockput(prev[level]);
			}
			prev[level] = buf;
keep_prev_node:

			if (!level)
				goto chop_root;
		}

		/* Push back down to leaf level */
		do {
			ret = cursor_advance_down(cursor);
			if (ret < 0)
				goto out;
		} while (ret);
	}

chop_root:
	/* Remove depth if possible */
	while (btree->root.depth > 1 && bcount(bufdata(prev[0])) == 1) {
		trace("drop btree level");
		btree->root.block = bufindex(prev[1]);
		btree->root.depth--;
		tux3_mark_btree_dirty(btree);

		/*
		 * We know prev[0] is redirected and dirty. So, in
		 * here, we can just cancel bnode_redirect by bfree(),
		 * instead of defered_bfree()
		 * FIXME: we can optimize freeing bnode without
		 * bnode_redirect, and if we did, this is not true.
		 */
		bfree(sb, bufindex(prev[0]), 1);
		log_bnode_free(sb, bufindex(prev[0]));
		blockput_free_unify(sb, prev[0]);

		vecmove(prev, prev + 1, btree->root.depth);
	}
	ret = 0;

out:
	if (leafprev)
		blockput(leafprev);
	for (int i = 0; i < btree->root.depth; i++) {
		if (prev[i])
			blockput(prev[i]);
	}
	release_cursor(cursor);
error_btree_probe:
	up_write(&btree->lock);

	free_cursor(cursor);
error_alloc_cursor:
	free(cii);
error_cii:
	free(prev);

	return ret;
}
Esempio n. 5
0
int alloc_empty_btree(struct btree *btree)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct sb *sb = btree->sb;
	struct buffer_head *rootbuf = new_node(btree);
	if (IS_ERR(rootbuf))
		goto error;
	struct buffer_head *leafbuf = new_leaf(btree);
	if (IS_ERR(leafbuf))
		goto error_leafbuf;

	assert(!has_root(btree));
	struct bnode *rootnode = bufdata(rootbuf);
	block_t rootblock = bufindex(rootbuf);
	block_t leafblock = bufindex(leafbuf);
	trace("root at %Lx", rootblock);
	trace("leaf at %Lx", leafblock);
	bnode_init_root(rootnode, 1, leafblock, 0, 0);
	log_bnode_root(sb, rootblock, 1, leafblock, 0, 0);
	log_balloc(sb, leafblock, 1);

	mark_buffer_unify_non(rootbuf);
	blockput(rootbuf);
	mark_buffer_dirty_non(leafbuf);
	blockput(leafbuf);

	btree->root = (struct root){ .block = rootblock, .depth = 1 };
	tux3_mark_btree_dirty(btree);

	return 0;

error_leafbuf:
	(btree->ops->bfree)(sb, bufindex(rootbuf), 1);
	blockput(rootbuf);
	rootbuf = leafbuf;
error:
	return PTR_ERR(rootbuf);
}

/* FIXME: right? and this should be done by btree_chop()? */
int free_empty_btree(struct btree *btree)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct btree_ops *ops = btree->ops;

	if (!has_root(btree))
		return 0;

	assert(btree->root.depth == 1);
	struct sb *sb = btree->sb;
	struct buffer_head *rootbuf = vol_bread(sb, btree->root.block);
	if (!rootbuf)
		return -EIO;
	assert(bnode_sniff(bufdata(rootbuf)));
	/* Make btree has no root */
	btree->root = no_root;
	tux3_mark_btree_dirty(btree);

	struct bnode *rootnode = bufdata(rootbuf);
	assert(bcount(rootnode) == 1);
	block_t leaf = be64_to_cpu(rootnode->entries[0].block);
	struct buffer_head *leafbuf = vol_find_get_block(sb, leaf);

	if (leafbuf && !leaf_need_redirect(sb, leafbuf)) {
		/*
		 * This is redirected leaf. So, in here, we can just
		 * cancel leaf_redirect by bfree(), instead of
		 * defered_bfree().
		 */
		bfree(sb, leaf, 1);
		log_leaf_free(sb, leaf);
		assert(ops->leaf_can_free(btree, bufdata(leafbuf)));
		blockput_free(sb, leafbuf);
	} else {
		defer_bfree(&sb->defree, leaf, 1);
		log_bfree(sb, leaf, 1);
		if (leafbuf) {
			assert(ops->leaf_can_free(btree, bufdata(leafbuf)));
			blockput(leafbuf);
		}
	}

	if (!bnode_need_redirect(sb, rootbuf)) {
		/*
		 * This is redirected bnode. So, in here, we can just
		 * cancel bnode_redirect by bfree(), instead of
		 * defered_bfree().
		 */
		bfree(sb, bufindex(rootbuf), 1);
		log_bnode_free(sb, bufindex(rootbuf));
		blockput_free_unify(sb, rootbuf);
	} else {
		defer_bfree(&sb->deunify, bufindex(rootbuf), 1);
		log_bfree_on_unify(sb, bufindex(rootbuf), 1);
		blockput(rootbuf);
	}

	return 0;
}

int replay_bnode_redirect(struct replay *rp, block_t oldblock, block_t newblock)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct sb *sb = rp->sb;
	struct buffer_head *newbuf, *oldbuf;
	int err = 0;

	newbuf = vol_getblk(sb, newblock);
	if (!newbuf) {
		err = -ENOMEM;	/* FIXME: error code */
		goto error;
	}
	oldbuf = vol_bread(sb, oldblock);
	if (!oldbuf) {
		err = -EIO;	/* FIXME: error code */
		goto error_put_newbuf;
	}
	assert(bnode_sniff(bufdata(oldbuf)));

	memcpy(bufdata(newbuf), bufdata(oldbuf), bufsize(newbuf));
	mark_buffer_unify_atomic(newbuf);

	blockput(oldbuf);
error_put_newbuf:
	blockput(newbuf);
error:
	return err;
}

int replay_bnode_root(struct replay *rp, block_t root, unsigned count,
		      block_t left, block_t right, tuxkey_t rkey)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct sb *sb = rp->sb;
	struct buffer_head *rootbuf;

	rootbuf = vol_getblk(sb, root);
	if (!rootbuf)
		return -ENOMEM;
	bnode_buffer_init(rootbuf);

	bnode_init_root(bufdata(rootbuf), count, left, right, rkey);

	mark_buffer_unify_atomic(rootbuf);
	blockput(rootbuf);

	return 0;
}

/*
 * Before this replay, replay should already dirty the buffer of src.
 * (e.g. by redirect)
 */
int replay_bnode_split(struct replay *rp, block_t src, unsigned pos,
		       block_t dst)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct sb *sb = rp->sb;
	struct buffer_head *srcbuf, *dstbuf;
	int err = 0;

	srcbuf = vol_getblk(sb, src);
	if (!srcbuf) {
		err = -ENOMEM;	/* FIXME: error code */
		goto error;
	}

	dstbuf = vol_getblk(sb, dst);
	if (!dstbuf) {
		err = -ENOMEM;	/* FIXME: error code */
		goto error_put_srcbuf;
	}
	bnode_buffer_init(dstbuf);

	bnode_split(bufdata(srcbuf), pos, bufdata(dstbuf));

	mark_buffer_unify_non(srcbuf);
	mark_buffer_unify_atomic(dstbuf);

	blockput(dstbuf);
error_put_srcbuf:
	blockput(srcbuf);
error:
	return err;
}

/*
 * Before this replay, replay should already dirty the buffer of bnodeblock.
 * (e.g. by redirect)
 */
static int replay_bnode_change(struct sb *sb, block_t bnodeblock,
			       u64 val1, u64 val2,
			       void (*change)(struct bnode *, u64, u64))
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct buffer_head *bnodebuf;

	bnodebuf = vol_getblk(sb, bnodeblock);
	if (!bnodebuf)
		return -ENOMEM;	/* FIXME: error code */

	struct bnode *bnode = bufdata(bnodebuf);
	change(bnode, val1, val2);

	mark_buffer_unify_non(bnodebuf);
	blockput(bnodebuf);

	return 0;
}

static void add_func(struct bnode *bnode, u64 child, u64 key)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct index_entry *entry = bnode_lookup(bnode, key) + 1;
	bnode_add_index(bnode, entry, child, key);
}

int replay_bnode_add(struct replay *rp, block_t parent, block_t child,
		     tuxkey_t key)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	return replay_bnode_change(rp->sb, parent, child, key, add_func);
}

static void update_func(struct bnode *bnode, u64 child, u64 key)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct index_entry *entry = bnode_lookup(bnode, key);
	assert(be64_to_cpu(entry->key) == key);
	entry->block = cpu_to_be64(child);
}

int replay_bnode_update(struct replay *rp, block_t parent, block_t child,
			tuxkey_t key)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	return replay_bnode_change(rp->sb, parent, child, key, update_func);
}

int replay_bnode_merge(struct replay *rp, block_t src, block_t dst)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct sb *sb = rp->sb;
	struct buffer_head *srcbuf, *dstbuf;
	int err = 0, ret;

	srcbuf = vol_getblk(sb, src);
	if (!srcbuf) {
		err = -ENOMEM;	/* FIXME: error code */
		goto error;
	}

	dstbuf = vol_getblk(sb, dst);
	if (!dstbuf) {
		err = -ENOMEM;	/* FIXME: error code */
		goto error_put_srcbuf;
	}

	ret = bnode_merge_nodes(sb, bufdata(dstbuf), bufdata(srcbuf));
	assert(ret == 1);

	mark_buffer_unify_non(dstbuf);
	mark_buffer_unify_non(srcbuf);

	blockput(dstbuf);
error_put_srcbuf:
	blockput(srcbuf);
error:
	return err;
}

static void del_func(struct bnode *bnode, u64 key, u64 count)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct index_entry *entry = bnode_lookup(bnode, key);
	assert(be64_to_cpu(entry->key) == key);
	bnode_remove_index(bnode, entry, count);
}

int replay_bnode_del(struct replay *rp, block_t bnode, tuxkey_t key,
		     unsigned count)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	return replay_bnode_change(rp->sb, bnode, key, count, del_func);
}

static void adjust_func(struct bnode *bnode, u64 from, u64 to)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	struct index_entry *entry = bnode_lookup(bnode, from);
	assert(be64_to_cpu(entry->key) == from);
	entry->key = cpu_to_be64(to);
}

int replay_bnode_adjust(struct replay *rp, block_t bnode, tuxkey_t from,
			tuxkey_t to)
{
	if(DEBUG_MODE_K==1)
	{
		printf("\t\t\t\t%25s[K]  %25s  %4d  #in\n",__FILE__,__func__,__LINE__);
	}
	return replay_bnode_change(rp->sb, bnode, from, to, adjust_func);
}
Esempio n. 6
0
/*
 * DATA_BTREE_BIT is not set in normal state. We set it only when
 * flush inode.  So, this is called to flush inode.
 */
static void tux3_iattr_adjust_for_btree(struct inode *inode,
					struct tux3_iattr_data *idata)
{
	if (has_root(&tux_inode(inode)->btree))
		idata->present |= DATA_BTREE_BIT;
}
Esempio n. 7
0
/*
Major reconstruction of memory management for -off_cache flag
*/
void IMB_init_buffers_iter(struct comm_info* c_info, struct iter_schedule* ITERATIONS,
                           struct Bench* Bmark, MODES BMODE, int iter, int size)
/*


                      Initializes communications buffers (call set_buf)
                      Initializes iterations scheduling


Input variables:


-Bmark                (type struct Bench*)
                      (For explanation of struct Bench type:
                      describes all aspects of modes of a benchmark;
                      see [1] for more information)

                      Current benchmark

-BMODE                (type MODES)
                      aggregate / non aggregate

-iter                 (type int)
                      number of current iteration of message size loop

-size                 (type int)
                      Message size


In/out variables:

-c_info               (type struct comm_info*)
                      Collection of all base data for MPI;
                      see [1] for more information

                      Communications buffers are allocated and assigned values

-ITERATIONS           (type struct iter_schedule*)
                      Adaptive number of iterations, out of cache scheduling are
                      setup if requested



*/
/* >> IMB 3.1  */
{
    /* IMB 3.1 << */
    size_t s_len, r_len, s_alloc, r_alloc;
    int init_size, irep, i_s, i_r, x_sample;
    const int root_based = has_root(Bmark->name);


    x_sample = BMODE->AGGREGATE ? ITERATIONS->msgspersample : ITERATIONS->msgs_nonaggr;

    /* July 2002 fix V2.2.1: */
#if (defined EXT || defined MPIIO || RMA)
    if( Bmark->access==no ) x_sample=ITERATIONS->msgs_nonaggr;
#endif

    ITERATIONS->n_sample = (size > 0)
                           ? max(1, min(ITERATIONS->overall_vol / size, x_sample))
                           : x_sample;

    Bmark->sample_failure = 0;

    init_size = max(size, asize);

    if (c_info->rank < 0) {
        return;
    } else {

        if (ITERATIONS->iter_policy == imode_off) {
            ITERATIONS->n_sample = x_sample = ITERATIONS->msgspersample;
        } else if ((ITERATIONS->iter_policy == imode_multiple_np) || (ITERATIONS->iter_policy == imode_auto && root_based)) {
            /* n_sample for benchmarks with uneven distribution of works
               must be greater or equal and multiple to num_procs.
               The formula below is a negative leg of hyperbola.
               It's moved and scaled relative to max message size
               and initial n_sample subject to multiple to num_procs.
            */
            double d_n_sample = ITERATIONS->msgspersample;
            int max_msg_size = 1<<c_info->max_msg_log;
            int tmp = (int)(d_n_sample*max_msg_size/(c_info->num_procs*init_size+max_msg_size)+0.5);
            ITERATIONS->n_sample = x_sample = max(tmp-tmp%c_info->num_procs, c_info->num_procs);
        } /* else as is */
    }

    if (
#ifdef MPI1
        !strcmp(Bmark->name,"Alltoall") || !strcmp(Bmark->name,"Alltoallv")
#elif defined NBC // MPI1
        !strcmp(Bmark->name, "Ialltoall")  || !strcmp(Bmark->name, "Ialltoall_pure")
        || !strcmp(Bmark->name, "Ialltoallv") || !strcmp(Bmark->name, "Ialltoallv_pure")
#else
        0
#endif // NBC // MPI1
    )
    {
        s_len = (size_t)c_info->num_procs * (size_t)init_size;
        r_len = (size_t)c_info->num_procs * (size_t)init_size;
    }
    else if (
#ifdef MPI1
        !strcmp(Bmark->name, "Allgather")   || !strcmp(Bmark->name, "Allgatherv")
        || !strcmp(Bmark->name, "Gather")      || !strcmp(Bmark->name, "Gatherv")
#elif defined NBC
        !strcmp(Bmark->name, "Iallgather")  || !strcmp(Bmark->name, "Iallgather_pure")
        || !strcmp(Bmark->name, "Iallgatherv") || !strcmp(Bmark->name, "Iallgatherv_pure")
        || !strcmp(Bmark->name, "Igather")     || !strcmp(Bmark->name, "Igather_pure")
        || !strcmp(Bmark->name, "Igatherv")    || !strcmp(Bmark->name, "Igatherv_pure")
#else // MPI1 // NBC
        0
#endif // MPI1 // NBC
    )
    {
        s_len = (size_t) init_size;
        r_len = (size_t) c_info->num_procs * (size_t)init_size;
    }
    else if( !strcmp(Bmark->name,"Exchange") )
    {
        s_len = 2 * (size_t)init_size;
        r_len = (size_t) init_size;
    }
    else if(
#ifdef MPI1
        !strcmp(Bmark->name,"Scatter") || !strcmp(Bmark->name,"Scatterv")
#elif defined NBC // MPI1
        !strcmp(Bmark->name,"Iscatter")  || !strcmp(Bmark->name,"Iscatter_pure")
        || !strcmp(Bmark->name,"Iscatterv") || !strcmp(Bmark->name,"Iscatterv_pure")
#else // NBC // MPI1
        0
#endif // NBC // MPI1
    )
    {
        s_len = (size_t)c_info->num_procs * (size_t)init_size;
        r_len = (size_t)init_size;
    } else if( !strcmp(Bmark->name,"Barrier") || /*!strcmp(Bmark->name,"Window") ||*/ !strcmp(Bmark->name,"Open_Close") ) {
        s_len = r_len = 0;
    }
    else if ( ! strcmp(Bmark->name,"Exchange_put") || ! strcmp(Bmark->name,"Exchange_get") )
    {
        s_len = 2 * (size_t)init_size;
        r_len = 2 * (size_t)init_size;
    }
    else if (! strcmp(Bmark->name,"Compare_and_swap") )
    {
        /* Compare_and_swap operations require 3 buffers, so allocate space for compare
         * buffers in our r_buffer */
        s_len = (size_t)init_size;
        r_len = 3 * (size_t)init_size;
    }
    else
    {
        s_len = r_len = (size_t) init_size;
    }

    /*===============================================*/
    /* the displ is declared as int by MPI1 standard
       If c_info->num_procs*init_size  exceed INT_MAX value there is no way to run this sample
     */
    if (
#ifdef MPI1
        !strcmp(Bmark->name,"Alltoallv")  ||
        !strcmp(Bmark->name,"Allgatherv") ||
        !strcmp(Bmark->name,"Scatterv")   ||
        !strcmp(Bmark->name,"Gatherv")
#elif defined NBC // MPI1
        !strcmp(Bmark->name,"Ialltoallv")  || !strcmp(Bmark->name,"Ialltoallv_pure")  ||
        !strcmp(Bmark->name,"Iallgatherv") || !strcmp(Bmark->name,"Iallgatherv_pure") ||
        !strcmp(Bmark->name,"Iscatterv")   || !strcmp(Bmark->name,"Iscatterv_pure")   ||
        !strcmp(Bmark->name,"Igatherv")    || !strcmp(Bmark->name,"Igatherv_pure")
#else // NBC // MPI1
        0
#endif // NBC // MPI1
    )
    {
        if( s_len > INT_MAX || r_len > INT_MAX) {
            Bmark->sample_failure = SAMPLE_FAILED_INT_OVERFLOW;
            return;
        }
    }
    /*===============================================*/

    /* IMB 3.1: new memory management for -off_cache */
    if (BMODE->type == Sync) {
        ITERATIONS->use_off_cache=0;
        ITERATIONS->n_sample=x_sample;
    } else {
#ifdef MPIIO
        ITERATIONS->use_off_cache=0;
#else
        ITERATIONS->use_off_cache = ITERATIONS->off_cache;
#endif
        if (ITERATIONS->off_cache) {
            if ( ITERATIONS->cache_size > 0) {
                size_t cls = (size_t) ITERATIONS->cache_line_size;
                size_t ofs = ( (s_len + cls - 1) / cls + 1 ) * cls;
                ITERATIONS->s_offs = ofs;
                ITERATIONS->s_cache_iter = min(ITERATIONS->n_sample,(2*ITERATIONS->cache_size*CACHE_UNIT+ofs-1)/ofs);
                ofs = ( ( r_len + cls -1 )/cls + 1 )*cls;
                ITERATIONS->r_offs = ofs;
                ITERATIONS->r_cache_iter = min(ITERATIONS->n_sample,(2*ITERATIONS->cache_size*CACHE_UNIT+ofs-1)/ofs);
            } else {
                ITERATIONS->s_offs=ITERATIONS->r_offs=0;
                ITERATIONS->s_cache_iter=ITERATIONS->r_cache_iter=1;
            }
        }
    }

#ifdef MPIIO
    s_alloc = s_len;
    r_alloc = r_len;
#else
    if( ITERATIONS->use_off_cache ) {
        s_alloc = max(s_len,ITERATIONS->s_cache_iter*ITERATIONS->s_offs);
        r_alloc = max(r_len,ITERATIONS->r_cache_iter*ITERATIONS->r_offs);
    } else {
        s_alloc = s_len;
        r_alloc = r_len;
    }
#endif

    c_info->used_mem = 1.f*(s_alloc+r_alloc)/MEM_UNIT;

#ifdef DEBUG
    {
        size_t mx, mu;

        mx = (size_t) MEM_UNIT*c_info->max_mem;
        mu = (size_t) MEM_UNIT*c_info->used_mem;

        DBG_I3("Got send / recv lengths; iters ",s_len,r_len,ITERATIONS->n_sample);
        DBG_I2("max  / used memory ",mx,mu);
        DBG_I2("send / recv offsets ",ITERATIONS->s_offs, ITERATIONS->r_offs);
        DBG_I2("send / recv cache iterations ",ITERATIONS->s_cache_iter, ITERATIONS->r_cache_iter);
        DBG_I2("send / recv buffer allocations ",s_alloc, r_alloc);
        DBGF_I2("Got send / recv lengths ",s_len,r_len);
        DBGF_I2("max  / used memory ",mx,mu);
        DBGF_I2("send / recv offsets ",ITERATIONS->s_offs, ITERATIONS->r_offs);
        DBGF_I2("send / recv cache iterations ",ITERATIONS->s_cache_iter, ITERATIONS->r_cache_iter);
        DBGF_I2("send / recv buffer allocations ",s_alloc, r_alloc);
    }
#endif

    if( c_info->used_mem > c_info->max_mem ) {
        Bmark->sample_failure=SAMPLE_FAILED_MEMORY;
        return;
    }

    if (s_alloc > 0  && r_alloc > 0) {
        if (ITERATIONS->use_off_cache) {
            IMB_alloc_buf(c_info, "IMB_init_buffers_iter 1", s_alloc, r_alloc);
            IMB_set_buf(c_info, c_info->rank, 0, s_len-1, 0, r_len-1);

            for (irep = 1; irep < ITERATIONS->s_cache_iter; irep++) {
                i_s = irep % ITERATIONS->s_cache_iter;
                memcpy((void*)((char*)c_info->s_buffer + i_s * ITERATIONS->s_offs), c_info->s_buffer, s_len);
            }

            for (irep = 1; irep < ITERATIONS->r_cache_iter; irep++) {
                i_r = irep % ITERATIONS->r_cache_iter;
                memcpy((void*)((char*)c_info->r_buffer + i_r * ITERATIONS->r_offs), c_info->r_buffer, r_len);
            }
        } else {
            IMB_set_buf(c_info, c_info->rank, 0, s_alloc-1, 0, r_alloc-1);
        }
    }

    IMB_init_transfer(c_info, Bmark, size, (MPI_Aint) max(s_alloc, r_alloc));

    /* Determine #iterations if dynamic adaptation requested */
    if ((ITERATIONS->iter_policy == imode_dynamic) || (ITERATIONS->iter_policy == imode_auto && !root_based)) {
        double time[MAX_TIME_ID];
        int acc_rep_test, t_sample;
        int selected_n_sample = ITERATIONS->n_sample;

        memset(time, 0, MAX_TIME_ID);
        if (iter == 0 || BMODE->type == Sync) {
            ITERATIONS->n_sample_prev = ITERATIONS->msgspersample;
            if (c_info->n_lens > 0) {
                memset(ITERATIONS->numiters, 0, c_info->n_lens);
            }
        }

        /* first, run 1 iteration only */
        ITERATIONS->n_sample=1;
#ifdef MPI1
        c_info->select_source = Bmark->select_source;
#endif
        Bmark->Benchmark(c_info,size,ITERATIONS,BMODE,&time[0]);

        time[1] = time[0];

#ifdef MPIIO
        if( Bmark->access != no) {
            ierr = MPI_File_seek(c_info->fh, 0 ,MPI_SEEK_SET);
            MPI_ERRHAND(ierr);

            if( Bmark->fpointer == shared) {
                ierr = MPI_File_seek_shared(c_info->fh, 0 ,MPI_SEEK_SET);
                MPI_ERRHAND(ierr);
            }
        }
#endif /*MPIIO*/

        MPI_Allreduce(&time[1], &time[0], 1, MPI_DOUBLE, MPI_MAX, c_info->communicator);

        {   /* determine rough #repetitions for a run time of 1 sec */
            int rep_test = 1;
            if (time[0] < (1.0 / MSGSPERSAMPLE)) {
                rep_test = MSGSPERSAMPLE;
            } else if ((time[0] < 1.0)) {
                rep_test = (int)(1.0 / time[0] + 0.5);
            }

            MPI_Allreduce(&rep_test, &acc_rep_test, 1, MPI_INT, MPI_MAX, c_info->communicator);
        }

        ITERATIONS->n_sample = min(selected_n_sample, acc_rep_test);

        if (ITERATIONS->n_sample > 1) {
#ifdef MPI1
            c_info->select_source = Bmark->select_source;
#endif
            Bmark->Benchmark(c_info,size,ITERATIONS,BMODE,&time[0]);
            time[1] = time[0];
#ifdef MPIIO
            if( Bmark->access != no) {
                ierr = MPI_File_seek(c_info->fh, 0 ,MPI_SEEK_SET);
                MPI_ERRHAND(ierr);

                if ( Bmark->fpointer == shared) {
                    ierr = MPI_File_seek_shared(c_info->fh, 0 ,MPI_SEEK_SET);
                    MPI_ERRHAND(ierr);
                }
            }
#endif /*MPIIO*/

            MPI_Allreduce(&time[1], &time[0], 1, MPI_DOUBLE, MPI_MAX, c_info->communicator);
        }

        {
            float val = (float) (1+ITERATIONS->secs/time[0]);
            t_sample = (time[0] > 1.e-8 && (val <= (float) 0x7fffffff))
                       ? (int)val
                       : selected_n_sample;
        }

        if (c_info->n_lens>0 && BMODE->type != Sync) {
            // check monotonicity with msg sizes
            int i;
            for (i = 0; i < iter; i++) {
                t_sample = ( c_info->msglen[i] < size )
                           ? min(t_sample,ITERATIONS->numiters[i])
                           : max(t_sample,ITERATIONS->numiters[i]);
            }
            ITERATIONS->n_sample = ITERATIONS->numiters[iter] = min(selected_n_sample, t_sample);
        } else {
            ITERATIONS->n_sample = min(selected_n_sample,
                                       min(ITERATIONS->n_sample_prev, t_sample));
        }

        MPI_Bcast(&ITERATIONS->n_sample, 1, MPI_INT, 0, c_info->communicator);

#ifdef DEBUG
        {
            int usec=time*1000000;

            DBGF_I2("Checked time with #iters / usec ",acc_rep_test,usec);
            DBGF_I1("=> # samples, aligned with previous ",t_sample);
            DBGF_I1("final #samples ",ITERATIONS->n_sample);
        }
#endif
    } else { /*if( (ITERATIONS->iter_policy == imode_dynamic) || (ITERATIONS->iter_policy == imode_auto && !root_based) )*/
        double time[MAX_TIME_ID];
        Bmark->Benchmark(c_info,size,ITERATIONS,BMODE,&time[0]);
    }

    ITERATIONS->n_sample_prev=ITERATIONS->n_sample;

    /* >> IMB 3.1  */

}