void show_tree_range(struct btree *btree, tuxkey_t start, unsigned count) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } __tux3_dbg("%i level btree at %Li:\n", btree->root.depth, btree->root.block); if (!has_root(btree)) return; struct cursor *cursor = alloc_cursor(btree, 0); if (!cursor) { tux3_err(btree->sb, "out of memory"); return; } if (btree_probe(cursor, start)) { tux3_fs_error(btree->sb, "tell me why!!!"); goto out; } struct buffer_head *buffer; do { buffer = cursor_leafbuf(cursor); assert((btree->ops->leaf_sniff)(btree, bufdata(buffer))); (btree->ops->leaf_dump)(btree, bufdata(buffer)); } while (--count && cursor_advance(cursor)); out: free_cursor(cursor); }
/* * Cursor read root node. * < 0 - error * 0 - success */ static int cursor_read_root(struct cursor *cursor) { struct btree *btree = cursor->btree; struct buffer_head *buffer; assert(has_root(btree)); buffer = vol_bread(btree->sb, btree->root.block); if (!buffer) return -EIO; /* FIXME: stupid, it might have been NOMEM */ assert(bnode_sniff(bufdata(buffer))); cursor_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries); return 0; }
/* * Cursor read root node. * < 0 - error * 0 - success */ static int cursor_read_root(struct cursor *cursor) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct btree *btree = cursor->btree; struct buffer_head *buffer; assert(has_root(btree)); buffer = vol_bread(btree->sb, btree->root.block); if (!buffer) return -EIO; /* FIXME: stupid, it might have been NOMEM */ assert(bnode_sniff(bufdata(buffer))); cursor_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries); return 0; }
/* * This is range deletion. So, instead of adjusting balance of the * space on sibling nodes for each change, this just removes the range * and merges from right to left even if it is not same parent. * * +--------------- (A, B, C)--------------------+ * | | | * +-- (AA, AB, AC) -+ +- (BA, BB, BC) -+ + (CA, CB, CC) + * | | | | | | | | | * (AAA,AAB)(ABA,ABB)(ACA,ACB) (BAA,BAB)(BBA)(BCA,BCB) (CAA)(CBA,CBB)(CCA) * * [less : A, AA, AAA, AAB, AB, ABA, ABB, AC, ACA, ACB, B, BA ... : greater] * * If we merged from cousin (or re-distributed), we may have to update * the index until common parent. (e.g. removed (ACB), then merged * from (BAA,BAB) to (ACA), we have to adjust B in root node to BB) * * See, adjust_parent_sep(). * * FIXME: no re-distribute. so, we don't guarantee above than 50% * space efficiency. And if range is end of key (truncate() case), we * don't need to merge, and adjust_parent_sep(). * * FIXME2: we may want to split chop work for each step. instead of * blocking for a long time. */ int btree_chop(struct btree *btree, tuxkey_t start, u64 len) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = btree->sb; struct btree_ops *ops = btree->ops; struct buffer_head **prev, *leafprev = NULL; struct chopped_index_info *cii; struct cursor *cursor; tuxkey_t limit; int ret, done = 0; if (!has_root(btree)) return 0; /* Chop all range if len >= TUXKEY_LIMIT */ limit = (len >= TUXKEY_LIMIT) ? TUXKEY_LIMIT : start + len; prev = malloc(sizeof(*prev) * btree->root.depth); if (prev == NULL) return -ENOMEM; memset(prev, 0, sizeof(*prev) * btree->root.depth); cii = malloc(sizeof(*cii) * btree->root.depth); if (cii == NULL) { ret = -ENOMEM; goto error_cii; } memset(cii, 0, sizeof(*cii) * btree->root.depth); cursor = alloc_cursor(btree, 0); if (!cursor) { ret = -ENOMEM; goto error_alloc_cursor; } down_write(&btree->lock); ret = btree_probe(cursor, start); if (ret) goto error_btree_probe; /* Walk leaves */ while (1) { struct buffer_head *leafbuf; tuxkey_t this_key; /* * FIXME: If leaf was merged and freed later, we don't * need to redirect leaf and leaf_chop() */ if ((ret = cursor_redirect(cursor))) goto out; leafbuf = cursor_pop(cursor); /* Adjust start and len for this leaf */ this_key = cursor_level_this_key(cursor); if (start < this_key) { if (limit < TUXKEY_LIMIT) len -= this_key - start; start = this_key; } ret = ops->leaf_chop(btree, start, len, bufdata(leafbuf)); if (ret) { if (ret < 0) { blockput(leafbuf); goto out; } mark_buffer_dirty_non(leafbuf); } /* Try to merge this leaf with prev */ if (leafprev) { if (try_leaf_merge(btree, leafprev, leafbuf)) { trace(">>> can merge leaf %p into leaf %p", leafbuf, leafprev); remove_index(cursor, cii); mark_buffer_dirty_non(leafprev); blockput_free(sb, leafbuf); goto keep_prev_leaf; } blockput(leafprev); } leafprev = leafbuf; keep_prev_leaf: if (cursor_level_next_key(cursor) >= limit) done = 1; /* Pop and try to merge finished nodes */ while (done || cursor_level_finished(cursor)) { struct buffer_head *buf; int level = cursor->level; struct chopped_index_info *ciil = &cii[level]; /* Get merge src buffer, and go parent level */ buf = cursor_pop(cursor); /* * Logging chopped indexes * FIXME: If node is freed later (e.g. merged), * we dont't need to log this */ if (ciil->count) { log_bnode_del(sb, bufindex(buf), ciil->start, ciil->count); } memset(ciil, 0, sizeof(*ciil)); /* Try to merge node with prev */ if (prev[level]) { assert(level); if (try_bnode_merge(sb, prev[level], buf)) { trace(">>> can merge node %p into node %p", buf, prev[level]); remove_index(cursor, cii); mark_buffer_unify_non(prev[level]); blockput_free_unify(sb, buf); goto keep_prev_node; } blockput(prev[level]); } prev[level] = buf; keep_prev_node: if (!level) goto chop_root; } /* Push back down to leaf level */ do { ret = cursor_advance_down(cursor); if (ret < 0) goto out; } while (ret); } chop_root: /* Remove depth if possible */ while (btree->root.depth > 1 && bcount(bufdata(prev[0])) == 1) { trace("drop btree level"); btree->root.block = bufindex(prev[1]); btree->root.depth--; tux3_mark_btree_dirty(btree); /* * We know prev[0] is redirected and dirty. So, in * here, we can just cancel bnode_redirect by bfree(), * instead of defered_bfree() * FIXME: we can optimize freeing bnode without * bnode_redirect, and if we did, this is not true. */ bfree(sb, bufindex(prev[0]), 1); log_bnode_free(sb, bufindex(prev[0])); blockput_free_unify(sb, prev[0]); vecmove(prev, prev + 1, btree->root.depth); } ret = 0; out: if (leafprev) blockput(leafprev); for (int i = 0; i < btree->root.depth; i++) { if (prev[i]) blockput(prev[i]); } release_cursor(cursor); error_btree_probe: up_write(&btree->lock); free_cursor(cursor); error_alloc_cursor: free(cii); error_cii: free(prev); return ret; }
int alloc_empty_btree(struct btree *btree) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = btree->sb; struct buffer_head *rootbuf = new_node(btree); if (IS_ERR(rootbuf)) goto error; struct buffer_head *leafbuf = new_leaf(btree); if (IS_ERR(leafbuf)) goto error_leafbuf; assert(!has_root(btree)); struct bnode *rootnode = bufdata(rootbuf); block_t rootblock = bufindex(rootbuf); block_t leafblock = bufindex(leafbuf); trace("root at %Lx", rootblock); trace("leaf at %Lx", leafblock); bnode_init_root(rootnode, 1, leafblock, 0, 0); log_bnode_root(sb, rootblock, 1, leafblock, 0, 0); log_balloc(sb, leafblock, 1); mark_buffer_unify_non(rootbuf); blockput(rootbuf); mark_buffer_dirty_non(leafbuf); blockput(leafbuf); btree->root = (struct root){ .block = rootblock, .depth = 1 }; tux3_mark_btree_dirty(btree); return 0; error_leafbuf: (btree->ops->bfree)(sb, bufindex(rootbuf), 1); blockput(rootbuf); rootbuf = leafbuf; error: return PTR_ERR(rootbuf); } /* FIXME: right? and this should be done by btree_chop()? */ int free_empty_btree(struct btree *btree) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct btree_ops *ops = btree->ops; if (!has_root(btree)) return 0; assert(btree->root.depth == 1); struct sb *sb = btree->sb; struct buffer_head *rootbuf = vol_bread(sb, btree->root.block); if (!rootbuf) return -EIO; assert(bnode_sniff(bufdata(rootbuf))); /* Make btree has no root */ btree->root = no_root; tux3_mark_btree_dirty(btree); struct bnode *rootnode = bufdata(rootbuf); assert(bcount(rootnode) == 1); block_t leaf = be64_to_cpu(rootnode->entries[0].block); struct buffer_head *leafbuf = vol_find_get_block(sb, leaf); if (leafbuf && !leaf_need_redirect(sb, leafbuf)) { /* * This is redirected leaf. So, in here, we can just * cancel leaf_redirect by bfree(), instead of * defered_bfree(). */ bfree(sb, leaf, 1); log_leaf_free(sb, leaf); assert(ops->leaf_can_free(btree, bufdata(leafbuf))); blockput_free(sb, leafbuf); } else { defer_bfree(&sb->defree, leaf, 1); log_bfree(sb, leaf, 1); if (leafbuf) { assert(ops->leaf_can_free(btree, bufdata(leafbuf))); blockput(leafbuf); } } if (!bnode_need_redirect(sb, rootbuf)) { /* * This is redirected bnode. So, in here, we can just * cancel bnode_redirect by bfree(), instead of * defered_bfree(). */ bfree(sb, bufindex(rootbuf), 1); log_bnode_free(sb, bufindex(rootbuf)); blockput_free_unify(sb, rootbuf); } else { defer_bfree(&sb->deunify, bufindex(rootbuf), 1); log_bfree_on_unify(sb, bufindex(rootbuf), 1); blockput(rootbuf); } return 0; } int replay_bnode_redirect(struct replay *rp, block_t oldblock, block_t newblock) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *newbuf, *oldbuf; int err = 0; newbuf = vol_getblk(sb, newblock); if (!newbuf) { err = -ENOMEM; /* FIXME: error code */ goto error; } oldbuf = vol_bread(sb, oldblock); if (!oldbuf) { err = -EIO; /* FIXME: error code */ goto error_put_newbuf; } assert(bnode_sniff(bufdata(oldbuf))); memcpy(bufdata(newbuf), bufdata(oldbuf), bufsize(newbuf)); mark_buffer_unify_atomic(newbuf); blockput(oldbuf); error_put_newbuf: blockput(newbuf); error: return err; } int replay_bnode_root(struct replay *rp, block_t root, unsigned count, block_t left, block_t right, tuxkey_t rkey) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *rootbuf; rootbuf = vol_getblk(sb, root); if (!rootbuf) return -ENOMEM; bnode_buffer_init(rootbuf); bnode_init_root(bufdata(rootbuf), count, left, right, rkey); mark_buffer_unify_atomic(rootbuf); blockput(rootbuf); return 0; } /* * Before this replay, replay should already dirty the buffer of src. * (e.g. by redirect) */ int replay_bnode_split(struct replay *rp, block_t src, unsigned pos, block_t dst) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *srcbuf, *dstbuf; int err = 0; srcbuf = vol_getblk(sb, src); if (!srcbuf) { err = -ENOMEM; /* FIXME: error code */ goto error; } dstbuf = vol_getblk(sb, dst); if (!dstbuf) { err = -ENOMEM; /* FIXME: error code */ goto error_put_srcbuf; } bnode_buffer_init(dstbuf); bnode_split(bufdata(srcbuf), pos, bufdata(dstbuf)); mark_buffer_unify_non(srcbuf); mark_buffer_unify_atomic(dstbuf); blockput(dstbuf); error_put_srcbuf: blockput(srcbuf); error: return err; } /* * Before this replay, replay should already dirty the buffer of bnodeblock. * (e.g. by redirect) */ static int replay_bnode_change(struct sb *sb, block_t bnodeblock, u64 val1, u64 val2, void (*change)(struct bnode *, u64, u64)) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct buffer_head *bnodebuf; bnodebuf = vol_getblk(sb, bnodeblock); if (!bnodebuf) return -ENOMEM; /* FIXME: error code */ struct bnode *bnode = bufdata(bnodebuf); change(bnode, val1, val2); mark_buffer_unify_non(bnodebuf); blockput(bnodebuf); return 0; } static void add_func(struct bnode *bnode, u64 child, u64 key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, key) + 1; bnode_add_index(bnode, entry, child, key); } int replay_bnode_add(struct replay *rp, block_t parent, block_t child, tuxkey_t key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, parent, child, key, add_func); } static void update_func(struct bnode *bnode, u64 child, u64 key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, key); assert(be64_to_cpu(entry->key) == key); entry->block = cpu_to_be64(child); } int replay_bnode_update(struct replay *rp, block_t parent, block_t child, tuxkey_t key) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, parent, child, key, update_func); } int replay_bnode_merge(struct replay *rp, block_t src, block_t dst) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct sb *sb = rp->sb; struct buffer_head *srcbuf, *dstbuf; int err = 0, ret; srcbuf = vol_getblk(sb, src); if (!srcbuf) { err = -ENOMEM; /* FIXME: error code */ goto error; } dstbuf = vol_getblk(sb, dst); if (!dstbuf) { err = -ENOMEM; /* FIXME: error code */ goto error_put_srcbuf; } ret = bnode_merge_nodes(sb, bufdata(dstbuf), bufdata(srcbuf)); assert(ret == 1); mark_buffer_unify_non(dstbuf); mark_buffer_unify_non(srcbuf); blockput(dstbuf); error_put_srcbuf: blockput(srcbuf); error: return err; } static void del_func(struct bnode *bnode, u64 key, u64 count) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, key); assert(be64_to_cpu(entry->key) == key); bnode_remove_index(bnode, entry, count); } int replay_bnode_del(struct replay *rp, block_t bnode, tuxkey_t key, unsigned count) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, bnode, key, count, del_func); } static void adjust_func(struct bnode *bnode, u64 from, u64 to) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } struct index_entry *entry = bnode_lookup(bnode, from); assert(be64_to_cpu(entry->key) == from); entry->key = cpu_to_be64(to); } int replay_bnode_adjust(struct replay *rp, block_t bnode, tuxkey_t from, tuxkey_t to) { if(DEBUG_MODE_K==1) { printf("\t\t\t\t%25s[K] %25s %4d #in\n",__FILE__,__func__,__LINE__); } return replay_bnode_change(rp->sb, bnode, from, to, adjust_func); }
/* * DATA_BTREE_BIT is not set in normal state. We set it only when * flush inode. So, this is called to flush inode. */ static void tux3_iattr_adjust_for_btree(struct inode *inode, struct tux3_iattr_data *idata) { if (has_root(&tux_inode(inode)->btree)) idata->present |= DATA_BTREE_BIT; }
/* Major reconstruction of memory management for -off_cache flag */ void IMB_init_buffers_iter(struct comm_info* c_info, struct iter_schedule* ITERATIONS, struct Bench* Bmark, MODES BMODE, int iter, int size) /* Initializes communications buffers (call set_buf) Initializes iterations scheduling Input variables: -Bmark (type struct Bench*) (For explanation of struct Bench type: describes all aspects of modes of a benchmark; see [1] for more information) Current benchmark -BMODE (type MODES) aggregate / non aggregate -iter (type int) number of current iteration of message size loop -size (type int) Message size In/out variables: -c_info (type struct comm_info*) Collection of all base data for MPI; see [1] for more information Communications buffers are allocated and assigned values -ITERATIONS (type struct iter_schedule*) Adaptive number of iterations, out of cache scheduling are setup if requested */ /* >> IMB 3.1 */ { /* IMB 3.1 << */ size_t s_len, r_len, s_alloc, r_alloc; int init_size, irep, i_s, i_r, x_sample; const int root_based = has_root(Bmark->name); x_sample = BMODE->AGGREGATE ? ITERATIONS->msgspersample : ITERATIONS->msgs_nonaggr; /* July 2002 fix V2.2.1: */ #if (defined EXT || defined MPIIO || RMA) if( Bmark->access==no ) x_sample=ITERATIONS->msgs_nonaggr; #endif ITERATIONS->n_sample = (size > 0) ? max(1, min(ITERATIONS->overall_vol / size, x_sample)) : x_sample; Bmark->sample_failure = 0; init_size = max(size, asize); if (c_info->rank < 0) { return; } else { if (ITERATIONS->iter_policy == imode_off) { ITERATIONS->n_sample = x_sample = ITERATIONS->msgspersample; } else if ((ITERATIONS->iter_policy == imode_multiple_np) || (ITERATIONS->iter_policy == imode_auto && root_based)) { /* n_sample for benchmarks with uneven distribution of works must be greater or equal and multiple to num_procs. The formula below is a negative leg of hyperbola. It's moved and scaled relative to max message size and initial n_sample subject to multiple to num_procs. */ double d_n_sample = ITERATIONS->msgspersample; int max_msg_size = 1<<c_info->max_msg_log; int tmp = (int)(d_n_sample*max_msg_size/(c_info->num_procs*init_size+max_msg_size)+0.5); ITERATIONS->n_sample = x_sample = max(tmp-tmp%c_info->num_procs, c_info->num_procs); } /* else as is */ } if ( #ifdef MPI1 !strcmp(Bmark->name,"Alltoall") || !strcmp(Bmark->name,"Alltoallv") #elif defined NBC // MPI1 !strcmp(Bmark->name, "Ialltoall") || !strcmp(Bmark->name, "Ialltoall_pure") || !strcmp(Bmark->name, "Ialltoallv") || !strcmp(Bmark->name, "Ialltoallv_pure") #else 0 #endif // NBC // MPI1 ) { s_len = (size_t)c_info->num_procs * (size_t)init_size; r_len = (size_t)c_info->num_procs * (size_t)init_size; } else if ( #ifdef MPI1 !strcmp(Bmark->name, "Allgather") || !strcmp(Bmark->name, "Allgatherv") || !strcmp(Bmark->name, "Gather") || !strcmp(Bmark->name, "Gatherv") #elif defined NBC !strcmp(Bmark->name, "Iallgather") || !strcmp(Bmark->name, "Iallgather_pure") || !strcmp(Bmark->name, "Iallgatherv") || !strcmp(Bmark->name, "Iallgatherv_pure") || !strcmp(Bmark->name, "Igather") || !strcmp(Bmark->name, "Igather_pure") || !strcmp(Bmark->name, "Igatherv") || !strcmp(Bmark->name, "Igatherv_pure") #else // MPI1 // NBC 0 #endif // MPI1 // NBC ) { s_len = (size_t) init_size; r_len = (size_t) c_info->num_procs * (size_t)init_size; } else if( !strcmp(Bmark->name,"Exchange") ) { s_len = 2 * (size_t)init_size; r_len = (size_t) init_size; } else if( #ifdef MPI1 !strcmp(Bmark->name,"Scatter") || !strcmp(Bmark->name,"Scatterv") #elif defined NBC // MPI1 !strcmp(Bmark->name,"Iscatter") || !strcmp(Bmark->name,"Iscatter_pure") || !strcmp(Bmark->name,"Iscatterv") || !strcmp(Bmark->name,"Iscatterv_pure") #else // NBC // MPI1 0 #endif // NBC // MPI1 ) { s_len = (size_t)c_info->num_procs * (size_t)init_size; r_len = (size_t)init_size; } else if( !strcmp(Bmark->name,"Barrier") || /*!strcmp(Bmark->name,"Window") ||*/ !strcmp(Bmark->name,"Open_Close") ) { s_len = r_len = 0; } else if ( ! strcmp(Bmark->name,"Exchange_put") || ! strcmp(Bmark->name,"Exchange_get") ) { s_len = 2 * (size_t)init_size; r_len = 2 * (size_t)init_size; } else if (! strcmp(Bmark->name,"Compare_and_swap") ) { /* Compare_and_swap operations require 3 buffers, so allocate space for compare * buffers in our r_buffer */ s_len = (size_t)init_size; r_len = 3 * (size_t)init_size; } else { s_len = r_len = (size_t) init_size; } /*===============================================*/ /* the displ is declared as int by MPI1 standard If c_info->num_procs*init_size exceed INT_MAX value there is no way to run this sample */ if ( #ifdef MPI1 !strcmp(Bmark->name,"Alltoallv") || !strcmp(Bmark->name,"Allgatherv") || !strcmp(Bmark->name,"Scatterv") || !strcmp(Bmark->name,"Gatherv") #elif defined NBC // MPI1 !strcmp(Bmark->name,"Ialltoallv") || !strcmp(Bmark->name,"Ialltoallv_pure") || !strcmp(Bmark->name,"Iallgatherv") || !strcmp(Bmark->name,"Iallgatherv_pure") || !strcmp(Bmark->name,"Iscatterv") || !strcmp(Bmark->name,"Iscatterv_pure") || !strcmp(Bmark->name,"Igatherv") || !strcmp(Bmark->name,"Igatherv_pure") #else // NBC // MPI1 0 #endif // NBC // MPI1 ) { if( s_len > INT_MAX || r_len > INT_MAX) { Bmark->sample_failure = SAMPLE_FAILED_INT_OVERFLOW; return; } } /*===============================================*/ /* IMB 3.1: new memory management for -off_cache */ if (BMODE->type == Sync) { ITERATIONS->use_off_cache=0; ITERATIONS->n_sample=x_sample; } else { #ifdef MPIIO ITERATIONS->use_off_cache=0; #else ITERATIONS->use_off_cache = ITERATIONS->off_cache; #endif if (ITERATIONS->off_cache) { if ( ITERATIONS->cache_size > 0) { size_t cls = (size_t) ITERATIONS->cache_line_size; size_t ofs = ( (s_len + cls - 1) / cls + 1 ) * cls; ITERATIONS->s_offs = ofs; ITERATIONS->s_cache_iter = min(ITERATIONS->n_sample,(2*ITERATIONS->cache_size*CACHE_UNIT+ofs-1)/ofs); ofs = ( ( r_len + cls -1 )/cls + 1 )*cls; ITERATIONS->r_offs = ofs; ITERATIONS->r_cache_iter = min(ITERATIONS->n_sample,(2*ITERATIONS->cache_size*CACHE_UNIT+ofs-1)/ofs); } else { ITERATIONS->s_offs=ITERATIONS->r_offs=0; ITERATIONS->s_cache_iter=ITERATIONS->r_cache_iter=1; } } } #ifdef MPIIO s_alloc = s_len; r_alloc = r_len; #else if( ITERATIONS->use_off_cache ) { s_alloc = max(s_len,ITERATIONS->s_cache_iter*ITERATIONS->s_offs); r_alloc = max(r_len,ITERATIONS->r_cache_iter*ITERATIONS->r_offs); } else { s_alloc = s_len; r_alloc = r_len; } #endif c_info->used_mem = 1.f*(s_alloc+r_alloc)/MEM_UNIT; #ifdef DEBUG { size_t mx, mu; mx = (size_t) MEM_UNIT*c_info->max_mem; mu = (size_t) MEM_UNIT*c_info->used_mem; DBG_I3("Got send / recv lengths; iters ",s_len,r_len,ITERATIONS->n_sample); DBG_I2("max / used memory ",mx,mu); DBG_I2("send / recv offsets ",ITERATIONS->s_offs, ITERATIONS->r_offs); DBG_I2("send / recv cache iterations ",ITERATIONS->s_cache_iter, ITERATIONS->r_cache_iter); DBG_I2("send / recv buffer allocations ",s_alloc, r_alloc); DBGF_I2("Got send / recv lengths ",s_len,r_len); DBGF_I2("max / used memory ",mx,mu); DBGF_I2("send / recv offsets ",ITERATIONS->s_offs, ITERATIONS->r_offs); DBGF_I2("send / recv cache iterations ",ITERATIONS->s_cache_iter, ITERATIONS->r_cache_iter); DBGF_I2("send / recv buffer allocations ",s_alloc, r_alloc); } #endif if( c_info->used_mem > c_info->max_mem ) { Bmark->sample_failure=SAMPLE_FAILED_MEMORY; return; } if (s_alloc > 0 && r_alloc > 0) { if (ITERATIONS->use_off_cache) { IMB_alloc_buf(c_info, "IMB_init_buffers_iter 1", s_alloc, r_alloc); IMB_set_buf(c_info, c_info->rank, 0, s_len-1, 0, r_len-1); for (irep = 1; irep < ITERATIONS->s_cache_iter; irep++) { i_s = irep % ITERATIONS->s_cache_iter; memcpy((void*)((char*)c_info->s_buffer + i_s * ITERATIONS->s_offs), c_info->s_buffer, s_len); } for (irep = 1; irep < ITERATIONS->r_cache_iter; irep++) { i_r = irep % ITERATIONS->r_cache_iter; memcpy((void*)((char*)c_info->r_buffer + i_r * ITERATIONS->r_offs), c_info->r_buffer, r_len); } } else { IMB_set_buf(c_info, c_info->rank, 0, s_alloc-1, 0, r_alloc-1); } } IMB_init_transfer(c_info, Bmark, size, (MPI_Aint) max(s_alloc, r_alloc)); /* Determine #iterations if dynamic adaptation requested */ if ((ITERATIONS->iter_policy == imode_dynamic) || (ITERATIONS->iter_policy == imode_auto && !root_based)) { double time[MAX_TIME_ID]; int acc_rep_test, t_sample; int selected_n_sample = ITERATIONS->n_sample; memset(time, 0, MAX_TIME_ID); if (iter == 0 || BMODE->type == Sync) { ITERATIONS->n_sample_prev = ITERATIONS->msgspersample; if (c_info->n_lens > 0) { memset(ITERATIONS->numiters, 0, c_info->n_lens); } } /* first, run 1 iteration only */ ITERATIONS->n_sample=1; #ifdef MPI1 c_info->select_source = Bmark->select_source; #endif Bmark->Benchmark(c_info,size,ITERATIONS,BMODE,&time[0]); time[1] = time[0]; #ifdef MPIIO if( Bmark->access != no) { ierr = MPI_File_seek(c_info->fh, 0 ,MPI_SEEK_SET); MPI_ERRHAND(ierr); if( Bmark->fpointer == shared) { ierr = MPI_File_seek_shared(c_info->fh, 0 ,MPI_SEEK_SET); MPI_ERRHAND(ierr); } } #endif /*MPIIO*/ MPI_Allreduce(&time[1], &time[0], 1, MPI_DOUBLE, MPI_MAX, c_info->communicator); { /* determine rough #repetitions for a run time of 1 sec */ int rep_test = 1; if (time[0] < (1.0 / MSGSPERSAMPLE)) { rep_test = MSGSPERSAMPLE; } else if ((time[0] < 1.0)) { rep_test = (int)(1.0 / time[0] + 0.5); } MPI_Allreduce(&rep_test, &acc_rep_test, 1, MPI_INT, MPI_MAX, c_info->communicator); } ITERATIONS->n_sample = min(selected_n_sample, acc_rep_test); if (ITERATIONS->n_sample > 1) { #ifdef MPI1 c_info->select_source = Bmark->select_source; #endif Bmark->Benchmark(c_info,size,ITERATIONS,BMODE,&time[0]); time[1] = time[0]; #ifdef MPIIO if( Bmark->access != no) { ierr = MPI_File_seek(c_info->fh, 0 ,MPI_SEEK_SET); MPI_ERRHAND(ierr); if ( Bmark->fpointer == shared) { ierr = MPI_File_seek_shared(c_info->fh, 0 ,MPI_SEEK_SET); MPI_ERRHAND(ierr); } } #endif /*MPIIO*/ MPI_Allreduce(&time[1], &time[0], 1, MPI_DOUBLE, MPI_MAX, c_info->communicator); } { float val = (float) (1+ITERATIONS->secs/time[0]); t_sample = (time[0] > 1.e-8 && (val <= (float) 0x7fffffff)) ? (int)val : selected_n_sample; } if (c_info->n_lens>0 && BMODE->type != Sync) { // check monotonicity with msg sizes int i; for (i = 0; i < iter; i++) { t_sample = ( c_info->msglen[i] < size ) ? min(t_sample,ITERATIONS->numiters[i]) : max(t_sample,ITERATIONS->numiters[i]); } ITERATIONS->n_sample = ITERATIONS->numiters[iter] = min(selected_n_sample, t_sample); } else { ITERATIONS->n_sample = min(selected_n_sample, min(ITERATIONS->n_sample_prev, t_sample)); } MPI_Bcast(&ITERATIONS->n_sample, 1, MPI_INT, 0, c_info->communicator); #ifdef DEBUG { int usec=time*1000000; DBGF_I2("Checked time with #iters / usec ",acc_rep_test,usec); DBGF_I1("=> # samples, aligned with previous ",t_sample); DBGF_I1("final #samples ",ITERATIONS->n_sample); } #endif } else { /*if( (ITERATIONS->iter_policy == imode_dynamic) || (ITERATIONS->iter_policy == imode_auto && !root_based) )*/ double time[MAX_TIME_ID]; Bmark->Benchmark(c_info,size,ITERATIONS,BMODE,&time[0]); } ITERATIONS->n_sample_prev=ITERATIONS->n_sample; /* >> IMB 3.1 */ }