/********************************bss_malloc.c********************************** Function: bss_stats() Input : Output: Return: Description: *********************************bss_malloc.c*********************************/ void bss_stats(void) { int oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD}; int vals[sizeof(oprs)/sizeof(oprs[0])-1]; int work[sizeof(oprs)/sizeof(oprs[0])-1]; vals[0]=vals[1]=vals[2]=bss_req; giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs); vals[2]/=num_nodes; if (!my_id) { printf("%d :: bss_malloc stats:\n",my_id); printf("%d :: bss_req min = %d\n",my_id,vals[0]); printf("%d :: bss_req max = %d\n",my_id,vals[1]); printf("%d :: bss_req ave = %d\n",my_id,vals[2]); printf("%d :: bss_req me = %d\n",my_id,bss_req); } #ifdef DEBUG /* check to make sure that malloc and free calls are balanced */ if (num_bss_frees+num_bss_req) { printf("%d :: bss # frees = %d\n",my_id,-1*num_bss_frees); printf("%d :: bss # calls = %d\n",my_id,num_bss_req); } fflush(stdout); #endif }
static PetscErrorCode set_pairwise(gs_id *gs) { PetscInt i, j; PetscInt p_mask_size; PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; PetscInt *ngh_buf, *buf2; PetscInt offset; PetscInt *msg_list, *msg_size, **msg_nodes, nprs; PetscInt *pairwise_elm_list, len_pair_list=0; PetscInt *iptr, t1, i_start, nel, *elms; PetscInt ct; PetscErrorCode ierr; PetscFunctionBegin; /* to make life easier */ nel = gs->nel; elms = gs->elms; ngh_buf = gs->ngh_buf; sh_proc_mask = gs->pw_nghs; /* need a few temp masks */ p_mask_size = len_bit_mask(num_nodes); p_mask = (PetscInt*) malloc(p_mask_size); tmp_proc_mask = (PetscInt*) malloc(p_mask_size); /* set mask to my my_id's bit mask */ ierr = set_bit_mask(p_mask,p_mask_size,my_id);CHKERRQ(ierr); p_mask_size /= sizeof(PetscInt); len_pair_list=gs->len_pw_list; gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); /* how many processors (nghs) do we have to exchange with? */ nprs=gs->num_pairs=ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt)); /* allocate space for gs_gop() info */ gs->pair_list = msg_list = (PetscInt *) malloc(sizeof(PetscInt)*nprs); gs->msg_sizes = msg_size = (PetscInt *) malloc(sizeof(PetscInt)*nprs); gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1)); /* init msg_size list */ ierr = ivec_zero(msg_size,nprs);CHKERRQ(ierr); /* expand from bit mask list to int list */ ierr = bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr); /* keep list of elements being handled pairwise */ for (i=j=0;i<nel;i++) { if (elms[i] & TOP_BIT) {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;} } pairwise_elm_list[j] = -1; gs->msg_ids_out = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1)); gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; gs->msg_ids_in = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1)); gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); /* find who goes to each processor */ for (i_start=i=0;i<nprs;i++) { /* processor i's mask */ ierr = set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr); /* det # going to processor i */ for (ct=j=0;j<len_pair_list;j++) { buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); ierr = ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) {ct++;} } msg_size[i] = ct; i_start = PetscMax(i_start,ct); /*space to hold nodes in message to first neighbor */ msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); for (j=0;j<len_pair_list;j++) { buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); ierr = ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) {*iptr++ = j;} } *iptr = -1; } msg_nodes[nprs] = NULL; j=gs->loc_node_pairs=i_start; t1 = GL_MAX; ierr = giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); gs->max_node_pairs = i_start; i_start=j; t1 = GL_MIN; ierr = giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); gs->min_node_pairs = i_start; i_start=j; t1 = GL_ADD; ierr = giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); gs->avg_node_pairs = i_start/num_nodes + 1; i_start=nprs; t1 = GL_MAX; giop(&i_start,&offset,1,&t1); gs->max_pairs = i_start; /* remap pairwise in tail of gsi_via_bit_mask() */ gs->msg_total = ivec_sum(gs->msg_sizes,nprs); gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); gs->in = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); /* reset malloc pool */ free((void*)p_mask); free((void*)tmp_proc_mask); PetscFunctionReturn(0); }
static PetscErrorCode get_ngh_buf(gs_id *gs) { PetscInt i, j, npw=0, ntree_map=0; PetscInt p_mask_size, ngh_buf_size, buf_size; PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; PetscInt *ngh_buf, *buf1, *buf2; PetscInt offset, per_load, num_loads, or_ct, start, end; PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; PetscInt oper=GL_B_OR; PetscInt *ptr3, *t_mask, level, ct1, ct2; PetscErrorCode ierr; PetscFunctionBegin; /* to make life easier */ nel = gs->nel; elms = gs->elms; level = gs->level; /* det #bytes needed for processor bit masks and init w/mask cor. to my_id */ p_mask = (PetscInt*) malloc(p_mask_size=len_bit_mask(num_nodes)); ierr = set_bit_mask(p_mask,p_mask_size,my_id);CHKERRQ(ierr); /* allocate space for masks and info bufs */ gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; t_mask = (PetscInt*) malloc(p_mask_size); gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); /* comm buffer size ... memory usage bounded by ~2*msg_buf */ /* had thought I could exploit rendezvous threshold */ /* default is one pass */ per_load = negl = gs->negl; gs->num_loads = num_loads = 1; i=p_mask_size*negl; /* possible overflow on buffer size */ /* overflow hack */ if (i<0) {i=INT_MAX;} buf_size = PetscMin(msg_buf,i); /* can we do it? */ if (p_mask_size>buf_size) SETERRQ2(PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size); /* get giop buf space ... make *only* one malloc */ buf1 = (PetscInt*) malloc(buf_size<<1); /* more than one gior exchange needed? */ if (buf_size!=i) { per_load = buf_size/p_mask_size; buf_size = per_load*p_mask_size; gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); } /* convert buf sizes from #bytes to #ints - 32 bit only! */ p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); /* find giop work space */ buf2 = buf1+buf_size; /* hold #ints needed for processor masks */ gs->mask_sz=p_mask_size; /* init buffers */ ierr = ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr); ierr = ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr); ierr = ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr); /* HACK reset tree info */ tree_buf=NULL; tree_buf_sz=ntree=0; /* ok do it */ for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) { /* identity for bitwise or is 000...000 */ ivec_zero(buf1,buf_size); /* load msg buffer */ for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) { offset = (offset-start)*p_mask_size; ivec_copy(buf1+offset,p_mask,p_mask_size); } /* GLOBAL: pass buffer */ ierr = giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr); /* unload buffer into ngh_buf */ ptr2=(elms+i_start); for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) { /* I own it ... may have to pairwise it */ if (j==*ptr2) { /* do i share it w/anyone? */ ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt)); /* guess not */ if (ct1<2) {ptr2++; ptr1+=p_mask_size; continue;} /* i do ... so keep info and turn off my bit */ ivec_copy(ptr1,ptr3,p_mask_size); ierr = ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr); ierr = ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); /* is it to be done pairwise? */ if (--ct1<=level) { npw++; /* turn on high bit to indicate pw need to process */ *ptr2++ |= TOP_BIT; ierr = ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); ptr1+=p_mask_size; continue; } /* get set for next and note that I have a tree contribution */ /* could save exact elm index for tree here -> save a search */ ptr2++; ptr1+=p_mask_size; ntree_map++; } /* i don't but still might be involved in tree */ else { /* shared by how many? */ ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt)); /* none! */ if (ct1<2) continue; /* is it going to be done pairwise? but not by me of course!*/ if (--ct1<=level) continue; } /* LATER we're going to have to process it NOW */ /* nope ... tree it */ ierr = place_in_tree(j);CHKERRQ(ierr); } } free((void*)t_mask); free((void*)buf1); gs->len_pw_list=npw; gs->num_nghs = ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt)); /* expand from bit mask list to int list and save ngh list */ gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); gs->num_pw_nghs = ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); oper = GL_MAX; ct1 = gs->num_nghs; ierr = giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr); gs->max_nghs = ct1; gs->tree_map_sz = ntree_map; gs->max_left_over=ntree; free((void*)p_mask); free((void*)sh_proc_mask); PetscFunctionReturn(0); }
static gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) { PetscInt i, j, k, t2; PetscInt *companion, *elms, *unique, *iptr; PetscInt num_local=0, *num_to_reduce, **local_reduce; PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1]; PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1]; gs_id *gs; PetscErrorCode ierr; if (!in_elms) {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");} if (nel<0) {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");} if (nel==0) {ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);} /* get space for gs template */ gs = gsi_new(); gs->id = ++num_gs_ids; /* hmt 6.4.99 */ /* caller can set global ids that don't participate to 0 */ /* gs_init ignores all zeros in elm list */ /* negative global ids are still invalid */ for (i=j=0;i<nel;i++) {if (in_elms[i]!=0) {j++;}} k=nel; nel=j; /* copy over in_elms list and create inverse map */ elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); for (i=j=0;i<k;i++) { if (in_elms[i]!=0) {elms[j] = in_elms[i]; companion[j++] = i;} } if (j!=nel) {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");} /* pre-pass ... check to see if sorted */ elms[nel] = INT_MAX; iptr = elms; unique = elms+1; j=0; while (*iptr!=INT_MAX) { if (*iptr++>*unique++) {j=1; break;} } /* set up inverse map */ if (j) { ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); ierr = SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr); } else {ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);} elms[nel] = INT_MIN; /* first pass */ /* determine number of unique elements, check pd */ for (i=k=0;i<nel;i+=j) { t2 = elms[i]; j=++i; /* clump 'em for now */ while (elms[j]==t2) {j++;} /* how many together and num local */ if (j-=i) {num_local++; k+=j;} } /* how many unique elements? */ gs->repeats=k; gs->nel = nel-k; /* number of repeats? */ gs->num_local = num_local; num_local+=2; gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*)); gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); gs->elms = unique; gs->nel_total = nel; gs->local_elms = elms; gs->companion = companion; /* compess map as well as keep track of local ops */ for (num_local=i=j=0;i<gs->nel;i++) { k=j; t2 = unique[i] = elms[j]; companion[i] = companion[j]; while (elms[j]==t2) {j++;} if ((t2=(j-k))>1) { /* number together */ num_to_reduce[num_local] = t2++; iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); /* to use binary searching don't remap until we check intersection */ *iptr++ = i; /* note that we're skipping the first one */ while (++k<j) {*(iptr++) = companion[k];} *iptr = -1; } } /* sentinel for ngh_buf */ unique[gs->nel]=INT_MAX; /* for two partition sort hack */ num_to_reduce[num_local] = 0; local_reduce[num_local] = NULL; num_to_reduce[++num_local] = 0; local_reduce[num_local] = NULL; /* load 'em up */ /* note one extra to hold NON_UNIFORM flag!!! */ vals[2] = vals[1] = vals[0] = nel; if (gs->nel>0) { vals[3] = unique[0]; vals[4] = unique[gs->nel-1]; } else { vals[3] = INT_MAX; vals[4] = INT_MIN; } vals[5] = level; vals[6] = num_gs_ids; /* GLOBAL: send 'em out */ ierr = giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr); /* must be semi-pos def - only pairwise depends on this */ /* LATER - remove this restriction */ if (vals[3]<0) {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");} if (vals[4]==INT_MAX) {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");} gs->nel_min = vals[0]; gs->nel_max = vals[1]; gs->nel_sum = vals[2]; gs->gl_min = vals[3]; gs->gl_max = vals[4]; gs->negl = vals[4]-vals[3]+1; if (gs->negl<=0) {SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");} /* LATER :: add level == -1 -> program selects level */ if (vals[5]<0) {vals[5]=0;} else if (vals[5]>num_nodes) {vals[5]=num_nodes;} gs->level = vals[5]; return(gs); }