void armci_memoffset_table_newentry(void *ptr, size_t seg_size) { void **ptr_arr; void *master_addr = NULL; size_t tile_size=0, offset=0; if(!ptr) armci_die("armci_memoffset_table_newentry : null ptr",0); if(seg_count >= MAX_SEGS) /* CHECK: make it dynamic */ armci_die("armci_cary_allocate: Increase MAX_SEGS > 512", armci_me); if(armci_me == armci_master) master_addr = ptr; armci_msg_brdcst(&master_addr, sizeof(void*), armci_master); ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); armci_cray_gettilesize(ptr, ptr_arr, &tile_size); offset = (size_t)((char*)master_addr - (char*)ptr_arr[armci_master]); /* enter in memoffset table */ armci_memoffset_table[seg_count].seg_addr = ptr_arr[armci_master]; armci_memoffset_table[seg_count].seg_size = seg_size; armci_memoffset_table[seg_count].tile_size = tile_size; armci_memoffset_table[seg_count].mem_offset = offset; #if DEBUG_ printf("%d: addr=%p seg_size=%ld tile_size=%ld offset=%ld\n", armci_me, ptr_arr[armci_master], seg_size, tile_size, offset); #endif ++seg_count; free(ptr_arr); }
void test_brdcst(int datatype) { void *a[6]; int len[6] = {1, 10, 100, 1000, 10000, 100000}; int datatype_size = 0; int i, j; switch(datatype) { case ARMCI_INT: datatype_size = sizeof(int); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((int *) a[i])[j] = (int) j; else memset(a[i], 0x0, len[i] * datatype_size); break; case ARMCI_LONG: datatype_size = sizeof(long); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((long *) a[i])[j] = (long) j; else memset(a[i], 0x0, len[i] * datatype_size); break; case ARMCI_FLOAT: datatype_size = sizeof(float); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((float *) a[i])[j] = (float) j; else memset(a[i], 0x0, len[i] * datatype_size); break; case ARMCI_DOUBLE: datatype_size = sizeof(double); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((double *) a[i])[j] = (double) j; else memset(a[i], 0x0, len[i] * datatype_size); break; default: break; } for(i = 0; i < 6; i++) armci_msg_brdcst(a[i], len[i] * datatype_size, 0); switch(datatype) { case ARMCI_INT: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((int *) a[i])[j] != (int) j) { printf("ERROR a[%d][%d] = %d != %d\n", i, j, ((int *) a[i])[j], (int) j); ARMCI_Error("armci_brdcst failed (int)\n",0); } break; case ARMCI_LONG: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((long *) a[i])[j] != (long) j) { printf("ERROR a[%d][%d] = %ld != %ld\n", i, j, ((long *) a[i])[j], (long) j); ARMCI_Error("armci_brdcst failed (long)\n",0); } break; case ARMCI_FLOAT: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((float *) a[i])[j] != (float) j) { printf("ERROR a[%d][%d] = %f != %f\n", i, j, ((float *) a[i])[j], (float) j); ARMCI_Error("armci_brdcst failed (float)\n",0); } break; case ARMCI_DOUBLE: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((double *) a[i])[j] != (double) j) { printf("ERROR a[%d][%d] = %f != %f\n", i, j, ((double *) a[i])[j], (double) j); ARMCI_Error("armci_brdcst failed (double)\n",0); } break; default: break; } for(i = 0; i < 6; i++) free(a[i]); }
void TestGlobals() { #define MAXLENG 256*1024 double *dtest; int *itest; long *ltest; int len; int ifrom=nproc-1,lfrom=1,dfrom=1; if (me == 0) { printf("Global test ... broadcast and reduction for int, long, double\n----------\n"); fflush(stdout); } if (!(dtest = (double *) malloc((unsigned) (MAXLENG*sizeof(double))))) ARMCI_Error("TestGlobals: failed to allocated dtest", MAXLENG); if (!(ltest = (long *) malloc((unsigned) (MAXLENG*sizeof(long))))) ARMCI_Error("TestGlobals: failed to allocated ltest", MAXLENG); if (!(itest = (int *) malloc((unsigned) (MAXLENG*sizeof(int))))) ARMCI_Error("TestGlobals: failed to allocated itest", MAXLENG); for (len=1; len<MAXLENG; len*=2) { int ilen = len*sizeof(int); int dlen = len*sizeof(double); int llen = len*sizeof(long); int i; ifrom = (ifrom+1)%nproc; lfrom = (lfrom+1)%nproc; dfrom = (lfrom+1)%nproc; #if 0 printf("%d:ifrom=%d lfrom=%d dfrom=%d\n",me,ifrom,lfrom,dfrom);fflush(stdout); #endif if (me == 0) { printf("Test length = %d ... ", len); fflush(stdout); } if(me == ifrom)for (i=0; i<len; i++)itest[i]=i; else for (i=0; i<len; i++)itest[i]=0; if(me == lfrom)for (i=0; i<len; i++)ltest[i]=(long)i; else for (i=0; i<len; i++)ltest[i]=0L; if(me == dfrom)for (i=0; i<len; i++)dtest[i]=(double)i; else for (i=0; i<len; i++)dtest[i]=0.0; /* Test broadcast */ armci_msg_brdcst(itest, ilen, ifrom); armci_msg_brdcst(ltest, llen, lfrom); armci_msg_brdcst(dtest, dlen, dfrom); for (i=0; i<len; i++){ if (itest[i] != i) armci_die2("int broadcast failed", i,itest[i]); if (ltest[i] != (long)i) armci_die2("long broadcast failed", i,(int)ltest[i]); if (dtest[i] != (double)i) armci_die2("double broadcast failed", i,(int)dtest[i]); } if (me == 0) { printf("broadcast OK ..."); fflush(stdout); } /* Test global sum */ for (i=0; i<len; i++) { itest[i] = i*me; ltest[i] = (long) itest[i]; dtest[i] = (double) itest[i]; } armci_msg_igop(itest, len, "+"); armci_msg_lgop(ltest, len, "+"); armci_msg_dgop(dtest, len, "+"); for (i=0; i<len; i++) { int iresult = i*nproc*(nproc-1)/2; if (itest[i] != iresult || ltest[i] != (long)iresult || dtest[i] != (double) iresult) ARMCI_Error("TestGlobals: global sum failed", (int) i); } if (me == 0) { printf("global sums OK\n"); fflush(stdout); } } /* now we get timing data */ time_gop(dtest,MAXLENG); time_reduce(dtest,MAXLENG); free((char *) itest); free((char *) ltest); free((char *) dtest); }
int main(int argc, char *argv[]) { int ch; extern char *optarg; int i, j, r; thread_t threads[MAX_TPP]; /* init MP */ MP_INIT(argc,argv); MP_PROCS(&size); MP_MYID(&rank); while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) { switch(ch) { case 't': /* # of threads */ tpp = atoi(optarg); if (tpp < 1 || tpp > MAX_TPP) { PRINTF0("\"%s\" is improper value for -t, should be a " "number between 1 and %d(MAX_TPP)\n", optarg, MAX_TPP); usage(); } break; case 'i': /* # of iterations */ iters = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -t, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 's': /* # of elements in the array */ asize = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -s, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 'd': delay = atoi(optarg); break; /* delay before start */ case 'h': usage(); break; /* print usage info */ } } #ifdef NOTHREADS tpp = 1; PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n"); #endif th_size = size * tpp; PRINTF0("\nTest of multi-threaded capabilities:\n" "%d threads per process (%d threads total),\n" "%d array elements of size %d,\n" "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters); if (delay) { printf("%d: %d\n", rank, getpid()); fflush(stdout); sleep(delay); MP_BARRIER(); } TH_INIT(size,tpp); for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i; #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fname[10] = '0' + th_rank[i] / 100; fname[11] = '0' + th_rank[i] % 100 / 10; fname[12] = '0' + th_rank[i] % 10; dbg[i] = fopen(fname, "w"); } #endif for (i = 0; i < tpp; i++) prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]); /* init ARMCI */ ARMCI_Init(); /* set global seed (to ensure same random sequence across procs) */ time_seed = (unsigned)time(NULL); armci_msg_brdcst(&time_seed, sizeof(time_seed), 0); srand(time_seed); rand(); prndbg(0, "seed = %u\n", time_seed); /* random pairs */ pairs = calloc(th_size, sizeof(int)); for (i = 0; i < th_size; i++) pairs[i] = -1; for (i = 0; i < th_size; i++) { if (pairs[i] != -1) continue; r = RND(0, th_size); while (i == r || pairs[r] != -1 ) r = RND(0, th_size); pairs[i] = r; pairs[r] = i; } for (i = 0, cbufl = 0; i < th_size; i++) cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d", i, pairs[i], pairs[i], pairs[pairs[i]]); prndbg(0, "random pairs:%s\n", cbuf); /* random targets */ rnd_tgts = calloc(th_size, sizeof(int)); for (i = 0, cbufl = 0; i < th_size; i++) { rnd_tgts[i] = RND(0, th_size); if (rnd_tgts[i] == i) { i--; continue; } cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]); } prndbg(0, "random targets:%s\n", cbuf); /* random one */ rnd_one = RND(0, th_size); prndbg(0, "random one = %d\n", rnd_one); assert(ptrs1 = calloc(th_size, sizeof(void *))); assert(ptrs2 = calloc(th_size, sizeof(void *))); #ifdef NOTHREADS thread_main((void *)(long)0); #else for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i); for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL); #endif MP_BARRIER(); PRINTF0("Tests Completed\n"); /* clean up */ #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) fclose(dbg[i]); #endif ARMCI_Finalize(); TH_FINALIZE(); MP_FINALIZE(); return 0; }
void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Comm comm) { int result; MPI_Comm_compare(comm, MPI_COMM_WORLD, &result); if(result == MPI_IDENT) armci_msg_brdcst(buffer, len, root); else MPI_Bcast(buffer, len, MPI_BYTE, root, (MPI_Comm)comm); }
static void process_hostlist(char *names) { #ifdef CLUSTER int i, cluster=0; char *s,*master; int len, root=0; /******** inspect list of machine names to determine locality ********/ if (armci_me==0){ /* first find out how many cluster nodes we got */ armci_nclus =1; s=master=names; for(i=1; i < armci_nproc; i++){ s += strlen(s)+1; if(strcmp(s,master)){ /* we found a new machine name on the list */ master = s; armci_nclus++; /*fprintf(stderr,"new name %s len =%d\n",master, strlen(master));*/ } } /* allocate memory */ armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); /* fill the data structure -- go through the list again */ s=names; master="*-"; /* impossible hostname */ cluster =0; for(i=0; i < armci_nproc; i++){ if(strcmp(s,master)){ /* we found a new machine name on the list */ master = s; armci_clus_info[cluster].nslave=1; armci_clus_info[cluster].master=i; strcpy(armci_clus_info[cluster].hostname, master); #ifdef CHECK_NODE_NAMES /* need consecutive task id allocated on the same node * the current test only compares hostnames against first cluster */ if(cluster) if(!strcmp(master,armci_clus_info[0].hostname)){ /* we have seen that hostname before */ fprintf(stderr, "\nIt appears that tasks allocated on the same"); fprintf(stderr, " host machine do not have\n"); fprintf(stderr, "consecutive message-passing IDs/numbers. "); fprintf(stderr,"This is not acceptable \nto the ARMCI library "); fprintf(stderr,"as it prevents SMP optimizations and would\n"); fprintf(stderr,"lead to poor resource utilization.\n\n"); fprintf(stderr,"Please contact your System Administrator "); fprintf(stderr,"or, if you can, modify the "); # if defined(MPI) fprintf(stderr,"MPI"); # elif defined(TCGMSG) fprintf(stderr,"TCGMSG"); # elif defined(PVM) fprintf(stderr,"PVM"); # endif fprintf(stderr,"\nmessage-passing job startup configuration.\n\n"); #ifdef HITACHI fprintf(stderr,"On Hitachi it can be done by setting environment variable MPIR_RANK_NO_ROUND, for example\n setenv MPIR_RANK_NO_ROUND yes\n\n"); #endif sleep(1); armci_die("Cannot run: improper task to host mapping!",0); } #endif cluster++; }else{ /* the process is still on the same host */ armci_clus_info[cluster-1].nslave++; } s += strlen(s)+1; } if(armci_nclus != cluster) armci_die("inconsistency processing clusterinfo",armci_nclus); } /******** process 0 got all data ********/ /* now broadcast locality info struct to all processes * two steps are needed because of the unknown length of hostname list */ len = sizeof(int); armci_msg_brdcst(&armci_nclus, len, root); if(armci_me){ /* allocate memory */ armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); } len = sizeof(armci_clus_t)*armci_nclus; armci_msg_brdcst(armci_clus_info, len, root); /******** all processes 0 got all data ********/ /* now determine current cluster node id by comparing me to master */ armci_clus_me = armci_nclus-1; for(i =0; i< armci_nclus-1; i++) if(armci_me < armci_clus_info[i+1].master){ armci_clus_me=i; break; } #else armci_clus_me=0; armci_nclus=1; armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); strcpy(armci_clus_info[0].hostname, names); armci_clus_info[0].master=0; armci_clus_info[0].nslave=armci_nproc; #endif armci_clus_first = armci_clus_info[armci_clus_me].master; armci_clus_last = armci_clus_first +armci_clus_info[armci_clus_me].nslave-1; }
static int sparse_initialize(int *n, int *non_zero, int **row_ind, int **col_ind, double **values, double **vec, double **svec) { int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL; double *tmp_values=NULL; unsigned long len; FILE *fp=NULL; /* Broadcast order of matrix */ if(me==0) { if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL) ARMCI_Error("Error: Input file not found", me); fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */ fscanf(fp, "%d", n); if(*n%nproc) ARMCI_Error("# of rows is not divisible by # of processors", nproc); if(*n > ROW) ARMCI_Error("order is greater than defined variable ROW", ROW); } len = sizeof(int); armci_msg_brdcst(n, len, 0); /* Broad cast number of non_zeros */ if(me==0) fscanf(fp, "%d", non_zero); armci_msg_brdcst(non_zero, len, 0); /* Broadcast row indices */ len = (*n+1)*sizeof(int); row_ind_tmp = (int *)malloc(len); if(me==0)for(i=0; i<*n+1; i++) { fscanf(fp, "%d", &row_ind_tmp[i]); if(fortran_indexing) --row_ind_tmp[i]; } armci_msg_brdcst(row_ind_tmp, len, 0); load_balance(*n, *non_zero, row_ind_tmp); /* find how much temporary storage is needed at the maximum */ if(me==0) { for(max=-1,j=0; j<nproc; j++) if(max<proc_nz_list[j]) max=proc_nz_list[j]; if(max<0) ARMCI_Error(" max cannot be negative", max); } /* Broadcast the maximum number of elements */ len = sizeof(int); armci_msg_brdcst(&max, len, 0); /* create the Sparse MAtrix Array */ if(me==0) printf(" Creating ValueArray (CompressedSparseMatrix) ...\n\n"); create_array((void**)col_ind, sizeof(int), 1, &max); /* create the column subscript array */ if(me==0) printf(" Creating Column Subscript Array ... \n\n"); create_array((void**)values, sizeof(double), 1, &max); /* create the x-vector and the solution vector */ if(me==0) printf(" Creating Vectors ... \n\n"); create_array((void**)vec, sizeof(double),1, &max); create_array((void**)svec, sizeof(double),1, &max); armci_msg_barrier(); /* Process 0 distributes the column indices and non_zero values to respective processors*/ if(me == 0) { tmp_indices = (int *)malloc(max*sizeof(int)); tmp_values = (double *)malloc(max*sizeof(double)); for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) { fscanf(fp, "%d", &tmp_indices[i]); if(fortran_indexing) --tmp_indices[i]; } /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */ if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]); if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j))) ARMCI_Error("armci_nbput failed\n",rc); } } ARMCI_AllFence(); armci_msg_barrier(); ARMCI_AllFence(); /* initializing x-vector */ if(me==0) for(i=0; i<proc_nz_list[me]; i++) vec[me][i] = (i+1); else for(i=0; i<proc_nz_list[me]; i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1); #if 0 if(me==0) { printf("max = %d\n", max); for(i=0; i<max; i++) printf("%.1f ", values[me][i]); printf("\n"); } #endif *row_ind = row_ind_tmp; if(me==0) { free(tmp_indices); free(tmp_values); fclose(fp); } return 0; }
static void process_hostlist(char *names) { #ifdef CLUSTER int i, cluster=0; char *s,*master; int len, root=0; /******** inspect list of machine names to determine locality ********/ if (armci_me==0){ /* first find out how many cluster nodes we got */ armci_nclus =1; s=master=names; for(i=1; i < armci_nproc; i++){ s += strlen(s)+1; if(strcmp(s,master)){ /* we found a new machine name on the list */ master = s; armci_nclus++; /*fprintf(stderr,"new name %s len =%d\n",master, strlen(master));*/ } } /* allocate memory */ armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); if(!armci_clus_info) armci_die("malloc failed for clusinfo",armci_nclus); /* fill the data structure -- go through the list again */ s=names; master="*-"; /* impossible hostname */ cluster =0; for(i=0; i < armci_nproc; i++){ if(strcmp(s,master)){ /* we found a new machine name on the list */ master = s; armci_clus_info[cluster].nslave=1; armci_clus_info[cluster].master=i; strcpy(armci_clus_info[cluster].hostname, master); #ifdef CHECK_NODE_NAMES /* need consecutive task id allocated on the same node * the current test only compares hostnames against first cluster */ if(cluster) if(!strcmp(master,armci_clus_info[0].hostname)){ /* we have seen that hostname before */ fprintf(stderr, "ARMCI supports block process mapping only\n"); armci_die("Cannot run: improper task to host mapping!",0); } #endif cluster++; } else{ /* the process is still on the same host */ armci_clus_info[cluster-1].nslave++; } s += strlen(s)+1; } if(armci_nclus != cluster) armci_die("inconsistency processing clusterinfo",armci_nclus); } /******** process 0 got all data ********/ /* now broadcast locality info struct to all processes * two steps are needed because of the unknown length of hostname list */ len = sizeof(int); armci_msg_brdcst(&armci_nclus, len, root); if(armci_me){ /* allocate memory */ armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); if(!armci_clus_info) armci_die("malloc failed for clusinfo",armci_nclus); } len = sizeof(armci_clus_t)*armci_nclus; armci_msg_brdcst(armci_clus_info, len, root); /******** all processes 0 got all data ********/ /* now determine current cluster node id by comparing me to master */ armci_clus_me = armci_nclus-1; for(i =0; i< armci_nclus-1; i++) if(armci_me < armci_clus_info[i+1].master){ armci_clus_me=i; break; } #else armci_clus_me=0; armci_nclus=1; armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); if(!armci_clus_info) armci_die("malloc failed for clusinfo",armci_nclus); strcpy(armci_clus_info[0].hostname, names); armci_clus_info[0].master=0; armci_clus_info[0].nslave=armci_nproc; #endif /* Starting process ID on my node */ armci_clus_first = armci_clus_info[armci_clus_me].master; /* Last process ID on my node */ armci_clus_last = armci_clus_first + armci_clus_info[armci_clus_me].nslave-1; }