void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; double t1; proc_owner = block_owner(I, J); edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); t1 = armci_timer(); #ifdef MPI2_ONESIDED { int target_disp = ( ((char*)(a[I+J*nblocks])) - ((char*)(ptr[proc_owner])) ); if(target_disp<0) { printf("ERROR!: target disp is < 0, target_disp= %d\n", target_disp); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc_owner, 0, win); MPI_Get(buf, size, MPI_CHAR, proc_owner, target_disp, size, MPI_CHAR, win); MPI_Win_unlock(proc_owner, win); } #else ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); #endif comm_time += armci_timer() - t1; get_cntr++; }
static void sparse_multiply(int n, int non_zero, int *row_ind, int **col_ind, double **values, double **vec, double **svec) { int i, j, k, num_elements, offset, *tmp_indices; double start_time, comm_time, comp_time, v, vec_local[COL]; int start, end, prev, nrows, idx; #if 0 /* ---- Sequential Case ---- */ for(i=0; i<n; i++) { svec[me][i] = 0; for(k=row_ind[i]; k<row_ind[i+1]; k++) { j = col_ind[me][k]; v = values[me][k]; svec[me][i] += v*vec[me][j]; printf("%.1f %.1f\n", v, vec[me][j]); } } for(i=0; i<n; i++) printf("%.1f ", svec[me][i]); printf("\n"); #else num_elements = proc_nz_list[me]; printf("num_elements = %d\n", num_elements); tmp_indices = (int *)malloc(num_elements*sizeof(int)); for(i=0; i<num_elements; i++) tmp_indices[i] = col_ind[me][i]; qsort(tmp_indices, num_elements, sizeof(int), compare); start_time = armci_timer(); /* get the required portion of vector you need to local array */ start = prev = tmp_indices[0]; for(i=1; i<num_elements; i++) { if(tmp_indices[i]>prev+1) { end = prev; get_data(n, start, end, vec_local, vec); start = prev = tmp_indices[i]; } else prev = tmp_indices[i]; } get_data(n, start, prev, vec_local, vec); #if 1 if(count>=0) for(i=0; i<=count; i++) ARMCI_Wait(&gHandle[i]); #endif comm_time = armci_timer() - start_time; start_time = armci_timer(); /* Perform Matrix-Vector multiply and store the result in solution vector - "svec[]" */ if(me==0) { nrows = proc_row_list[me]; offset = row_ind[0]; } else { nrows = proc_row_list[me]-proc_row_list[me-1]; offset = row_ind[proc_row_list[me-1]]; } /* printf("%d: My total Work = %d\n", me, nrows); */ for(i=0; i<nrows; i++) { /* loop over rows owned by me */ svec[me][i] = 0; if(me==0) idx = i; else idx = proc_row_list[me-1] + i; for(k=row_ind[idx]; k<row_ind[idx+1]; k++) { j = col_ind[me][k-offset]; v = values[me][k-offset]; svec[me][i] += v*vec_local[j]; } } comp_time = armci_timer()-start_time; printf("%d: %f + %f = %f (count = %d)\n", me, comm_time, comp_time, comm_time+comp_time, count+1); #endif }
void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array(ddst_put, 2, elems); create_array(ddst_get, 2, elems); create_array(dsrc, 1, &elems[1]); for (i = 0; i < elems[1]; i++) { dsrc[me][i] = i * 1.001 * (me + 1); } for (i = 0; i < elems[0]*elems[1]; i++) { ddst_put[me][i] = 0.0; ddst_get[me][i] = 0.0; } ARMCI_Barrier(); /* only proc 0 does the work */ if (me == 0) { if (!dryrun) { printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); } /* initializing non-blocking handles */ for (i = 0; i < elems[1]; i++) { ARMCI_INIT_HANDLE(&hdl_put[i]); } for (i = 0; i < elems[1]; i++) { ARMCI_INIT_HANDLE(&hdl_get[i]); } /* aggregate handles */ for (i = 0; i < nproc; i++) { ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); } for (i = 0; i < nproc; i++) { ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); } for (i = 0; i < nproc; i++) { ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); } for (i = 0; i < nproc; i++) { ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); } bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time = armci_timer(); start = 0; end = elems[1]; for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1] + j], i, &hdl_put[j]); } for (j = start; j < end; j++) { ARMCI_Wait(&hdl_put[j]); } } if (!dryrun) { printf("%d: Value Put time = %.2es\n", me, armci_timer() - start_time); } /* vector put */ start_time = armci_timer(); for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1] + j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if ((rc = ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) { ARMCI_Error("armci_nbputv failed\n", rc); } } for (i = 1; i < nproc; i++) { ARMCI_Wait(&hdl_put[i]); } if (!dryrun) { printf("%d: Vector Put time = %.2es\n", me, armci_timer() - start_time); } /* regular put */ start_time = armci_timer(); for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { if ((rc = ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1] + j], bytes, i, &hdl_put[j]))) { ARMCI_Error("armci_nbput failed\n", rc); } } for (j = start; j < end; j++) { ARMCI_Wait(&hdl_put[j]); } } if (!dryrun) { printf("%d: Regular Put time = %.2es\n", me, armci_timer() - start_time); } /* aggregate put */ start_time = armci_timer(); for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { if ((rc = ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1] + j], bytes, i, &aggr_hdl_put[i]))) { ARMCI_Error("armci_nbput failed\n", rc); } } } for (i = 1; i < nproc; i++) { ARMCI_Wait(&aggr_hdl_put[i]); } if (!dryrun) { printf("%d: Aggregate Put time = %.2es\n\n", me, armci_timer() - start_time); } /* **************** GET **************** */ /* vector get */ start_time = armci_timer(); for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1] + j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if ((rc = ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) { ARMCI_Error("armci_nbgetv failed\n", rc); } ARMCI_Wait(&hdl_get[i]); } if (!dryrun) { printf("%d: Vector Get time = %.2es\n", me, armci_timer() - start_time); } /* regular get */ start_time = armci_timer(); for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { if ((rc = ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1] + j], bytes, i, &hdl_get[j]))) { ARMCI_Error("armci_nbget failed\n", rc); } } for (j = start; j < end; j++) { ARMCI_Wait(&hdl_get[j]); } } if (!dryrun) { printf("%d: Regular Get time = %.2es\n", me, armci_timer() - start_time); } /* aggregate get */ start_time = armci_timer(); for (i = 1; i < nproc; i++) { for (j = start; j < end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1] + j], bytes, i, &aggr_hdl_get[i]); } } for (i = 1; i < nproc; i++) { ARMCI_Wait(&aggr_hdl_get[i]); } if (!dryrun) { printf("%d: Aggregate Get time = %.2es\n", me, armci_timer() - start_time); } } ARMCI_Barrier(); ARMCI_AllFence(); ARMCI_Barrier(); /* Verify */ if (!(me == 0)) for (j = 0; j < elems[1]; j++) { if (ARMCI_ABS(ddst_put[me][j] - j * 1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } ARMCI_Barrier(); if (!dryrun)if (me == 0) { printf("\n aggregate put ..O.K.\n"); } fflush(stdout); if (me == 0) { for (i = 1; i < nproc; i++) { for (j = 0; j < elems[1]; j++) { if (ARMCI_ABS(ddst_get[me][i*elems[1] + j] - j * 1.001 *(i + 1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } ARMCI_Barrier(); if (!dryrun)if (me == 0) { printf(" aggregate get ..O.K.\n"); } fflush(stdout); ARMCI_AllFence(); ARMCI_Barrier(); if (!dryrun)if (me == 0) { printf("O.K.\n"); fflush(stdout); } destroy_array(ddst_put); destroy_array(ddst_get); destroy_array(dsrc); }