/** wrapper for realloc */ static void *ss_realloc (void *p, SS_INT n, size_t size, SS_INT *ok) { void *pnew ; pnew = realloc (p, SS_MAX (n,1) * size) ; /* realloc the block */ *ok = (pnew != NULL) ; /* realloc fails if pnew is NULL */ return ((*ok) ? pnew : p) ; /* return original p if failure */ }
/** allocate a sparse matrix (triplet form or compressed-column form) */ static cs *ss_spalloc (SS_INT m, SS_INT n, SS_INT nzmax, SS_INT values, SS_INT triplet) { cs *A = ss_calloc (1, sizeof (cs)) ; /* allocate the cs struct */ if (!A) return (NULL) ; /* out of memory */ A->id = M_SPT; A->m = m ; /* define dimensions and nzmax */ A->n = n ; A->nzmax = nzmax = SS_MAX (nzmax, 1) ; A->nz = triplet ? 0 : -1 ; /* allocate triplet or comp.col */ A->p = ss_malloc (triplet ? nzmax : n+1, sizeof (SS_INT)) ; A->i = ss_malloc (nzmax, sizeof (SS_INT)) ; A->x = values ? ss_malloc (nzmax, sizeof (SS_ENTRY)) : NULL ; A->nref=calloc(1,sizeof(long)); A->nref[0]=1; return ((!A->p || !A->i || (values && !A->x)) ? ss_spfree (A) : A) ; }
static int * p_distribute_parts( sptensor_t * const ttbuf, char const * const pfname, rank_info * const rinfo) { /* root may have more than target_nnz */ idx_t const target_nnz = rinfo->global_nnz / rinfo->npes; int * parts = (int *) splatt_malloc(SS_MAX(ttbuf->nnz, target_nnz) * sizeof(int)); if(rinfo->rank == 0) { int ret; FILE * fin = open_f(pfname, "r"); /* send to all other ranks */ for(int p=1; p < rinfo->npes; ++p) { /* read into buffer */ for(idx_t n=0; n < target_nnz; ++n) { if((ret = fscanf(fin, "%d", &(parts[n]))) == 0) { fprintf(stderr, "SPLATT ERROR: not enough elements in '%s'\n", pfname); exit(1); } } MPI_Send(parts, target_nnz, MPI_INT, p, 0, rinfo->comm_3d); } /* now read my own part info */ for(idx_t n=0; n < ttbuf->nnz; ++n) { if((ret = fscanf(fin, "%d", &(parts[n]))) == 0) { fprintf(stderr, "SPLATT ERROR: not enough elements in '%s'\n", pfname); exit(1); } } fclose(fin); } else { /* receive part info */ MPI_Recv(parts, ttbuf->nnz, MPI_INT, 0, 0, rinfo->comm_3d, &(rinfo->status)); } return parts; }
sptensor_t * mpi_simple_distribute( char const * const ifname, MPI_Comm comm) { int rank, npes; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &npes); sptensor_t * tt = NULL; FILE * fin = NULL; if(rank == 0) { fin = open_f(ifname, "r"); } switch(get_file_type(ifname)) { case SPLATT_FILE_TEXT_COORD: tt = p_tt_mpi_read_file(fin, comm); break; case SPLATT_FILE_BIN_COORD: tt = p_tt_mpi_read_binary_file(fin, comm); break; } if(rank == 0) { fclose(fin); } /* set dims info */ #pragma omp parallel for schedule(static, 1) for(idx_t m=0; m < tt->nmodes; ++m) { idx_t const * const inds = tt->ind[m]; idx_t dim = 1 +inds[0]; for(idx_t n=1; n < tt->nnz; ++n) { dim = SS_MAX(dim, 1 + inds[n]); } tt->dims[m] = dim; } return tt; }
void mpi_write_mats( matrix_t ** mats, permutation_t const * const perm, rank_info const * const rinfo, char const * const basename, idx_t const nmodes) { char * fname; idx_t const nfactors = mats[0]->J; MPI_Status status; idx_t maxdim = 0; idx_t maxlocaldim = 0; matrix_t * matbuf = NULL; val_t * vbuf = NULL; idx_t * loc_iperm = NULL; for(idx_t m=0; m < nmodes; ++m) { maxdim = SS_MAX(maxdim, rinfo->global_dims[m]); maxlocaldim = SS_MAX(maxlocaldim, mats[m]->I); } /* get the largest local dim */ if(rinfo->rank == 0) { MPI_Reduce(MPI_IN_PLACE, &maxlocaldim, 1, SPLATT_MPI_IDX, MPI_MAX, 0, rinfo->comm_3d); } else { MPI_Reduce(&maxlocaldim, NULL, 1, SPLATT_MPI_IDX, MPI_MAX, 0, rinfo->comm_3d); } if(rinfo->rank == 0) { matbuf = mat_alloc(maxdim, nfactors); loc_iperm = (idx_t *) splatt_malloc(maxdim * sizeof(idx_t)); vbuf = (val_t *) splatt_malloc(maxdim * nfactors * sizeof(val_t)); } for(idx_t m=0; m < nmodes; ++m) { /* root handles the writing */ if(rinfo->rank == 0) { asprintf(&fname, "%s%"SPLATT_PF_IDX".mat", basename, m+1); matbuf->I = rinfo->global_dims[m]; /* copy root's matrix to buffer */ for(idx_t i=0; i < mats[m]->I; ++i) { idx_t const gi = rinfo->layer_starts[m] + perm->iperms[m][i]; for(idx_t f=0; f < nfactors; ++f) { matbuf->vals[f + (gi*nfactors)] = mats[m]->vals[f+(i*nfactors)]; } } /* receive matrix from each rank */ for(int p=1; p < rinfo->npes; ++p) { idx_t layerstart; idx_t nrows; MPI_Recv(&layerstart, 1, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status); MPI_Recv(&nrows, 1, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status); MPI_Recv(vbuf, nrows * nfactors, SPLATT_MPI_VAL, p, 0, rinfo->comm_3d, &status); MPI_Recv(loc_iperm, nrows, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status); /* permute buffer and copy into matbuf */ for(idx_t i=0; i < nrows; ++i) { idx_t const gi = layerstart + loc_iperm[i]; for(idx_t f=0; f < nfactors; ++f) { matbuf->vals[f + (gi*nfactors)] = vbuf[f+(i*nfactors)]; } } } /* write the factor matrix to disk */ mat_write(matbuf, fname); /* clean up */ free(fname); } else { /* send matrix to root */ MPI_Send(&(rinfo->layer_starts[m]), 1, SPLATT_MPI_IDX, 0, 0, rinfo->comm_3d); MPI_Send(&(mats[m]->I), 1, SPLATT_MPI_IDX, 0, 0, rinfo->comm_3d); MPI_Send(mats[m]->vals, mats[m]->I * mats[m]->J, SPLATT_MPI_VAL, 0, 0, rinfo->comm_3d); MPI_Send(perm->iperms[m] + rinfo->mat_start[m], mats[m]->I, SPLATT_MPI_IDX, 0, 0, rinfo->comm_3d); } } /* foreach mode */ if(rinfo->rank == 0) { mat_free(matbuf); free(vbuf); free(loc_iperm); } }
sptensor_t * mpi_tt_read( char const * const ifname, char const * const pfname, rank_info * const rinfo) { timer_start(&timers[TIMER_IO]); /* first just make sure it exists */ FILE * fin; if((fin = fopen(ifname, "r")) == NULL) { if(rinfo->rank == 0) { fprintf(stderr, "SPLATT ERROR: failed to open '%s'\n", ifname); } return NULL; } fclose(fin); /* first naively distribute tensor nonzeros for analysis */ sptensor_t * ttbuf = mpi_simple_distribute(ifname, MPI_COMM_WORLD); rinfo->nmodes = ttbuf->nmodes; MPI_Allreduce(&(ttbuf->nnz), &(rinfo->global_nnz), 1, SPLATT_MPI_IDX, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(ttbuf->dims, &(rinfo->global_dims), ttbuf->nmodes, SPLATT_MPI_IDX, MPI_MAX, MPI_COMM_WORLD); /* first compute MPI dimension if not specified by the user */ if(rinfo->decomp == DEFAULT_MPI_DISTRIBUTION) { rinfo->decomp = SPLATT_DECOMP_MEDIUM; p_get_best_mpi_dim(rinfo); } mpi_setup_comms(rinfo); /* count # nonzeros found in each index */ idx_t * ssizes[MAX_NMODES]; for(idx_t m=0; m < ttbuf->nmodes; ++m) { ssizes[m] = (idx_t *) calloc(rinfo->global_dims[m], sizeof(idx_t)); } p_fill_ssizes(ttbuf, ssizes, rinfo); /* actually parse tensor */ sptensor_t * tt = NULL; switch(rinfo->decomp) { case SPLATT_DECOMP_COARSE: tt = p_read_tt_1d(ifname, ssizes, ttbuf->nmodes, rinfo); /* now fix tt->dims */ for(idx_t m=0; m < tt->nmodes; ++m) { tt->dims[m] = 0; for(idx_t n=0; n < tt->nnz; ++n) { tt->dims[m] = SS_MAX(tt->dims[m], tt->ind[m][n] + 1); } } break; case SPLATT_DECOMP_MEDIUM: tt = p_rearrange_medium(ttbuf, ssizes, rinfo); /* now map tensor indices to local (layer) coordinates and fill in dims */ #pragma omp parallel for schedule(static, 1) for(idx_t m=0; m < ttbuf->nmodes; ++m) { tt->dims[m] = rinfo->layer_ends[m] - rinfo->layer_starts[m]; for(idx_t n=0; n < tt->nnz; ++n) { assert(tt->ind[m][n] >= rinfo->layer_starts[m]); assert(tt->ind[m][n] < rinfo->layer_ends[m]); tt->ind[m][n] -= rinfo->layer_starts[m]; } } break; case SPLATT_DECOMP_FINE: tt = p_rearrange_fine(ttbuf, pfname, ssizes, rinfo); /* now fix tt->dims */ for(idx_t m=0; m < tt->nmodes; ++m) { tt->dims[m] = rinfo->global_dims[m]; rinfo->layer_ends[m] = tt->dims[m]; } break; } for(idx_t m=0; m < ttbuf->nmodes; ++m) { free(ssizes[m]); } tt_free(ttbuf); timer_stop(&timers[TIMER_IO]); return tt; }
/** * @brief Find the boundaries for a process layer. * * @param ssizes The number of nonzeros found in each index (of each mode). * ssizes[1][5] is the number of nonzeros in X(:,5,:). * @param mode Which mode to work on. * @param rinfo MPI rank information. */ static void p_find_layer_boundaries( idx_t ** const ssizes, idx_t const mode, rank_info * const rinfo) { idx_t const * const dims = rinfo->global_dims; idx_t const nnz = rinfo->global_nnz; idx_t const m = mode; /* find start/end slices for my partition */ int const layer_dim = rinfo->dims_3d[m]; idx_t pnnz = nnz / layer_dim; /* nnz in a layer */ /* current processor */ int currp = 0; idx_t lastn = 0; idx_t nnzcnt = ssizes[m][0]; /* initialize layer_ptrs */ rinfo->layer_ptrs[m] = splatt_malloc((layer_dim+1) * sizeof(**(rinfo->layer_ptrs))); rinfo->layer_ptrs[m][currp++] = 0; rinfo->layer_ptrs[m][layer_dim] = dims[m]; if(layer_dim == 1) { goto CLEANUP; return; } /* foreach slice */ for(idx_t s=1; s < dims[m]; ++s) { /* if we have passed the next layer boundary */ if(nnzcnt >= lastn + pnnz) { /* choose this slice or the previous, whichever is closer */ idx_t const thisdist = nnzcnt - (lastn + pnnz); idx_t const prevdist = (lastn + pnnz) - (nnzcnt - ssizes[m][s-1]); if(prevdist < thisdist) { lastn = nnzcnt - ssizes[m][s-1]; /* see below comment */ //rinfo->layer_ptrs[m][currp++] = s-1; } else { lastn = nnzcnt; //rinfo->layer_ptrs[m][currp++] = s; } /* Always choosing s but marking lastn with s-1 leads to better balance * and communication volume. This is totally a heuristic. */ rinfo->layer_ptrs[m][currp++] = s; /* exit early if we placed the last rank */ if(currp == layer_dim) { break; } /* adjust target nnz based on what is left */ pnnz = (nnz - lastn) / SS_MAX(1, layer_dim - (currp-1)); } nnzcnt += ssizes[m][s]; } CLEANUP: /* store layer bounderies in layer_{starts, ends} */ rinfo->layer_starts[m] = rinfo->layer_ptrs[m][rinfo->coords_3d[m]]; rinfo->layer_ends[m] = rinfo->layer_ptrs[m][rinfo->coords_3d[m] + 1]; /* it is possible to have a very small dimension and too many ranks */ if(rinfo->dims_3d[m] > 1 && rinfo->layer_ends[m] - rinfo->layer_starts[m] == dims[m]) { fprintf(stderr, "SPLATT: rank: %d too many MPI ranks for mode %"\ SPLATT_PF_IDX".\n", rinfo->rank, m+1); rinfo->layer_starts[m] = dims[m]; rinfo->layer_ends[m] = dims[m]; } }
static void p_find_my_slices_1d( idx_t ** const ssizes, idx_t const nmodes, idx_t const nnz, rank_info * const rinfo) { idx_t const * const dims = rinfo->global_dims; /* find start/end slices for my partition */ for(idx_t m=0; m < nmodes; ++m) { /* current processor */ int currp = 0; idx_t lastn = 0; idx_t nnzcnt = 0; idx_t pnnz = nnz / rinfo->npes; rinfo->layer_starts[m] = 0; rinfo->layer_ends[m] = dims[m]; rinfo->mat_start[m] = 0; rinfo->mat_end[m] = dims[m]; for(idx_t s=0; s < dims[m]; ++s) { if(nnzcnt >= lastn + pnnz) { /* choose this slice or the previous, whichever is closer */ if(s > 0) { idx_t const thisdist = nnzcnt - (lastn + pnnz); idx_t const prevdist = (lastn + pnnz) - (nnzcnt - ssizes[m][s-1]); if(prevdist < thisdist) { lastn = nnzcnt - ssizes[m][s-1]; } else { lastn = nnzcnt; } } else { lastn = nnzcnt; } ++currp; /* adjust target nnz based on what is left */ pnnz = (nnz - lastn) / SS_MAX(1, rinfo->npes - currp); if(currp == rinfo->rank) { rinfo->mat_start[m] = s; } else if(currp == rinfo->rank+1 && currp != rinfo->npes) { /* only set mat_end if we aren't at the end of the tensor */ rinfo->mat_end[m] = s; break; } } nnzcnt += ssizes[m][s]; if(rinfo->rank == rinfo->npes-1) { assert(rinfo->mat_end[m] == rinfo->global_dims[m]); } } /* it is possible to have a very small dimension and too many ranks */ if(rinfo->npes > 1 && rinfo->mat_start[m] == 0 && rinfo->mat_end[m] == dims[m]) { fprintf(stderr, "SPLATT: rank: %d too many MPI ranks for mode %"\ SPLATT_PF_IDX".\n", rinfo->rank, m+1); rinfo->mat_start[m] = dims[m]; rinfo->mat_end[m] = dims[m]; } } }
idx_t * tt_densetile( sptensor_t * const tt, idx_t const * const tile_dims) { timer_start(&timers[TIMER_TILE]); idx_t const nmodes = tt->nmodes; /* * Count tiles and compute their dimensions. */ idx_t ntiles = 1; for(idx_t m=0; m < nmodes; ++m) { ntiles *= tile_dims[m]; } /* the actual number of indices to place in each tile */ idx_t tsizes[MAX_NMODES]; for(idx_t m=0; m < nmodes; ++m) { tsizes[m] = SS_MAX(tt->dims[m] / tile_dims[m], 1); } /* We'll copy the newly tiled non-zeros into this one, then copy back */ sptensor_t * newtt = tt_alloc(tt->nnz, tt->nmodes); /* * Count of non-zeros per tile. We use +1 because after a prefix sum, this * becomes a pointer into the non-zeros for each tile (e.g., csr->row_ptr). */ idx_t * tcounts_global = splatt_malloc((ntiles+1) * sizeof(*tcounts_global)); for(idx_t t=0; t < ntiles+1; ++t) { tcounts_global[t] = 0; } /* * A matrix of thread-local counters. */ int const nthreads = splatt_omp_get_max_threads(); idx_t * * tcounts_thread = splatt_malloc( (nthreads+1) * sizeof(*tcounts_thread)); /* After the prefix sum, the global counter will have the sum of all nnz in * each tile (across threads), and thus can be returned. */ tcounts_thread[nthreads] = tcounts_global; /* partition the non-zeros */ idx_t * thread_parts = partition_simple(tt->nnz, nthreads); #pragma omp parallel { int const tid = splatt_omp_get_thread_num(); idx_t const nnz_start = thread_parts[tid]; idx_t const nnz_end = thread_parts[tid+1]; /* allocate / initialize thread-local counters */ tcounts_thread[tid] = splatt_malloc(ntiles * sizeof(**tcounts_thread)); for(idx_t tile=0; tile < ntiles; ++tile) { tcounts_thread[tid][tile] = 0; } #pragma omp barrier /* offset by 1 to make prefix sum easy */ idx_t * tcounts_local = tcounts_thread[tid+1]; /* count tile sizes (in nnz) */ idx_t coord[MAX_NMODES]; for(idx_t x=nnz_start; x < nnz_end; ++x) { for(idx_t m=0; m < nmodes; ++m) { /* capping at dims-1 fixes overflow when dims don't divide evenly */ coord[m] = SS_MIN(tt->ind[m][x] / tsizes[m], tile_dims[m]-1); } idx_t const id = get_tile_id(tile_dims, nmodes, coord); assert(id < ntiles); ++tcounts_local[id]; } #pragma omp barrier #pragma omp single { /* prefix sum for each tile */ for(idx_t tile=0; tile < ntiles; ++tile) { for(int thread=0; thread < nthreads; ++thread) { tcounts_thread[thread+1][tile] += tcounts_thread[thread][tile]; } /* carry over to next tile */ if(tile < (ntiles-1)) { tcounts_thread[0][tile+1] += tcounts_thread[nthreads][tile]; } } } /* implied barrier */ /* grab my starting indices now */ tcounts_local = tcounts_thread[tid]; /* * Rearrange old tensor into new tiled one. */ for(idx_t x=nnz_start; x < nnz_end; ++x) { for(idx_t m=0; m < nmodes; ++m) { coord[m] = SS_MIN(tt->ind[m][x] / tsizes[m], tile_dims[m]-1); } /* offset by 1 to make prefix sum easy */ idx_t const id = get_tile_id(tile_dims, nmodes, coord); assert(id < ntiles); idx_t const newidx = tcounts_local[id]++; newtt->vals[newidx] = tt->vals[x]; for(idx_t m=0; m < nmodes; ++m) { newtt->ind[m][newidx] = tt->ind[m][x]; } } splatt_free(tcounts_local); } /* end omp parallel */ /* copy tiled data into old struct */ par_memcpy(tt->vals, newtt->vals, tt->nnz * sizeof(*tt->vals)); for(idx_t m=0; m < nmodes; ++m) { par_memcpy(tt->ind[m], newtt->ind[m], tt->nnz * sizeof(**tt->ind)); } /* shift counts to the right by 1 to make proper pointer */ memmove(tcounts_global+1, tcounts_global, ntiles * sizeof(*tcounts_global)); tcounts_global[0] = 0; assert(tcounts_global[ntiles] == tt->nnz); tt_free(newtt); splatt_free(tcounts_thread); splatt_free(thread_parts); timer_stop(&timers[TIMER_TILE]); return tcounts_global; }
/** wrapper for calloc */ static void *ss_calloc (SS_INT n, size_t size) { return (calloc (SS_MAX (n,1), size)) ; }
/** wrapper for malloc */ static void *ss_malloc (SS_INT n, size_t size) { return (malloc (SS_MAX (n,1) * size)) ; }