Exemplo n.º 1
0
/**
 wrapper for realloc */
static void *ss_realloc (void *p, SS_INT n, size_t size, SS_INT *ok)
{
    void *pnew ;
    pnew = realloc (p, SS_MAX (n,1) * size) ; /* realloc the block */
    *ok = (pnew != NULL) ;                  /* realloc fails if pnew is NULL */
    return ((*ok) ? pnew : p) ;             /* return original p if failure */
}
Exemplo n.º 2
0
/**
 allocate a sparse matrix (triplet form or compressed-column form) */
static cs *ss_spalloc (SS_INT m, SS_INT n, SS_INT nzmax, SS_INT values, SS_INT triplet)
{
    cs *A = ss_calloc (1, sizeof (cs)) ;    /* allocate the cs struct */
    if (!A) return (NULL) ;                 /* out of memory */
    A->id = M_SPT;
    A->m = m ;                              /* define dimensions and nzmax */
    A->n = n ;
    A->nzmax = nzmax = SS_MAX (nzmax, 1) ;
    A->nz = triplet ? 0 : -1 ;              /* allocate triplet or comp.col */
    A->p = ss_malloc (triplet ? nzmax : n+1, sizeof (SS_INT)) ;
    A->i = ss_malloc (nzmax, sizeof (SS_INT)) ;
    A->x = values ? ss_malloc (nzmax, sizeof (SS_ENTRY)) : NULL ;
    A->nref=calloc(1,sizeof(long));
    A->nref[0]=1;
    return ((!A->p || !A->i || (values && !A->x)) ? ss_spfree (A) : A) ;
}
Exemplo n.º 3
0
static int * p_distribute_parts(
  sptensor_t * const ttbuf,
  char const * const pfname,
  rank_info * const rinfo)
{
  /* root may have more than target_nnz */
  idx_t const target_nnz = rinfo->global_nnz / rinfo->npes;
  int * parts = (int *) splatt_malloc(SS_MAX(ttbuf->nnz, target_nnz) * sizeof(int));

  if(rinfo->rank == 0) {
    int ret;
    FILE * fin = open_f(pfname, "r");

    /* send to all other ranks */
    for(int p=1; p < rinfo->npes; ++p) {
      /* read into buffer */
      for(idx_t n=0; n < target_nnz; ++n) {
        if((ret = fscanf(fin, "%d", &(parts[n]))) == 0) {
          fprintf(stderr, "SPLATT ERROR: not enough elements in '%s'\n",
              pfname);
          exit(1);
        }
      }
      MPI_Send(parts, target_nnz, MPI_INT, p, 0, rinfo->comm_3d);
    }

    /* now read my own part info */
    for(idx_t n=0; n < ttbuf->nnz; ++n) {
      if((ret = fscanf(fin, "%d", &(parts[n]))) == 0) {
        fprintf(stderr, "SPLATT ERROR: not enough elements in '%s'\n",
            pfname);
        exit(1);
      }
    }
    fclose(fin);
  } else {
    /* receive part info */
    MPI_Recv(parts, ttbuf->nnz, MPI_INT, 0, 0, rinfo->comm_3d,
        &(rinfo->status));
  }
  return parts;
}
Exemplo n.º 4
0
sptensor_t * mpi_simple_distribute(
  char const * const ifname,
  MPI_Comm comm)
{
  int rank, npes;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &npes);

  sptensor_t * tt = NULL;

  FILE * fin = NULL;
  if(rank == 0) {
    fin = open_f(ifname, "r");
  }

  switch(get_file_type(ifname)) {
  case SPLATT_FILE_TEXT_COORD:
    tt = p_tt_mpi_read_file(fin, comm);
    break;
  case SPLATT_FILE_BIN_COORD:
    tt = p_tt_mpi_read_binary_file(fin, comm);
    break;
  }

  if(rank == 0) {
    fclose(fin);
  }

  /* set dims info */
  #pragma omp parallel for schedule(static, 1)
  for(idx_t m=0; m < tt->nmodes; ++m) {
    idx_t const * const inds = tt->ind[m];
    idx_t dim = 1 +inds[0];
    for(idx_t n=1; n < tt->nnz; ++n) {
      dim = SS_MAX(dim, 1 + inds[n]);
    }
    tt->dims[m] = dim;
  }


  return tt;
}
Exemplo n.º 5
0
void mpi_write_mats(
  matrix_t ** mats,
  permutation_t const * const perm,
  rank_info const * const rinfo,
  char const * const basename,
  idx_t const nmodes)
{
  char * fname;
  idx_t const nfactors = mats[0]->J;

  MPI_Status status;

  idx_t maxdim = 0;
  idx_t maxlocaldim = 0;
  matrix_t * matbuf = NULL;
  val_t * vbuf = NULL;
  idx_t * loc_iperm = NULL;

  for(idx_t m=0; m < nmodes; ++m) {
    maxdim = SS_MAX(maxdim, rinfo->global_dims[m]);
    maxlocaldim = SS_MAX(maxlocaldim, mats[m]->I);
  }

  /* get the largest local dim */
  if(rinfo->rank == 0) {
    MPI_Reduce(MPI_IN_PLACE, &maxlocaldim, 1, SPLATT_MPI_IDX, MPI_MAX, 0,
      rinfo->comm_3d);
  } else {
    MPI_Reduce(&maxlocaldim, NULL, 1, SPLATT_MPI_IDX, MPI_MAX, 0,
      rinfo->comm_3d);
  }

  if(rinfo->rank == 0) {
    matbuf = mat_alloc(maxdim, nfactors);
    loc_iperm = (idx_t *) splatt_malloc(maxdim * sizeof(idx_t));
    vbuf = (val_t *) splatt_malloc(maxdim * nfactors * sizeof(val_t));
  }

  for(idx_t m=0; m < nmodes; ++m) {
    /* root handles the writing */
    if(rinfo->rank == 0) {
      asprintf(&fname, "%s%"SPLATT_PF_IDX".mat", basename, m+1);
      matbuf->I = rinfo->global_dims[m];

      /* copy root's matrix to buffer */
      for(idx_t i=0; i < mats[m]->I; ++i) {
        idx_t const gi = rinfo->layer_starts[m] + perm->iperms[m][i];
        for(idx_t f=0; f < nfactors; ++f) {
          matbuf->vals[f + (gi*nfactors)] = mats[m]->vals[f+(i*nfactors)];
        }
      }

      /* receive matrix from each rank */
      for(int p=1; p < rinfo->npes; ++p) {
        idx_t layerstart;
        idx_t nrows;
        MPI_Recv(&layerstart, 1, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status);
        MPI_Recv(&nrows, 1, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status);
        MPI_Recv(vbuf, nrows * nfactors, SPLATT_MPI_VAL, p, 0, rinfo->comm_3d,
            &status);
        MPI_Recv(loc_iperm, nrows, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status);

        /* permute buffer and copy into matbuf */
        for(idx_t i=0; i < nrows; ++i) {
          idx_t const gi = layerstart + loc_iperm[i];
          for(idx_t f=0; f < nfactors; ++f) {
            matbuf->vals[f + (gi*nfactors)] = vbuf[f+(i*nfactors)];
          }
        }
      }

      /* write the factor matrix to disk */
      mat_write(matbuf, fname);

      /* clean up */
      free(fname);
    } else {
      /* send matrix to root */
      MPI_Send(&(rinfo->layer_starts[m]), 1, SPLATT_MPI_IDX, 0, 0, rinfo->comm_3d);
      MPI_Send(&(mats[m]->I), 1, SPLATT_MPI_IDX, 0, 0, rinfo->comm_3d);
      MPI_Send(mats[m]->vals, mats[m]->I * mats[m]->J, SPLATT_MPI_VAL, 0, 0,
          rinfo->comm_3d);
      MPI_Send(perm->iperms[m] + rinfo->mat_start[m], mats[m]->I, SPLATT_MPI_IDX,
          0, 0, rinfo->comm_3d);
    }
  } /* foreach mode */


  if(rinfo->rank == 0) {
    mat_free(matbuf);
    free(vbuf);
    free(loc_iperm);
  }
}
Exemplo n.º 6
0
sptensor_t * mpi_tt_read(
  char const * const ifname,
  char const * const pfname,
  rank_info * const rinfo)
{
  timer_start(&timers[TIMER_IO]);

  /* first just make sure it exists */
  FILE * fin;
  if((fin = fopen(ifname, "r")) == NULL) {
    if(rinfo->rank == 0) {
      fprintf(stderr, "SPLATT ERROR: failed to open '%s'\n", ifname);
    }
    return NULL;
  }
  fclose(fin);

  /* first naively distribute tensor nonzeros for analysis */
  sptensor_t * ttbuf = mpi_simple_distribute(ifname, MPI_COMM_WORLD);

  rinfo->nmodes = ttbuf->nmodes;
  MPI_Allreduce(&(ttbuf->nnz), &(rinfo->global_nnz), 1, SPLATT_MPI_IDX,
      MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(ttbuf->dims, &(rinfo->global_dims), ttbuf->nmodes,
      SPLATT_MPI_IDX, MPI_MAX, MPI_COMM_WORLD);


  /* first compute MPI dimension if not specified by the user */
  if(rinfo->decomp == DEFAULT_MPI_DISTRIBUTION) {
    rinfo->decomp = SPLATT_DECOMP_MEDIUM;
    p_get_best_mpi_dim(rinfo);
  }

  mpi_setup_comms(rinfo);

  /* count # nonzeros found in each index */
  idx_t * ssizes[MAX_NMODES];
  for(idx_t m=0; m < ttbuf->nmodes; ++m) {
    ssizes[m] = (idx_t *) calloc(rinfo->global_dims[m], sizeof(idx_t));
  }
  p_fill_ssizes(ttbuf, ssizes, rinfo);

  /* actually parse tensor */
  sptensor_t * tt = NULL;
  switch(rinfo->decomp) {
  case SPLATT_DECOMP_COARSE:
    tt = p_read_tt_1d(ifname, ssizes, ttbuf->nmodes, rinfo);
    /* now fix tt->dims */
    for(idx_t m=0; m < tt->nmodes; ++m) {
      tt->dims[m] = 0;
      for(idx_t n=0; n < tt->nnz; ++n) {
        tt->dims[m] = SS_MAX(tt->dims[m], tt->ind[m][n] + 1);
      }
    }
    break;

  case SPLATT_DECOMP_MEDIUM:
    tt = p_rearrange_medium(ttbuf, ssizes, rinfo);

    /* now map tensor indices to local (layer) coordinates and fill in dims */
    #pragma omp parallel for schedule(static, 1)
    for(idx_t m=0; m < ttbuf->nmodes; ++m) {
      tt->dims[m] = rinfo->layer_ends[m] - rinfo->layer_starts[m];
      for(idx_t n=0; n < tt->nnz; ++n) {
        assert(tt->ind[m][n] >= rinfo->layer_starts[m]);
        assert(tt->ind[m][n] < rinfo->layer_ends[m]);
        tt->ind[m][n] -= rinfo->layer_starts[m];
      }
    }
    break;

  case SPLATT_DECOMP_FINE:
    tt = p_rearrange_fine(ttbuf, pfname, ssizes, rinfo);
    /* now fix tt->dims */
    for(idx_t m=0; m < tt->nmodes; ++m) {
      tt->dims[m] = rinfo->global_dims[m];
      rinfo->layer_ends[m] = tt->dims[m];
    }
    break;
  }

  for(idx_t m=0; m < ttbuf->nmodes; ++m) {
    free(ssizes[m]);
  }

  tt_free(ttbuf);
  timer_stop(&timers[TIMER_IO]);
  return tt;
}
Exemplo n.º 7
0
/**
* @brief Find the boundaries for a process layer.
*
* @param ssizes The number of nonzeros found in each index (of each mode).
*               ssizes[1][5] is the number of nonzeros in X(:,5,:).
* @param mode Which mode to work on.
* @param rinfo MPI rank information.
*/
static void p_find_layer_boundaries(
  idx_t ** const ssizes,
  idx_t const mode,
  rank_info * const rinfo)
{
  idx_t const * const dims = rinfo->global_dims;
  idx_t const nnz = rinfo->global_nnz;
  idx_t const m = mode;

  /* find start/end slices for my partition */
  int const layer_dim = rinfo->dims_3d[m];
  idx_t pnnz = nnz / layer_dim; /* nnz in a layer */

  /* current processor */
  int currp  = 0;
  idx_t lastn = 0;
  idx_t nnzcnt = ssizes[m][0];

  /* initialize layer_ptrs */
  rinfo->layer_ptrs[m]
      = splatt_malloc((layer_dim+1) * sizeof(**(rinfo->layer_ptrs)));
  rinfo->layer_ptrs[m][currp++] = 0;
  rinfo->layer_ptrs[m][layer_dim] = dims[m];

  if(layer_dim == 1) {
    goto CLEANUP;
    return;
  }

  /* foreach slice */
  for(idx_t s=1; s < dims[m]; ++s) {
    /* if we have passed the next layer boundary */
    if(nnzcnt >= lastn + pnnz) {

      /* choose this slice or the previous, whichever is closer */
      idx_t const thisdist = nnzcnt - (lastn + pnnz);
      idx_t const prevdist = (lastn + pnnz) - (nnzcnt - ssizes[m][s-1]);
      if(prevdist < thisdist) {
        lastn = nnzcnt - ssizes[m][s-1];
        /* see below comment */
        //rinfo->layer_ptrs[m][currp++] = s-1;
      } else {
        lastn = nnzcnt;
        //rinfo->layer_ptrs[m][currp++] = s;
      }

      /* Always choosing s but marking lastn with s-1 leads to better balance
       * and communication volume. This is totally a heuristic. */
      rinfo->layer_ptrs[m][currp++] = s;

      /* exit early if we placed the last rank */
      if(currp == layer_dim) {
        break;
      }

      /* adjust target nnz based on what is left */
      pnnz = (nnz - lastn) / SS_MAX(1, layer_dim - (currp-1));
    }
    nnzcnt += ssizes[m][s];
  }

  CLEANUP:
  /* store layer bounderies in layer_{starts, ends} */
  rinfo->layer_starts[m] = rinfo->layer_ptrs[m][rinfo->coords_3d[m]];
  rinfo->layer_ends[m] = rinfo->layer_ptrs[m][rinfo->coords_3d[m] + 1];

  /* it is possible to have a very small dimension and too many ranks */
  if(rinfo->dims_3d[m] > 1 &&
        rinfo->layer_ends[m] - rinfo->layer_starts[m] == dims[m]) {
    fprintf(stderr, "SPLATT: rank: %d too many MPI ranks for mode %"\
        SPLATT_PF_IDX".\n", rinfo->rank, m+1);
    rinfo->layer_starts[m] = dims[m];
    rinfo->layer_ends[m] = dims[m];
  }
}
Exemplo n.º 8
0
static void p_find_my_slices_1d(
  idx_t ** const ssizes,
  idx_t const nmodes,
  idx_t const nnz,
  rank_info * const rinfo)
{
  idx_t const * const dims = rinfo->global_dims;
  /* find start/end slices for my partition */
  for(idx_t m=0; m < nmodes; ++m) {
    /* current processor */
    int currp  = 0;
    idx_t lastn = 0;
    idx_t nnzcnt = 0;

    idx_t pnnz = nnz / rinfo->npes;

    rinfo->layer_starts[m] = 0;
    rinfo->layer_ends[m] = dims[m];

    rinfo->mat_start[m] = 0;
    rinfo->mat_end[m] = dims[m];
    for(idx_t s=0; s < dims[m]; ++s) {
      if(nnzcnt >= lastn + pnnz) {
        /* choose this slice or the previous, whichever is closer */
        if(s > 0) {
          idx_t const thisdist = nnzcnt - (lastn + pnnz);
          idx_t const prevdist = (lastn + pnnz) - (nnzcnt - ssizes[m][s-1]);
          if(prevdist < thisdist) {
            lastn = nnzcnt - ssizes[m][s-1];
          } else {
            lastn = nnzcnt;
          }
        } else {
          lastn = nnzcnt;
        }

        ++currp;

        /* adjust target nnz based on what is left */
        pnnz = (nnz - lastn) / SS_MAX(1, rinfo->npes - currp);

        if(currp == rinfo->rank) {
          rinfo->mat_start[m] = s;
        } else if(currp == rinfo->rank+1 && currp != rinfo->npes) {
          /* only set mat_end if we aren't at the end of the tensor */
          rinfo->mat_end[m] = s;
          break;
        }
      }
      nnzcnt += ssizes[m][s];

      if(rinfo->rank == rinfo->npes-1) {
        assert(rinfo->mat_end[m] == rinfo->global_dims[m]);
      }
    }

    /* it is possible to have a very small dimension and too many ranks */
    if(rinfo->npes > 1 && rinfo->mat_start[m] == 0
        && rinfo->mat_end[m] == dims[m]) {
      fprintf(stderr, "SPLATT: rank: %d too many MPI ranks for mode %"\
          SPLATT_PF_IDX".\n", rinfo->rank, m+1);
      rinfo->mat_start[m] = dims[m];
      rinfo->mat_end[m] = dims[m];
    }
  }
}
Exemplo n.º 9
0
idx_t * tt_densetile(
  sptensor_t * const tt,
  idx_t const * const tile_dims)
{
  timer_start(&timers[TIMER_TILE]);

  idx_t const nmodes = tt->nmodes;

  /*
   * Count tiles and compute their dimensions.
   */
  idx_t ntiles = 1;
  for(idx_t m=0; m < nmodes; ++m) {
    ntiles *= tile_dims[m];
  }
  /* the actual number of indices to place in each tile */
  idx_t tsizes[MAX_NMODES];
  for(idx_t m=0; m < nmodes; ++m) {
    tsizes[m] = SS_MAX(tt->dims[m] / tile_dims[m], 1);
  }

  /* We'll copy the newly tiled non-zeros into this one, then copy back */
  sptensor_t * newtt = tt_alloc(tt->nnz, tt->nmodes);

  /*
   * Count of non-zeros per tile. We use +1 because after a prefix sum, this
   * becomes a pointer into the non-zeros for each tile (e.g., csr->row_ptr).
   */
  idx_t * tcounts_global = splatt_malloc((ntiles+1) * sizeof(*tcounts_global));
  for(idx_t t=0; t < ntiles+1; ++t) {
    tcounts_global[t] = 0;
  }

  /* 
   * A matrix of thread-local counters.
   */
  int const nthreads = splatt_omp_get_max_threads();
  idx_t * * tcounts_thread = splatt_malloc(
      (nthreads+1) * sizeof(*tcounts_thread));

  /* After the prefix sum, the global counter will have the sum of all nnz in
   * each tile (across threads), and thus can be returned. */
  tcounts_thread[nthreads] = tcounts_global;

  /* partition the non-zeros */
  idx_t * thread_parts = partition_simple(tt->nnz, nthreads);

  #pragma omp parallel
  {
    int const tid = splatt_omp_get_thread_num();
    idx_t const nnz_start = thread_parts[tid];
    idx_t const nnz_end   = thread_parts[tid+1];

    /* allocate / initialize thread-local counters */
    tcounts_thread[tid] = splatt_malloc(ntiles * sizeof(**tcounts_thread));
    for(idx_t tile=0; tile < ntiles; ++tile) {
      tcounts_thread[tid][tile] = 0;
    }
    #pragma omp barrier

    /* offset by 1 to make prefix sum easy */
    idx_t * tcounts_local = tcounts_thread[tid+1];

    /* count tile sizes (in nnz) */
    idx_t coord[MAX_NMODES];
    for(idx_t x=nnz_start; x < nnz_end; ++x) {
      for(idx_t m=0; m < nmodes; ++m) {
        /* capping at dims-1 fixes overflow when dims don't divide evenly */
        coord[m] = SS_MIN(tt->ind[m][x] / tsizes[m], tile_dims[m]-1);
      }
      idx_t const id = get_tile_id(tile_dims, nmodes, coord);
      assert(id < ntiles);
      ++tcounts_local[id];
    }

    #pragma omp barrier
    #pragma omp single
    {
      /* prefix sum for each tile */
      for(idx_t tile=0; tile < ntiles; ++tile) {
        for(int thread=0; thread < nthreads; ++thread) {
          tcounts_thread[thread+1][tile] += tcounts_thread[thread][tile];
        }

        /* carry over to next tile */
        if(tile < (ntiles-1)) {
          tcounts_thread[0][tile+1] += tcounts_thread[nthreads][tile];
        }
      }
    } /* implied barrier */

    /* grab my starting indices now */
    tcounts_local = tcounts_thread[tid];

    /*
     * Rearrange old tensor into new tiled one.
     */
    for(idx_t x=nnz_start; x < nnz_end; ++x) {
      for(idx_t m=0; m < nmodes; ++m) {
        coord[m] = SS_MIN(tt->ind[m][x] / tsizes[m], tile_dims[m]-1);
      }
      /* offset by 1 to make prefix sum easy */
      idx_t const id = get_tile_id(tile_dims, nmodes, coord);
      assert(id < ntiles);

      idx_t const newidx = tcounts_local[id]++;
      newtt->vals[newidx] = tt->vals[x];
      for(idx_t m=0; m < nmodes; ++m) {
        newtt->ind[m][newidx] = tt->ind[m][x];
      }
    }

    splatt_free(tcounts_local);
  } /* end omp parallel */

  /* copy tiled data into old struct */
  par_memcpy(tt->vals, newtt->vals, tt->nnz * sizeof(*tt->vals));
  for(idx_t m=0; m < nmodes; ++m) {
    par_memcpy(tt->ind[m], newtt->ind[m], tt->nnz * sizeof(**tt->ind));
  }

  /* shift counts to the right by 1 to make proper pointer */
  memmove(tcounts_global+1, tcounts_global, ntiles * sizeof(*tcounts_global));
  tcounts_global[0] = 0;
  assert(tcounts_global[ntiles] == tt->nnz);

  tt_free(newtt);
  splatt_free(tcounts_thread);
  splatt_free(thread_parts);

  timer_stop(&timers[TIMER_TILE]);
  return tcounts_global;
}
Exemplo n.º 10
0
/**
 wrapper for calloc */
static void *ss_calloc (SS_INT n, size_t size)
{
    return (calloc (SS_MAX (n,1), size)) ;
}
Exemplo n.º 11
0
/**
 wrapper for malloc */
static void *ss_malloc (SS_INT n, size_t size)
{
    return (malloc (SS_MAX (n,1) * size)) ;
}