Esempio n. 1
0
void get_vertex_distribution_for_pred(size_t count, const int64_t* vertex_p, int* owner_p, size_t* local_p) {
  const int64_t* restrict vertex = vertex_p;
  int* restrict owner = owner_p;
  size_t* restrict local = local_p;
  ptrdiff_t i;
#pragma omp parallel for
  for (i = 0; i < (ptrdiff_t)count; ++i) {
    owner[i] = VERTEX_OWNER(vertex[i]);
    local[i] = VERTEX_LOCAL(vertex[i]);
  }
}
Esempio n. 2
0
int validate_bfs_result(const csr_graph* const g, const int64_t root, const int64_t* const pred, const int64_t nvisited) {
  int validation_passed = 1;
  int root_is_mine = (VERTEX_OWNER(root) == rank);

  const size_t nlocalverts = g->nlocalverts;
  const size_t nlocaledges = g->nlocaledges;
  const int64_t nglobalverts = g->nglobalverts;

  /* Check that root is its own parent. */
  if (root_is_mine) {
    if (pred[VERTEX_LOCAL(root)] != root) {
      fprintf(stderr, "%d: Validation error: parent of root vertex %" PRId64 " is %" PRId64 ", not the root itself.\n", rank, root, pred[VERTEX_LOCAL(root)]);
      validation_passed = 0;
    }
  }

  /* Check that nothing else is its own parent, and check for in-range
   * values. */
  int any_range_errors = 0;
  size_t i;
  for (i = 0; i < nlocalverts; ++i) {
    int64_t v = VERTEX_TO_GLOBAL(i);
    assert (VERTEX_OWNER(v) == rank);
    assert (VERTEX_LOCAL(v) == i);
    if (v != root && pred[i] == v) {
      fprintf(stderr, "%d: Validation error: parent of non-root vertex %" PRId64 " is itself.\n", rank, v);
      validation_passed = 0;
    }
    if (pred[i] < -1 || pred[i] >= nglobalverts) {
      fprintf(stderr, "%d: Validation error: parent of vertex %" PRId64 " is out-of-range value %" PRId64 ".\n", rank, v, pred[i]);
      validation_passed = 0;
      any_range_errors = 1;
    }
  }
  MPI_Allreduce(MPI_IN_PLACE, &any_range_errors, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);

  /* Check that nvisited is correct. */
  int64_t nvisited_actual = 0;
  for (i = 0; i < nlocalverts; ++i) {
    if (pred[i] != -1) ++nvisited_actual;
  }
  MPI_Allreduce(MPI_IN_PLACE, &nvisited_actual, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
  if (nvisited_actual != nvisited) {
    fprintf(stderr, "%d: Validation error: claimed visit count %" PRId64 " is different from actual count %" PRId64 ".\n", rank, nvisited, nvisited_actual);
    validation_passed = 0;
  }

  if (!any_range_errors) { /* Other parts of validation assume in-range values */

    /* Check that there is an edge from each vertex to its claimed
     * predecessor. */
    size_t i;
    for (i = 0; i < nlocalverts; ++i) {
      int64_t v = VERTEX_TO_GLOBAL(i);
      int64_t p = pred[i];
      if (p == -1) continue;
      int found_pred_edge = 0;
      if (v == p) found_pred_edge = 1; /* Root vertex */
      size_t ei, ei_end = g->rowstarts[i + 1];
      for (ei = g->rowstarts[i]; ei < ei_end; ++ei) {
        int64_t w = g->column[ei];
        if (w == p) {
          found_pred_edge = 1;
          break;
        }
      }
      if (!found_pred_edge) {
        fprintf(stderr, "%d: Validation error: no graph edge from vertex %" PRId64 " to its parent %" PRId64 ".\n", rank, v, p);
        validation_passed = 0;
      }
    }

    /* Create a vertex depth map to use for later validation. */
    int64_t* depth = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t));
    { /* Scope some code that has a lot of temporary variables. */
      int64_t* pred_depth = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Depth of predecessor vertex for each local vertex */
      size_t i;
      for (i = 0; i < nlocalverts; ++i) depth[i] = INT64_MAX;
      if (root_is_mine) depth[VERTEX_LOCAL(root)] = 0;
      /* Send each vertex that appears in the local part of the predecessor map
       * to its owner; record the original locations so we can put the answers
       * into pred_depth. */
      /* Do a histogram sort by owner (this same kind of sort is used other
       * places as well).  First, count the number of vertices going to each
       * destination. */
      int* num_preds_per_owner = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */
      for (i = 0; i < nlocalverts; ++i) {
        ++num_preds_per_owner[pred[i] == -1 ? size - 1 : VERTEX_OWNER(pred[i])];
      }
      int64_t* preds_per_owner = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Predecessors sorted by owner */
      int64_t* preds_per_owner_results_offsets = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Indices into pred_depth to write */
      /* Second, do a prefix sum to get the displacements of the different
       * owners in the outgoing array. */
      int* pred_owner_displs = (int*)xmalloc((size + 1) * sizeof(int));
      pred_owner_displs[0] = 0;
      int r;
      for (r = 0; r < size; ++r) {
        pred_owner_displs[r + 1] = pred_owner_displs[r] + num_preds_per_owner[r];
      }
      /* Last, put the vertices into the correct positions in the array, based
       * on their owners and the counts and displacements computed earlier. */
      int* pred_owner_offsets = (int*)xmalloc((size + 1) * sizeof(int));
      memcpy(pred_owner_offsets, pred_owner_displs, (size + 1) * sizeof(int));
      for (i = 0; i < nlocalverts; ++i) {
        int* offset_ptr = &pred_owner_offsets[pred[i] == -1 ? size - 1 : VERTEX_OWNER(pred[i])];
        preds_per_owner[*offset_ptr] = pred[i];
        preds_per_owner_results_offsets[*offset_ptr] = i;
        ++*offset_ptr;
      }
      for (r = 0; r < size; ++r) {
        assert (pred_owner_offsets[r] == pred_owner_displs[r + 1]);
      }
      free(pred_owner_offsets);

      /* Send around the number of vertices that will be sent to each destination. */
      int* num_my_preds_per_sender = (int*)xmalloc(size * sizeof(int));
      MPI_Alltoall(num_preds_per_owner, 1, MPI_INT,
                   num_my_preds_per_sender, 1, MPI_INT,
                   MPI_COMM_WORLD);
      int* my_preds_per_sender_displs = (int*)xmalloc((size + 1) * sizeof(int));
      my_preds_per_sender_displs[0] = 0;
      for (r = 0; r < size; ++r) {
        my_preds_per_sender_displs[r + 1] = my_preds_per_sender_displs[r] + num_my_preds_per_sender[r];
      }
      /* Send around the actual vertex data (list of depth requests that will
       * be responded to at each BFS iteration). */
      int64_t* my_depth_requests = (int64_t*)xmalloc(my_preds_per_sender_displs[size] * sizeof(int64_t));
      int64_t* my_depth_replies = (int64_t*)xmalloc(my_preds_per_sender_displs[size] * sizeof(int64_t));
      MPI_Alltoallv(preds_per_owner, num_preds_per_owner, pred_owner_displs, INT64_T_MPI_TYPE,
                    my_depth_requests, num_my_preds_per_sender, my_preds_per_sender_displs, INT64_T_MPI_TYPE,
                    MPI_COMM_WORLD);

      int64_t* pred_depth_raw = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Depth of predecessor vertex for each local vertex, ordered by source proc */

      /* Do a mini-BFS (naively) over just the predecessor graph (hopefully a
       * tree) produced by the real BFS; fill in the depth map. */
      while (1) {
        int any_changed = 0;
        int i;
        /* Create and send the depth values requested by other nodes.  The list
         * of requests is sent once, and are stored on the receiver so the
         * replies can be sent (possibly with updated depth values) at every
         * iteration. */
        for (i = 0; i < my_preds_per_sender_displs[size]; ++i) {
          my_depth_replies[i] = (my_depth_requests[i] == -1 ? INT64_MAX : depth[VERTEX_LOCAL(my_depth_requests[i])]);
        }
        MPI_Alltoallv(my_depth_replies, num_my_preds_per_sender, my_preds_per_sender_displs, INT64_T_MPI_TYPE,
                      pred_depth_raw, num_preds_per_owner, pred_owner_displs, INT64_T_MPI_TYPE,
                      MPI_COMM_WORLD);
        {
          size_t i;
          /* Put the received depths into the local array. */
          for (i = 0; i < nlocalverts; ++i) {
            pred_depth[preds_per_owner_results_offsets[i]] = pred_depth_raw[i];
          }
          /* Check those values to determine if they violate any correctness
           * conditions. */
          for (i = 0; i < nlocalverts; ++i) {
            int64_t v = VERTEX_TO_GLOBAL(i);
            if (v == root) {
              /* The depth and predecessor for this were checked earlier. */
            } else if (depth[i] == INT64_MAX && pred_depth[i] == INT64_MAX) {
              /* OK -- depth should be filled in later. */
            } else if (depth[i] == INT64_MAX && pred_depth[i] != INT64_MAX) {
              depth[i] = pred_depth[i] + 1;
              any_changed = 1;
            } else if (depth[i] != pred_depth[i] + 1) {
              fprintf(stderr, "%d: Validation error: BFS predecessors do not form a tree; see vertices %" PRId64 " (depth %" PRId64 ") and %" PRId64 " (depth %" PRId64 ").\n", rank, v, depth[i], pred[i], pred_depth[i]);
              validation_passed = 0;
            } else {
              /* Vertex already has its correct depth value. */
            }
          }
        }
        MPI_Allreduce(MPI_IN_PLACE, &any_changed, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
        if (!any_changed) break;
      }

      free(num_preds_per_owner);
      free(num_my_preds_per_sender);
      free(preds_per_owner);
      free(preds_per_owner_results_offsets);
      free(my_preds_per_sender_displs);
      free(my_depth_requests);
      free(my_depth_replies);
      free(pred_owner_displs);
      free(pred_depth);
      free(pred_depth_raw);
    }

    /* Check that all edges connect vertices whose depths differ by at most
     * one. */
    {
      int64_t maxlocaledges = 0;
      MPI_Allreduce((void*)&nlocaledges, &maxlocaledges, 1, INT64_T_MPI_TYPE, MPI_MAX, MPI_COMM_WORLD);
      /* We break the total list of overall edges into chunks to reduce the
       * amount of data to be sent at a time (since we are using MPI_Alltoallv
       * to send data collectively). */
      const int edge_chunk_size = (1 << 23); /* Reduce memory usage */
      int num_edge_groups = (maxlocaledges + edge_chunk_size - 1) / edge_chunk_size;
      int eg;
      for (eg = 0; eg < num_edge_groups; ++eg) {
        size_t first_edge_index = (size_t)(eg * edge_chunk_size);
        if (first_edge_index > nlocaledges) first_edge_index = nlocaledges;
        size_t last_edge_index = (size_t)((eg + 1) * edge_chunk_size);
        if (last_edge_index > nlocaledges) last_edge_index = nlocaledges;
        /* Sort the edge targets in this chunk by their owners (histogram
         * sort); see the BFS code above for details of the steps of the
         * algorithm. */
        int* num_edge_targets_by_owner = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */
        size_t ei;
        for (ei = first_edge_index; ei < last_edge_index; ++ei) {
          ++num_edge_targets_by_owner[VERTEX_OWNER(g->column[ei])];
        }
        int* edge_targets_by_owner_displs = (int*)xmalloc((size + 1) * sizeof(int));
        edge_targets_by_owner_displs[0] = 0;
        int i;
        for (i = 0; i < size; ++i) {
          edge_targets_by_owner_displs[i + 1] = edge_targets_by_owner_displs[i] + num_edge_targets_by_owner[i];
        }
        int64_t* edge_targets_by_owner = (int64_t*)xmalloc(edge_targets_by_owner_displs[size] * sizeof(int64_t));
        int64_t* edge_targets_by_owner_indices = (int64_t*)xmalloc(edge_targets_by_owner_displs[size] * sizeof(int64_t)); /* Source indices for where to write the targets */
        int* edge_targets_by_owner_offsets = (int*)xmalloc((size + 1) * sizeof(int));
        memcpy(edge_targets_by_owner_offsets, edge_targets_by_owner_displs, (size + 1) * sizeof(int));
        for (ei = first_edge_index; ei < last_edge_index; ++ei) {
          edge_targets_by_owner[edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]] = g->column[ei];
          edge_targets_by_owner_indices[edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]] = ei;
          ++edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])];
        }
        for (i = 0; i < size; ++i) {
          assert (edge_targets_by_owner_offsets[i] == edge_targets_by_owner_displs[i + 1]);
        }
        free(edge_targets_by_owner_offsets);

        /* Send around the number of data elements that will be sent later. */
        int* num_incoming_targets_by_src = (int*)xmalloc(size * sizeof(int));
        MPI_Alltoall(num_edge_targets_by_owner, 1, MPI_INT,
                     num_incoming_targets_by_src, 1, MPI_INT,
                     MPI_COMM_WORLD);
        int* incoming_targets_by_src_displs = (int*)xmalloc((size + 1) * sizeof(int));
        incoming_targets_by_src_displs[0] = 0;
        for (i = 0; i < size; ++i) {
          incoming_targets_by_src_displs[i + 1] = incoming_targets_by_src_displs[i] + num_incoming_targets_by_src[i];
        }

        int64_t* target_depth_requests = (int64_t*)xmalloc(incoming_targets_by_src_displs[size] * sizeof(int64_t));
        int64_t* target_depth_replies = (int64_t*)xmalloc(incoming_targets_by_src_displs[size] * sizeof(int64_t));

        /* Send the actual requests for the depths of edge targets. */
        MPI_Alltoallv(edge_targets_by_owner, num_edge_targets_by_owner, edge_targets_by_owner_displs, INT64_T_MPI_TYPE,
                      target_depth_requests, num_incoming_targets_by_src, incoming_targets_by_src_displs, INT64_T_MPI_TYPE,
                      MPI_COMM_WORLD);

        free(edge_targets_by_owner);

        /* Fill in the replies for the requests sent to me. */
        for (i = 0; i < incoming_targets_by_src_displs[size]; ++i) {
          assert (VERTEX_OWNER(target_depth_requests[i]) == rank);
          target_depth_replies[i] = depth[VERTEX_LOCAL(target_depth_requests[i])];
        }

        free(target_depth_requests);

        int64_t* target_depth_raw = (int64_t*)xmalloc((last_edge_index - first_edge_index) * sizeof(int64_t));

        /* Send back the replies. */
        MPI_Alltoallv(target_depth_replies, num_incoming_targets_by_src, incoming_targets_by_src_displs, INT64_T_MPI_TYPE,
                      target_depth_raw, num_edge_targets_by_owner, edge_targets_by_owner_displs, INT64_T_MPI_TYPE,
                      MPI_COMM_WORLD);

        free(target_depth_replies);
        free(num_incoming_targets_by_src);
        free(num_edge_targets_by_owner);
        free(incoming_targets_by_src_displs);
        free(edge_targets_by_owner_displs);

        int64_t* target_depth = (int64_t*)xmalloc((last_edge_index - first_edge_index) * sizeof(int64_t));

        /* Put the replies into the proper order (original order of the edges).
         * */
        for (ei = 0; ei < last_edge_index - first_edge_index; ++ei) {
          target_depth[edge_targets_by_owner_indices[ei] - first_edge_index] = target_depth_raw[ei];
        }

        free(target_depth_raw);
        free(edge_targets_by_owner_indices);

        /* Check the depth relationship of the endpoints of each edge in the
         * current chunk. */
        size_t src_i = 0;
        for (ei = first_edge_index; ei < last_edge_index; ++ei) {
          while (ei >= g->rowstarts[src_i + 1]) {
            ++src_i;
          }
          int64_t src = VERTEX_TO_GLOBAL(src_i);
          int64_t src_depth = depth[src_i];
          int64_t tgt = g->column[ei];
          int64_t tgt_depth = target_depth[ei - first_edge_index];
          if (src_depth != INT64_MAX && tgt_depth == INT64_MAX) {
            fprintf(stderr, "%d: Validation error: edge connects vertex %" PRId64 " in the BFS tree (depth %" PRId64 ") to vertex %" PRId64 " outside the tree.\n", rank, src, src_depth, tgt);
            validation_passed = 0;
          } else if (src_depth == INT64_MAX && tgt_depth != INT64_MAX) {
            /* Skip this for now; this problem will be caught when scanning
             * reversed copy of this edge.  Set the failure flag, though,
             * just in case. */
            validation_passed = 0;
          } else if (src_depth - tgt_depth < -1 ||
                     src_depth - tgt_depth > 1) {
            fprintf(stderr, "%d: Validation error: depths of edge endpoints %" PRId64 " (depth %" PRId64 ") and %" PRId64 " (depth %" PRId64 ") are too far apart (abs. val. > 1).\n", rank, src, src_depth, tgt, tgt_depth);
            validation_passed = 0;
          }
        }
        free(target_depth);
      }
    }

    free(depth);

  } /* End of part skipped by range errors */
  
  /* Collect the global validation result. */
  MPI_Allreduce(MPI_IN_PLACE, &validation_passed, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
  return validation_passed;
}
Esempio n. 3
0
/* This BFS represents its queues as bitmaps and uses some data representation
 * tricks to fit with the use of MPI one-sided operations.  It is not much
 * faster than the standard version on the machines I have tested it on, but
 * systems that have good RDMA hardware and good MPI one-sided implementations
 * might get better performance from it.  This code might also be good to
 * translate to UPC, Co-array Fortran, SHMEM, or GASNet since those systems are
 * more designed for one-sided remote memory operations. */
void run_mpi_bfs(const csr_graph* const g, int64_t root, int64_t* pred, int64_t* nvisited) {
  const size_t nlocalverts = g->nlocalverts;
  const int64_t nglobalverts = g->nglobalverts;
  int64_t nvisited_local = 0;

  /* Set up a second predecessor map so we can read from one and modify the
   * other. */
  int64_t* orig_pred = pred;
  int64_t* pred2 = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));

  /* The queues (old and new) are represented as bitmaps.  Each bit in the
   * queue bitmap says to check elts_per_queue_bit elements in the predecessor
   * map for vertices that need to be visited.  In other words, the queue
   * bitmap is an overapproximation of the actual queue; because MPI_Accumulate
   * does not get any information on the result of the update, sometimes
   * elements are also added to the bitmap when they were actually already
   * black.  Because of this, the predecessor map needs to be checked to be
   * sure a given vertex actually needs to be processed. */
  const int elts_per_queue_bit = 4;
  const int ulong_bits = sizeof(unsigned long) * CHAR_BIT;
  int64_t queue_nbits = (nlocalverts + elts_per_queue_bit - 1) / elts_per_queue_bit;
  int64_t queue_nwords = (queue_nbits + ulong_bits - 1) / ulong_bits;
  unsigned long* queue_bitmap1 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  unsigned long* queue_bitmap2 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  memset(queue_bitmap1, 0, queue_nwords * sizeof(unsigned long));

  /* List of local vertices (used as sources in MPI_Accumulate). */
  int64_t* local_vertices = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));
  {size_t i; for (i = 0; i < nlocalverts; ++i) local_vertices[i] = VERTEX_TO_GLOBAL(i);}

  /* List of all bit masks for an unsigned long (used as sources in
   * MPI_Accumulate). */
  unsigned long masks[ulong_bits];
  {int i; for (i = 0; i < ulong_bits; ++i) masks[i] = (1UL << i);}

  /* Coding of predecessor map: */
  /* - White (not visited): INT64_MAX */
  /* - Grey (in queue): 0 .. nglobalverts-1 */
  /* - Black (done): -nglobalverts .. -1 */

  /* Set initial predecessor map. */
  {size_t i; for (i = 0; i < nlocalverts; ++i) pred[i] = INT64_MAX;}

  /* Mark root as grey and add it to the queue. */
  if (VERTEX_OWNER(root) == rank) {
    pred[VERTEX_LOCAL(root)] = root;
    queue_bitmap1[VERTEX_LOCAL(root) / elts_per_queue_bit / ulong_bits] |= (1UL << ((VERTEX_LOCAL(root) / elts_per_queue_bit) % ulong_bits));
  }

  /* Create MPI windows on the two predecessor arrays and the two queues. */
  MPI_Win pred_win, pred2_win, queue1_win, queue2_win;
  MPI_Win_create(pred, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred_win);
  MPI_Win_create(pred2, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred2_win);
  MPI_Win_create(queue_bitmap1, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue1_win);
  MPI_Win_create(queue_bitmap2, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue2_win);

  while (1) {
    int64_t i;
    /* Clear the next-level queue. */
    memset(queue_bitmap2, 0, queue_nwords * sizeof(unsigned long));

    /* The pred2 array is pred with all grey vertices changed to black. */
    memcpy(pred2, pred, nlocalverts * sizeof(int64_t));
    for (i = 0; i < (int64_t)nlocalverts; ++i) {
      if (pred2[i] >= 0 && pred2[i] < nglobalverts) pred2[i] -= nglobalverts;
    }

    /* Start one-sided operations for this level. */
    MPI_Win_fence(MPI_MODE_NOPRECEDE, pred2_win);
    MPI_Win_fence(MPI_MODE_NOPRECEDE, queue2_win);

    /* Step through the words of the queue bitmap. */
    for (i = 0; i < queue_nwords; ++i) {
      unsigned long val = queue_bitmap1[i];
      int bitnum;
      /* Skip any that are all zero. */
      if (!val) continue;
      /* Scan the bits in the word. */
      for (bitnum = 0; bitnum < ulong_bits; ++bitnum) {
        size_t first_v_local = (size_t)((i * ulong_bits + bitnum) * elts_per_queue_bit);
        if (first_v_local >= nlocalverts) break;
        int bit = (int)((val >> bitnum) & 1);
        /* Skip any that are zero. */
        if (!bit) continue;
        /* Scan the queue elements corresponding to this bit. */
        int qelem_idx;
        for (qelem_idx = 0; qelem_idx < elts_per_queue_bit; ++qelem_idx) {
          size_t v_local = first_v_local + qelem_idx;
          if (v_local >= nlocalverts) continue;
          /* Since the queue is an overapproximation, check the predecessor map
           * to be sure this vertex is grey. */
          if (pred[v_local] >= 0 && pred[v_local] < nglobalverts) {
            ++nvisited_local;
            size_t ei, ei_end = g->rowstarts[v_local + 1];
            /* Walk the incident edges. */
            for (ei = g->rowstarts[v_local]; ei < ei_end; ++ei) {
              int64_t w = g->column[ei];
              if (w == VERTEX_TO_GLOBAL(v_local)) continue; /* Self-loop */
              /* Set the predecessor of the other edge endpoint (note use of
               * MPI_MIN and the coding of the predecessor map). */
              MPI_Accumulate(&local_vertices[v_local], 1, INT64_T_MPI_TYPE, VERTEX_OWNER(w), VERTEX_LOCAL(w), 1, INT64_T_MPI_TYPE, MPI_MIN, pred2_win);
              /* Mark the endpoint in the remote queue (note that the min may
               * not do an update, so the queue is an overapproximation in this
               * way as well). */
              MPI_Accumulate(&masks[((VERTEX_LOCAL(w) / elts_per_queue_bit) % ulong_bits)], 1, MPI_UNSIGNED_LONG, VERTEX_OWNER(w), VERTEX_LOCAL(w) / elts_per_queue_bit / ulong_bits, 1, MPI_UNSIGNED_LONG, MPI_BOR, queue2_win);
            }
          }
        }
      }
    }
    /* End one-sided operations. */
    MPI_Win_fence(MPI_MODE_NOSUCCEED, queue2_win);
    MPI_Win_fence(MPI_MODE_NOSUCCEED, pred2_win);

    /* Test if there are any elements in the next-level queue (globally); stop
     * if none. */
    int any_set = 0;
    for (i = 0; i < queue_nwords; ++i) {
      if (queue_bitmap2[i] != 0) {any_set = 1; break;}
    }
    MPI_Allreduce(MPI_IN_PLACE, &any_set, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
    if (!any_set) break;

    /* Swap queues and predecessor maps. */
    {MPI_Win temp = queue1_win; queue1_win = queue2_win; queue2_win = temp;}
    {unsigned long* temp = queue_bitmap1; queue_bitmap1 = queue_bitmap2; queue_bitmap2 = temp;}
    {MPI_Win temp = pred_win; pred_win = pred2_win; pred2_win = temp;}
    {int64_t* temp = pred; pred = pred2; pred2 = temp;}
  }
  MPI_Win_free(&pred_win);
  MPI_Win_free(&pred2_win);
  MPI_Win_free(&queue1_win);
  MPI_Win_free(&queue2_win);
  MPI_Free_mem(local_vertices);
  MPI_Free_mem(queue_bitmap1);
  MPI_Free_mem(queue_bitmap2);

  /* Clean up the predecessor map swapping since the surrounding code does not
   * allow the BFS to change the predecessor map pointer. */
  if (pred2 != orig_pred) {
    memcpy(orig_pred, pred2, nlocalverts * sizeof(int64_t));
    MPI_Free_mem(pred2);
  } else {
    MPI_Free_mem(pred);
  }

  /* Change from special coding of predecessor map to the one the benchmark
   * requires. */
  size_t i;
  for (i = 0; i < nlocalverts; ++i) {
    if (orig_pred[i] < 0) {
      orig_pred[i] += nglobalverts;
    } else if (orig_pred[i] == INT64_MAX) {
      orig_pred[i] = -1;
    }
  }

  /* Count visited vertices. */
  MPI_Allreduce(MPI_IN_PLACE, &nvisited_local, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
  *nvisited = nvisited_local;
}
Esempio n. 4
0
/* This version is the traditional level-synchronized BFS using two queues.  A
 * bitmap is used to indicate which vertices have been visited.  Messages are
 * sent and processed asynchronously throughout the code to hopefully overlap
 * communication with computation. */
void run_bfs(int64_t root, int64_t* pred) {
  allocate_memory();
  const ptrdiff_t nlocalverts = g.nlocalverts;
  const size_t* const restrict rowstarts = g.rowstarts;
  const int64_t* const restrict column = g.column;
  int64_t maxlocalverts = g.max_nlocalverts;

  /* Set up the visited bitmap. */
  const int ulong_bits = sizeof(unsigned long) * CHAR_BIT;
  const int ulong_bits_squared = ulong_bits * ulong_bits;
  int64_t local_queue_summary_size = (maxlocalverts + ulong_bits_squared - 1) / ulong_bits_squared;
  int64_t local_queue_size = local_queue_summary_size * ulong_bits;
  int lg_local_queue_size = lg_int64_t(local_queue_size);
  int64_t global_queue_summary_size = MUL_SIZE(local_queue_summary_size);
  int64_t global_queue_size = MUL_SIZE(local_queue_size);

#define SWIZZLE_VERTEX(c) ((VERTEX_OWNER(c) << lg_local_queue_size) * ulong_bits | VERTEX_LOCAL(c))
#if 0
  int64_t* restrict column_swizzled = (int64_t*)xmalloc(nlocaledges * sizeof(int64_t));
  {
    size_t i;
    for (i = 0; i < nlocaledges; ++i) {
      int64_t c = column[i];
      column_swizzled[i] = SWIZZLE_VERTEX(c);
    }
  }
#endif

  unsigned long* restrict in_queue = g_in_queue;
  memset(in_queue, 0, global_queue_size * sizeof(unsigned long));
  unsigned long* restrict in_queue_summary = g_in_queue_summary;
  memset(in_queue_summary, 0, global_queue_summary_size * sizeof(unsigned long));
  unsigned long* restrict out_queue = g_out_queue;
  unsigned long* restrict out_queue_summary = g_out_queue_summary;
  unsigned long* restrict visited = g_visited;
  memset(visited, 0, local_queue_size * sizeof(unsigned long));

#define SET_IN(v) do {int64_t vs = SWIZZLE_VERTEX(v); size_t word_idx = vs / ulong_bits; int bit_idx = vs % ulong_bits; unsigned long mask = (1UL << bit_idx); in_queue_summary[word_idx / ulong_bits] |= (1UL << (word_idx % ulong_bits)); in_queue[word_idx] |= mask;} while (0)
#define TEST_IN(vs) (((in_queue_summary[vs / ulong_bits / ulong_bits] & (1UL << ((vs / ulong_bits) % ulong_bits))) != 0) && ((in_queue[vs / ulong_bits] & (1UL << (vs % ulong_bits))) != 0))
#define TEST_VISITED_LOCAL(v) ((visited[(v) / ulong_bits] & (1UL << ((v) % ulong_bits))) != 0)
// #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); __sync_fetch_and_or(&visited[word_idx], mask); __sync_fetch_and_or(&out_queue[word_idx], mask);} while (0)
#define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); visited[word_idx] |= mask; out_queue[word_idx] |= mask;} while (0)

  SET_IN(root);
  {ptrdiff_t i; _Pragma("omp parallel for schedule(static)") for (i = 0; i < nlocalverts; ++i) pred[i] = -1;}
  if (VERTEX_OWNER(root) == rank) {
    pred[VERTEX_LOCAL(root)] = root;
    SET_VISITED_LOCAL(VERTEX_LOCAL(root));
  }
  uint16_t cur_level = 0;
  while (1) {
    ++cur_level;
#if 0
    if (rank == 0) fprintf(stderr, "BFS level %" PRIu16 "\n", cur_level);
#endif
    memset(out_queue, 0, local_queue_size * sizeof(unsigned long));
    // memset(out_queue_summary, 0, local_queue_summary_size * sizeof(unsigned long));
    ptrdiff_t i, ii;
#if 0
#pragma omp parallel for schedule(static)
    for (i = 0; i < global_queue_summary_size; ++i) {
      unsigned long val = 0UL;
      int j;
      unsigned long mask = 1UL;
      for (j = 0; j < ulong_bits; ++j, mask <<= 1) {
        if (in_queue[i * ulong_bits + j]) val |= mask;
      }
      in_queue_summary[i] = val;
    }
#endif
    unsigned long not_done = 0;
#pragma omp parallel for schedule(static) reduction(|:not_done)
    for (ii = 0; ii < nlocalverts; ii += ulong_bits) {
      size_t i, i_end = ii + ulong_bits;
      if (i_end > nlocalverts) i_end = nlocalverts;
      for (i = ii; i < i_end; ++i) {
        if (!TEST_VISITED_LOCAL(i)) {
          size_t j, j_end = rowstarts[i + 1];
          for (j = rowstarts[i]; j < j_end; ++j) {
            int64_t v1 = column[j];
            int64_t v1_swizzled = SWIZZLE_VERTEX(v1);
            if (TEST_IN(v1_swizzled)) {
              pred[i] = (v1 & INT64_C(0xFFFFFFFFFFFF)) | ((int64_t)cur_level << 48);
              not_done |= 1;
              SET_VISITED_LOCAL(i);
              break;
            }
          }
        }
      }
    }
#if 1
#pragma omp parallel for schedule(static)
    for (i = 0; i < local_queue_summary_size; ++i) {
      unsigned long val = 0UL;
      int j;
      unsigned long mask = 1UL;
      for (j = 0; j < ulong_bits; ++j, mask <<= 1) {
        unsigned long full_val = out_queue[i * ulong_bits + j];
        visited[i * ulong_bits + j] |= full_val;
        if (full_val) val |= mask;
      }
      out_queue_summary[i] = val;
      // not_done |= val;
    }
#endif
    MPI_Allreduce(MPI_IN_PLACE, &not_done, 1, MPI_UNSIGNED_LONG, MPI_BOR, MPI_COMM_WORLD);
    if (not_done == 0) break;
    MPI_Allgather(out_queue, local_queue_size, MPI_UNSIGNED_LONG, in_queue, local_queue_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);
    MPI_Allgather(out_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, in_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);
  }
  deallocate_memory();
}
Esempio n. 5
0
void readGraph_singleFile_MPI(graph_t *G, char *filename)
{
    uint8_t align;
    int rank, size;
    int offset,offset_row ,offset_col,offset_weight;
    edge_id_t my_edges[2];
    int local_n=0;
    int local_m=0;
    int k;
    uint32_t TotVertices;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    G->rank = rank;
    G->nproc = size;
    G->filename[0] = '\0';
    sprintf(G->filename, "%s", filename);
    
    int lgsize; 
    for (lgsize = 0; lgsize < size; ++lgsize) {
        if ((1 << lgsize) == size) break;
    }

    MPI_File fh;
    MPI_Status status;    
    MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
    MPI_File_read(fh, &G->n, 1, MPI_UINT32_T, &status);
    offset = sizeof(vertex_id_t);
    TotVertices = G->n;

    MPI_File_read_at(fh, offset, &G->m, 1, MPI_UINT64_T, &status);
    offset += sizeof(edge_id_t);
    
    MPI_File_read_at(fh, offset, &G->directed, 1, MPI_C_BOOL, &status);
    offset += sizeof(bool);

    MPI_File_read_at(fh, offset, &align, 1, MPI_UINT8_T, &status);
    offset += sizeof(uint8_t);
    offset_row = offset;

    for( uint32_t i = 0; i < G->n; i++ ) {
        if( rank == VERTEX_OWNER(i,TotVertices,size) ) {
            MPI_File_read_at(fh,offset_row + (i)*sizeof(edge_id_t),&my_edges[0], 2, MPI_UINT64_T, &status);
            local_n++;
            local_m += my_edges[1] - my_edges[0];
        }
    }
    G->local_n = local_n;
    G->local_m = local_m;
    offset_col = offset_row + (G->n+1) * sizeof(edge_id_t);
    offset_weight = offset_col + G->m * sizeof(vertex_id_t);

    G->rowsIndices = (edge_id_t *)malloc((G->local_n + 1) * sizeof(edge_id_t) );
    G->endV = (vertex_id_t *)malloc((G->local_m)*sizeof(vertex_id_t));
    G->weights = (weight_t *)malloc((G->local_m)*sizeof(weight_t));
    G->rowsIndices[0] = 0;
    k = 1;

    for( uint32_t i = 0; i < G->n; i++ ) {
        if( rank == VERTEX_OWNER(i,TotVertices,size) ) {
            MPI_File_read_at(fh,offset_row + (i)*sizeof(edge_id_t),&my_edges[0], 2, MPI_UINT64_T, &status);
            G->rowsIndices[k] = G->rowsIndices[k-1] + my_edges[1] - my_edges[0];
            MPI_File_read_at(fh,offset_col + my_edges[0] * sizeof(vertex_id_t), &G->endV[G->rowsIndices[k-1]], G->rowsIndices[k]-G->rowsIndices[k-1], MPI_UINT32_T, &status);
            MPI_File_read_at(fh,offset_weight + my_edges[0] * sizeof(weight_t), &G->weights[G->rowsIndices[k-1]], G->rowsIndices[k]-G->rowsIndices[k-1], MPI_DOUBLE, &status);
            k++;
        }
    }

    MPI_File_close(&fh);
}