Пример #1
0
/* opens, reads, and computes the crc32 value for the given filename */
int scr_crc32(const char* filename, uLong* crc)
{
  /* check that we got a variable to write our answer to */
  if (crc == NULL) {
    return SCR_FAILURE;
  }

  /* initialize our crc value */
  *crc = crc32(0L, Z_NULL, 0);

  /* open the file for reading */
  int fd = scr_open(filename, O_RDONLY);
  if (fd < 0) {
    scr_dbg(1, "Failed to open file to compute crc: %s errno=%d @ file %s:%d",
            filename, errno, __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

  /* read the file data in and compute its crc32 */
  int nread = 0;
  unsigned long buffer_size = 1024*1024;
  char buf[buffer_size];
  do {
    nread = scr_read(filename, fd, buf, buffer_size);
    if (nread > 0) {
      *crc = crc32(*crc, (const Bytef*) buf, (uInt) nread);
    }
  } while (nread == buffer_size);

  /* if we got an error, don't print anything and bailout */
  if (nread < 0) {
    scr_dbg(1, "Error while reading file to compute crc: %s @ file %s:%d",
            filename, __FILE__, __LINE__
    );
    close(fd);
    return SCR_FAILURE;
  }

  /* close the file */
  scr_close(filename, fd);

  return SCR_SUCCESS;
}
Пример #2
0
/* opens specified file and waits on a lock before returning the file descriptor */
int scr_open_with_lock(const char* file, int flags, mode_t mode)
{
  /* open the file */
  int fd = scr_open(file, flags, mode);
  if (fd < 0) {
    scr_err("Opening file for write: scr_open(%s) errno=%d %s @ %s:%d",
            file, errno, strerror(errno), __FILE__, __LINE__
    );
    return fd;
  }

  /* acquire an exclusive file lock */
  int ret = scr_file_lock_write(file, fd);
  if (ret != SCR_SUCCESS) {
    close(fd);
    return ret;
  }
     
  /* return the opened file descriptor */
  return fd;
}
Пример #3
0
static int scr_swap_files_copy(
  int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send,
  int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv,
  MPI_Comm comm)
{
  int rc = SCR_SUCCESS;
  MPI_Request request[2];
  MPI_Status  status[2];

  /* allocate MPI send buffer */
  char *buf_send = NULL;
  if (have_outgoing) {
    buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size);
    if (buf_send == NULL) {
      scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d",
              scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__
      );
      return SCR_FAILURE;
    }
  }

  /* allocate MPI recv buffer */
  char *buf_recv = NULL;
  if (have_incoming) {
    buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size);
    if (buf_recv == NULL) {
      scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d",
              scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__
      );
      return SCR_FAILURE;
    }
  }

  /* open the file to send: read-only mode */
  int fd_send = -1;
  if (have_outgoing) {
    fd_send = scr_open(file_send, O_RDONLY);
    if (fd_send < 0) {
      scr_abort(-1, "Opening file for send: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d",
              file_send, errno, strerror(errno), __FILE__, __LINE__
      );
    }
  }

  /* open the file to recv: truncate, write-only mode */
  int fd_recv = -1;
  if (have_incoming) {
    mode_t mode_file = scr_getmode(1, 1, 0);
    fd_recv = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file);
    if (fd_recv < 0) {
      scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d",
              file_recv, errno, strerror(errno), __FILE__, __LINE__
      );
    }
  }

  /* exchange file chunks */
  int nread, nwrite;
  int sending = 0;
  if (have_outgoing) {
    sending = 1;
  }
  int receiving = 0;
  if (have_incoming) {
    receiving = 1;
  }
  while (sending || receiving) {
    /* if we are still receiving a file, post a receive */
    if (receiving) {
      MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]);
    }

    /* if we are still sending a file, read a chunk, send it, and wait */
    if (sending) {
      nread = scr_read(file_send, fd_send, buf_send, scr_mpi_buf_size);
      if (scr_crc_on_copy && nread > 0) {
        *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread);
      }
      if (nread < 0) {
        nread = 0;
      }
      MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]);
      MPI_Wait(&request[1], &status[1]);
      if (nread < scr_mpi_buf_size) {
        sending = 0;
      }
    }

    /* if we are still receiving a file,
     * wait on our receive to complete and write the data */
    if (receiving) {
      MPI_Wait(&request[0], &status[0]);
      MPI_Get_count(&status[0], MPI_BYTE, &nwrite);
      if (scr_crc_on_copy && nwrite > 0) {
        *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite);
      }
      scr_write(file_recv, fd_recv, buf_recv, nwrite);
      if (nwrite < scr_mpi_buf_size) {
        receiving = 0;
      }
    }
  }

  /* close the files */
  if (have_outgoing) {
    scr_close(file_send, fd_send);
  }
  if (have_incoming) {
    scr_close(file_recv, fd_recv);
  }

  /* set crc field on our file if it hasn't been set already */
  if (scr_crc_on_copy && have_outgoing) {
    uLong meta_send_crc;
    if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) {
      scr_meta_set_crc32(meta_send, *crc32_send);
    } else {
      /* TODO: we could check that the crc on the sent file matches and take some action if not */
    }
  }

  /* free the MPI buffers */
  scr_align_free(&buf_recv);
  scr_align_free(&buf_send);

  return rc;
}
Пример #4
0
static int scr_swap_files_move(
  int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send,
  int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv,
  MPI_Comm comm)
{
  int rc = SCR_SUCCESS;
  MPI_Request request[2];
  MPI_Status  status[2];

  /* allocate MPI send buffer */
  char *buf_send = NULL;
  if (have_outgoing) {
    buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size);
    if (buf_send == NULL) {
      scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d",
              scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__
      );
      return SCR_FAILURE;
    }
  }

  /* allocate MPI recv buffer */
  char *buf_recv = NULL;
  if (have_incoming) {
    buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size);
    if (buf_recv == NULL) {
      scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d",
              scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__
      );
      return SCR_FAILURE;
    }
  }

  /* since we'll overwrite our send file in place with the recv file,
   * which may be larger, we need to keep track of how many bytes we've
   * sent and whether we've sent them all */
  unsigned long filesize_send = 0;

  /* open our file */
  int fd = -1;
  if (have_outgoing) {
    /* we'll overwrite our send file (or just read it if there is no incoming) */
    filesize_send = scr_file_size(file_send);
    fd = scr_open(file_send, O_RDWR);
    if (fd < 0) {
      /* TODO: skip writes and return error? */
      scr_abort(-1, "Opening file for send/recv: scr_open(%s, O_RDWR) errno=%d %s @ %s:%d",
              file_send, errno, strerror(errno), __FILE__, __LINE__
      );
    }
  } else if (have_incoming) {
    /* if we're in this branch, then we only have an incoming file,
     * so we'll write our recv file from scratch */
    mode_t mode_file = scr_getmode(1, 1, 0);
    fd = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file);
    if (fd < 0) {
      /* TODO: skip writes and return error? */
      scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d",
              file_recv, errno, strerror(errno), __FILE__, __LINE__
      );
    }
  }

  /* exchange file chunks */
  int sending = 0;
  if (have_outgoing) {
    sending = 1;
  }
  int receiving = 0;
  if (have_incoming) {
    receiving = 1;
  }
  int nread, nwrite;
  off_t read_pos = 0, write_pos = 0;
  while (sending || receiving) {
    if (receiving) {
      /* prepare a buffer to receive up to scr_mpi_buf_size bytes */
      MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]);
    }

    if (sending) {
      /* compute number of bytes to read */
      unsigned long count = filesize_send - read_pos;
      if (count > scr_mpi_buf_size) {
        count = scr_mpi_buf_size;
      }

      /* read a chunk of up to scr_mpi_buf_size bytes into buf_send */
      lseek(fd, read_pos, SEEK_SET); /* seek to read position */
      nread = scr_read(file_send, fd, buf_send, count);
      if (scr_crc_on_copy && nread > 0) {
        *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread);
      }
      if (nread < 0) {
        nread = 0;
      }
      read_pos += (off_t) nread; /* update read pointer */

      /* send chunk (if nread is smaller than scr_mpi_buf_size,
       * then we've read the whole file) */
      MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]);
      MPI_Wait(&request[1], &status[1]);

      /* check whether we've read the whole file */
      if (filesize_send == read_pos && count < scr_mpi_buf_size) {
        sending = 0;
      }
    }

    if (receiving) {
      /* count the number of bytes received */
      MPI_Wait(&request[0], &status[0]);
      MPI_Get_count(&status[0], MPI_BYTE, &nwrite);
      if (scr_crc_on_copy && nwrite > 0) {
        *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite);
      }

      /* write those bytes to file (if nwrite is smaller than scr_mpi_buf_size,
       * then we've received the whole file) */
      lseek(fd, write_pos, SEEK_SET); /* seek to write position */
      scr_write(file_recv, fd, buf_recv, nwrite);
      write_pos += (off_t) nwrite; /* update write pointer */

      /* if nwrite is smaller than scr_mpi_buf_size,
       * then assume we've received the whole file */
      if (nwrite < scr_mpi_buf_size) {
        receiving = 0;
      }
    }
  }

  /* close file and cleanup */
  if (have_outgoing && have_incoming) {
    /* sent and received a file; close it, truncate it to corect size, rename it */
    scr_close(file_send, fd);
    truncate(file_send, write_pos);
    rename(file_send, file_recv);
  } else if (have_outgoing) {
    /* only sent a file; close it, delete it, and remove its completion marker */
    scr_close(file_send, fd);
    scr_file_unlink(file_send);
  } else if (have_incoming) {
    /* only received a file; just need to close it */
    scr_close(file_recv, fd);
  }

  if (scr_crc_on_copy && have_outgoing) {
    uLong meta_send_crc;
    if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) {
      /* we transfer this meta data across below,
       * so may as well update these fields so we can use them */
      scr_meta_set_crc32(meta_send, *crc32_send);
      /* do not complete file send, we just deleted it above */
    } else {
      /* TODO: we could check that the crc on the sent file matches and take some action if not */
    }
  }

  /* free the MPI buffers */
  scr_align_free(&buf_recv);
  scr_align_free(&buf_send);

  return rc;
}
Пример #5
0
/* given a filename, its meta data, its list of segments, and list of destination containers,
 * copy file to container files */
static int scr_flush_file_to_containers(
  const char* file,
  scr_meta* meta,
  scr_hash* segments,
  const char* dst_dir)
{
  /* check that we got something for a source file */
  if (file == NULL || strcmp(file, "") == 0) {
    scr_err("Invalid source file @ %s:%d",
      __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

  /* check that our other arguments are valid */
  if (meta == NULL || segments == NULL) {
    scr_err("Invalid metadata or segments @ %s:%d",
      __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

  /* open the file for reading */
  int fd_src = scr_open(file, O_RDONLY);
  if (fd_src < 0) {
    scr_err("Opening file to copy: scr_open(%s) errno=%d %s @ %s:%d",
      file, errno, strerror(errno), __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

#if !defined(__APPLE__)
  /* TODO:
  posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL)
  that tells the kernel that you don't ever need the pages
  from the file again, and it won't bother keeping them in the page cache.
  */
  posix_fadvise(fd_src, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL);
#endif

  /* get the buffer size we'll use to write to the file */
  unsigned long buf_size = scr_file_buf_size;

  /* allocate buffer to read in file chunks */
  char* buf = (char*) SCR_MALLOC(buf_size);

  /* initialize crc value */
  uLong crc;
  if (scr_crc_on_flush) {
    crc = crc32(0L, Z_NULL, 0);
  }

  int rc = SCR_SUCCESS;

  /* write out each segment */
  scr_hash_sort_int(segments, SCR_HASH_SORT_ASCENDING);
  scr_hash_elem* elem;
  for (elem = scr_hash_elem_first(segments);
       elem != NULL;
       elem = scr_hash_elem_next(elem))
  {
    /* get the container info for this segment */
    scr_hash* hash = scr_hash_elem_hash(elem);

    /* get the offset into the container and the length of the segment (both in bytes) */
    char* container_name;
    unsigned long container_offset, segment_length;
    if (scr_container_get_name_offset_length(hash,
      &container_name, &container_offset, &segment_length) != SCR_SUCCESS)
    {
      scr_err("Failed to get segment offset and length @ %s:%d",
              __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
      break;
    }

    /* build full name to destination file */
    scr_path* dst_path = scr_path_from_str(dst_dir);
    scr_path_append_str(dst_path, container_name);
    scr_path_reduce(dst_path);
    char* dst_file = scr_path_strdup(dst_path);

    /* open container file for writing -- we don't truncate here because more than one
     * process may be writing to the same file */
    int fd_container = scr_open(dst_file, O_WRONLY);
    if (fd_container < 0) {
      scr_err("Opening file for writing: scr_open(%s) errno=%d %s @ %s:%d",
        dst_file, errno, strerror(errno), __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
      break;
    }

#if !defined(__APPLE__)
    /* TODO:
    posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL)
    that tells the kernel that you don't ever need the pages
    from the file again, and it won't bother keeping them in the page cache.
    */
    posix_fadvise(fd_container, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL);
#endif

    /* seek to offset within container */
    off_t pos = (off_t) container_offset;
    if (lseek(fd_container, pos, SEEK_SET) == (off_t)-1) {
      /* our seek failed, return an error */
      scr_err("Failed to seek to byte %lu in %s @ %s:%d",
        pos, dst_file, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
      break;
    }

    /* copy data from file into container in chunks */
    unsigned long remaining = segment_length;
    while (remaining > 0) {
      /* read / write up to buf_size bytes at a time from file */
      unsigned long count = remaining;
      if (count > buf_size) {
        count = buf_size;
      }

      /* attempt to read buf_size bytes from file */
      int nread = scr_read_attempt(file, fd_src, buf, count);

      /* if we read some bytes, write them out */
      if (nread > 0) {
        /* optionally compute crc value as we go */
        if (scr_crc_on_flush) {
          crc = crc32(crc, (const Bytef*) buf, (uInt) nread);
        }

        /* write our nread bytes out */
        int nwrite = scr_write_attempt(dst_file, fd_container, buf, nread);

        /* check for a write error or a short write */
        if (nwrite != nread) {
          /* write had a problem, stop copying and return an error */
          rc = SCR_FAILURE;
          break;
        }

        /* subtract the bytes we've processed from the number remaining */
        remaining -= (unsigned long) nread;
      }

      /* assume a short read is an error */
      if (nread < count) {
        /* read had a problem, stop copying and return an error */
        rc = SCR_FAILURE;
        break;
      }

      /* check for a read error, stop copying and return an error */
      if (nread < 0) {
        /* read had a problem, stop copying and return an error */
        rc = SCR_FAILURE;
        break;
      }
    }

    /* close container */
    if (scr_close(dst_file, fd_container) != SCR_SUCCESS) {
      rc = SCR_FAILURE;
    }

    /* free the container file name and path */
    scr_free(&dst_file);
    scr_path_delete(&dst_path);
  }

  /* close the source file */
  if (scr_close(file, fd_src) != SCR_SUCCESS) {
    rc = SCR_FAILURE;
  }

  /* free buffer */
  scr_free(&buf);

  /* verify / set crc value */
  if (rc == SCR_SUCCESS) {
    uLong crc2;
    if (scr_crc_on_flush) {
      if (scr_meta_get_crc32(meta, &crc2) == SCR_SUCCESS) {
        /* if a crc is already set in the meta data, check that we computed the same value */
        if (crc != crc2) {
          scr_err("CRC32 mismatch detected when flushing file %s @ %s:%d",
            file, __FILE__, __LINE__
          );
          rc = SCR_FAILURE;
        }
      } else {
        /* if there is no crc set, let's set it now */
        scr_meta_set_crc32(meta, crc);
      }
    }
  }

  return rc;
}
Пример #6
0
/* read in halt file (which program may have changed), update internal data structure,
 * set & unset any fields, and write out halt file all while locked */
int scr_halt_sync_and_set(const char* file, struct arglist* args, scr_hash* data)
{
  /* set the mode on the file to be readable/writable by all
   * (enables a sysadmin to halt a user's job via scr_halt --all) */
  mode_t old_mode = umask(0000);

  /* TODO: sleep and try the open several times if the first fails */
  /* open the halt file for reading */
  int fd = scr_open(file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
  if (fd < 0) {
    scr_err("Opening file for write: scr_open(%s) errno=%d %m @ %s:%d",
            file, errno, __FILE__, __LINE__
    );
    /* restore the normal file mask */
    umask(old_mode);
    return SCR_FAILURE;
  }

  /* acquire an exclusive file lock before reading */
  int ret = scr_write_lock(file,fd);
  if (ret != SCR_SUCCESS){
     scr_close(file,fd);
     umask(old_mode);
     return ret;
  }

  /* read in the current data from the file */
  scr_hash_read_fd(file, fd, data);

  /* set / unset values in file */
  if (args->set_reason) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_REASON);
    scr_hash_set_kv(data, SCR_HALT_KEY_EXIT_REASON, args->value_reason);
  } else if (args->unset_reason) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_REASON);
  }

  if (args->set_checkpoints) {
    scr_hash_unset(data, SCR_HALT_KEY_CHECKPOINTS);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_CHECKPOINTS, args->value_checkpoints);
  } else if (args->unset_checkpoints) {
    scr_hash_unset(data, SCR_HALT_KEY_CHECKPOINTS);
  }

  if (args->set_before) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_BEFORE);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_EXIT_BEFORE, args->value_before);
  } else if (args->unset_before) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_BEFORE);
  }

  if (args->set_after) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_AFTER);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_EXIT_AFTER, args->value_after);
  } else if (args->unset_after) {
    scr_hash_unset(data, SCR_HALT_KEY_EXIT_AFTER);
  }

  if (args->set_seconds) {
    scr_hash_unset(data, SCR_HALT_KEY_SECONDS);
    scr_hash_setf(data, NULL, "%s %lu", SCR_HALT_KEY_SECONDS, args->value_seconds);
  } else if (args->unset_seconds) {
    scr_hash_unset(data, SCR_HALT_KEY_SECONDS);
  }

  /* wind file pointer back to the start of the file */
  lseek(fd, 0, SEEK_SET);

  /* write our updated data */
  ssize_t bytes_written = scr_hash_write_fd(file, fd, data);

  /* truncate the file to the correct size (may be smaller than it was before) */
  if (bytes_written >= 0) {
    ftruncate(fd, (off_t) bytes_written);
  }

  /* release the file lock */
  ret = scr_unlock(file, fd);
  if (ret != SCR_SUCCESS){
     scr_close(file,fd);
     umask(old_mode);
     return ret;
  }

  /* close file */
  scr_close(file, fd);

  /* restore the normal file mask */
  umask(old_mode);

  /* write current values to halt file */
  return SCR_SUCCESS;
}
Пример #7
0
int main (int argc, char *argv[])
{
  /* check that we were given at least one argument
   * (the transfer file name) */
  if (argc != 2) {
    printf("Usage: scr_transfer <transferfile>\n");
    return 1;
  }

  /* record the name of the transfer file */
  scr_transfer_file = strdup(argv[1]);
  if (scr_transfer_file == NULL) {
    scr_err("scr_transfer: Copying transfer file name @ %s:%d",
            __FILE__, __LINE__
    );
    return 1;
  }

  /* initialize our tracking variables */
  read_params();

  /* get file io mode */
  mode_t mode_file = scr_getmode(1, 1, 0);

  /* we cache the opened file descriptors to avoid extra opens,
   * seeks, and closes */
  int fd_src = -1;
  int fd_dst = -1;

  char* new_file_src = NULL;
  char* old_file_src = NULL;
  char* new_file_dst = NULL;
  char* old_file_dst = NULL;

  off_t new_position = 0;
  off_t old_position = 0;

  /* start in the stopped state */
  state = STOPPED;
  set_transfer_file_state(SCR_TRANSFER_KEY_STATE_STOP, 0);

  /* TODO: enable this value to be set from config file */
  /* TODO: page-align this buffer for faster performance */
  /* allocate our file copy buffer */
  size_t bufsize = scr_file_buf_size;
  char* buf = malloc(bufsize);
  if (buf == NULL) {
    scr_err("scr_transfer: Failed to allocate %llu bytes for file copy buffer @ %s:%d",
            (unsigned long long) bufsize, __FILE__, __LINE__
    );
    return 1;
  }

  int nread = 0;
  double secs_run   = 0.0;
  double secs_slept = 0.0;
  double secs_run_start  = scr_seconds();
  double secs_run_end    = secs_run_start;
  double secs_last_write = secs_run_start;
  scr_hash* hash = scr_hash_new();
  while (keep_running) {
    /* loop here sleeping and checking transfer file periodically
     * until state changes and / or some time elapses */
    /* reset our timer for our last write */
    double secs_remain = scr_transfer_secs;
    while (keep_running && (state == STOPPED || secs_remain > 0.0)) {
      /* remember our current state before reading transfer file */
      int old_state = state;

      /* read the transfer file, which fills in our hash and
       * also updates state and bytes_per_second */
      scr_hash_delete(&hash);
      hash = read_transfer_file();

      /* compute time we should sleep before writing more data based
       * on bandwidth and percent of runtime limits */
      if (state == RUNNING) {
        /* get the current time */
        double secs_now = scr_seconds();

        /* based on the amount we last wrote and our allocated bandwidth,
         * compute time we need to sleep before attempting our next write */
        double secs_remain_bw = 0.0;
        if (nread > 0 && bytes_per_second > 0.0) {
          double secs_to_wait_bw = (double) nread / bytes_per_second;
          double secs_waited_bw = secs_now - secs_last_write;
          secs_remain_bw = secs_to_wait_bw - secs_waited_bw;
        }

        /* based on the percentage of time we are allowed to be running,
         * compute time we need to sleep before attempting our next write */
        double secs_remain_runtime = 0.0;
        if (percent_runtime > 0.0) {
          /* stop the run clock, add to the run time,
           * and restart the run clock */
          secs_run_end = secs_now;
          secs_run += secs_run_end - secs_run_start;
          secs_run_start = secs_run_end;

          /* compute our total time, and the time we need to sleep */
          double secs_total = secs_run + secs_slept;
          secs_remain_runtime = secs_run / percent_runtime - secs_total;
        }

        /* take the maximum of these two values */
        secs_remain = secs_remain_bw;
        if (secs_remain_runtime > secs_remain) {
          secs_remain = secs_remain_runtime;
        }
      }

      /* check for a state transition */
      if (state != old_state) {
        if (state == RUNNING) {
          /* if we switched to RUNNING, kick out without sleeping and
           * reset the total run and sleep times */
          secs_remain = 0.0;
          secs_run    = 0.0;
          secs_slept  = 0.0;
        } else if (state == STOPPED) {
          /* if we switched to STOPPED, close our files if open */
          close_files(new_file_src, &fd_src, new_file_dst, &fd_dst);
          clear_parameters(&new_file_src, &new_file_dst, &new_position);
          clear_parameters(&old_file_src, &old_file_dst, &old_position);

          /* after closing our files, update our state in the transfer file */
          set_transfer_file_state(SCR_TRANSFER_KEY_STATE_STOP, 0);
        }
      }

      /* assume we can sleep for the full remainder of the time */
      double secs = secs_remain;

      /* if we're not running, always sleep for the full time */
      if (state != RUNNING) {
        secs = scr_transfer_secs;
      }

      /* set a maximum time to sleep before we read the hash file again
       * (ensures some responsiveness) */
      if (secs > scr_transfer_secs) {
        secs = scr_transfer_secs;
      }

      /* sleep if we need to */
      if (secs > 0.0) {
        /* stop the run clock and add to the total run time */
        secs_run_end = scr_seconds();
        secs_run += secs_run_end - secs_run_start;

        /* sleep */
        usleep((unsigned long) (secs * 1000000.0));
        secs_slept += secs;
        secs_remain -= secs;

        /* restart the run clock */
        secs_run_start = scr_seconds();
      }
    }

    /* write data out */
    if (state == RUNNING) {
      /* look for a new file to transfer */
      off_t filesize = 0;
      find_file(hash, &new_file_src, &new_file_dst, &new_position, &filesize);

      /* if we got a new file, close the old one (if open),
       * open the new file */
      if (bool_diff_files(new_file_src, old_file_src)) {
        /* close the old descriptor if it's open */
        if (fd_src >= 0) {
          scr_close(old_file_src, fd_src);
          fd_src = -1;
        }

        /* delete the old file name if we have one */
        if (old_file_src != NULL) {
          free(old_file_src);
          old_file_src = NULL;
        }

        /* reset our position counter */
        old_position = 0;

        /* open the file and remember the filename if we have one */
        if (new_file_src != NULL) {
          fd_src = scr_open(new_file_src, O_RDONLY);
          /* TODO: check for errors here */
          old_file_src = strdup(new_file_src);
          /* TODO: check for errors here */
        }
      }

      /* if we got a new file, close the old one (if open),
       * open the new file */
      if (bool_diff_files(new_file_dst, old_file_dst)) {
        /* close the old descriptor if it's open */
        if (fd_dst >= 0) {
          scr_close(old_file_dst, fd_dst);
          fd_dst = -1;
        }

        /* delete the old file name if we have one */
        if (old_file_dst != NULL) {
          free(old_file_dst);
          old_file_dst = NULL;
        }

        /* reset our position counter */
        old_position = 0;

        /* open the file and remember the filename if we have one */
        if (new_file_dst != NULL) {
          fd_dst = scr_open(new_file_dst, O_RDWR | O_CREAT, mode_file);
          /* TODO: check for errors here */
          old_file_dst = strdup(new_file_dst);
          /* TODO: check for errors here */
        }
      }

      /* we may have the same file, but perhaps the position changed
       * (may need to seek) */
      if (new_position != old_position) {
        if (fd_src >= 0) {
          lseek(fd_src, new_position, SEEK_SET);
          /* TODO: check for errors here */
        }

        if (fd_dst >= 0) {
          lseek(fd_dst, new_position, SEEK_SET);
          /* TODO: check for errors here */
        }

        /* remember the new position */
        old_position = new_position;
      }

      /* if we have two open files,
       * copy a chunk from source file to destination file */
      nread = 0;
      if (fd_src >= 0 && fd_dst >= 0) {
        /* compute number of bytes to read from file */
        size_t count = (size_t) (filesize - new_position);
        if (count > bufsize) {
          count = bufsize;
        }

        /* read a chunk */
        nread = scr_read(new_file_src, fd_src, buf, count);

        /* if we read data, write it out */
        if (nread > 0) {
          /* record the time of our write */
          secs_last_write = scr_seconds();

          /* write the chunk and force it out with an fsync */
          scr_write(new_file_dst, fd_dst, buf, nread);
          fsync(fd_dst);

          /* update our position */
          new_position += (off_t) nread;
          old_position = new_position;

          /* record the updated position in the transfer file */
          update_transfer_file(new_file_src, new_file_dst, new_position);
        }

        /* if we've written all of the bytes, close the files */
        if (new_position == filesize) {
          close_files(new_file_src, &fd_src, new_file_dst, &fd_dst);
          clear_parameters(&new_file_src, &new_file_dst, &new_position);
          clear_parameters(&old_file_src, &old_file_dst, &old_position);
        }
      } else {
        /* TODO: we may have an error
         * (failed to open the source or dest file) */
        /* if we found no file to transfer, move to a STOPPED state */
        if (new_file_src == NULL) {
          state = STOPPED;
          set_transfer_file_state(SCR_TRANSFER_KEY_STATE_STOP, 1);
        }
      }
    }
  }

  /* free our file copy buffer */
  if (buf != NULL) {
    free(buf);
    buf = NULL;
  }

  /* free the strdup'd tranfer file name */
  if (scr_transfer_file != NULL) {
    free(scr_transfer_file);
    scr_transfer_file = NULL;
  }

  return 0;
}
Пример #8
0
int main(int argc, char* argv[])
{
  int i, j;
  int index = 1;

  /* print usage if not enough arguments were given */
  if (argc < 2) {
    printf("Usage: scr_rebuild_xor <size> <root> <missing_xor_filename> <ordered_remaining_xor_filenames>\n");
    return 1;
  }

  /* TODO: want to pass this on command line? */
  /* get current working directory */
  char dsetdir[SCR_MAX_FILENAME];
  scr_getcwd(dsetdir, sizeof(dsetdir));

  /* create and reduce path for dataset */
  scr_path* path_dset = scr_path_from_str(dsetdir);
  scr_path_reduce(path_dset);

  /* allocate buffers */
  char* buffer_A = malloc(buffer_size * sizeof(char));
  char* buffer_B = malloc(buffer_size * sizeof(char));
  if (buffer_A == NULL || buffer_B == NULL) {
    scr_err("Failed to allocate buffer memory @ %s:%d",
      __FILE__, __LINE__
    );
    return 1;
  }

  /* read in the size of the XOR set */
  int xor_set_size = (int) strtol(argv[index++], (char **)NULL, 10);
  if (xor_set_size <= 0) {
    scr_err("Invalid XOR set size argument %s @ %s:%d",
      argv[index-1], __FILE__, __LINE__
    );
    return 1;
  }

  /* allocate memory for data structures based on the XOR set size */
  int*   num_files  = malloc(xor_set_size * sizeof(int));
  int*   offsets    = malloc(xor_set_size * sizeof(int));
  char** xor_files  = malloc(xor_set_size * sizeof(char*));
  int*   xor_fds    = malloc(xor_set_size * sizeof(int));
  scr_hash** xor_headers = malloc(xor_set_size * sizeof(scr_hash*));
  if (num_files == NULL || offsets == NULL || xor_files == NULL || xor_fds == NULL || xor_headers == NULL) {
    scr_err("Failed to allocate buffer memory @ %s:%d",
      __FILE__, __LINE__
    );
    return 1;
  }

  /* read in the rank of the missing process (the root) */
  int root = (int) strtol(argv[index++], (char **)NULL, 10);
  if (root < 0 || root >= xor_set_size) {
    scr_err("Invalid root argument %s @ %s:%d",
      argv[index-1], __FILE__, __LINE__
    );
    return 1;
  }

  /* read in the missing xor filename */
  xor_files[0] = strdup(argv[index++]);
  if (xor_files[0] == NULL) {
    scr_err("Failed to dup XOR filename @ %s:%d",
      __FILE__, __LINE__
    );
    return 1;
  }

  /* read in the xor filenames (expected to be in order of XOR segment number) */
  /* we order ranks so that root is index 0, the rank to the right of root is index 1, and so on */
  for (i=0; i < xor_set_size; i++) {
    xor_headers[i] = scr_hash_new();

    /* we'll get the XOR file name for root from the header stored in the XOR file of the partner */
    if (i == root) {
      continue;
    }

    /* adjust the index relative to root */
    j = i - root;
    if (j < 0) {
      j += xor_set_size;
    }

    /* copy the XOR file name */
    xor_files[j] = strdup(argv[index++]);
    if (xor_files[j] == NULL) {
      scr_err("Failed to dup XOR filename @ %s:%d",
        __FILE__, __LINE__
      );
      return 1;
    }
  }

  /* open each of the xor files and read in the headers */
  for (i=1; i < xor_set_size; i++) {
    /* open each xor file for reading */
    xor_fds[i] = scr_open(xor_files[i], O_RDONLY);
    if (xor_fds[i] < 0) {
      scr_err("Opening xor segment file: scr_open(%s) errno=%d %s @ %s:%d",
        xor_files[i], errno, strerror(errno), __FILE__, __LINE__
      );
      return 1;
    }

    /* read the header from this xor file */
    if (scr_hash_read_fd(xor_files[i], xor_fds[i], xor_headers[i]) < 0) {
      scr_err("Failed to read XOR header from %s @ %s:%d",
        xor_files[i], __FILE__, __LINE__
      );
      return 1;
    }
  }

  /* build header for missing XOR file */
  int partner_rank = -1;
  if (xor_set_size >= 2) {
    scr_hash_merge(xor_headers[0], xor_headers[1]);

    /* fetch our own file list from rank to our right */
    scr_hash* rhs_hash = scr_hash_get(xor_headers[1], SCR_KEY_COPY_XOR_PARTNER);
    scr_hash* current_hash = scr_hash_new();
    scr_hash_merge(current_hash, rhs_hash);
    scr_hash_set(xor_headers[0], SCR_KEY_COPY_XOR_CURRENT, current_hash);

    /* we are the partner to the rank to our left */
    scr_hash* lhs_hash = scr_hash_get(xor_headers[xor_set_size-1], SCR_KEY_COPY_XOR_CURRENT);
    scr_hash* partner_hash = scr_hash_new();
    scr_hash_merge(partner_hash, lhs_hash);
    scr_hash_set(xor_headers[0], SCR_KEY_COPY_XOR_PARTNER, partner_hash);

    /* get global rank of partner */
    if (scr_hash_util_get_int(lhs_hash, SCR_KEY_COPY_XOR_RANK, &partner_rank) != SCR_SUCCESS) {
      scr_err("Failed to read partner rank from XOR file header in %s @ %s:%d",
        xor_files[xor_set_size-1], __FILE__, __LINE__
      );
      return 1;
    }
  }

  /* get a pointer to the current hash for the missing rank */
  scr_hash* missing_current_hash = scr_hash_get(xor_headers[0], SCR_KEY_COPY_XOR_CURRENT);

  /* read the rank */
  int my_rank = -1;
  if (scr_hash_util_get_int(missing_current_hash, SCR_KEY_COPY_XOR_RANK, &my_rank) != SCR_SUCCESS) {
    scr_err("Failed to read rank from XOR file header in %s @ %s:%d",
      xor_files[0], __FILE__, __LINE__
    );
    return 1;
  }

  /* get the dataset */
  scr_dataset* dataset = scr_hash_get(xor_headers[0], SCR_KEY_COPY_XOR_DATASET);

  /* read the dataset id */
  int dset_id = -1;
  if (scr_dataset_get_id(dataset, &dset_id) != SCR_SUCCESS) {
    scr_err("Failed to read dataset id from XOR file header in %s @ %s:%d",
      xor_files[0], __FILE__, __LINE__
    );
    return 1;
  }

  /* read the ranks */
  int num_ranks = -1;
  if (scr_hash_util_get_int(xor_headers[0], SCR_KEY_COPY_XOR_RANKS, &num_ranks) != SCR_SUCCESS) {
    scr_err("Failed to read ranks from XOR file header in %s @ %s:%d",
      xor_files[0], __FILE__, __LINE__
    );
    return 1;
  }

  /* get name of partner's fmap */
  scr_path* path_partner_map = scr_path_from_str(".scr");
  scr_path_append_strf(path_partner_map, "fmap.%d.scr", partner_rank);

  /* extract partner's flush descriptor */
  scr_hash* flushdesc = scr_hash_new();
  scr_filemap* partner_map = scr_filemap_new();
  scr_filemap_read(path_partner_map, partner_map);
  scr_filemap_get_flushdesc(partner_map, dset_id, partner_rank, flushdesc);
  scr_filemap_delete(&partner_map);

  /* delete partner map path */
  scr_path_delete(&path_partner_map);

  /* determine whether we should preserve user directories */
  int preserve_dirs = 0;
  scr_hash_util_get_int(flushdesc, SCR_SCAVENGE_KEY_PRESERVE, &preserve_dirs);

  /* read the chunk size */
  unsigned long chunk_size = 0;
  if (scr_hash_util_get_unsigned_long(xor_headers[0], SCR_KEY_COPY_XOR_CHUNK, &chunk_size) != SCR_SUCCESS) {
    scr_err("Failed to read chunk size from XOR file header in %s @ %s:%d",
      xor_files[0], __FILE__, __LINE__
    );
    return 1;
  }

  /* determine number of files each member wrote in XOR set */
  for (i=0; i < xor_set_size; i++) {
    /* record the number of files for this rank */
    scr_hash* current_hash = scr_hash_get(xor_headers[i], SCR_KEY_COPY_XOR_CURRENT);
    if (scr_hash_util_get_int(current_hash, SCR_KEY_COPY_XOR_FILES, &num_files[i]) != SCR_SUCCESS) {
      scr_err("Failed to read number of files from %s @ %s:%d",
        xor_files[i], __FILE__, __LINE__
      );
      return 1;
    }
  }
  
  /* count the total number of files and set the offsets array */
  int total_num_files = 0;
  for (i=0; i < xor_set_size; i++) {
    offsets[i] = total_num_files;
    total_num_files += num_files[i];
  }

  /* allocate space for a file descriptor, file name pointer, and filesize for each user file */
  int* user_fds                 = (int*)           malloc(total_num_files * sizeof(int));
  char** user_files             = (char**)         malloc(total_num_files * sizeof(char*));
  char** user_rel_files         = (char**)         malloc(total_num_files * sizeof(char*));
  unsigned long* user_filesizes = (unsigned long*) malloc(total_num_files * sizeof(unsigned long));
  if (user_fds == NULL || user_files == NULL || user_rel_files == NULL || user_filesizes == NULL) {
    scr_err("Failed to allocate buffer memory @ %s:%d",
      __FILE__, __LINE__
    );
    return 1;
  }

  /* get file name, file size, and open each of the user files that we have */
  for (i=0; i < xor_set_size; i++) {
    scr_hash* current_hash = scr_hash_get(xor_headers[i], SCR_KEY_COPY_XOR_CURRENT);

    /* for each file belonging to this rank, get filename, filesize, and open file */
    for (j=0; j < num_files[i]; j++) {
      int offset = offsets[i] + j;

      /* get the meta data for this file */
      scr_meta* meta = scr_hash_get_kv_int(current_hash, SCR_KEY_COPY_XOR_FILE, j);
      if (meta == NULL) {
        scr_err("Failed to read meta data for file %d in %s @ %s:%d",
          j, xor_files[i], __FILE__, __LINE__
        );
        return 1;
      }

      /* record the filesize of this file */
      if (scr_meta_get_filesize(meta, &user_filesizes[offset]) != SCR_SUCCESS) {
        scr_err("Failed to read filesize field for file %d in %s @ %s:%d",
          j, xor_files[i], __FILE__, __LINE__
        );
        return 1;
      }

      /* get filename */
      char* origname;
      if (scr_meta_get_origname(meta, &origname) != SCR_SUCCESS) {
        scr_err("Failed to read original name for file %d in %s @ %s:%d",
          j, xor_files[i], __FILE__, __LINE__
        );
        return 1;
      }

      /* construct full path to user file */
      scr_path* path_user_full = scr_path_from_str(origname);
      if (preserve_dirs) {
        /* get original path of file */
        char* origpath;
        if (scr_meta_get_origpath(meta, &origpath) != SCR_SUCCESS) {
          scr_err("Failed to read original path for file %d in %s @ %s:%d",
            j, xor_files[i], __FILE__, __LINE__
          );
          return 1;
        }

        /* construct full path to file */
        scr_path_prepend_str(path_user_full, origpath);
      } else {
        /* construct full path to file */
        scr_path_prepend(path_user_full, path_dset);
      }

      /* reduce path to user file */
      scr_path_reduce(path_user_full);

      /* make a copy of the full path */
      user_files[offset] = scr_path_strdup(path_user_full);

      /* make a copy of relative path */
      scr_path* path_user_rel = scr_path_relative(path_dset, path_user_full);
      user_rel_files[offset] = scr_path_strdup(path_user_rel);
      scr_path_delete(&path_user_rel);

      /* free the full path */
      scr_path_delete(&path_user_full);

      /* open the file */
      if (i == 0) {
        /* create directory for file */
        scr_path* user_dir_path = scr_path_from_str(user_files[offset]);
        scr_path_reduce(user_dir_path);
        scr_path_dirname(user_dir_path);
        if (! scr_path_is_null(user_dir_path)) {
          char* user_dir = scr_path_strdup(user_dir_path);
          mode_t mode_dir = scr_getmode(1, 1, 1);
          if (scr_mkdir(user_dir, mode_dir) != SCR_SUCCESS) {
            scr_err("Failed to create directory for user file %s @ %s:%d",
              user_dir, __FILE__, __LINE__
            );
            return 1;
          }
          scr_free(&user_dir);
        }
        scr_path_delete(&user_dir_path);

        /* open missing file for writing */
        mode_t mode_file = scr_getmode(1, 1, 0);
        user_fds[offset] = scr_open(user_files[offset], O_WRONLY | O_CREAT | O_TRUNC, mode_file);
        if (user_fds[offset] < 0) {
          scr_err("Opening user file for writing: scr_open(%s) errno=%d %s @ %s:%d",
            user_files[offset], errno, strerror(errno), __FILE__, __LINE__
          );
          return 1;
        }
      } else {
        /* open existing file for reading */
        user_fds[offset] = scr_open(user_files[offset], O_RDONLY);
        if (user_fds[offset] < 0) {
          scr_err("Opening user file for reading: scr_open(%s) errno=%d %s @ %s:%d",
            user_files[offset], errno, strerror(errno), __FILE__, __LINE__
          );
          return 1;
        }
      }
    }
  }

  /* finally, open the xor file for the missing rank */
  mode_t mode_file = scr_getmode(1, 1, 0);
  xor_fds[0] = scr_open(xor_files[0], O_WRONLY | O_CREAT | O_TRUNC, mode_file);
  if (xor_fds[0] < 0) {
    scr_err("Opening xor file to be reconstructed: scr_open(%s) errno=%d %s @ %s:%d",
      xor_files[0], errno, strerror(errno), __FILE__, __LINE__
    );
    return 1;
  }

  int rc = 0;

  /* write the header to the XOR file of the missing rank */
  if (scr_hash_write_fd(xor_files[0], xor_fds[0], xor_headers[0]) < 0) {
    rc = 1;
  }

  /* this offset array records the current position we are in the logical file for each rank */
  unsigned long* offset = malloc(xor_set_size * sizeof(unsigned long));
  if (offset == NULL) {
    scr_err("Failed to allocate buffer memory @ %s:%d",
      __FILE__, __LINE__
    );
    return 1;
  }
  for (i=0; i < xor_set_size; i++) {
    offset[i] = 0;
  }

  unsigned long write_pos = 0;
  int chunk_id;
  for (chunk_id = 0; chunk_id < xor_set_size && rc == 0; chunk_id++) {
    size_t nread = 0;
    while (nread < chunk_size && rc == 0) {
      /* read upto buffer_size bytes at a time */
      size_t count = chunk_size - nread;
      if (count > buffer_size) {
        count = buffer_size;
      }

      /* clear our buffer */
      memset(buffer_A, 0, count);

      /* read a segment from each rank and XOR it into our buffer */
      for (i=1; i < xor_set_size; i++) {
        /* read the next set of bytes for this chunk from my file into send_buf */
        if (chunk_id != ((i + root) % xor_set_size)) {
          /* read chunk from the logical file for this rank */
          if (scr_read_pad_n(num_files[i], &user_files[offsets[i]], &user_fds[offsets[i]],
                             buffer_B, count, offset[i], &user_filesizes[offsets[i]]) != SCR_SUCCESS)
          {
            /* our read failed, set the return code to an error */
            rc = 1;
            count = 0;
          }
          offset[i] += count;
        } else {
          /* read chunk from the XOR file for this rank */
          if (scr_read_attempt(xor_files[i], xor_fds[i], buffer_B, count) != count) {
            /* our read failed, set the return code to an error */
            rc = 1;
            count = 0;
          }
        }

        /* TODO: XORing with unsigned long would be faster here (if chunk size is multiple of this size) */
        /* merge the blocks via xor operation */
        for (j = 0; j < count; j++) {
          buffer_A[j] ^= buffer_B[j];
        }
      }

      /* at this point, we have the data from the missing rank, write it out */
      if (chunk_id != root) {
        /* write chunk to logical file for the missing rank */
        if (scr_write_pad_n(num_files[0], &user_files[0], &user_fds[0],
                            buffer_A, count, write_pos, &user_filesizes[0]) != SCR_SUCCESS)
        {
          /* our write failed, set the return code to an error */
          rc = 1;
        }
        write_pos += count;
      } else {
        /* write chunk to xor file for the missing rank */
        if (scr_write_attempt(xor_files[0], xor_fds[0], buffer_A, count) != count) {
          /* our write failed, set the return code to an error */
          rc = 1;
        }
      }

      nread += count;
    }
  }

  /* close each of the user files */
  for (i=0; i < total_num_files; i++) {
    if (scr_close(user_files[i], user_fds[i]) != SCR_SUCCESS) {
      rc = 1;
    }
  }

  /* close each of the XOR files */
  for (i=0; i < xor_set_size; i++) {
    if (scr_close(xor_files[i], xor_fds[i]) != SCR_SUCCESS) {
      rc = 1;
    }
  }

  /* if the write failed, delete the files we just wrote, and return an error */
  if (rc != 0) {
    for (j=0; j < num_files[0]; j++) {
      scr_file_unlink(user_files[j]);
    }
    scr_file_unlink(xor_files[0]);
    return 1;
  }

  /* check that filesizes are correct */
  unsigned long filesize;
  for (j=0; j < num_files[0]; j++) {
    filesize = scr_file_size(user_files[j]);
    if (filesize != user_filesizes[j]) {
      /* the filesize check failed, so delete the file */
      scr_file_unlink(user_files[j]);

      /* mark the file as incomplete */
      scr_meta* meta = scr_hash_get_kv_int(missing_current_hash, SCR_KEY_COPY_XOR_FILE, j);
      scr_meta_set_complete(meta, 0);

      rc = 1;
    }
  }
  /* TODO: we didn't record the filesize of the XOR file for the missing rank anywhere */

  /* create a filemap for this rank */
  scr_filemap* map = scr_filemap_new();
  if (map == NULL) {
    scr_err("Failed to allocate filemap @ %s:%d",
      __FILE__, __LINE__
    );
    return 1;
  }

  /* record the dataset information in the filemap */
  scr_filemap_set_dataset(map, dset_id, my_rank, dataset);

  /* write meta data for each of the user files and add each one to the filemap */
  for (j=0; j < num_files[0]; j++) {
    /* add user file to filemap and record meta data */
    char* user_file_relative = user_rel_files[j];
    scr_filemap_add_file(map, dset_id, my_rank, user_file_relative);
    scr_meta* meta = scr_hash_get_kv_int(missing_current_hash, SCR_KEY_COPY_XOR_FILE, j);
    scr_filemap_set_meta(map, dset_id, my_rank, user_file_relative, meta);
  }

  /* write meta data for xor file and add it to the filemap */
  scr_filemap_add_file(map, dset_id, my_rank, xor_files[0]);
  unsigned long full_chunk_filesize = scr_file_size(xor_files[0]);
  int missing_complete = 1;
  scr_meta* meta_chunk = scr_meta_new();
  scr_meta_set_filename(meta_chunk, xor_files[0]);
  scr_meta_set_filetype(meta_chunk, SCR_META_FILE_XOR);
  scr_meta_set_filesize(meta_chunk, full_chunk_filesize);
  /* TODO: remove this from meta file, for now it's needed in scr_index.c */
  scr_meta_set_ranks(meta_chunk, num_ranks);
  scr_meta_set_complete(meta_chunk, missing_complete);
  scr_filemap_set_meta(map, dset_id, my_rank, xor_files[0], meta_chunk);

  /* set expected number of files for the missing rank */
  int expected_num_files = scr_filemap_num_files(map, dset_id, my_rank);
  scr_filemap_set_expected_files(map, dset_id, my_rank, expected_num_files);

  /* compute, check, and store crc values with files */
  for (j=0; j < num_files[0]; j++) {
    /* compute crc on user file */
    char* user_file_relative = user_rel_files[j];
    if (scr_compute_crc(map, dset_id, my_rank, user_file_relative) != SCR_SUCCESS) {
      /* the crc check failed, so delete the file */
      scr_file_unlink(user_files[j]);
      rc = 1;
    }
  }
  if (scr_compute_crc(map, dset_id, my_rank, xor_files[0]) != SCR_SUCCESS) {
    /* the crc check failed, so delete the file */
    scr_file_unlink(xor_files[0]);
    rc = 1;
  }

  /* store flush descriptor */
  scr_filemap_set_flushdesc(map, dset_id, my_rank, flushdesc);

  /* write filemap for this rank */
  scr_path* path_map = scr_path_from_str(".scr");
  scr_path_append_strf(path_map, "fmap.%d.scr", my_rank);
  if (scr_filemap_write(path_map, map) != SCR_SUCCESS) {
    rc = 1;
  }
  scr_path_delete(&path_map);

  /* delete the map */
  scr_filemap_delete(&map);

  scr_meta_delete(&meta_chunk);

  /* delete the flush/scavenge descriptor */
  scr_hash_delete(&flushdesc);

  scr_free(&offset);

  for (i=0; i < total_num_files; i++) {
    scr_free(&user_rel_files[i]);
    scr_free(&user_files[i]);
  }

  scr_free(&user_filesizes);
  scr_free(&user_rel_files);
  scr_free(&user_files);
  scr_free(&user_fds);

  for (i=0; i < xor_set_size; i++) {
    scr_hash_delete(&xor_headers[i]);
  }

  for (i=0; i < xor_set_size; i++) {
    scr_free(&xor_files[i]);
  }

  scr_free(&xor_headers);
  scr_free(&xor_fds);
  scr_free(&xor_files);
  scr_free(&offsets);
  scr_free(&num_files);

  scr_free(&buffer_B);
  scr_free(&buffer_A);

  scr_path_delete(&path_dset);

  return rc;
}
Пример #9
0
/* copy src_file (full path) to dest_path and return new full path in dest_file */
int scr_file_copy(
  const char* src_file,
  const char* dst_file,
  unsigned long buf_size,
  uLong* crc)
{
  /* check that we got something for a source file */
  if (src_file == NULL || strcmp(src_file, "") == 0) {
    scr_err("Invalid source file @ %s:%d",
      __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

  /* check that we got something for a destination file */
  if (dst_file == NULL || strcmp(dst_file, "") == 0) {
    scr_err("Invalid destination file @ %s:%d",
      __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

  /* open src_file for reading */
  int src_fd = scr_open(src_file, O_RDONLY);
  if (src_fd < 0) {
    scr_err("Opening file to copy: scr_open(%s) errno=%d %s @ %s:%d",
      src_file, errno, strerror(errno), __FILE__, __LINE__
    );
    return SCR_FAILURE;
  }

  /* open dest_file for writing */
  mode_t mode_file = scr_getmode(1, 1, 0);
  int dst_fd = scr_open(dst_file, O_WRONLY | O_CREAT | O_TRUNC, mode_file);
  if (dst_fd < 0) {
    scr_err("Opening file for writing: scr_open(%s) errno=%d %s @ %s:%d",
      dst_file, errno, strerror(errno), __FILE__, __LINE__
    );
    scr_close(src_file, src_fd);
    return SCR_FAILURE;
  }

  /* TODO:
  posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL)
  that tells the kernel that you don't ever need the pages
  from the file again, and it won't bother keeping them in the page cache.
  */
  posix_fadvise(src_fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL);
  posix_fadvise(dst_fd, 0, 0, POSIX_FADV_DONTNEED | POSIX_FADV_SEQUENTIAL);

  /* allocate buffer to read in file chunks */
  char* buf = (char*) malloc(buf_size);
  if (buf == NULL) {
    scr_err("Allocating memory: malloc(%llu) errno=%d %s @ %s:%d",
      buf_size, errno, strerror(errno), __FILE__, __LINE__
    );
    scr_close(dst_file, dst_fd);
    scr_close(src_file, src_fd);
    return SCR_FAILURE;
  }

  /* initialize crc values */
  if (crc != NULL) {
    *crc = crc32(0L, Z_NULL, 0);
  }

  int rc = SCR_SUCCESS;

  /* write chunks */
  int copying = 1;
  while (copying) {
    /* attempt to read buf_size bytes from file */
    int nread = scr_read_attempt(src_file, src_fd, buf, buf_size);

    /* if we read some bytes, write them out */
    if (nread > 0) {
      /* optionally compute crc value as we go */
      if (crc != NULL) {
        *crc = crc32(*crc, (const Bytef*) buf, (uInt) nread);
      }

      /* write our nread bytes out */
      int nwrite = scr_write_attempt(dst_file, dst_fd, buf, nread);

      /* check for a write error or a short write */
      if (nwrite != nread) {
        /* write had a problem, stop copying and return an error */
        copying = 0;
        rc = SCR_FAILURE;
      }
    }

    /* assume a short read means we hit the end of the file */
    if (nread < buf_size) {
      copying = 0;
    }

    /* check for a read error, stop copying and return an error */
    if (nread < 0) {
      /* read had a problem, stop copying and return an error */
      copying = 0;
      rc = SCR_FAILURE;
    }
  }

  /* free buffer */
  scr_free(&buf);

  /* close source and destination files */
  if (scr_close(dst_file, dst_fd) != SCR_SUCCESS) {
    rc = SCR_FAILURE;
  }
  if (scr_close(src_file, src_fd) != SCR_SUCCESS) {
    rc = SCR_FAILURE;
  }

  /* unlink the file if the copy failed */
  if (rc != SCR_SUCCESS) {
    unlink(dst_file);
  }

  return rc;
}
Пример #10
0
/* read contents of summary file */
static int scr_fetch_summary(
  const char* summary_dir,
  scr_hash* file_list)
{
  /* assume that we won't succeed in our fetch attempt */
  int rc = SCR_SUCCESS;

  /* check whether summary file exists and is readable */
  if (scr_my_rank_world == 0) {
    /* check that we can access the directory */
    if (scr_file_is_readable(summary_dir) != SCR_SUCCESS) {
      scr_err("Failed to access summary directory %s @ %s:%d",
        summary_dir, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }
  }

  /* broadcast success code from rank 0 */
  MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world);
  if (rc != SCR_SUCCESS) {
    return rc;
  }

  /* add path to file list */
  scr_hash_util_set_str(file_list, SCR_KEY_PATH, summary_dir);

  /* build path to summary file */
  scr_path* dataset_path = scr_path_from_str(summary_dir);
  scr_path* meta_path = scr_path_dup(dataset_path);
  scr_path_append_str(meta_path, ".scr");
  scr_path_reduce(meta_path);

  /* rank 0 reads the summary file */
  scr_hash* header = scr_hash_new();
  if (scr_my_rank_world == 0) {
    /* build path to summary file */
    scr_path* summary_path = scr_path_dup(meta_path);
    scr_path_append_str(summary_path, "summary.scr");
    const char* summary_file = scr_path_strdup(summary_path);

    /* open file for reading */
    int fd = scr_open(summary_file, O_RDONLY);
    if (fd >= 0) {
      /* read summary hash */
      ssize_t header_size = scr_hash_read_fd(summary_file, fd, header);
      if (header_size < 0) {
        rc = SCR_FAILURE;
      }

      /* TODO: check that the version is correct */

      /* close the file */
      scr_close(summary_file, fd);
    } else {
      scr_err("Failed to open summary file %s @ %s:%d",
        summary_file, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }

    /* free summary path and string */
    scr_free(&summary_file);
    scr_path_delete(&summary_path);
  }

  /* broadcast success code from rank 0 */
  MPI_Bcast(&rc, 1, MPI_INT, 0, scr_comm_world);
  if (rc != SCR_SUCCESS) {
    goto cleanup;
  }

  /* broadcast the summary hash */
  scr_hash_bcast(header, 0, scr_comm_world);

  /* extract and record the datast in file list */
  scr_hash* dataset_hash = scr_hash_new();
  scr_dataset* dataset = scr_hash_get(header, SCR_SUMMARY_6_KEY_DATASET);
  scr_hash_merge(dataset_hash, dataset);
  scr_hash_set(file_list, SCR_SUMMARY_6_KEY_DATASET, dataset_hash);

  /* build path to rank2file map */
  scr_path* rank2file_path = scr_path_dup(meta_path);
  scr_path_append_str(rank2file_path, "rank2file.scr");

  /* fetch file names and offsets containing file hash data */
  int valid = 0;
  char* file = NULL;
  unsigned long offset = 0;
  if (scr_my_rank_world == 0) {
    /* rank 0 is only valid reader to start with */
    valid  = 1;
    file   = scr_path_strdup(rank2file_path);
    offset = 0;
  }
  if (scr_fetch_rank2file_map(dataset_path, 1, &valid, &file, &offset)
      != SCR_SUCCESS)
  {
    rc = SCR_FAILURE;
  }

  /* create hashes to exchange data */
  scr_hash* send = scr_hash_new();
  scr_hash* recv = scr_hash_new();

  /* read data from file */
  if (valid) {
    /* open file if necessary */
    int fd = scr_open(file, O_RDONLY);
    if (fd >= 0) {
      /* create hash to hold file contents */
      scr_hash* save = scr_hash_new();

      /* read hash from file */
      scr_lseek(file, fd, offset, SEEK_SET);
      ssize_t readsize = scr_hash_read_fd(file, fd, save);
      if (readsize < 0) {
        scr_err("Failed to read rank2file map file %s @ %s:%d",
          file, __FILE__, __LINE__
        );
        rc = SCR_FAILURE;
      }

      /* check that the number of ranks match */
      int ranks = 0;
      scr_hash_util_get_int(save, SCR_SUMMARY_6_KEY_RANKS, &ranks);
      if (ranks != scr_ranks_world) {
        scr_err("Invalid number of ranks in %s, got %d expected %d @ %s:%d",
          file, ranks, scr_ranks_world, __FILE__, __LINE__
        );
        rc = SCR_FAILURE;
      }

      /* delete current send hash, set it to values from file,
       * delete file hash */
      scr_hash_delete(&send);
      send = scr_hash_extract(save, SCR_SUMMARY_6_KEY_RANK);
      scr_hash_delete(&save);

      /* close the file */
      scr_close(file, fd);
    } else {
      scr_err("Failed to open rank2file map %s @ %s:%d",
        file, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }

    /* delete file name string */
    scr_free(&file);
  }

  /* check that everyone read the data ok */
  if (! scr_alltrue(rc == SCR_SUCCESS)) {
    rc = SCR_FAILURE;
    goto cleanup_hashes;
  }

  /* scatter to groups */
  scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT);

  /* iterate over the ranks that sent data to us, and set up our
   * list of files */
  scr_hash_elem* elem;
  for (elem = scr_hash_elem_first(recv);
       elem != NULL;
       elem = scr_hash_elem_next(elem))
  {
    /* the key is the source rank, which we don't care about,
     * the info we need is in the element hash */
    scr_hash* elem_hash = scr_hash_elem_hash(elem);

    /* get pointer to file hash */
    scr_hash* file_hash = scr_hash_get(elem_hash, SCR_SUMMARY_6_KEY_FILE);
    if (file_hash != NULL) {
      /* TODO: parse summary file format */
      scr_hash_merge(file_list, elem_hash);
    } else {
      rc = SCR_FAILURE;
    }
  }

  /* fill in file list parameters */
  if (rc == SCR_SUCCESS) {
    /* if we're not using containers, add PATH entry for each of our
     * files */
    scr_hash* files = scr_hash_get(file_list, SCR_KEY_FILE);
    for (elem = scr_hash_elem_first(files);
         elem != NULL;
         elem = scr_hash_elem_next(elem))
    {
      /* get the file name */
      char* file = scr_hash_elem_key(elem);

      /* combine the file name with the summary directory to build a
       * full path to the file */
      scr_path* path_full = scr_path_dup(dataset_path);
      scr_path_append_str(path_full, file);

      /* subtract off last component to get just the path */
      scr_path_dirname(path_full);
      char* path = scr_path_strdup(path_full);

      /* record path in file list */
      scr_hash* hash = scr_hash_elem_hash(elem);
      scr_hash_util_set_str(hash, SCR_KEY_PATH, path);

      /* free the path and string */
      scr_free(&path);
      scr_path_delete(&path_full);
    }
  }

  /* check that everyone read the data ok */
  if (! scr_alltrue(rc == SCR_SUCCESS)) {
    rc = SCR_FAILURE;
    goto cleanup_hashes;
  }

cleanup_hashes:
  /* delete send and receive hashes */
  scr_hash_delete(&recv);
  scr_hash_delete(&send);

  /* free string and path for rank2file map */
  scr_path_delete(&rank2file_path);

cleanup:
  /* free the header hash */
  scr_hash_delete(&header);

  /* free path for dataset directory */
  scr_path_delete(&meta_path);
  scr_path_delete(&dataset_path);

  return rc;
}
Пример #11
0
static int scr_fetch_rank2file_map(
  const scr_path* dataset_path,
  int             depth,
  int*            ptr_valid,
  char**          ptr_file,
  unsigned long*  ptr_offset)
{
  int rc = SCR_SUCCESS;

  /* get local variables so we don't have to deference everything */
  int valid            = *ptr_valid;
  char* file           = *ptr_file;
  unsigned long offset = *ptr_offset;

  /* create a hash to hold section of file */
  scr_hash* hash = scr_hash_new();

  /* if we can read from file do it */
  if (valid) {
    /* open file if we haven't already */
    int fd = scr_open(file, O_RDONLY);
    if (fd >= 0) {
      /* read our segment from the file */
      scr_lseek(file, fd, offset, SEEK_SET);
      ssize_t read_rc = scr_hash_read_fd(file, fd, hash);
      if (read_rc < 0) {
        scr_err("Failed to read from %s @ %s:%d",
          file, __FILE__, __LINE__
        );
        rc = SCR_FAILURE;
      }

      /* close the file */
      scr_close(file, fd);
    } else {
      scr_err("Failed to open rank2file map %s @ %s:%d",
        file, __FILE__, __LINE__
      );
      rc = SCR_FAILURE;
    }
  }

  /* check for read errors */
  if (! scr_alltrue(rc == SCR_SUCCESS)) {
    rc = SCR_FAILURE;
    goto cleanup;
  }

  /* create hashes to exchange data */
  scr_hash* send = scr_hash_new();
  scr_hash* recv = scr_hash_new();

  /* copy rank data into send hash */
  if (valid) {
    scr_hash* rank_hash = scr_hash_get(hash, SCR_SUMMARY_6_KEY_RANK);
    scr_hash_merge(send, rank_hash);
  }

  /* exchange hashes */
  scr_hash_exchange_direction(send, recv, scr_comm_world, SCR_HASH_EXCHANGE_RIGHT);

  /* see if anyone sent us anything */
  int newvalid = 0;
  char* newfile = NULL;
  unsigned long newoffset = 0;
  scr_hash_elem* elem = scr_hash_elem_first(recv);
  if (elem != NULL) {
    /* got something, so now we'll read in the next step */
    newvalid = 1;

    /* get file name we should read */
    scr_hash* elem_hash = scr_hash_elem_hash(elem);
    char* value;
    if (scr_hash_util_get_str(elem_hash, SCR_SUMMARY_6_KEY_FILE, &value)
        == SCR_SUCCESS)
    {
      /* return string of full path to file to caller */
      scr_path* newpath = scr_path_dup(dataset_path);
      scr_path_append_str(newpath, value);
      newfile = scr_path_strdup(newpath);
      scr_path_delete(&newpath);
    } else {
      rc = SCR_FAILURE;
    }

    /* get offset we should start reading from */
    if (scr_hash_util_get_bytecount(elem_hash, SCR_SUMMARY_6_KEY_OFFSET, &newoffset)
        != SCR_SUCCESS)
    {
      rc = SCR_FAILURE;
    }
  }

  /* free the send and receive hashes */
  scr_hash_delete(&recv);
  scr_hash_delete(&send);

  /* get level id, and broadcast it from rank 0,
   * which we assume to be a reader in all steps */
  int level_id = -1;
  if (valid) {
    if (scr_hash_util_get_int(hash, SCR_SUMMARY_6_KEY_LEVEL, &level_id)
        != SCR_SUCCESS)
    {
      rc = SCR_FAILURE;
    }
  }
  MPI_Bcast(&level_id, 1, MPI_INT, 0, scr_comm_world);

  /* check for read errors */
  if (! scr_alltrue(rc == SCR_SUCCESS)) {
    rc = SCR_FAILURE;
    goto cleanup;
  }

  /* set parameters for output or next iteration,
   * we already took care of updating ptr_fd earlier */
  if (valid) {
    scr_free(ptr_file);
  }
  *ptr_valid  = newvalid;
  *ptr_file   = newfile;
  *ptr_offset = newoffset;

  /* recurse if we still have levels to read */
  if (level_id > 1) {
    rc = scr_fetch_rank2file_map(dataset_path, depth+1, ptr_valid, ptr_file, ptr_offset);
  }

cleanup:
  /* free the hash */
  scr_hash_delete(&hash);

  return rc;
}
Пример #12
0
/* apply XOR redundancy scheme to dataset files */
static int scr_reddesc_apply_xor(scr_filemap* map, const scr_reddesc* c, int id)
{
  int rc = SCR_SUCCESS;
  int i;

  /* get pointer to XOR state structure */
  scr_reddesc_xor* state = (scr_reddesc_xor*) c->copy_state;

  /* allocate buffer to read a piece of my file */
  char* send_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size);
  if (send_buf == NULL) {
    scr_abort(-1, "Allocating memory for send buffer: malloc(%d) errno=%d %s @ %s:%d",
            scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__
    );
  }

  /* allocate buffer to read a piece of the recevied chunk file */
  char* recv_buf = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size);
  if (recv_buf == NULL) {
    scr_abort(-1, "Allocating memory for recv buffer: malloc(%d) errno=%d %s @ %s:%d",
            scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__
    );
  }

  /* count the number of files I have and allocate space in structures for each of them */
  int num_files = scr_filemap_num_files(map, id, scr_my_rank_world);
  int* fds = (int*) SCR_MALLOC(num_files * sizeof(int));
  char** filenames = (char**) SCR_MALLOC(num_files * sizeof(char*));
  unsigned long* filesizes = (unsigned long*) SCR_MALLOC(num_files * sizeof(unsigned long));

  /* record partner's redundancy descriptor hash in our filemap */
  scr_hash* lhs_desc_hash = scr_hash_new();
  scr_hash* my_desc_hash  = scr_hash_new();
  scr_reddesc_store_to_hash(c, my_desc_hash);
  scr_hash_sendrecv(my_desc_hash, state->rhs_rank, lhs_desc_hash, state->lhs_rank, c->comm);
  scr_filemap_set_desc(map, id, state->lhs_rank_world, lhs_desc_hash);
  scr_hash_delete(&my_desc_hash);
  scr_hash_delete(&lhs_desc_hash);

  /* allocate a new xor file header hash */
  scr_hash* header = scr_hash_new();

  /* record the global ranks of the processes in our xor group */
  scr_hash_merge(header, state->group_map);

  /* record dataset in header */
  scr_hash* dataset = scr_hash_new();
  scr_filemap_get_dataset(map, id, scr_my_rank_world, dataset);
  scr_hash_set(header, SCR_KEY_COPY_XOR_DATASET, dataset);

  /* open each file, get the filesize of each, and read the meta data of each */
  scr_hash* current_files = scr_hash_new();
  int file_count = 0;
  unsigned long my_bytes = 0;
  scr_hash_elem* file_elem;
  for (file_elem = scr_filemap_first_file(map, id, scr_my_rank_world);
       file_elem != NULL;
       file_elem = scr_hash_elem_next(file_elem))
  {
    /* get the filename */
    filenames[file_count] = scr_hash_elem_key(file_elem);

    /* get the filesize of this file and add the byte count to the total */
    filesizes[file_count] = scr_file_size(filenames[file_count]);
    my_bytes += filesizes[file_count];

    /* read the meta data for this file and insert it into the current_files hash */
    scr_meta* file_hash = scr_meta_new();
    scr_filemap_get_meta(map, id, scr_my_rank_world, filenames[file_count], file_hash);
    scr_hash_setf(current_files, file_hash, "%d", file_count);

    /* open the file */
    fds[file_count]  = scr_open(filenames[file_count], O_RDONLY);
    if (fds[file_count] < 0) {
      /* TODO: try again? */
      scr_abort(-1, "Opening checkpoint file for copying: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d",
                filenames[file_count], errno, strerror(errno), __FILE__, __LINE__
      );
    }

    file_count++;
  }

  /* set total number of files we have, plus our rank */
  scr_hash* current_hash = scr_hash_new();
  scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_RANK,  scr_my_rank_world);
  scr_hash_set_kv_int(current_hash, SCR_KEY_COPY_XOR_FILES, file_count);
  scr_hash_set(current_hash, SCR_KEY_COPY_XOR_FILE, current_files);

  /* exchange file info with partners and add data to our header */
  scr_hash* partner_hash = scr_hash_new();
  scr_hash_sendrecv(current_hash, state->rhs_rank, partner_hash, state->lhs_rank, c->comm);
  scr_hash_set(header, SCR_KEY_COPY_XOR_CURRENT, current_hash);
  scr_hash_set(header, SCR_KEY_COPY_XOR_PARTNER, partner_hash);

  /* allreduce to get maximum filesize */
  unsigned long max_bytes;
  MPI_Allreduce(&my_bytes, &max_bytes, 1, MPI_UNSIGNED_LONG, MPI_MAX, c->comm);

  /* TODO: use unsigned long integer arithmetic (with proper byte padding) instead of char to speed things up */

  /* compute chunk size according to maximum file length and number of ranks in xor set */
  /* if filesize doesn't divide evenly, then add one byte to chunk_size */
  /* TODO: check that ranks > 1 for this divide to be safe (or at partner selection time) */
  size_t chunk_size = max_bytes / (unsigned long) (c->ranks - 1);
  if ((c->ranks - 1) * chunk_size < max_bytes) {
    chunk_size++;
  }

  /* TODO: need something like this to handle 0-byte files? */
  if (chunk_size == 0) {
    chunk_size++;
  }

  /* record the dataset id and the chunk size in the xor chunk header */
  scr_hash_util_set_bytecount(header, SCR_KEY_COPY_XOR_CHUNK, chunk_size);

  /* set chunk filenames of form:  xor.<group_id>_<xor_rank+1>_of_<xor_ranks>.scr */
  char my_chunk_file[SCR_MAX_FILENAME];
  char* dir = scr_cache_dir_hidden_get(c, id);
  sprintf(my_chunk_file,  "%s/xor.%d_%d_of_%d.scr", dir, c->group_id, c->rank+1, c->ranks);
  scr_free(&dir);

  /* record chunk file in filemap before creating it */
  scr_filemap_add_file(map, id, scr_my_rank_world, my_chunk_file);
  scr_filemap_write(scr_map_file, map);

  /* open my chunk file */
  mode_t mode_file = scr_getmode(1, 1, 0);
  int fd_chunk = scr_open(my_chunk_file, O_WRONLY | O_CREAT | O_TRUNC, mode_file);
  if (fd_chunk < 0) {
    /* TODO: try again? */
    scr_abort(-1, "Opening XOR chunk file for writing: scr_open(%s) errno=%d %s @ %s:%d",
            my_chunk_file, errno, strerror(errno), __FILE__, __LINE__
    );
  }

  /* write out the xor chunk header */
  scr_hash_write_fd(my_chunk_file, fd_chunk, header);
  scr_hash_delete(&header);

  MPI_Request request[2];
  MPI_Status  status[2];

  /* XOR Reduce_scatter */
  size_t nread = 0;
  while (nread < chunk_size) {
    size_t count = chunk_size - nread;
    if (count > scr_mpi_buf_size) {
      count = scr_mpi_buf_size;
    }

    int chunk_id;
    for(chunk_id = c->ranks-1; chunk_id >= 0; chunk_id--) {
      /* read the next set of bytes for this chunk from my file into send_buf */
      if (chunk_id > 0) {
        int chunk_id_rel = (c->rank + c->ranks + chunk_id) % c->ranks;
        if (chunk_id_rel > c->rank) {
          chunk_id_rel--;
        }
        unsigned long offset = chunk_size * (unsigned long) chunk_id_rel + nread;
        if (scr_read_pad_n(num_files, filenames, fds,
                           send_buf, count, offset, filesizes) != SCR_SUCCESS)
        {
          rc = SCR_FAILURE;
        }
      } else {
        memset(send_buf, 0, count);
      }

      /* TODO: XORing with unsigned long would be faster here (if chunk size is multiple of this size) */
      /* merge the blocks via xor operation */
      if (chunk_id < c->ranks-1) {
        for (i = 0; i < count; i++) {
          send_buf[i] ^= recv_buf[i];
        }
      }

      if (chunk_id > 0) {
        /* not our chunk to write, forward it on and get the next */
        MPI_Irecv(recv_buf, count, MPI_BYTE, state->lhs_rank, 0, c->comm, &request[0]);
        MPI_Isend(send_buf, count, MPI_BYTE, state->rhs_rank, 0, c->comm, &request[1]);
        MPI_Waitall(2, request, status);
      } else {
        /* write send block to send chunk file */
        if (scr_write_attempt(my_chunk_file, fd_chunk, send_buf, count) != count) {
          rc = SCR_FAILURE;
        }
      }
    }

    nread += count;
  }

  /* close my chunkfile, with fsync */
  if (scr_close(my_chunk_file, fd_chunk) != SCR_SUCCESS) {
    rc = SCR_FAILURE;
  }

  /* close my dataset files */
  for (i=0; i < num_files; i++) {
    scr_close(filenames[i], fds[i]);
  }

  /* free the buffers */
  scr_free(&filesizes);
  /* in this case, we don't free each name, since we copied the pointer to the string in the filemap */
  scr_free(&filenames);
  scr_free(&fds);
  scr_align_free(&send_buf);
  scr_align_free(&recv_buf);

  /* TODO: need to check for errors */
  /* write meta file for xor chunk */
  unsigned long my_chunk_file_size = scr_file_size(my_chunk_file);
  scr_meta* meta = scr_meta_new();
  scr_meta_set_filename(meta, my_chunk_file);
  scr_meta_set_filetype(meta, SCR_META_FILE_XOR);
  scr_meta_set_filesize(meta, my_chunk_file_size);
  scr_meta_set_complete(meta, 1);
  /* TODODSET: move the ranks field elsewhere, for now it's needed by scr_index.c */
  scr_meta_set_ranks(meta, scr_ranks_world);
  scr_filemap_set_meta(map, id, scr_my_rank_world, my_chunk_file, meta);
  scr_filemap_write(scr_map_file, map);
  scr_meta_delete(&meta);

  /* if crc_on_copy is set, compute and store CRC32 value for chunk file */
  if (scr_crc_on_copy) {
    scr_compute_crc(map, id, scr_my_rank_world, my_chunk_file);
    /* TODO: would be nice to save this CRC in our partner's XOR file so we can check correctness on a rebuild */
  }

  return rc;
}