Example #1
0
static int
main_write(int argi, int argc, char **argv, json_object *main_obj)
{
    int rank = 0, dumpNum = 0, dumpCount = 0;
    unsigned long long problem_nbytes, dumpBytes = 0, summedBytes = 0;
    char nbytes_str[32], seconds_str[32], bandwidth_str[32], seconds_str2[32];
    double dumpTime = 0;
    double bandwidth, summedBandwidth;
    MACSIO_TIMING_GroupMask_t main_wr_grp = MACSIO_TIMING_GroupMask("main_write");
    double dump_loop_start, dump_loop_end;
    double min_dump_loop_start, max_dump_loop_end;
    int exercise_scr = JsonGetInt(main_obj, "clargs/exercise_scr");

    /* Sanity check args */

    /* Generate a static problem object to dump on each dump */
    json_object *problem_obj = MACSIO_DATA_GenerateTimeZeroDumpObject(main_obj,0);
    problem_nbytes = (unsigned long long) json_object_object_nbytes(problem_obj, JSON_C_FALSE);

#warning MAKE JSON OBJECT KEY CASE CONSISTENT
    json_object_object_add(main_obj, "problem", problem_obj);

    /* Just here for debugging for the moment */
    if (MACSIO_LOG_DebugLevel >= 2)
    {
        char outfName[256];
        FILE *outf;
        int json_c_print_flags = JSON_C_TO_STRING_PRETTY | JSON_C_TO_STRING_SPACED;

        if (MACSIO_LOG_DebugLevel < 3)
            json_c_print_flags |= JSON_C_TO_STRING_NO_EXTARR_VALS;

        snprintf(outfName, sizeof(outfName), "main_obj_write_%03d.json", MACSIO_MAIN_Rank);
        outf = fopen(outfName, "w");
        fprintf(outf, "\"%s\"\n", json_object_to_json_string_ext(main_obj, json_c_print_flags));
        fclose(outf);
    }

#warning WERE NOT GENERATING OR WRITING ANY METADATA STUFF

#warning MAKE THIS LOOP MORE LIKE A MAIN SIM LOOP WITH SIMPLE COMPUTE AND COMM STEP
    dump_loop_start = MT_Time();
    dumpTime = 0.0;
    for (dumpNum = 0; dumpNum < json_object_path_get_int(main_obj, "clargs/num_dumps"); dumpNum++)
    {
        double dt;
        int scr_need_checkpoint_flag = 1;
        MACSIO_TIMING_TimerId_t heavy_dump_tid;

#warning ADD OPTION TO UNLINK OLD FILE SETS

#ifdef HAVE_SCR
        if (exercise_scr)
            SCR_Need_checkpoint(&scr_need_checkpoint_flag);
#endif

        const MACSIO_IFACE_Handle_t *iface = MACSIO_IFACE_GetByName(
            json_object_path_get_string(main_obj, "clargs/interface"));

        /* log dump start */

        if (!exercise_scr || scr_need_checkpoint_flag)
        {
            int scr_valid = 0;

#ifdef HAVE_SCR
            if (exercise_scr)
                SCR_Start_checkpoint();
#endif

            /* Start dump timer */
            heavy_dump_tid = MT_StartTimer("heavy dump", main_wr_grp, dumpNum);

#warning REPLACE DUMPN AND DUMPT WITH A STATE TUPLE
#warning SHOULD HAVE PLUGIN RETURN FILENAMES SO MACSIO CAN STAT FOR TOTAL BYTES ON DISK
            /* do the dump */
            (*(iface->dumpFunc))(argi, argc, argv, main_obj, dumpNum, dumpTime);
#ifdef HAVE_MPI
            mpi_errno = 0;
#endif
            errno = 0;


            dt = MT_StopTimer(heavy_dump_tid);

#ifdef HAVE_SCR
            if (exercise_scr)
                SCR_Complete_checkpoint(scr_valid);
#endif
        }

        /* stop timer */
        dumpTime += dt;
        dumpBytes += problem_nbytes;
        dumpCount += 1;

        /* log dump timing */
        MACSIO_LOG_MSG(Info, ("Dump %02d BW: %s/%s = %s", dumpNum,
            MU_PrByts(problem_nbytes, 0, nbytes_str, sizeof(nbytes_str)),
            MU_PrSecs(dt, 0, seconds_str, sizeof(seconds_str)),
            MU_PrBW(problem_nbytes, dt, 0, bandwidth_str, sizeof(bandwidth_str))));
    }

    dump_loop_end = MT_Time();

    MACSIO_LOG_MSG(Info, ("Overall BW: %s/%s = %s",
        MU_PrByts(dumpBytes, 0, nbytes_str, sizeof(nbytes_str)),
        MU_PrSecs(dumpTime, 0, seconds_str, sizeof(seconds_str)),
        MU_PrBW(dumpBytes, dumpTime, 0, bandwidth_str, sizeof(bandwidth_str))));

    bandwidth = dumpBytes / dumpTime;
    summedBandwidth = bandwidth;
    min_dump_loop_start = dump_loop_start;
    max_dump_loop_end = dump_loop_end;

#ifdef HAVE_MPI
    MPI_Comm_rank(MACSIO_MAIN_Comm, &rank);
    MPI_Reduce(&bandwidth, &summedBandwidth, 1, MPI_DOUBLE, MPI_SUM, 0, MACSIO_MAIN_Comm);
    MPI_Reduce(&dumpBytes, &summedBytes, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MACSIO_MAIN_Comm);
    MPI_Reduce(&dump_loop_start, &min_dump_loop_start, 1, MPI_DOUBLE, MPI_MIN, 0, MACSIO_MAIN_Comm);
    MPI_Reduce(&dump_loop_end, &max_dump_loop_end, 1, MPI_DOUBLE, MPI_MAX, 0, MACSIO_MAIN_Comm);
#endif

    if (rank == 0)
    {
        MACSIO_LOG_MSG(Info, ("Summed  BW: %s",
            MU_PrBW(summedBandwidth, 1.0, 0, bandwidth_str, sizeof(bandwidth_str))));
        MACSIO_LOG_MSG(Info, ("Total Bytes: %s; Last finisher - First starter = %s; BW = %s",
            MU_PrByts(summedBytes, 0, nbytes_str, sizeof(nbytes_str)),
            MU_PrSecs(max_dump_loop_end - min_dump_loop_start, 0, seconds_str, sizeof(seconds_str)),
            MU_PrBW(summedBytes, max_dump_loop_end - min_dump_loop_start, 0, bandwidth_str, sizeof(bandwidth_str))));
    }
}
Example #2
0
int main (int argc, char* argv[])
{
  char *path_to_stdout = NULL;
  int scr_retval;
  /* check that we got an appropriate number of arguments */
  if (argc == 2) {
    path_to_stdout = argv[1];
  }
  else if(argc == 5){
    filesize = (size_t) atol(argv[1]);
    times = atoi(argv[2]);
    seconds = atoi(argv[3]);
    path_to_stdout = argv[4];
  }
  else{
    printf("Usage: test_api_file [filesize times sleep_secs path_to_stdout]\n");
    printf("OR: test_api_file [ path_to_stdout]\n");
    exit(1);
  }
  
  MPI_Init(&argc, &argv);

  int rank = -1, size = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  /* open file for stdout */
  printf("new stdout filename: \"%s\"\n", path_to_stdout);
  fflush(stdout);
  freopen(path_to_stdout, "a+", stdout);
  MPI_Barrier(MPI_COMM_WORLD);

  /* time how long it takes to get through init */
  MPI_Barrier(MPI_COMM_WORLD);

  double init_start = MPI_Wtime();
  if (SCR_Init() != SCR_SUCCESS){
    printf("FAILED INITIALIZING SCR\n");
    fclose(stdout);
    return -1;
  }
  double init_end = MPI_Wtime();
  double secs = init_end - init_start;

  MPI_Barrier(MPI_COMM_WORLD);

  double secsmin, secsmax, secssum;
  MPI_Reduce(&secs, &secsmin, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
  MPI_Reduce(&secs, &secsmax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
  MPI_Reduce(&secs, &secssum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  if (rank == 0) { printf("Init: Min %8.6f s\tMax %8.6f s\tAvg %8.6f s\n", secsmin, secsmax, secssum/size); }

  MPI_Barrier(MPI_COMM_WORLD);

  int num_files = rank % 4;
  char** files = NULL;
  char** bufs  = NULL;
  size_t* filesizes = NULL;
  char* buf = NULL;
  if (num_files > 0) {
    files = (char**) malloc(num_files * sizeof(char*));
    bufs  = (char**) malloc(num_files * sizeof(char*));
    filesizes = (size_t*) malloc(num_files * sizeof(size_t));
  }

  int i;
  for (i=0; i < num_files; i++) {
    // route our checkpoint file
    char name[256];
    sprintf(name, "rank_%d.%d.ckpt", rank, i);
    files[i] = strdup(name);
    filesizes[i] = filesize + rank + 2*i;
    bufs[i] = (char*) malloc(filesizes[i]);
  }
  if (num_files > 0) {
    buf = (char*) malloc(filesizes[num_files-1]);
  }

  // check each of our checkpoint files
  int found_checkpoint = 1;
  for (i=0; i < num_files; i++) {
    char file[2094];
    scr_retval = SCR_Route_file(files[i], file);
    if (scr_retval != SCR_SUCCESS) {
      printf("%d: failed calling SCR_Route_file(): %d: @%s:%d\n",
             rank, scr_retval, __FILE__, __LINE__
      );
    }
    if (read_checkpoint(file, &timestep, buf, filesizes[i])) {
      // check that contents are good
      if (!check_buffer(buf, filesizes[i], rank + 2*i, timestep)) {
        printf("!!!!CORRUPTION!!!! Rank %d, File %s: Invalid value in buffer\n", rank, file);
        fflush(stdout);
        fclose(stdout);
        MPI_Abort(MPI_COMM_WORLD, 1);
        return 1;
      }
    } else {
      found_checkpoint = 0;
    }
  }

  // check that everyone found their checkpoint files ok
  int all_found_checkpoint = 0;
  MPI_Allreduce(&found_checkpoint, &all_found_checkpoint, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
  if (!all_found_checkpoint && rank == 0) {
    printf("At least one rank (perhaps all) did not find its checkpoint\n");
    fflush(stdout);
  }

  // check that everyone is at the same timestep
  int timestep_and, timestep_or;
  int timestep_a, timestep_o;
  if (num_files > 0) {
    timestep_a = timestep;
    timestep_o = timestep;
  } else {
    timestep_a = 0xffffffff;
    timestep_o = 0x00000000;
  }
  MPI_Allreduce(&timestep_a, &timestep_and, 1, MPI_INT, MPI_BAND, MPI_COMM_WORLD);
  MPI_Allreduce(&timestep_o, &timestep_or,  1, MPI_INT, MPI_BOR,  MPI_COMM_WORLD);
  if (timestep_and != timestep_or) {
    printf("%d: Timesteps don't agree: timestep %d\n", rank, timestep);
    fflush(stdout);
    fclose(stdout);
    return 1;
  }
  timestep = timestep_and;

  // make up some data for the next checkpoint
  for (i=0; i < num_files; i++) {
    init_buffer(bufs[i], filesizes[i], rank + 2*i, timestep);
  }

  timestep++;

  // prime system once before timing
  int t;
  for(t=0; t < 1; t++) {
    int rc;
    int all_valid = 1;
    scr_retval = SCR_Start_checkpoint();
    if (scr_retval != SCR_SUCCESS) {
      printf("%d: failed calling SCR_Start_checkpoint(): %d: @%s:%d\n",
             rank, scr_retval, __FILE__, __LINE__
      );
    }
  for (i=0; i < num_files; i++) {
    int valid = 0;
    char file[2094];
    scr_retval = SCR_Route_file(files[i], file);
    if (scr_retval != SCR_SUCCESS) {
      printf("%d: failed calling SCR_route_file(): %d: @%s:%d\n",
             rank, scr_retval, __FILE__, __LINE__
      );
    }
    int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
    if (fd_me > 0) {
      valid = 1;

      // write the checkpoint
      rc = write_checkpoint(fd_me, timestep, bufs[i], filesizes[i]);
      if (rc < 0) { valid = 0; }

      rc = fsync(fd_me);
      if (rc < 0) { valid = 0; }

      // make sure the close is without error
      rc = close(fd_me);
      if (rc < 0) { valid = 0; }
    }
    if (!valid) { all_valid = 0; }
  }
  scr_retval = SCR_Complete_checkpoint(all_valid);
  if (scr_retval != SCR_SUCCESS) {
    printf("%d: failed calling SCR_Complete_checkpoint(): %d: @%s:%d\n",
           rank, scr_retval, __FILE__, __LINE__
    );
  }
  if (rank == 0) { printf("Completed checkpoint %d.\n", timestep); fflush(stdout); }

  timestep++;
  }
  MPI_Barrier(MPI_COMM_WORLD);

  if (times > 0) {
    int count = 0;
    double time_start = MPI_Wtime();
    for(t=0; t < times; t++) {
      int rc;
      int all_valid = 1;
      scr_retval = SCR_Start_checkpoint();
      if (scr_retval != SCR_SUCCESS) {
        printf("%d: failed calling SCR_Start_checkpoint(): %d: @%s:%d\n",
               rank, scr_retval, __FILE__, __LINE__
        );
      }
      for (i=0; i < num_files; i++) {
        int valid = 0;
        char file[2094];
        scr_retval = SCR_Route_file(files[i], file);
        if (scr_retval != SCR_SUCCESS) {
          printf("%d: failed calling SCR_Route_file(): %d: @%s:%d\n",
                 rank, scr_retval, __FILE__, __LINE__
          );
        }
        int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
        if (fd_me > 0) {
          count++;
          valid = 1;
          
          // write the checkpoint
          rc = write_checkpoint(fd_me, timestep, bufs[i], filesizes[i]);
          if (rc < 0) { valid = 0; }
          
          rc = fsync(fd_me);
          if (rc < 0) { valid = 0; }
          
          // make sure the close is without error
          rc = close(fd_me);
          if (rc < 0) { valid = 0; }
        }
        if (!valid) { all_valid = 0; }
      }
      scr_retval = SCR_Complete_checkpoint(all_valid);
      if (scr_retval != SCR_SUCCESS) {
        printf("%d: failed calling SCR_Complete_checkpoint(): %d: @%s:%d\n",
               rank, scr_retval, __FILE__, __LINE__
        );
      }
      if (rank == 0) { printf("Completed checkpoint %d.\n", timestep); fflush(stdout); }
      
      timestep++;
      if (seconds > 0) {
        if (rank == 0) { printf("Sleeping for %d seconds... \n", seconds); fflush(stdout); }
        sleep(seconds);
      }
    }
    double time_end = MPI_Wtime();
    double bw = (filesize*count/(1024*1024)) / (time_end - time_start);
    
    MPI_Barrier(MPI_COMM_WORLD);
    
    double bwmin, bwmax, bwsum;
    MPI_Reduce(&bw, &bwmin, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
    MPI_Reduce(&bw, &bwmax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&bw, &bwsum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    if (rank == 0) { printf("FileIO: Min %7.2f MB/s\tMax %7.2f MB/s\tAvg %7.2f MB/s\n", bwmin, bwmax, bwsum/size); }
  }

  if (buf != NULL) { free(buf); buf = NULL; }
  for (i=0; i < num_files; i++) {
    if (bufs[i]  != NULL) { free(bufs[i]);  bufs[i]  = NULL; }
    if (files[i] != NULL) { free(files[i]); files[i] = NULL; }
  }
  if (files     != NULL) { free(files);     files     = NULL; }
  if (bufs      != NULL) { free(bufs);      bufs      = NULL; }
  if (filesizes != NULL) { free(filesizes); filesizes = NULL; }

  scr_retval = SCR_Finalize();
  if (scr_retval != SCR_SUCCESS) {
    printf("%d: failed calling SCR_Finalize(): %d: @%s:%d\n",
           rank, scr_retval, __FILE__, __LINE__
    );
  }
  MPI_Finalize();

  fclose(stdout);
  return 0;
}
Example #3
0
float
jacobi(int nn)
{
  int i,j,k,n;
  float gosa,wgosa,s0,ss;
  double s, cpu, com;

  for(n=0 ; n<nn ; ++n){
    gosa = 0.0;
    wgosa= 0.0;
    //    if (id == 0)fprintf(stderr, "%d: nn:%d, imax:%d, jmax:%d, kmax:%d\n", n, nn, imax, jmax, kmax);
    //    if (id == 128)fprintf(stderr, "%d: nn:%d, imax:%d, jmax:%d, kmax:%d\n", n, nn, imax, jmax, kmax);
    s = MPI_Wtime();
    for(i=1 ; i<imax-1 ; ++i)
      for(j=1 ; j<jmax-1 ; ++j)
        for(k=1 ; k<kmax-1 ; ++k){
          s0 = a[0][i][j][k] * p[i+1][j  ][k  ]
             + a[1][i][j][k] * p[i  ][j+1][k  ]
             + a[2][i][j][k] * p[i  ][j  ][k+1]
             + b[0][i][j][k] * ( p[i+1][j+1][k  ] - p[i+1][j-1][k  ]
                               - p[i-1][j+1][k  ] + p[i-1][j-1][k  ] )
             + b[1][i][j][k] * ( p[i  ][j+1][k+1] - p[i  ][j-1][k+1]
                               - p[i  ][j+1][k-1] + p[i  ][j-1][k-1] )
             + b[2][i][j][k] * ( p[i+1][j  ][k+1] - p[i-1][j  ][k+1]
                               - p[i+1][j  ][k-1] + p[i-1][j  ][k-1] )
             + c[0][i][j][k] * p[i-1][j  ][k  ]
             + c[1][i][j][k] * p[i  ][j-1][k  ]
             + c[2][i][j][k] * p[i  ][j  ][k-1]
             + wrk1[i][j][k];

          ss = ( s0 * a[3][i][j][k] - p[i][j][k] ) * bnd[i][j][k];
          wgosa += ss*ss;

          wrk2[i][j][k] = p[i][j][k] + omega * ss;
        }

    for(i=1 ; i<imax-1 ; ++i)
      for(j=1 ; j<jmax-1 ; ++j)
        for(k=1 ; k<kmax-1 ; ++k)
          p[i][j][k] = wrk2[i][j][k];

    cpu = MPI_Wtime() - s;
    s = MPI_Wtime();
    sendp(ndx,ndy,ndz);
    com = MPI_Wtime() - s;
    s = MPI_Wtime();
    if (id == 0) {
      fprintf(stderr, "%d: time: %f cpu: %f com: %f nn:%d, imax:%d, jmax:%d, kmax:%d\n", cpu + com, cpu, com,  n, nn, imax, jmax, kmax);
    }
    //    if (n % 10 == 0 ) fprintf(stdout, "LLLL %d  %f\n", n, s);

#ifdef SCR_ENABLE
    int flag;
    char SCR_testFileName[SCR_MAX_FILENAME];
    char     testFileName[SCR_MAX_FILENAME];
    double gs, ge, start, end, dump, encoding;
    gs = start = MPI_Wtime();
    SCR_Need_checkpoint(&flag);
    if(flag){
      SCR_Start_checkpoint();
      sprintf(testFileName, "testfile.%d", id);
      strcpy(SCR_testFileName, testFileName);
      SCR_Route_file(testFileName, SCR_testFileName);
      if(id == 0)  printf(" ***** SCR_ROUTE_FILE %s\n", SCR_testFileName);
      file_dump(SCR_testFileName, p, MIMAX * MJMAX * MKMAX * sizeof(float));
      end = MPI_Wtime();
      dump = end - start;
      start = MPI_Wtime();
      SCR_Complete_checkpoint(1);
      ge = end = MPI_Wtime();
      encoding = end - start;
      //printf(" ***** SCR_START_CHECKPOINT\n");

      if (id == 0) fprintf(stderr, "time: %f (write: %f , enco:  %f)\n", ge - gs, dump, encoding);
    }


    // SCR

#endif
    MPI_Allreduce(&wgosa,
                  &gosa,
                  1,
                  MPI_FLOAT,
                  MPI_SUM,
                  MPI_COMM_WORLD);
  } /* end n loop */

  return(gosa);
}
Example #4
0
double getbw(char* name, char* buf, size_t size, int times)
{
  char file[SCR_MAX_FILENAME];
  double bw = 0.0;

  if (times > 0) {
    /* start the timer */
    double time_start = MPI_Wtime();

    /* write the checkpoint file */
    int i, count = 0;
    for(i=0; i < times; i++) {
      int rc;
      int valid = 0;

/*
      int need_checkpoint;
      SCR_Need_checkpoint(&need_checkpoint);
      if (need_checkpoint) {
*/

      /* instruct SCR we are starting the next checkpoint */
      SCR_Start_checkpoint();

      /* get the file name to write our checkpoint file to */
      char newname[SCR_MAX_FILENAME];
      sprintf(newname, "timestep.%d/%s", timestep, name);
      SCR_Route_file(newname, file);

      /* open the file and write the checkpoint */
      int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
      if (fd_me > 0) {
        count++;
        valid = 1;

        /* write the checkpoint data */
        rc = write_checkpoint(fd_me, timestep, buf, size);
        if (rc < 0) {
          valid = 0;
          printf("%d: Error writing to %s\n", rank, file);
        }

        /* force the data to storage */
        rc = fsync(fd_me);
        if (rc < 0) {
          valid = 0;
          printf("%d: Error fsync %s\n", rank, file);
        }

        /* make sure the close is without error */
        rc = close(fd_me);
        if (rc < 0) {
          valid = 0;
          printf("%d: Error closing %s\n", rank, file);
        }
      }
      else {
      	printf("%d: Could not open file %s\n", rank, file);
      }
      /*
      if( valid )
      	printf("%d: Wrote checkpoint to %s\n", rank, file);
      */

      /* mark this checkpoint as complete */
      SCR_Complete_checkpoint(valid);
      if (rank == 0) {
        printf("Completed checkpoint %d.\n", timestep);
        fflush(stdout);
      }

/*
      }
*/

      /* increase the timestep counter */
      timestep++;

      /* optionally sleep for some time */
      if (seconds > 0) {
        if (rank == 0) { printf("Sleeping for %d seconds... \n", seconds); fflush(stdout); }
        sleep(seconds);
      }
    }

    /* stop the timer and compute the bandwidth */
    double time_end = MPI_Wtime();
    bw = ((size * count) / (1024*1024)) / (time_end - time_start);
  }

  return bw;
}