Пример #1
0
extern void getGCStats( GCStats *s )
{
    nat total_collections = 0;
    nat g;
    Time gc_cpu = 0;
    Time gc_elapsed = 0;
    Time current_elapsed = 0;
    Time current_cpu = 0;

    getProcessTimes(&current_cpu, &current_elapsed);

    /* EZY: static inline'ify these */
    for (g = 0; g < RtsFlags.GcFlags.generations; g++)
        total_collections += generations[g].collections;

    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
        gc_cpu     += GC_coll_cpu[g];
        gc_elapsed += GC_coll_elapsed[g];
    }

    s->bytes_allocated = GC_tot_alloc*(StgWord64)sizeof(W_);
    s->num_gcs = total_collections;
    s->num_byte_usage_samples = residency_samples;
    s->max_bytes_used = max_residency*sizeof(W_);
    s->cumulative_bytes_used = cumulative_residency*(StgWord64)sizeof(W_);
    s->peak_megabytes_allocated = (StgWord64)(peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L));
    s->bytes_copied = GC_tot_copied*(StgWord64)sizeof(W_);
    s->max_bytes_slop = max_slop*(StgWord64)sizeof(W_);
    s->current_bytes_used = current_residency*(StgWord64)sizeof(W_);
    s->current_bytes_slop = current_slop*(StgWord64)sizeof(W_);
    /*
    s->init_cpu_seconds = TimeToSecondsDbl(get_init_cpu());
    s->init_wall_seconds = TimeToSecondsDbl(get_init_elapsed());
    */
    s->mutator_cpu_seconds = TimeToSecondsDbl(current_cpu - end_init_cpu - gc_cpu - PROF_VAL(RP_tot_time + HC_tot_time));
    s->mutator_wall_seconds = TimeToSecondsDbl(current_elapsed- end_init_elapsed - gc_elapsed);
    s->gc_cpu_seconds = TimeToSecondsDbl(gc_cpu);
    s->gc_wall_seconds = TimeToSecondsDbl(gc_elapsed);
    /* EZY: Being consistent with incremental output, but maybe should also discount init */
    s->cpu_seconds = TimeToSecondsDbl(current_cpu);
    s->wall_seconds = TimeToSecondsDbl(current_elapsed - end_init_elapsed);
    s->par_tot_bytes_copied = GC_par_tot_copied*(StgWord64)sizeof(W_);
    s->par_max_bytes_copied = GC_par_max_copied*(StgWord64)sizeof(W_);
}
Пример #2
0
int main(int argc, char **argv)
{
    if (argc != 4)
    {
        fputs("We need 3 arguments\n", stdout);
        return 1;
    }

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

    rebuild_target = atoi(argv[1]);
    const char *store_dir = argv[2];
    const char *data_file = argv[3];

    int ntargets = mpi_world_size - 1;
    if (ntargets > MAX_STORAGE_TARGETS)
        return 1;

    if (rebuild_target < 0 || rebuild_target > ntargets)
        return 1;
    helper = 1;
    while (helper == rebuild_target)
        helper += 1;
    if (helper == rebuild_target)
        return 1;

    PROF_START(total);
    PROF_START(init);

    int last_run_fd = -1;
    RunData last_run;
    memset(&last_run, 0, sizeof(RunData));
    if (mpi_rank == 0) {
        last_run_fd = open(data_file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
        read(last_run_fd, &last_run, sizeof(RunData));
    }

    /* Create mapping from storage targets to ranks, and vice versa */
    Target targetIDs[MAX_STORAGE_TARGETS] = {{0,0}};
    Target targetID = {0,0};
    if (mpi_rank != 0)
    {
        int store_fd = open(store_dir, O_DIRECTORY | O_RDONLY);
        int target_ID_fd = openat(store_fd, "targetNumID", O_RDONLY);
        char targetID_s[20] = {0};
        read(target_ID_fd, targetID_s, sizeof(targetID_s));
        close(target_ID_fd);
        close(store_fd);
        targetID.id = atoi(targetID_s);
        targetID.rank = mpi_rank;
    }
    MPI_Gather(
            &targetID, sizeof(Target), MPI_BYTE,
            targetIDs, sizeof(Target), MPI_BYTE,
            0,
            MPI_COMM_WORLD);
    if (mpi_rank == 0) {
        if (last_run.ntargets != ntargets) {
            /* ERROR - new number of targets */
            assert(0);
        }
        for (int i = 0; i < ntargets; i++)
            targetIDs[i] = targetIDs[i+1];
        for (int i = 0; i < ntargets; i++)
            last_run.targetIDs[i].rank = -1;
        for (int i = 0; i < ntargets; i++) {
            Target target = targetIDs[i];
            int j = 0;
            int found = 0;
            for (; j < ntargets; j++)
                if (last_run.targetIDs[j].id == target.id) {
                    last_run.targetIDs[j] = target;
                    found = 1;
                }
            if (!found) {
                /* ERROR - new target introduced */
                printf(" > %d, %d\n", target.id, target.rank);
                assert(0);
            }
        }
        rank2st[0] = -1;
        for (int i = 0; i < ntargets; i++)
        {
            st2rank[i] = last_run.targetIDs[i].rank;
            rank2st[st2rank[i]] = i;
        }
    }
    MPI_Bcast(st2rank, sizeof(st2rank), MPI_BYTE, 0, MPI_COMM_WORLD);
    MPI_Bcast(rank2st, sizeof(rank2st), MPI_BYTE, 0, MPI_COMM_WORLD);

    PROF_END(init);

    if (mpi_rank == 0)
        printf("%d(rank=%d), %d(rank=%d)\n", rebuild_target, st2rank[rebuild_target], helper, st2rank[helper]);

    PROF_START(main_work);

    memset(&pr_sender, 0, sizeof(pr_sender));

    if (mpi_rank != 0 && rank2st[mpi_rank] != rebuild_target)
    {
        PersistentDB *pdb = pdb_init();
        pdb_iterate(pdb, do_file);
        pdb_term(pdb);

        if (rank2st[mpi_rank] == helper) {
            int dummy;
            MPI_Ssend((void*)&dummy, sizeof(dummy), MPI_BYTE, st2rank[rebuild_target], 0, MPI_COMM_WORLD);
        }
        pr_add_tmp_to_total(&pr_sample);
        pr_report_progress(&pr_sender, pr_sample);
        pr_report_done(&pr_sender);
    }
    else if (rank2st[mpi_rank] == rebuild_target)
    {
        int helper_rank = st2rank[helper];
        MPI_Status stat;
        int count;
        FileInfo fi;
        MPI_Recv(&fi, sizeof(FileInfo), MPI_BYTE, helper_rank, 0, MPI_COMM_WORLD, &stat);
        MPI_Get_count(&stat, MPI_BYTE, &count);
        while (count == sizeof(FileInfo)) {
            char key[200];
            MPI_Recv(key, sizeof(key), MPI_BYTE, helper_rank, 0, MPI_COMM_WORLD, &stat);
            int keylen;
            MPI_Get_count(&stat, MPI_BYTE, &keylen);
            key[keylen] = '\0';
            do_file(key, keylen, &fi);

            MPI_Recv(&fi, sizeof(FileInfo), MPI_BYTE, helper_rank, 0, MPI_COMM_WORLD, &stat);
            MPI_Get_count(&stat, MPI_BYTE, &count);
        }
        pr_add_tmp_to_total(&pr_sample);
        pr_report_progress(&pr_sender, pr_sample);
        pr_report_done(&pr_sender);
    }
    else if (mpi_rank == 0)
    {
        printf("st - total files   | data read     | data written  | disk I/O\n");
        pr_receive_loop(ntargets-1);
    }

    PROF_END(main_work);
    PROF_END(total);

    if (mpi_rank == 0) {
        printf("Overall timings: \n");
        printf("init       | %9.2f ms\n", 1e3*PROF_VAL(init));
        printf("main_work  | %9.2f ms\n", 1e3*PROF_VAL(main_work));
        printf("total      | %9.2f ms\n", 1e3*PROF_VAL(total));
    }

    MPI_Barrier(MPI_COMM_WORLD);
    char *iter = hs.corrupt;
    for (size_t i = 0; i < hs.corrupt_count; i++)
    {
        printf("Potentially corrupt chunk: '%s'\n", iter);
        iter += strlen(iter);
    }

    MPI_Finalize();
}
Пример #3
0
Файл: Stats.c Проект: Sciumo/ghc
double
mut_user_time_until( Ticks t )
{
    return TICK_TO_DBL(t - GC_tot_cpu - PROF_VAL(RP_tot_time));
}
Пример #4
0
void
stat_exit (void)
{
    generation *gen;
    Time gc_cpu = 0;
    Time gc_elapsed = 0;
    Time init_cpu = 0;
    Time init_elapsed = 0;
    Time mut_cpu = 0;
    Time mut_elapsed = 0;
    Time exit_cpu = 0;
    Time exit_elapsed = 0;
    W_ tot_alloc;
    W_ alloc;

    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {

	char temp[BIG_STRING_LEN];
	Time tot_cpu;
	Time tot_elapsed;
	nat i, g, total_collections = 0;

	getProcessTimes( &tot_cpu, &tot_elapsed );
	tot_elapsed -= start_init_elapsed;

        tot_alloc = calcTotalAllocated();

        // allocated since the last GC
        alloc = tot_alloc - GC_tot_alloc;
        GC_tot_alloc = tot_alloc;

	/* Count total garbage collections */
	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
	    total_collections += generations[g].collections;

	/* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
	if (tot_cpu  == 0.0)  tot_cpu = 1;
	if (tot_elapsed == 0.0) tot_elapsed = 1;
	
	if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
	    statsPrintf("%9" FMT_SizeT " %9.9s %9.9s", (W_)alloc*sizeof(W_), "", "");
	    statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
	}

        for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
            gc_cpu     += GC_coll_cpu[i];
            gc_elapsed += GC_coll_elapsed[i];
        }

        // heapCensus() is called by the GC, so RP and HC time are
        // included in the GC stats.  We therefore subtract them to
        // obtain the actual GC cpu time.
        gc_cpu     -= PROF_VAL(RP_tot_time + HC_tot_time);
        gc_elapsed -= PROF_VAL(RPe_tot_time + HCe_tot_time);

        init_cpu     = get_init_cpu();
        init_elapsed = get_init_elapsed();

        exit_cpu     = end_exit_cpu - start_exit_cpu;
        exit_elapsed = end_exit_elapsed - start_exit_elapsed;

        mut_elapsed = start_exit_elapsed - end_init_elapsed - gc_elapsed;

        mut_cpu = start_exit_cpu - end_init_cpu - gc_cpu
            - PROF_VAL(RP_tot_time + HC_tot_time);
        if (mut_cpu < 0) { mut_cpu = 0; }

	if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
	    showStgWord64(GC_tot_alloc*sizeof(W_), 
				 temp, rtsTrue/*commas*/);
	    statsPrintf("%16s bytes allocated in the heap\n", temp);

	    showStgWord64(GC_tot_copied*sizeof(W_), 
				 temp, rtsTrue/*commas*/);
	    statsPrintf("%16s bytes copied during GC\n", temp);

            if ( residency_samples > 0 ) {
		showStgWord64(max_residency*sizeof(W_), 
				     temp, rtsTrue/*commas*/);
		statsPrintf("%16s bytes maximum residency (%" FMT_Word " sample(s))\n",
			temp, residency_samples);
	    }

	    showStgWord64(max_slop*sizeof(W_), temp, rtsTrue/*commas*/);
	    statsPrintf("%16s bytes maximum slop\n", temp);

	    statsPrintf("%16" FMT_SizeT " MB total memory in use (%" FMT_SizeT " MB lost due to fragmentation)\n\n", 
                        (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
                        (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));

	    /* Print garbage collections in each gen */
            statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
            for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
                gen = &generations[g];
                statsPrintf("  Gen %2d     %5d colls, %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
                            gen->no,
                            gen->collections,
                            gen->par_collections,
                            TimeToSecondsDbl(GC_coll_cpu[g]),
                            TimeToSecondsDbl(GC_coll_elapsed[g]),
                            gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
                            TimeToSecondsDbl(GC_coll_max_pause[g]));
            }

#if defined(THREADED_RTS)
            if (RtsFlags.ParFlags.parGcEnabled && n_capabilities > 1) {
                statsPrintf("\n  Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n", 
                            100 * (((double)GC_par_tot_copied / (double)GC_par_max_copied) - 1)
                                / (n_capabilities - 1)
                    );
            }
#endif
            statsPrintf("\n");

#if defined(THREADED_RTS)
            statsPrintf("  TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
                        taskCount, taskCount - workerCount,
                        peakWorkerCount, workerCount,
                        n_capabilities);

	    statsPrintf("\n");

            {
                nat i;
                SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
                for (i = 0; i < n_capabilities; i++) {
                    sparks.created   += capabilities[i]->spark_stats.created;
                    sparks.dud       += capabilities[i]->spark_stats.dud;
                    sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
                    sparks.converted += capabilities[i]->spark_stats.converted;
                    sparks.gcd       += capabilities[i]->spark_stats.gcd;
                    sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
                }

                statsPrintf("  SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
                            sparks.created + sparks.dud + sparks.overflowed,
                            sparks.converted, sparks.overflowed, sparks.dud,
                            sparks.gcd, sparks.fizzled);
            }
#endif

	    statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
                        TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));

            statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
                        TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
            statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
                        TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));

#ifdef PROFILING
	    statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
		    TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
	    statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
		    TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
#endif 
	    statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
		    TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
	    statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
		    TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
#ifndef THREADED_RTS
	    statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
		    TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
		    TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
#endif

            if (mut_cpu == 0) {
		showStgWord64(0, temp, rtsTrue/*commas*/);
            } else {
		showStgWord64(
                    (StgWord64)((GC_tot_alloc*sizeof(W_)) / TimeToSecondsDbl(mut_cpu)),
                    temp, rtsTrue/*commas*/);
            }

	    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
	
	    statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
                    TimeToSecondsDbl(tot_cpu - gc_cpu -
				PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 
		    / TimeToSecondsDbl(tot_cpu), 
                    TimeToSecondsDbl(tot_cpu - gc_cpu -
                                PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
                    / TimeToSecondsDbl(tot_elapsed));

            /*
            TICK_PRINT(1);
            TICK_PRINT(2);
	    REPORT(TOTAL_CALLS);
            TICK_PRINT_TOT(1);
            TICK_PRINT_TOT(2);
            */

#if USE_PAPI
            papi_stats_report();
#endif
#if defined(THREADED_RTS) && defined(PROF_SPIN)
            {
                nat g;
                
                statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
                statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
                for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
                    statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
                }
            }
#endif
	}

	if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
      char *fmt1, *fmt2;
      if (RtsFlags.MiscFlags.machineReadable) {
          fmt1 = " [(\"bytes allocated\", \"%llu\")\n";
          fmt2 = " ,(\"num_GCs\", \"%d\")\n"
                 " ,(\"average_bytes_used\", \"%ld\")\n"
                 " ,(\"max_bytes_used\", \"%ld\")\n"
                 " ,(\"num_byte_usage_samples\", \"%ld\")\n"
                 " ,(\"peak_megabytes_allocated\", \"%lu\")\n"
                 " ,(\"init_cpu_seconds\", \"%.3f\")\n"
                 " ,(\"init_wall_seconds\", \"%.3f\")\n"
                 " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
                 " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
                 " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
                 " ,(\"GC_wall_seconds\", \"%.3f\")\n"
                 " ]\n";
      }
      else {
          fmt1 = "<<ghc: %llu bytes, ";
          fmt2 = "%d GCs, %ld/%ld avg/max bytes residency (%ld samples), %luM in use, %.3f INIT (%.3f elapsed), %.3f MUT (%.3f elapsed), %.3f GC (%.3f elapsed) :ghc>>\n";
      }
	  /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
	  statsPrintf(fmt1, GC_tot_alloc*(StgWord64)sizeof(W_));
	  statsPrintf(fmt2,
		    total_collections,
		    residency_samples == 0 ? 0 : 
		        cumulative_residency*sizeof(W_)/residency_samples, 
		    max_residency*sizeof(W_), 
		    residency_samples,
		    (unsigned long)(peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
		    TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
		    TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
		    TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
	}

	statsFlush();
	statsClose();
    }

    if (GC_coll_cpu) {
      stgFree(GC_coll_cpu);
      GC_coll_cpu = NULL;
    }
    if (GC_coll_elapsed) {
      stgFree(GC_coll_elapsed);
      GC_coll_elapsed = NULL;
    }
    if (GC_coll_max_pause) {
      stgFree(GC_coll_max_pause);
      GC_coll_max_pause = NULL;
    }
}
Пример #5
0
Файл: Stats.c Проект: Sciumo/ghc
void
stat_exit(int alloc)
{
    generation *gen;
    Ticks gc_cpu = 0;
    Ticks gc_elapsed = 0;
    Ticks init_cpu = 0;
    Ticks init_elapsed = 0;
    Ticks mut_cpu = 0;
    Ticks mut_elapsed = 0;
    Ticks exit_cpu = 0;
    Ticks exit_elapsed = 0;

    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {

	char temp[BIG_STRING_LEN];
	Ticks tot_cpu;
	Ticks tot_elapsed;
	nat i, g, total_collections = 0;

	getProcessTimes( &tot_cpu, &tot_elapsed );
	tot_elapsed -= start_init_elapsed;

	GC_tot_alloc += alloc;

	/* Count total garbage collections */
	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
	    total_collections += generations[g].collections;

	/* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
	if (tot_cpu  == 0.0)  tot_cpu = 1;
	if (tot_elapsed == 0.0) tot_elapsed = 1;
	
	if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
	    statsPrintf("%9ld %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", "");
	    statsPrintf(" %5.2f %5.2f\n\n", 0.0, 0.0);
	}

        for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
            gc_cpu     += GC_coll_cpu[i];
            gc_elapsed += GC_coll_elapsed[i];
        }

        init_cpu     = get_init_cpu();
        init_elapsed = get_init_elapsed();

        exit_cpu     = end_exit_cpu - start_exit_cpu;
        exit_elapsed = end_exit_elapsed - start_exit_elapsed;

        mut_elapsed = start_exit_elapsed - end_init_elapsed - gc_elapsed;

        mut_cpu = start_exit_cpu - end_init_cpu - gc_cpu
            - PROF_VAL(RP_tot_time + HC_tot_time);
        if (mut_cpu < 0) { mut_cpu = 0; }

	if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
	    showStgWord64(GC_tot_alloc*sizeof(W_), 
				 temp, rtsTrue/*commas*/);
	    statsPrintf("%16s bytes allocated in the heap\n", temp);

	    showStgWord64(GC_tot_copied*sizeof(W_), 
				 temp, rtsTrue/*commas*/);
	    statsPrintf("%16s bytes copied during GC\n", temp);

            if ( residency_samples > 0 ) {
		showStgWord64(max_residency*sizeof(W_), 
				     temp, rtsTrue/*commas*/);
		statsPrintf("%16s bytes maximum residency (%ld sample(s))\n",
			temp, residency_samples);
	    }

	    showStgWord64(max_slop*sizeof(W_), temp, rtsTrue/*commas*/);
	    statsPrintf("%16s bytes maximum slop\n", temp);

	    statsPrintf("%16ld MB total memory in use (%ld MB lost due to fragmentation)\n\n", 
                        peak_mblocks_allocated * MBLOCK_SIZE_W / (1024 * 1024 / sizeof(W_)),
                        (peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));

	    /* Print garbage collections in each gen */
            statsPrintf("                                    Tot time (elapsed)  Avg pause  Max pause\n");
            for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
                gen = &generations[g];
                statsPrintf("  Gen %2d     %5d colls, %5d par   %5.2fs   %5.2fs     %3.4fs    %3.4fs\n",
                            gen->no,
                            gen->collections,
                            gen->par_collections,
                            TICK_TO_DBL(GC_coll_cpu[g]),
                            TICK_TO_DBL(GC_coll_elapsed[g]),
                            gen->collections == 0 ? 0 : TICK_TO_DBL(GC_coll_elapsed[g] / gen->collections),
                            TICK_TO_DBL(GC_coll_max_pause[g]));
            }

#if defined(THREADED_RTS)
            if (RtsFlags.ParFlags.parGcEnabled) {
                statsPrintf("\n  Parallel GC work balance: %.2f (%ld / %ld, ideal %d)\n", 
                            (double)GC_par_avg_copied / (double)GC_par_max_copied,
                            (lnat)GC_par_avg_copied, (lnat)GC_par_max_copied,
                            RtsFlags.ParFlags.nNodes
                    );
            }
#endif
            statsPrintf("\n");

#if defined(THREADED_RTS)
	    {
		nat i;
		Task *task;
                statsPrintf("                        MUT time (elapsed)       GC time  (elapsed)\n");
		for (i = 0, task = all_tasks; 
		     task != NULL; 
		     i++, task = task->all_link) {
		    statsPrintf("  Task %2d %-8s :  %6.2fs    (%6.2fs)     %6.2fs    (%6.2fs)\n",
				i,
				(task->worker) ? "(worker)" : "(bound)",
				TICK_TO_DBL(task->mut_time),
				TICK_TO_DBL(task->mut_etime),
				TICK_TO_DBL(task->gc_time),
				TICK_TO_DBL(task->gc_etime));
		}
	    }

	    statsPrintf("\n");

            {
                nat i;
                SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
                for (i = 0; i < n_capabilities; i++) {
                    sparks.created   += capabilities[i].spark_stats.created;
                    sparks.dud       += capabilities[i].spark_stats.dud;
                    sparks.overflowed+= capabilities[i].spark_stats.overflowed;
                    sparks.converted += capabilities[i].spark_stats.converted;
                    sparks.gcd       += capabilities[i].spark_stats.gcd;
                    sparks.fizzled   += capabilities[i].spark_stats.fizzled;
                }

                statsPrintf("  SPARKS: %ld (%ld converted, %ld overflowed, %ld dud, %ld GC'd, %ld fizzled)\n\n",
                            sparks.created + sparks.dud + sparks.overflowed,
                            sparks.converted, sparks.overflowed, sparks.dud,
                            sparks.gcd, sparks.fizzled);
            }
#endif

	    statsPrintf("  INIT    time  %6.2fs  (%6.2fs elapsed)\n",
                        TICK_TO_DBL(init_cpu), TICK_TO_DBL(init_elapsed));

            statsPrintf("  MUT     time  %6.2fs  (%6.2fs elapsed)\n",
                        TICK_TO_DBL(mut_cpu), TICK_TO_DBL(mut_elapsed));
            statsPrintf("  GC      time  %6.2fs  (%6.2fs elapsed)\n",
                        TICK_TO_DBL(gc_cpu), TICK_TO_DBL(gc_elapsed));

#ifdef PROFILING
	    statsPrintf("  RP      time  %6.2fs  (%6.2fs elapsed)\n",
		    TICK_TO_DBL(RP_tot_time), TICK_TO_DBL(RPe_tot_time));
	    statsPrintf("  PROF    time  %6.2fs  (%6.2fs elapsed)\n",
		    TICK_TO_DBL(HC_tot_time), TICK_TO_DBL(HCe_tot_time));
#endif 
	    statsPrintf("  EXIT    time  %6.2fs  (%6.2fs elapsed)\n",
		    TICK_TO_DBL(exit_cpu), TICK_TO_DBL(exit_elapsed));
	    statsPrintf("  Total   time  %6.2fs  (%6.2fs elapsed)\n\n",
		    TICK_TO_DBL(tot_cpu), TICK_TO_DBL(tot_elapsed));
#ifndef THREADED_RTS
	    statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
		    TICK_TO_DBL(gc_cpu)*100/TICK_TO_DBL(tot_cpu),
		    TICK_TO_DBL(gc_elapsed)*100/TICK_TO_DBL(tot_elapsed));
#endif

	    if (tot_cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time) == 0)
		showStgWord64(0, temp, rtsTrue/*commas*/);
	    else
		showStgWord64(
		    (StgWord64)((GC_tot_alloc*sizeof(W_))/
			     TICK_TO_DBL(tot_cpu - GC_tot_cpu - 
					 PROF_VAL(RP_tot_time + HC_tot_time))),
		    temp, rtsTrue/*commas*/);
	    
	    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
	
	    statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
		    TICK_TO_DBL(tot_cpu - GC_tot_cpu - 
				PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 
		    / TICK_TO_DBL(tot_cpu), 
		    TICK_TO_DBL(tot_cpu - GC_tot_cpu - 
				PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 
		    / TICK_TO_DBL(tot_elapsed));

            /*
            TICK_PRINT(1);
            TICK_PRINT(2);
	    REPORT(TOTAL_CALLS);
            TICK_PRINT_TOT(1);
            TICK_PRINT_TOT(2);
            */

#if USE_PAPI
            papi_stats_report();
#endif
#if defined(THREADED_RTS) && defined(PROF_SPIN)
            {
                nat g;
                
                statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
                statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
                for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
                    statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
                }
            }
#endif
	}

	if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
      char *fmt1, *fmt2;
      if (RtsFlags.MiscFlags.machineReadable) {
          fmt1 = " [(\"bytes allocated\", \"%llu\")\n";
          fmt2 = " ,(\"num_GCs\", \"%d\")\n"
                 " ,(\"average_bytes_used\", \"%ld\")\n"
                 " ,(\"max_bytes_used\", \"%ld\")\n"
                 " ,(\"num_byte_usage_samples\", \"%ld\")\n"
                 " ,(\"peak_megabytes_allocated\", \"%lu\")\n"
                 " ,(\"init_cpu_seconds\", \"%.2f\")\n"
                 " ,(\"init_wall_seconds\", \"%.2f\")\n"
                 " ,(\"mutator_cpu_seconds\", \"%.2f\")\n"
                 " ,(\"mutator_wall_seconds\", \"%.2f\")\n"
                 " ,(\"GC_cpu_seconds\", \"%.2f\")\n"
                 " ,(\"GC_wall_seconds\", \"%.2f\")\n"
                 " ]\n";
      }
      else {
          fmt1 = "<<ghc: %llu bytes, ";
          fmt2 = "%d GCs, %ld/%ld avg/max bytes residency (%ld samples), %luM in use, %.2f INIT (%.2f elapsed), %.2f MUT (%.2f elapsed), %.2f GC (%.2f elapsed) :ghc>>\n";
      }
	  /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
	  statsPrintf(fmt1, GC_tot_alloc*(StgWord64)sizeof(W_));
	  statsPrintf(fmt2,
		    total_collections,
		    residency_samples == 0 ? 0 : 
		        cumulative_residency*sizeof(W_)/residency_samples, 
		    max_residency*sizeof(W_), 
		    residency_samples,
		    (unsigned long)(peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
		    TICK_TO_DBL(init_cpu), TICK_TO_DBL(init_elapsed),
		    TICK_TO_DBL(mut_cpu), TICK_TO_DBL(mut_elapsed),
		    TICK_TO_DBL(gc_cpu), TICK_TO_DBL(gc_elapsed));
	}

	statsFlush();
	statsClose();
    }

    if (GC_coll_cpu) {
      stgFree(GC_coll_cpu);
      GC_coll_cpu = NULL;
    }
    if (GC_coll_elapsed) {
      stgFree(GC_coll_elapsed);
      GC_coll_elapsed = NULL;
    }
    if (GC_coll_max_pause) {
      stgFree(GC_coll_max_pause);
      GC_coll_max_pause = NULL;
    }
}