diff --git a/src/benchmarks/gc/README.md b/src/benchmarks/gc/README.md
index 5c2815719f..5440b12434 100644
--- a/src/benchmarks/gc/README.md
+++ b/src/benchmarks/gc/README.md
@@ -1,12 +1,14 @@
 # About
 
-`dotnet-gc-infra` lets you run GC performance tests and analyze and chart statistics.
+This program lets you run GC performance tests and analyze and chart statistics.
 
 Command examples in this document use Bash/PowerShell syntax. If using Window's CMD, replace `/` with `\`.
 
-The general workflow when using `dotnet-gc-infra` is:
+The general workflow when using the GC infra is:
 
-* For testing your changes to coreclr, get a master branch build of coreclr, and also your own build. (It can also be used to compare different configurations on just the master branch.)
+* For testing your changes to coreclr, get a master branch build of coreclr, and also your own build.
+  (You can of course use any version of coreclr, not just master.
+  You can also only test with a single coreclr.)
 * Write a benchfile. (Or generate default ones with `suite-create` as in the tutorial.) This will reference the coreclrs and list the tests to be run.
 * Run the benchfile and collect traces.
 * Run analysis on the output.
@@ -102,7 +104,7 @@ On non-Windows systems, you'll need [`dotnet-trace`](https://github.com/dotnet/d
 On non-Windows systems, to run container tests, you'll need `cgroup-tools` installed.
 You should have builds of coreclr available for use in the next step.
 
-Finally, run `py . setup` from the root of dotnet-gc-infra.
+Finally, run `py . setup` from the same directory as this README.
 This will read information about your system that's relevant to performance analysis (such as cache sizes) and save to `bench/host_info.yaml`.
 It will also install some necessary dependencies on Windows.
 
@@ -123,13 +125,13 @@ The benchfiles can exist anywhere. This example will use the local directory `be
 To avoid writing benchfiles yourself, `suite-create` can generate a few:
 
 ```sh
-cd path/to/dotnet-gc-infra
 py . suite-create bench/suite --coreclrs path_to_coreclr0 path_to_coreclr1
 ```
 
 `path_to_coreclr0` is the path to a [Core_Root](#Core_Root).
 
-`path_to_coreclr1` should be a different Core_Root. (it can be the same, but the point is to compare performance of two different builds.)
+`path_to_coreclr1` should be a different Core_Root. (It can be the same, but the point is to compare performance of two different builds.)
+You can omit this if you just intend to test a single coreclr.
 
 If you made a mistake, you can run `suite-create` again and pass `--overwrite`, which clears the output directory (`bench/suite` in this example) first.
 
diff --git a/src/benchmarks/gc/docs/bench_file.md b/src/benchmarks/gc/docs/bench_file.md
index 804d88a586..0724e8cfb8 100644
--- a/src/benchmarks/gc/docs/bench_file.md
+++ b/src/benchmarks/gc/docs/bench_file.md
@@ -211,6 +211,94 @@ complus_threadpool_forcemaxworkerthreads: `int | None`
 complus_tieredcompilation: `bool | None`
   Set to true to enable tiered compilation
 
+complus_bgcfltuningenabled: `bool | None`
+  Set to true to enable https://github.com/dotnet/coreclr/pull/26695
+
+complus_bgcmemgoal: `int | None`
+  See comment on https://github.com/dotnet/coreclr/pull/26695
+
+complus_bgcmemgoalslack: `int | None`
+  See comment on https://github.com/dotnet/coreclr/pull/26695
+
+complus_gcconcurrentfinalization: `bool | None`
+  Enable concurrent finalization (not available in normal coreclr builds)
+
+container: `[TestConfigContainer](#TestConfigContainer) | None`
+  Set to run the test in a container.
+  A container is a job object on Windows, or cgroups / docker container on non-Windows.
+
+affinitize: `bool | None`
+  If true, this will be run in a job object affinitized to a single core.
+  Only works on Windows.
+  See `run_in_job.c`'s `--affinitize` option.
+
+memory_load: `[MemoryLoadOptions](#MemoryLoadOptions) | None`
+  If set, the test runner will launch a second process that ensures this percentage of the system's memory is consumed.
+
+coreclr_specific: `Mapping[str, [ConfigOptions](#ConfigOptions)] | None`
+  Maps coreclr name to config options for only that coreclr.
+  If present, should have an entry for every coreclr.
+
+
+
+## ConfigOptions
+
+complus_gcserver: `bool | None`
+  Set to true to use server GC.
+
+complus_gcconcurrent: `bool | None`
+  Set to true to allow background GCs.
+
+complus_gcgen0size: `int | None`
+  gen0size in bytes. (decimal)
+
+complus_gcgen0maxbudget: `int | None`
+  Max gen0 budget in bytes. (decimal)
+
+complus_gcheapaffinitizeranges: `str | None`
+  On non-Windows, this should look like: 1,3,5,7-9,12
+  On Windows, this should include group numbers, like: 0:1,0:3,0:5,1:7-9,1:12
+
+complus_gcheapcount: `int | None`
+  Number of heaps. (decimal)
+  Only has effect when complus_gcserver is set.
+
+complus_gcheaphardlimit: `int | None`
+  Hard limit on heap size, in bytes. (decimal)
+
+complus_gclargepages: `bool | None`
+  Set to true to enable large pages.
+
+complus_gcnoaffinitize: `bool | None`
+  Set to true to prevent affinitizing GC threads to cpu cores.
+
+complus_gccpugroup: `bool | None`
+  Set to true to enable CPU groups.
+
+complus_gcnumaaware: `bool | None`
+  Set to false to disable NUMA-awareness in GC
+
+complus_thread_useallcpugroups: `bool | None`
+  Set to true to automatically distribute threads across CPU Groups
+
+complus_threadpool_forcemaxworkerthreads: `int | None`
+  Overrides the MaxThreads setting for the ThreadPool worker pool
+
+complus_tieredcompilation: `bool | None`
+  Set to true to enable tiered compilation
+
+complus_bgcfltuningenabled: `bool | None`
+  Set to true to enable https://github.com/dotnet/coreclr/pull/26695
+
+complus_bgcmemgoal: `int | None`
+  See comment on https://github.com/dotnet/coreclr/pull/26695
+
+complus_bgcmemgoalslack: `int | None`
+  See comment on https://github.com/dotnet/coreclr/pull/26695
+
+complus_gcconcurrentfinalization: `bool | None`
+  Enable concurrent finalization (not available in normal coreclr builds)
+
 container: `[TestConfigContainer](#TestConfigContainer) | None`
   Set to run the test in a container.
   A container is a job object on Windows, or cgroups / docker container on non-Windows.
@@ -220,7 +308,7 @@ affinitize: `bool | None`
   Only works on Windows.
   See `run_in_job.c`'s `--affinitize` option.
 
-memory_load_percent: `float | None`
+memory_load: `[MemoryLoadOptions](#MemoryLoadOptions) | None`
   If set, the test runner will launch a second process that ensures this percentage of the system's memory is consumed.
 
 
@@ -290,6 +378,17 @@ allocType: `"simple" | "reference"`
 testKind: `"time" | "highSurvival"`
 
 
+## MemoryLoadOptions
+
+percent: `float`
+  The memory load process will allocate memory until the system's memory load is this high.
+
+no_readjust: `bool | None`
+  If true, the memory load process will never allocate or free any more memory after it's started.
+  If false, it will allocate or free in order to keep the system's memory at `percent`.
+
+
+
 ## ScoreElement
 
 weight: `float`
diff --git a/src/benchmarks/gc/docs/metrics.md b/src/benchmarks/gc/docs/metrics.md
index a426d94b20..70d88dc9cb 100644
--- a/src/benchmarks/gc/docs/metrics.md
+++ b/src/benchmarks/gc/docs/metrics.md
@@ -224,6 +224,9 @@ IsNonBackground
 IsNonConcurrent
 ReasonIs_alloc_loh
 ReasonIs_alloc_soh
+ReasonIs_bgc_stepping
+ReasonIs_bgc_tuning_loh
+ReasonIs_bgc_tuning_soh
 ReasonIs_empty
 ReasonIs_gcstress
 ReasonIs_induced
@@ -246,6 +249,7 @@ UsesPromotion
 ## float metrics
 
 AllocRateMBSec
+AllocedMBAccumulated
 AllocedSinceLastGCMB
 BGCFinalPauseMSec
 BGCLohConcurrentRevisitedPages
@@ -301,6 +305,7 @@ LastPerHeapHistToEndMSec
 MaxBGCWaitMSec
 MbAllocatedOnLOHSinceLastGen2Gc
 MbAllocatedOnSOHSinceLastSameGenGc
+MemoryPressure
 Number
 PauseDurationMSec
 PauseDurationSeconds
@@ -342,6 +347,8 @@ Gen0Size
 Gen1CollectionCount
 Gen2CollectionCount
 InternalSecondsTaken
+NumCreatedWithFinalizers
+NumFinalized
 ThreadCount
 TotalSecondsTaken
 
@@ -349,9 +356,8 @@ TotalSecondsTaken
 
 FinalYoungestDesiredMB
 FirstEventToFirstGCSeconds
-FirstToLastEventSeconds
 FirstToLastGCSeconds
-NumHeaps
+HeapCount
 PctTimePausedInGC
 TotalAllocatedMB
 TotalLOHAllocatedMB
diff --git a/src/benchmarks/gc/jupyter_notebook.py b/src/benchmarks/gc/jupyter_notebook.py
index ad3b9bec9e..cd2c004ec6 100644
--- a/src/benchmarks/gc/jupyter_notebook.py
+++ b/src/benchmarks/gc/jupyter_notebook.py
@@ -165,6 +165,7 @@ def show_summary(trace: ProcessedTrace) -> None:
         single_heap_metrics=parse_single_heap_metrics_arg(("InMB", "OutMB")),
         show_first_n_gcs=5,
         show_last_n_gcs=None,
+        show_reasons=False,
     )
 )
 
@@ -636,3 +637,6 @@ def _more_custom(trace: ProcessedTrace) -> None:
 
 
 _more_custom(_TRACE)
+
+
+# %%
diff --git a/src/benchmarks/gc/src/analysis/aggregate_stats.py b/src/benchmarks/gc/src/analysis/aggregate_stats.py
index f6c0e0a41d..b28a10e254 100644
--- a/src/benchmarks/gc/src/analysis/aggregate_stats.py
+++ b/src/benchmarks/gc/src/analysis/aggregate_stats.py
@@ -2,7 +2,6 @@
 # The .NET Foundation licenses this file to you under the MIT license.
 # See the LICENSE file in the project root for more information.
 
-from math import ceil, floor
 from statistics import mean, stdev
 from typing import Callable, Iterable, List, Mapping, Sequence, Tuple, Type, TypeVar
 
@@ -17,7 +16,7 @@
 )
 from ..commonlib.result_utils import all_non_err, as_err, fn_to_ok, flat_map_ok, map_ok
 from ..commonlib.type_utils import check_cast, T
-from ..commonlib.util import get_percent
+from ..commonlib.util import get_95th_percentile, get_percent
 
 from .types import (
     Failable,
@@ -212,25 +211,6 @@ def _fail_if_empty(
     return lambda xs: Err(f"<no values>") if is_empty(xs) else Ok(cb(xs))
 
 
-# numpy has problems on ARM, so using this instead.
-def get_percentile(values: Sequence[float], percent: float) -> float:
-    assert not is_empty(values)
-    assert 0.0 <= percent <= 100.0
-    sorted_values = sorted(values)
-    fraction = percent / 100.0
-    index_and_fraction = (len(values) - 1) * fraction
-    prev_index = floor(index_and_fraction)
-    next_index = ceil(index_and_fraction)
-    # The closer we are to 'next_index', the more 'next' should matter
-    next_factor = index_and_fraction - prev_index
-    prev_factor = 1.0 - next_factor
-    return sorted_values[prev_index] * prev_factor + sorted_values[next_index]
-
-
-def _get_95th_percentile(values: Sequence[float]) -> FailableFloat:
-    return Err("<no values>") if is_empty(values) else Ok(get_percentile(values, 95))
-
-
 def _stdev(values: Sequence[float]) -> FailableFloat:
     if len(values) <= 1:
         return Err("Not enough values for stdev")
@@ -243,6 +223,6 @@ def _stdev(values: Sequence[float]) -> FailableFloat:
     "Max": _fail_if_empty(max),
     "Min": _fail_if_empty(min),
     "Sum": fn_to_ok(sum),
-    "95P": _get_95th_percentile,
+    "95P": get_95th_percentile,
     "Stdev": _stdev,
 }
diff --git a/src/benchmarks/gc/src/analysis/analyze_joins.py b/src/benchmarks/gc/src/analysis/analyze_joins.py
index 2e94cf207b..bf9b856527 100644
--- a/src/benchmarks/gc/src/analysis/analyze_joins.py
+++ b/src/benchmarks/gc/src/analysis/analyze_joins.py
@@ -13,16 +13,13 @@
 from ..commonlib.command import Command, CommandKind, CommandsMapping
 from ..commonlib.document import (
     Cell,
+    DocOutputArgs,
     Document,
     handle_doc,
-    OutputOptions,
-    OutputWidth,
-    OUTPUT_WIDTH_DOC,
+    output_options_from_args,
     Row,
     Section,
     Table,
-    TABLE_INDENT_DOC,
-    TXT_DOC,
 )
 from ..commonlib.option import map_option, non_null, optional_to_iter
 from ..commonlib.result_utils import unwrap
@@ -64,14 +61,13 @@ class StagesOrPhases(Enum):
 
 @with_slots
 @dataclass(frozen=True)
-class AnalyzeJoinsAllGcsArgs:
+class AnalyzeJoinsAllGcsArgs(DocOutputArgs):
     trace_path: Path = argument(name_optional=True, doc=TRACE_PATH_DOC)
     process: ProcessQuery = argument(default=None, doc=PROCESS_DOC)
     show_n_worst_stolen_time_instances: int = argument(
         default=10, doc=_DOC_N_WORST_STOLEN_TIME_INSTANCES
     )
     show_n_worst_joins: int = argument(default=10, doc=_DOC_N_WORST_JOINS)
-    txt: Optional[Path] = argument(default=None, doc=TXT_DOC)
 
 
 def analyze_joins_all_gcs(args: AnalyzeJoinsAllGcsArgs) -> None:
@@ -86,7 +82,7 @@ def analyze_joins_all_gcs(args: AnalyzeJoinsAllGcsArgs) -> None:
             show_n_worst_stolen_time_instances=args.show_n_worst_stolen_time_instances,
             show_n_worst_joins=args.show_n_worst_joins,
         ),
-        OutputOptions(txt=args.txt),
+        output_options_from_args(args),
     )
 
 
@@ -113,7 +109,7 @@ def analyze_joins_all_gcs_for_jupyter(
 
 @with_slots
 @dataclass(frozen=True)
-class _AnalyzeJoinsSingleGcArgs:
+class _AnalyzeJoinsSingleGcArgs(DocOutputArgs):
     trace_path: Path = argument(name_optional=True, doc=TRACE_PATH_DOC)
     gc_number: int = argument(doc=GC_NUMBER_DOC)
     process: ProcessQuery = argument(default=None, doc=PROCESS_DOC)
@@ -133,10 +129,6 @@ class _AnalyzeJoinsSingleGcArgs:
     )
     max_heaps: Optional[int] = argument(default=None, doc="Only show this many heaps")
 
-    txt: Optional[Path] = argument(default=None, doc=TXT_DOC)
-    output_width: Optional[OutputWidth] = argument(default=None, doc=OUTPUT_WIDTH_DOC)
-    table_indent: Optional[int] = argument(default=None, doc=TABLE_INDENT_DOC)
-
 
 def _analyze_joins_single_gc(args: _AnalyzeJoinsSingleGcArgs) -> None:
     _check_join_analysis_ready()
@@ -151,9 +143,7 @@ def _analyze_joins_single_gc(args: _AnalyzeJoinsSingleGcArgs) -> None:
         show_n_worst_stolen_time_instances=args.show_n_worst_stolen_time_instances,
         max_heaps=args.max_heaps,
     )
-    handle_doc(
-        doc, OutputOptions(width=args.output_width, table_indent=args.table_indent, txt=args.txt)
-    )
+    handle_doc(doc, output_options_from_args(args))
 
 
 def _get_processed_trace_with_just_join_info(
diff --git a/src/benchmarks/gc/src/analysis/analyze_single.py b/src/benchmarks/gc/src/analysis/analyze_single.py
index c1d28080c9..8681ef49a9 100644
--- a/src/benchmarks/gc/src/analysis/analyze_single.py
+++ b/src/benchmarks/gc/src/analysis/analyze_single.py
@@ -2,6 +2,7 @@
 # The .NET Foundation licenses this file to you under the MIT license.
 # See the LICENSE file in the project root for more information.
 
+from collections import Counter
 from dataclasses import dataclass
 from math import inf
 from pathlib import Path
@@ -12,16 +13,13 @@
 from ..commonlib.command import Command, CommandKind, CommandsMapping
 from ..commonlib.document import (
     Cell,
+    DocOutputArgs,
     Document,
     handle_doc,
-    OutputOptions,
-    OutputWidth,
-    OUTPUT_WIDTH_DOC,
+    output_options_from_args,
     Row,
     Section,
     Table,
-    TABLE_INDENT_DOC,
-    TXT_DOC,
 )
 from ..commonlib.option import map_option, non_null, optional_to_iter
 from ..commonlib.result_utils import match
@@ -59,7 +57,7 @@
 
 @with_slots
 @dataclass(frozen=True)
-class _AnalyzeSingleArgs:
+class _AnalyzeSingleArgs(DocOutputArgs):
     path: Path = argument(doc=SINGLE_PATH_DOC, name_optional=True)
     process: ProcessQuery = argument(default=None, doc=PROCESS_DOC)
     run_metrics: Optional[Sequence[str]] = argument(default=None, doc=RUN_METRICS_DOC)
@@ -95,17 +93,11 @@ class _AnalyzeSingleArgs:
     Should not be set if '--show-first-n-gcs' is.
     """,
     )
-
-    output_width: Optional[OutputWidth] = argument(default=None, doc=OUTPUT_WIDTH_DOC)
-    table_indent: Optional[int] = argument(default=None, doc=TABLE_INDENT_DOC)
-    txt: Optional[Path] = argument(default=None, doc=TXT_DOC)
+    show_reasons: bool = argument(default=False, doc="Show the reason for each GC")
 
 
 def analyze_single(args: _AnalyzeSingleArgs) -> None:
-    handle_doc(
-        _get_analyze_single_document(args),
-        OutputOptions(width=args.output_width, table_indent=args.table_indent, txt=args.txt),
-    )
+    handle_doc(_get_analyze_single_document(args), output_options_from_args(args))
 
 
 @with_slots
@@ -142,7 +134,7 @@ def analyze_single_gc_for_processed_trace_file(
 ) -> Document:
     gc = get_gc_with_number(trace.gcs, gc_number)
     rows = [
-        (Cell(metric.name), _value_cell(metric, gc.metric(metric))) for metric in single_gc_metrics
+        (Cell(metric.name), value_cell(metric, gc.metric(metric))) for metric in single_gc_metrics
     ]
     name = f"GC {gc_number} ({show_time_span_start_end(gc.StartRelativeMSec, gc.EndRelativeMSec)})"
     section = Section(name=name, tables=(Table(rows=rows),))
@@ -202,6 +194,7 @@ def _get_analyze_single_document(args: _AnalyzeSingleArgs) -> Document:
         show_first_n_gcs=args.show_first_n_gcs,
         show_last_n_gcs=args.show_last_n_gcs,
         single_heap_metrics=single_heap_metrics,
+        show_reasons=args.show_reasons,
     )
 
 
@@ -215,6 +208,7 @@ def analyze_single_for_processed_trace(
     single_heap_metrics: SingleHeapMetrics,
     show_first_n_gcs: Optional[int],
     show_last_n_gcs: Optional[int],
+    show_reasons: bool,
 ) -> Document:
 
     sections: List[Section] = []
@@ -232,17 +226,30 @@ def analyze_single_for_processed_trace(
     gcs_to_print = _get_sorted_gcs_to_print(show_first_n_gcs, show_last_n_gcs, gcs, sort_gcs_by)
 
     if is_empty(gcs_to_print.gcs):
-        if not is_empty(all_single_gc_metrics) or not is_empty(single_heap_metrics):
+        if not is_empty(all_single_gc_metrics) or not is_empty(single_heap_metrics) or show_reasons:
             sections.append(Section(text="No GCs in trace"))
     else:
         if not is_empty(all_single_gc_metrics):
-            sections.append(_get_single_gcs_section(gcs_to_print, all_single_gc_metrics))
+            sections.append(
+                _get_single_gcs_section(gcs_to_print, all_single_gc_metrics, show_reasons)
+            )
         if not is_empty(single_heap_metrics):
             sections.extend(_get_single_heaps_sections(gcs_to_print.gcs, single_heap_metrics))
 
+    if show_reasons:
+        sections.append(_get_reasons_summary(gcs))
+
     return Document(sections=sections)
 
 
+def _get_reasons_summary(gcs: Sequence[ProcessedGC]) -> Section:
+    rows = [
+        (Cell(reason.name), Cell(count))
+        for reason, count in Counter(gc.reason for gc in gcs).items()
+    ]
+    return Section(tables=(Table(headers=("Reason", "Count"), rows=rows),))
+
+
 def _get_run_metrics_section(trace: ProcessedTrace, run_metrics: RunMetrics) -> Optional[Section]:
     if is_empty(run_metrics):
         return None
@@ -250,7 +257,7 @@ def _get_run_metrics_section(trace: ProcessedTrace, run_metrics: RunMetrics) ->
         run_metrics_table = Table(
             headers=("Name", "Value"),
             rows=[
-                (Cell(run_metric.name), _value_cell(run_metric, trace.metric(run_metric)))
+                (Cell(run_metric.name), value_cell(run_metric, trace.metric(run_metric)))
                 for run_metric in run_metrics
             ],
         )
@@ -276,12 +283,22 @@ class _GCsAndDescription:
     descr: Optional[str]
 
 
-def _get_single_gcs_section(gcs: _GCsAndDescription, single_gc_metrics: SingleGCMetrics) -> Section:
+def _get_single_gcs_section(
+    gcs: _GCsAndDescription, single_gc_metrics: SingleGCMetrics, show_reasons: bool
+) -> Section:
     def get_row_for_gc(gc: ProcessedGC) -> Optional[Row]:
-        return [Cell(gc.Number), *(_value_cell(m, gc.metric(m)) for m in single_gc_metrics)]
+        return (
+            Cell(gc.Number),
+            *optional_to_iter(Cell(str(gc.reason)) if show_reasons else None),
+            *(value_cell(m, gc.metric(m)) for m in single_gc_metrics),
+        )
 
     gcs_table = Table(
-        headers=["gc number", *(metric.name for metric in single_gc_metrics)],
+        headers=(
+            "gc number",
+            *optional_to_iter("reason" if show_reasons else None),
+            *(metric.name for metric in single_gc_metrics),
+        ),
         rows=[row for gc in gcs.gcs for row in optional_to_iter(get_row_for_gc(gc))],
     )
     name = "Single gcs" + ("" if gcs.descr is None else f" ({gcs.descr})")
@@ -295,7 +312,7 @@ def section_for_gc(gc: ProcessedGC) -> Section:
         heaps_table = Table(
             headers=("heap", *(metric.name for metric in single_heap_metrics)),
             rows=[
-                (Cell(heap_i), *(_value_cell(m, heap.metric(m)) for m in single_heap_metrics))
+                (Cell(heap_i), *(value_cell(m, heap.metric(m)) for m in single_heap_metrics))
                 for heap_i, heap in enumerate(gc.heaps)
             ],
         )
@@ -346,7 +363,7 @@ def _last_n(gcs: Sequence[ProcessedGC], n: int) -> _GCsAndDescription:
     return _GCsAndDescription(gcs[-n:], f"last {n}" if n < len(gcs) else None)
 
 
-def _value_cell(metric: MetricBase, value: FailableValue) -> Cell:
+def value_cell(metric: MetricBase, value: FailableValue) -> Cell:
     return match(
         value,
         cb_ok=lambda v: Cell("%.4f" % v) if metric.do_not_use_scientific_notation else Cell(v),
diff --git a/src/benchmarks/gc/src/analysis/chart_utils.py b/src/benchmarks/gc/src/analysis/chart_utils.py
index a59e0533eb..06d4a3cf07 100644
--- a/src/benchmarks/gc/src/analysis/chart_utils.py
+++ b/src/benchmarks/gc/src/analysis/chart_utils.py
@@ -12,29 +12,34 @@
 from matplotlib.lines import lineStyles, lineMarkers
 import matplotlib.pyplot as plt
 
-from ..commonlib.collection_util import flatten, indices, XYRanges, zip_check, zip_shorten_former
+from ..commonlib.collection_util import flatten, indices, XYRanges, zip_check, zip_shorten_latter
 from ..commonlib.option import map_option, option_or
 from ..commonlib.type_utils import check_cast, T, with_slots
-from ..commonlib.util import ensure_dir, remove_str_end
+from ..commonlib.util import change_extension, ensure_dir, get_command_line, remove_str_end
 
 # TODO: add more styles if necessary
 # Solid, dashed with dots, dotted
 LINE_STYLES: Sequence[str] = ["-", "-.", ":"]
 assert all(l in lineStyles.keys() for l in LINE_STYLES)
 
-_MARKER_STYLES: Sequence[str] = [
-    ".",
-    "x",
-    # triangle_up,
-    "^",
-    # pentagon
-    "p",
-]
-assert all(m in lineMarkers.keys() for m in _MARKER_STYLES)
+def _get_marker_styles() -> Sequence[str]:
+    first_four = (
+        ".",
+        "x",
+        # triangle_up,
+        "^",
+        # pentagon
+        "p"
+    )
+    assert all(m in lineMarkers.keys() for m in first_four)
+    return (*first_four, *(m for m in lineMarkers if m not in first_four))
+
+_MARKER_STYLES: Sequence[str] = _get_marker_styles()
 
 
 def zip_with_marker_styles(s: Sequence[T]) -> Iterable[Tuple[T, str]]:
-    return zip_shorten_former(s, _MARKER_STYLES)
+    assert len(s) <= len(_MARKER_STYLES), f"Drawing {len(s)} lines, need to add more marker styles"
+    return zip_shorten_latter(s, _MARKER_STYLES)
 
 
 Color = Tuple[float, float, float, float]
@@ -90,6 +95,11 @@ def show_or_save(out: Optional[Path], width_factor: Optional[float] = None) -> N
         plt.savefig(str(out), bbox_inches="tight", pad_inches=0)
         if width_factor is not None:
             _fix_svg_width(out, out, width_factor)
+        # Also write out the command used
+
+        out_txt = change_extension(out, "txt")
+        assert out_txt != out
+        out_txt.write_text(get_command_line())
         print(f"Saved to {out}")
 
 
diff --git a/src/benchmarks/gc/src/analysis/clr.py b/src/benchmarks/gc/src/analysis/clr.py
index b11972972f..1b12f9bdb7 100644
--- a/src/benchmarks/gc/src/analysis/clr.py
+++ b/src/benchmarks/gc/src/analysis/clr.py
@@ -40,11 +40,12 @@
 )
 
 
+# pylint:disable=import-outside-toplevel
+
 # This class contains no data, but is passed around as proof that we've set up CLR.
 class Clr:
     @property
     def _system(self) -> Any:
-        # pylint:disable=import-outside-toplevel
         import System  # type: ignore
 
         return System
diff --git a/src/benchmarks/gc/src/analysis/condemned_reasons.py b/src/benchmarks/gc/src/analysis/condemned_reasons.py
index d4713ca852..f9b0d85eb5 100644
--- a/src/benchmarks/gc/src/analysis/condemned_reasons.py
+++ b/src/benchmarks/gc/src/analysis/condemned_reasons.py
@@ -11,11 +11,11 @@
 from ..commonlib.command import Command, CommandKind, CommandsMapping
 from ..commonlib.document import (
     Cell,
+    DocOutputArgs,
     Document,
     handle_doc,
     OutputOptions,
-    OutputWidth,
-    OUTPUT_WIDTH_DOC,
+    output_options_from_args,
     Row,
     Section,
     single_table_document,
@@ -255,11 +255,10 @@ def _show_condemned_reasons(args: _ShowCondemnedReasonsArgs) -> None:
 
 @with_slots
 @dataclass(frozen=True)
-class _ShowCondemnedReasonsForGCArgs:
+class _ShowCondemnedReasonsForGCArgs(DocOutputArgs):
     path: Path = argument(name_optional=True, doc=SINGLE_PATH_DOC)
     gc_number: int = argument(doc=GC_NUMBER_DOC)
     process: ProcessQuery = argument(default=None, doc=PROCESS_DOC)
-    output_width: Optional[OutputWidth] = argument(default=None, doc=OUTPUT_WIDTH_DOC)
 
 
 def _show_condemned_reasons_for_gc(args: _ShowCondemnedReasonsForGCArgs) -> None:
@@ -272,7 +271,7 @@ def _show_condemned_reasons_for_gc(args: _ShowCondemnedReasonsForGCArgs) -> None
     ).unwrap()
     handle_doc(
         show_condemned_reasons_for_gc_for_jupyter(trace, args.gc_number),
-        OutputOptions(width=args.output_width),
+        output_options_from_args(args),
     )
 
 
diff --git a/src/benchmarks/gc/src/analysis/core_analysis.py b/src/benchmarks/gc/src/analysis/core_analysis.py
index 046053fe6d..4bd8b213b3 100644
--- a/src/benchmarks/gc/src/analysis/core_analysis.py
+++ b/src/benchmarks/gc/src/analysis/core_analysis.py
@@ -259,7 +259,16 @@ def get_process_info_from_process(
     show_name: str,
 ) -> ProcessInfo:
     mang = non_null(try_get_runtime(clr, process))
+    return get_process_info_from_mang(p, trace_path, process, show_name, mang)
 
+
+def get_process_info_from_mang(
+    p: AbstractTracedProcesses,
+    trace_path: Path,
+    process: AbstractTraceProcess,
+    show_name: str,
+    mang: AbstractTraceLoadedDotNetRuntime,
+) -> ProcessInfo:
     return ProcessInfo(
         event_names=p.event_names,
         name=show_name,
diff --git a/src/benchmarks/gc/src/analysis/enums.py b/src/benchmarks/gc/src/analysis/enums.py
index c6950f73d7..0aa7e71e4a 100644
--- a/src/benchmarks/gc/src/analysis/enums.py
+++ b/src/benchmarks/gc/src/analysis/enums.py
@@ -44,6 +44,9 @@ class gc_reason(OrderedEnum):
     lowmemory_host = 11
     pm_full_gc = 12
     lowmemory_host_blocking = 13
+    bgc_tuning_soh = 14
+    bgc_tuning_loh = 15
+    bgc_stepping = 16
 
 
 # See gc_heap_expand_mechanism in gcrecords.h
diff --git a/src/benchmarks/gc/src/analysis/mem_utils.py b/src/benchmarks/gc/src/analysis/mem_utils.py
index a9631647ee..4b68620311 100644
--- a/src/benchmarks/gc/src/analysis/mem_utils.py
+++ b/src/benchmarks/gc/src/analysis/mem_utils.py
@@ -11,9 +11,11 @@
 from ..commonlib.command import Command, CommandKind, CommandsMapping
 from ..commonlib.document import (
     Cell,
+    DocOutputArgs,
     Document,
     handle_doc,
     OutputOptions,
+    output_options_from_args,
     Section,
     single_table_document,
     Table,
@@ -583,9 +585,8 @@ def _parse_fragmented_block(line: str) -> _DumpHeapFragmentedBlock:
 
 @with_slots
 @dataclass(frozen=True)
-class _DiffDumpHeapArgs:
+class _DiffDumpHeapArgs(DocOutputArgs):
     paths: Tuple[Path, Path] = argument(name_optional=True, doc="Paths of heap dumps to diff")
-    txt: Optional[Path] = argument(default=None, doc="Output file")
 
 
 @with_slots
@@ -761,7 +762,7 @@ def _diff_dump_heap(args: _DiffDumpHeapArgs) -> None:
             rows=rows,
         )
     )
-    handle_doc(doc, OutputOptions(txt=args.txt))
+    handle_doc(doc, output_options_from_args(args))
 
 
 MEM_UTILS_COMMANDS: CommandsMapping = {
diff --git a/src/benchmarks/gc/src/analysis/parse_metrics.py b/src/benchmarks/gc/src/analysis/parse_metrics.py
index b8f16fb7f6..3f40771a88 100644
--- a/src/benchmarks/gc/src/analysis/parse_metrics.py
+++ b/src/benchmarks/gc/src/analysis/parse_metrics.py
@@ -104,12 +104,12 @@ def parse_run_metric_arg(metric: str) -> RunMetric:
     "PctIsEphemeral",
     "FirstToLastGCSeconds",
     "TotalNonGCSeconds",
-    "NumHeaps",
+    "HeapCount",
     "HeapSizeBeforeMB_Max",
     "HeapSizeAfterMB_Max",
     "HeapSizeBeforeMB_Mean",
     "HeapSizeAfterMB_Mean",
-    "FirstToLastEventSeconds",
+    "TotalSecondsTaken",
     "FirstEventToFirstGCSeconds",
     "PctTimeInGC_WhereIsNonBackground",
     "PctTimePausedInGC",
diff --git a/src/benchmarks/gc/src/analysis/process_trace.py b/src/benchmarks/gc/src/analysis/process_trace.py
index c17eb52ed6..acb76470d3 100644
--- a/src/benchmarks/gc/src/analysis/process_trace.py
+++ b/src/benchmarks/gc/src/analysis/process_trace.py
@@ -8,10 +8,10 @@
 
 from result import Err, Ok, Result
 
-from ..commonlib.bench_file import is_trace_path, load_test_status, TestResult
+from ..commonlib.bench_file import is_trace_path, load_test_status, TestResult, TestRunStatus
 from ..commonlib.collection_util import indices, map_to_mapping, repeat, zip_check, zip_check_3
 from ..commonlib.option import map_option, non_null
-from ..commonlib.result_utils import map_ok, match
+from ..commonlib.result_utils import map_err, map_ok, match, option_to_result, unwrap
 from ..commonlib.util import change_extension, show_size_bytes
 
 from .clr import Clr, get_clr
@@ -22,9 +22,13 @@
     AbstractMarkInfo,
     AbstractServerGcHistory,
     AbstractTraceGC,
+    AbstractTraceLoadedDotNetRuntime,
+    AbstractTraceProcess,
+    AbstractTracedProcesses,
     cs_result_to_result,
 )
 from .core_analysis import (
+    get_process_info_from_mang,
     get_process_names_and_process_info,
     process_predicate_from_id,
     process_predicate_from_parts,
@@ -36,6 +40,7 @@
     MaybeMetricValuesForSingleIteration,
     ProcessedGC,
     ProcessedHeap,
+    ProcessInfo,
     ProcessedTrace,
     ProcessQuery,
     RunMetrics,
@@ -66,7 +71,9 @@ def get_processed_trace(
     need_mechanisms_and_reasons: bool,
     need_join_info: bool,
 ) -> Result[str, ProcessedTrace]:
-    test_status = test_result.load_test_status()
+    test_status = option_to_result(
+        test_result.load_test_status(), lambda: "Need a test status file"
+    )
 
     if test_result.trace_path is None:
         if need_join_info:
@@ -88,59 +95,109 @@ def get_processed_trace(
                 )
             )
     else:
-        if process is None:
-            assert test_status is not None, (
-                "Didn't specify --process and there's no test status to specify PID\n"
-                " (hint: maybe specify the test output '.yaml' file instead of the trace file)"
+        return Ok(
+            _get_processed_trace_from_process(
+                clr,
+                test_status,
+                test_result,
+                test_result.trace_path,
+                process,
+                need_join_info=need_join_info,
+                need_mechanisms_and_reasons=need_mechanisms_and_reasons,
             )
-            process_predicate = process_predicate_from_id(test_status.process_id)
-        else:
-            assert (
-                test_status is None
-            ), "'--process' is unnecessary as the test result specifies the PID"
-            process_predicate = process_predicate_from_parts(process)
-
-        process_names, proc = get_process_names_and_process_info(
-            clr,
-            test_result.trace_path,
-            str(test_result),
-            process_predicate,
-            # TODO: make this optional; though the metric FirstEventToFirstGCSeconds needs this too.
-            collect_event_names=True,
         )
 
-        # TODO: just do this lazily (getting join info)
-        join_info = (
-            get_join_info_for_all_gcs(clr, proc)
-            if need_join_info
-            else Err("Did not request join info")
-        )
-        res = ProcessedTrace(
+
+def get_processed_trace_from_just_process(
+    clr: Clr,
+    trace_path: Path,
+    p: AbstractTracedProcesses,
+    process: AbstractTraceProcess,
+    mang: AbstractTraceLoadedDotNetRuntime,
+) -> ProcessedTrace:
+    proc_info = get_process_info_from_mang(p, trace_path, process, trace_path.name, mang)
+    return _init_processed_trace(
+        ProcessedTrace(
             clr=clr,
-            test_result=test_result,
-            test_status=test_status,
-            process_info=proc,
-            process_names=process_names,
-            process_query=process,
-            join_info=join_info,
-            # TODO: just do this lazily
-            mechanisms_and_reasons=get_mechanisms_and_reasons_for_process_info(proc)
-            if need_mechanisms_and_reasons
-            else None,
-            gcs_result=Err("temporary err, will be overwritten"),
-        )
-        gc_join_infos: Iterable[Result[str, AbstractJoinInfoForGC]] = match(
-            join_info,
-            lambda j: [cs_result_to_result(jgc) for jgc in j.GCs],
-            lambda e: repeat(Err(e), len(proc.gcs)),
-        )
-        res.gcs_result = Ok(
-            [
-                _get_processed_gc(res, i, gc_join_info)
-                for i, gc_join_info in zip_check(indices(proc.gcs), gc_join_infos)
-            ]
+            test_result=TestResult(trace_path=trace_path),
+            test_status=Err("get_processed_trace_from_just_process has no test status"),
+            process_info=proc_info,
+            process_names=cast(ThreadToProcessToName, None),
+            process_query=None,
+            join_info=Err("did not request join info"),
+            mechanisms_and_reasons=None,
+            gcs_result=Err("temp"),
+        ),
+        proc_info,
+    )
+
+
+def _get_processed_trace_from_process(
+    clr: Clr,
+    test_status: Failable[TestRunStatus],
+    test_result: TestResult,
+    trace_path: Path,
+    process: ProcessQuery,
+    need_join_info: bool,
+    need_mechanisms_and_reasons: bool,
+) -> ProcessedTrace:
+    if process is None:
+        ts = unwrap(
+            map_err(
+                test_status,
+                lambda _: "Didn't specify --process and there's no test status to specify PID\n"
+                " (hint: maybe specify the test output '.yaml' file instead of the trace file)",
+            )
         )
-        return Ok(res)
+        process_predicate = process_predicate_from_id(ts.process_id)
+    else:
+        assert (
+            test_status is None
+        ), "'--process' is unnecessary as the test result specifies the PID"
+        process_predicate = process_predicate_from_parts(process)
+    process_names, proc = get_process_names_and_process_info(
+        clr,
+        trace_path,
+        str(test_result),
+        process_predicate,
+        # TODO: make this optional; though the metric FirstEventToFirstGCSeconds needs this too.
+        collect_event_names=True,
+    )
+
+    # TODO: just do this lazily (getting join info)
+    join_info = (
+        get_join_info_for_all_gcs(clr, proc) if need_join_info else Err("Did not request join info")
+    )
+    res = ProcessedTrace(
+        clr=clr,
+        test_result=test_result,
+        test_status=test_status,
+        process_info=proc,
+        process_names=process_names,
+        process_query=process,
+        join_info=join_info,
+        # TODO: just do this lazily
+        mechanisms_and_reasons=get_mechanisms_and_reasons_for_process_info(proc)
+        if need_mechanisms_and_reasons
+        else None,
+        gcs_result=Err("temporary err, will be overwritten"),
+    )
+    return _init_processed_trace(res, proc)
+
+
+def _init_processed_trace(res: ProcessedTrace, process_info: ProcessInfo) -> ProcessedTrace:
+    gc_join_infos: Iterable[Result[str, AbstractJoinInfoForGC]] = match(
+        res.join_info,
+        lambda j: [cs_result_to_result(jgc) for jgc in j.GCs],
+        lambda e: repeat(Err(e), len(process_info.gcs)),
+    )
+    res.gcs_result = Ok(
+        [
+            _get_processed_gc(res, i, gc_join_info)
+            for i, gc_join_info in zip_check(indices(process_info.gcs), gc_join_infos)
+        ]
+    )
+    return res
 
 
 def _get_processed_gc(
diff --git a/src/benchmarks/gc/src/analysis/report.py b/src/benchmarks/gc/src/analysis/report.py
index 656761a655..b43188a3ec 100644
--- a/src/benchmarks/gc/src/analysis/report.py
+++ b/src/benchmarks/gc/src/analysis/report.py
@@ -36,20 +36,15 @@
 from ..commonlib.document import (
     Align,
     Cell,
+    DocOutputArgs,
     Document,
     handle_doc,
     HeaderGroup,
-    HTML_DOC,
-    OutputOptions,
-    OutputWidth,
-    OUTPUT_WIDTH_DOC,
+    output_options_from_args,
     print_document,
     Row,
     Section,
     Table,
-    TABLE_INDENT_DOC,
-    TXT_DOC,
-    XLSX_DOC,
 )
 from ..commonlib.option import map_option, non_null, optional_to_iter, option_or
 from ..commonlib.result_utils import all_non_err, as_err, map_ok, match, unwrap
@@ -436,7 +431,7 @@ class ReportReasonsArgs:
 
 @with_slots
 @dataclass(frozen=True)
-class DiffArgs:
+class DiffArgs(DocOutputArgs):
     trace_paths: Sequence[Path] = argument(name_optional=True, doc=DIFFABLE_PATHS_DOC)
 
     vary: Optional[Vary] = argument(default=None, doc=VARY_DOC)
@@ -462,12 +457,6 @@ class DiffArgs:
         doc="Only show metrics where there is this much difference between configurations.",
     )
 
-    output_width: Optional[OutputWidth] = argument(default=None, doc=OUTPUT_WIDTH_DOC)
-    table_indent: Optional[int] = argument(default=None, doc=TABLE_INDENT_DOC)
-    txt: Optional[Path] = argument(default=None, doc=TXT_DOC)
-    html: Optional[Path] = argument(default=None, doc=HTML_DOC)
-    excel: Optional[Path] = argument(default=None, hidden=True, doc=XLSX_DOC)
-
     sample_kind: SampleKind = argument(default=0, doc=SAMPLE_KIND_DOC)
     max_iterations: Optional[int] = argument(default=None, doc=MAX_ITERATIONS_FOR_ANALYZE_DOC)
 
@@ -525,16 +514,7 @@ def diff(args: DiffArgs) -> None:
         sort_by_metric=sort_by_metric,
         min_difference_pct=args.min_difference_pct,
     )
-    handle_doc(
-        doc,
-        OutputOptions(
-            width=args.output_width,
-            table_indent=args.table_indent,
-            html=args.html,
-            txt=args.txt,
-            excel=args.excel,
-        ),
-    )
+    handle_doc(doc, output_options_from_args(args))
 
 
 def diff_for_jupyter(
@@ -801,18 +781,19 @@ def get_mechanisms_and_reasons_for_test(
 
 @with_slots
 @dataclass(frozen=True)
-class _PrintAllRunsArgs:
+class _PrintAllRunsArgs(DocOutputArgs):
     bench_file_path: Path = argument(name_optional=True, doc="Path to a benchfile")
     run_metrics: Sequence[str] = argument(doc=RUN_METRICS_DOC)
 
 
 def _print_all_runs(args: _PrintAllRunsArgs) -> None:
-    print_document(
+    handle_doc(
         print_all_runs_for_jupyter(
             traces=ProcessedTraces(),
             bench_file_path=args.bench_file_path,
             run_metrics=parse_run_metrics_arg(args.run_metrics, default_to_important=False),
-        )
+        ),
+        output_options_from_args(args),
     )
 
 
diff --git a/src/benchmarks/gc/src/analysis/run_metrics.py b/src/benchmarks/gc/src/analysis/run_metrics.py
index 6f7adbdfc8..dbb267087e 100644
--- a/src/benchmarks/gc/src/analysis/run_metrics.py
+++ b/src/benchmarks/gc/src/analysis/run_metrics.py
@@ -7,28 +7,18 @@
 
 from result import Err, Ok, Result
 
-from ..commonlib.bench_file import GCPerfSimResult, TestRunStatus
 from ..commonlib.collection_util import (
     combine_mappings,
     DequeWithSum,
     indices,
     is_empty,
     make_mapping,
-    map_mapping_values,
-)
-from ..commonlib.option import map_option, non_null
-from ..commonlib.result_utils import (
-    all_non_err,
-    flat_map_ok,
-    fn2_to_ok,
-    map_ok,
-    match,
-    option_to_result,
-    unwrap,
 )
+from ..commonlib.option import non_null
+from ..commonlib.result_utils import all_non_err, flat_map_ok, fn2_to_ok, map_ok, match, unwrap
 from ..commonlib.score_spec import ScoreElement
 from ..commonlib.type_utils import enum_value
-from ..commonlib.util import bytes_to_gb, bytes_to_mb, geometric_mean, get_percent, seconds_to_msec
+from ..commonlib.util import bytes_to_mb, geometric_mean, get_percent, seconds_to_msec
 
 from .aggregate_stats import get_aggregate_stats
 from .enums import Gens, ServerGCThreadState
@@ -37,7 +27,6 @@
     Failable,
     FailableFloat,
     fn_of_property,
-    FailableInt,
     NamedRunMetric,
     ProcessedGC,
     ProcessInfo,
@@ -50,63 +39,6 @@
     FailableValues,
 )
 
-_GCPERFSIM_RESULT_GETTERS: Mapping[NamedRunMetric, Callable[[GCPerfSimResult], FailableValue]] = {
-    NamedRunMetric("InternalSecondsTaken", is_from_test_status=True): lambda g: Ok(g.seconds_taken),
-    NamedRunMetric("FinalHeapSizeGB", is_from_test_status=True): lambda g: Err(
-        "final_heap_size_bytes was not in test result, this can happen on runtimes < 3.0"
-    )
-    if g.final_heap_size_bytes is None
-    else Ok(bytes_to_gb(g.final_heap_size_bytes)),
-    NamedRunMetric("FinalFragmentationGB", is_from_test_status=True): lambda g: Err(
-        "final_fragmentation_bytes was not in test result, this can happen on runtimes < 3.0"
-    )
-    if g.final_fragmentation_bytes is None
-    else Ok(bytes_to_gb(g.final_fragmentation_bytes)),
-    NamedRunMetric("FinalTotalMemoryGB", is_from_test_status=True): lambda g: Ok(
-        bytes_to_gb(g.final_total_memory_bytes)
-    ),
-    NamedRunMetric("Gen0CollectionCount", is_from_test_status=True): lambda g: Ok(
-        g.collection_counts[0]
-    ),
-    NamedRunMetric("Gen1CollectionCount", is_from_test_status=True): lambda g: Ok(
-        g.collection_counts[1]
-    ),
-    NamedRunMetric("Gen2CollectionCount", is_from_test_status=True): lambda g: Ok(
-        g.collection_counts[2]
-    ),
-}
-
-
-def _gcperfsim_getter(
-    cb: Callable[[GCPerfSimResult], FailableValue]
-) -> Callable[[TestRunStatus], FailableValue]:
-    return (
-        lambda ts: Err("No gcperfsim_result")
-        if ts.gcperfsim_result is None
-        else cb(ts.gcperfsim_result)
-    )
-
-
-_TEST_STATUS_METRIC_GETTERS: Mapping[
-    NamedRunMetric, Callable[[TestRunStatus], FailableValue]
-] = combine_mappings(
-    {
-        NamedRunMetric("TotalSecondsTaken", is_from_test_status=True): lambda ts: Ok(
-            ts.seconds_taken
-        ),
-        NamedRunMetric("Gen0Size", is_from_test_status=True): lambda ts: option_to_result(
-            ts.test.config.config.complus_gcgen0size, lambda: "Gen0size not specified in config"
-        ),
-        NamedRunMetric("ThreadCount", is_from_test_status=True): lambda ts: option_to_result(
-            map_option(ts.test.benchmark.benchmark.get_argument("-tc"), int),
-            lambda: "tc not specified in benchmark",
-        ),
-    },
-    map_mapping_values(_gcperfsim_getter, _GCPERFSIM_RESULT_GETTERS),
-)
-
-TEST_STATUS_METRICS: Sequence[NamedRunMetric] = tuple(_TEST_STATUS_METRIC_GETTERS.keys())
-
 
 def stat_for_proc(proc: ProcessedTrace, metric: RunMetric) -> FailableValue:
     res = (
@@ -305,32 +237,13 @@ def _get_gc_aggregate_stats() -> Mapping[NamedRunMetric, Callable[[ProcessedTrac
     )
 
 
-def _get_num_heaps(proc: ProcessedTrace) -> FailableInt:
-    def f(gcs: Sequence[ProcessedGC]) -> int:
-        n_heaps = proc.gcs[0].trace_gc.HeapCount
-        for i, gc in enumerate(gcs):
-            assert gc.trace_gc.HeapCount == n_heaps
-            if gc.trace_gc.GlobalHeapHistory is None:
-                print(f"WARN: GC{i} has null GlobalHeapHistory. It's a {gc.Type}")
-            phh_count = gc.HeapCount
-            if n_heaps != phh_count:
-                print(
-                    f"WARN: GC{i} has {phh_count} PerHeapHistories but {n_heaps} heaps. "
-                    + f"It's a {gc.Type}"
-                )
-        return n_heaps
-
-    return map_ok(proc.gcs_result, f)
 
 
 _RUN_METRIC_GETTERS: Mapping[
     NamedRunMetric, Callable[[ProcessedTrace], FailableFloat]
 ] = combine_mappings(
     {
-        NamedRunMetric("NumHeaps"): _get_num_heaps,
-        NamedRunMetric("FirstToLastEventSeconds"): fn_of_property(
-            ProcessedTrace.FirstToLastEventSeconds
-        ),
+        NamedRunMetric("HeapCount"): fn_of_property(ProcessedTrace.HeapCountResult),
         NamedRunMetric("TotalNonGCSeconds"): fn_of_property(ProcessedTrace.TotalNonGCSeconds),
         NamedRunMetric("FirstToLastGCSeconds"): fn_of_property(ProcessedTrace.FirstToLastGCSeconds),
         NamedRunMetric("FirstEventToFirstGCSeconds"): fn_of_property(
@@ -344,21 +257,51 @@ def f(gcs: Sequence[ProcessedGC]) -> int:
             flat_map_ok(proc.gcs_result, _get_total_loh_allocated_bytes), bytes_to_mb
         ),
         NamedRunMetric("PctTimePausedInGC"): _get_percent_time_paused_in_gc,
+        NamedRunMetric("TotalSecondsTaken", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.TotalSecondsTaken
+        ),
+        NamedRunMetric("Gen0Size", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.Gen0Size
+        ),
+        NamedRunMetric("ThreadCount", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.ThreadCount
+        ),
+        NamedRunMetric("InternalSecondsTaken", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.InternalSecondsTaken
+        ),
+        NamedRunMetric("FinalHeapSizeGB", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.FinalHeapSizeGB
+        ),
+        NamedRunMetric("FinalFragmentationGB", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.FinalFragmentationGB
+        ),
+        NamedRunMetric("FinalTotalMemoryGB", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.FinalTotalMemoryGB
+        ),
+        NamedRunMetric("NumCreatedWithFinalizers", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.NumCreatedWithFinalizers
+        ),
+        NamedRunMetric("NumFinalized", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.NumFinalized
+        ),
+        NamedRunMetric("Gen0CollectionCount", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.Gen0CollectionCount
+        ),
+        NamedRunMetric("Gen1CollectionCount", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.Gen1CollectionCount
+        ),
+        NamedRunMetric("Gen2CollectionCount", is_from_test_status=True): fn_of_property(
+            ProcessedTrace.Gen2CollectionCount
+        ),
     },
     # This includes startup time so not a great metric
     # {NamedRunMetric("ProcessDurationMSec"): lambda proc: proc.stats.ProcessDuration},
     _STOLEN_CPU_TIMES_GETTERS,
     _get_gc_aggregate_stats(),
-    map_mapping_values(
-        lambda test_status_getter: lambda trace: Err("no test status")
-        if trace.test_status is None
-        else test_status_getter(trace.test_status),
-        _TEST_STATUS_METRIC_GETTERS,
-    ),
 )
 
-ALL_RUN_METRICS: Sequence[RunMetric] = tuple(_RUN_METRIC_GETTERS.keys())
-
+ALL_RUN_METRICS: RunMetrics = tuple(_RUN_METRIC_GETTERS.keys())
+TEST_STATUS_METRICS: RunMetrics = [m for m in ALL_RUN_METRICS if m.is_from_test_status]
 
 # Note: excludes bytes allocated after the last GC
 def _get_total_loh_allocated_bytes(gcs: Sequence[ProcessedGC]) -> Result[str, int]:
diff --git a/src/benchmarks/gc/src/analysis/single_gc_metrics.py b/src/benchmarks/gc/src/analysis/single_gc_metrics.py
index 4bd83d0006..585b81e79e 100644
--- a/src/benchmarks/gc/src/analysis/single_gc_metrics.py
+++ b/src/benchmarks/gc/src/analysis/single_gc_metrics.py
@@ -177,17 +177,17 @@ def _get_total_gc_time(gc: ProcessedGC) -> FailableFloat:
 
 _PER_GEN_METRIC_GETTERS: Mapping[str, Callable[[GenInfoGetter], FailableFloat]] = {
     # NOTE: these names will be prefixed with the generation name
-    "SurvivalPercent": GenInfoGetter.SurvivalPct,
-    "SizeBeforeMB": fn_to_ok(GenInfoGetter.SizeBeforeMB),
-    "SizeAfterMB": fn_to_ok(GenInfoGetter.SizeAfterMB),
+    "SurvivalPercent": fn_of_property(GenInfoGetter.SurvivalPct),
+    "SizeBeforeMB": ok_of_property(GenInfoGetter.SizeBeforeMB),
+    "SizeAfterMB": ok_of_property(GenInfoGetter.SizeAfterMB),
     # TODO: this is just the heap's FragmentationMB_Sum
-    "FragmentationMB": fn_to_ok(GenInfoGetter.FragmentationMB),
-    "FragmentationPercent": fn_to_ok(GenInfoGetter.FragmentationPct),
+    "FragmentationMB": ok_of_property(GenInfoGetter.FragmentationMB),
+    "FragmentationPercent": ok_of_property(GenInfoGetter.FragmentationPct),
     # TODO: this is just GenData[gen].In, already exposed elsewhere?
-    "InMB": fn_to_ok(GenInfoGetter.InMB),
-    "PromotedMB": fn_to_ok(GenInfoGetter.PromotedMB),
-    "BudgetMB": fn_to_ok(GenInfoGetter.BudgetMB),
-    "ObjSizeAfterMB": fn_to_ok(GenInfoGetter.ObjSizeAfterMB),
+    "InMB": ok_of_property(GenInfoGetter.InMB),
+    "PromotedMB": ok_of_property(GenInfoGetter.PromotedMB),
+    "BudgetMB": ok_of_property(GenInfoGetter.BudgetMB),
+    "ObjSizeAfterMB": ok_of_property(GenInfoGetter.ObjSizeAfterMB),
 }
 
 
@@ -255,6 +255,7 @@ def _convert_heap_getter(
         ),
         SingleGCMetric("Type", doc="Value of GCType enum"): lambda gc: Ok(enum_value(gc.Type)),
         SingleGCMetric("AllocedSinceLastGCMB"): ok_of_property(ProcessedGC.AllocedSinceLastGCMB),
+        SingleGCMetric("AllocedMBAccumulated"): ok_of_property(ProcessedGC.AllocedMBAccumulated),
         SingleGCMetric("AllocRateMBSec"): ok_of_property(ProcessedGC.AllocRateMBSec),
         SingleGCMetric("BGCFinalPauseMSec"): ok_of_property(ProcessedGC.BGCFinalPauseMSec),
         SingleGCMetric("DurationMSec"): ok_of_property(ProcessedGC.DurationMSec),
@@ -275,6 +276,7 @@ def _convert_heap_getter(
         SingleGCMetric("PromotedMB"): ok_of_property(ProcessedGC.PromotedMB),
         SingleGCMetric("RatioPeakAfter"): ok_of_property(ProcessedGC.RatioPeakAfter),
         SingleGCMetric("SuspendDurationMSec"): ok_of_property(ProcessedGC.suspend_duration_msec),
+        SingleGCMetric("MemoryPressure"): fn_of_property(ProcessedGC.MemoryPressure),
     },
     {
         # Unrelated to the total PercentTimeInGc which is for the entire process
diff --git a/src/benchmarks/gc/src/analysis/trace_commands.py b/src/benchmarks/gc/src/analysis/trace_commands.py
index 00bac87c5f..0e70dfcbca 100644
--- a/src/benchmarks/gc/src/analysis/trace_commands.py
+++ b/src/benchmarks/gc/src/analysis/trace_commands.py
@@ -6,30 +6,34 @@
 from itertools import islice
 from pathlib import Path
 from re import compile as compile_regexp, IGNORECASE
-from typing import Mapping, Optional, Tuple
+from typing import Optional, Sequence, Tuple
+
+from ..analysis.analyze_single import value_cell
+from ..analysis.parse_metrics import parse_run_metrics_arg
+from ..analysis.types import ProcessedTrace, RunMetrics, RUN_METRICS_DOC
+
 
 from ..commonlib.collection_util import map_to_mapping
 from ..commonlib.command import Command, CommandKind, CommandsMapping
 from ..commonlib.document import (
     Cell,
+    DocOutputArgs,
     Document,
     handle_doc,
-    OutputOptions,
-    OutputWidth,
-    OUTPUT_WIDTH_DOC,
+    output_options_from_args,
     Row,
     Section,
     Table,
-    TXT_DOC,
 )
 from ..commonlib.option import map_option, non_null, option_or
+from ..commonlib.result_utils import ignore_err
 from ..commonlib.type_utils import argument, with_slots
-from ..commonlib.util import opt_max, seconds_to_msec
+from ..commonlib.util import seconds_to_msec
 
-from .clr import Clr, get_clr
+from .clr import get_clr
 from .clr_types import AbstractEtlxTraceProcess, AbstractTracedProcesses
 from .core_analysis import get_traced_processes, TRACE_PATH_DOC, try_get_runtime
-from .process_trace import test_result_from_path
+from .process_trace import get_processed_trace_from_just_process, test_result_from_path
 
 
 @with_slots
@@ -76,66 +80,89 @@ def print_events_for_jupyter(
 
 @with_slots
 @dataclass(frozen=True)
-class _PrintProcessesArgs:
+class _PrintProcessesArgs(DocOutputArgs):
     trace_path: Path = argument(name_optional=True, doc=TRACE_PATH_DOC)
+    name_regex: Optional[str] = argument(
+        default=None, doc="Regular expression used to filter processes by their name"
+    )
     command_line_regex: Optional[str] = argument(
         default=None,
         doc="Regular expression used to filter processes by their command-line arguments",
     )
     clr_only: bool = argument(default=False, doc="Only include CLR processes")
     hide_threads: bool = argument(default=False, doc="Don't show threads for each process")
-    txt: Optional[Path] = argument(default=None, doc=TXT_DOC)
-    output_width: Optional[OutputWidth] = argument(default=None, doc=OUTPUT_WIDTH_DOC)
+    run_metrics: Optional[Sequence[str]] = argument(default=None, doc=RUN_METRICS_DOC)
 
 
 def _print_processes(args: _PrintProcessesArgs) -> None:
     trace_path = non_null(test_result_from_path(args.trace_path).trace_path)
     clr = get_clr()
     processes = get_traced_processes(clr, trace_path)
-    rgx = map_option(args.command_line_regex, lambda s: compile_regexp(s, IGNORECASE))
-    proc_to_peak_heap_size = map_to_mapping(
-        processes.processes, lambda p: _get_peak_clr_heap_size(clr, p)
+    name_regex = map_option(args.name_regex, lambda s: compile_regexp(s, IGNORECASE))
+    command_line_regex = map_option(
+        args.command_line_regex, lambda s: compile_regexp(s, IGNORECASE)
+    )
+    proc_to_processed_trace = map_to_mapping(
+        processes.processes,
+        lambda p: map_option(
+            try_get_runtime(clr, p),
+            lambda rt: get_processed_trace_from_just_process(clr, trace_path, processes, p, rt),
+        ),
     )
     filtered_processes = [
         p
         for p in processes.processes
-        if (rgx is None or rgx.search(p.CommandLine) is not None)
-        and (not args.clr_only or proc_to_peak_heap_size[p] is not None)
+        if (name_regex is None or name_regex.search(p.Name) is not None)
+        and (command_line_regex is None or command_line_regex.search(p.CommandLine) is not None)
+        and (not args.clr_only or proc_to_processed_trace[p] is not None)
     ]
+    # TODO: can only show threads with updated PerfView,
+    # otherwise thread_id_to_process_id will be none
+    hide_threads = args.hide_threads or processes.thread_id_to_process_id is None
+
+    run_metrics = parse_run_metrics_arg(
+        option_or(args.run_metrics, ["HeapSizePeakMB_Max", "TotalAllocatedMB"])
+    )
 
     table = Table(
         headers=(
             "pid",
             "name",
-            "peak heap size (MB)",
-            *([] if args.hide_threads else ["threads", "threads (my version)"]),
+            *(m.name for m in run_metrics),
+            *([] if hide_threads else ["threads", "threads (my version)"]),
             "command-line args",
         ),
         rows=[
-            _process_row(p, processes, proc_to_peak_heap_size, args.hide_threads)
+            _process_row(p, processes, proc_to_processed_trace[p], run_metrics, hide_threads)
             for p in sorted(
                 filtered_processes,
-                key=lambda p: option_or(proc_to_peak_heap_size[p], 0),
+                key=lambda p: option_or(
+                    map_option(
+                        proc_to_processed_trace[p], lambda pt: ignore_err(pt.HeapSizePeakMB_Max)
+                    ),
+                    0,
+                ),
                 reverse=True,
             )
         ],
     )
-    handle_doc(
-        Document(sections=(Section(tables=(table,)),)),
-        OutputOptions(txt=args.txt, width=args.output_width),
-    )
+    handle_doc(Document(sections=(Section(tables=(table,)),)), output_options_from_args(args))
 
 
 def _process_row(
     p: AbstractEtlxTraceProcess,
     processes: AbstractTracedProcesses,
-    proc_to_peak_heap_size: Mapping[AbstractEtlxTraceProcess, Optional[float]],
+    trace: Optional[ProcessedTrace],
+    run_metrics: RunMetrics,
     hide_threads: bool,
 ) -> Row:
     return (
         Cell(p.ProcessID),
         Cell(p.Name),
-        Cell(proc_to_peak_heap_size[p]),
+        *(
+            Cell() if trace is None else value_cell(run_metric, trace.metric(run_metric))
+            for run_metric in run_metrics
+        ),
         *(
             []
             if hide_threads
@@ -162,13 +189,6 @@ def _process_row(
     )
 
 
-def _get_peak_clr_heap_size(clr: Clr, p: AbstractEtlxTraceProcess) -> Optional[float]:
-    return map_option(
-        try_get_runtime(clr, p),
-        lambda rt: option_or(opt_max(gc.HeapSizePeakMB for gc in rt.GC.GCs), 0),
-    )
-
-
 @with_slots
 @dataclass(frozen=True)
 class SliceTraceFileArgs:
diff --git a/src/benchmarks/gc/src/analysis/types.py b/src/benchmarks/gc/src/analysis/types.py
index 42caf91861..92457845e5 100644
--- a/src/benchmarks/gc/src/analysis/types.py
+++ b/src/benchmarks/gc/src/analysis/types.py
@@ -26,17 +26,27 @@
 
 from result import Err, Ok, Result
 
-from ..commonlib.bench_file import TestResult, TestRunStatus
+from ..commonlib.bench_file import GCPerfSimResult, TestResult, TestRunStatus
 from ..commonlib.collection_util import count, empty_mapping, is_empty, map_to_mapping
 from ..commonlib.document import Cell
 from ..commonlib.frozen_dict import FrozenDict
 from ..commonlib.option import map_option, non_null
-from ..commonlib.result_utils import all_non_err, fn_to_ok, flat_map_ok, map_ok, unwrap
+from ..commonlib.result_utils import (
+    all_non_err,
+    fn_to_ok,
+    flat_map_ok,
+    map_ok,
+    map_ok_2,
+    option_to_result,
+    unwrap,
+)
 from ..commonlib.score_spec import ScoreElement, ScoreSpec
 from ..commonlib.type_utils import check_cast, enum_value, E, T, U, with_slots
 from ..commonlib.util import (
+    bytes_to_gb,
     bytes_to_mb,
     float_to_str_smaller,
+    get_95th_percentile,
     get_or_did_you_mean,
     get_percent,
     mb_to_gb,
@@ -545,6 +555,7 @@ def clr(self) -> Clr:
         return self.gc.clr
 
     def metric(self, metric: SingleHeapMetric) -> FailableValue:
+        # pylint:disable=import-outside-toplevel
         from .single_heap_metrics import get_single_heap_stat
 
         return get_single_heap_stat(self, metric)
@@ -657,6 +668,7 @@ def PromotedMB(self) -> float:
     def BudgetMB(self) -> float:
         return _fixup_mb(self._trace_gc.GenBudgetMB(self._gen_value))
 
+    @property
     def ObjSizeAfterMB(self) -> float:
         return _fixup_mb(self._trace_gc.GenObjSizeAfterMB(self._gen_value))
 
@@ -687,6 +699,15 @@ class ProcessedGC:
     join_info: Result[str, AbstractJoinInfoForGC]
     heaps: Sequence[ProcessedHeap]
 
+    @property
+    def prev_gc(self) -> Optional["ProcessedGC"]:
+        if self.index == 0:
+            return None
+        else:
+            res = self.proc.gcs[self.index - 1]
+            assert res.index == self.index - 1
+            return res
+
     @property
     def clr(self) -> Clr:
         return self.proc.clr
@@ -696,11 +717,12 @@ def SuspendDurationMSec(self) -> float:
         return self.trace_gc.SuspendDurationMSec
 
     def metric(self, single_gc_metric: SingleGCMetric) -> FailableValue:
-        from .single_gc_metrics import get_single_gc_stat
+        from .single_gc_metrics import get_single_gc_stat  # pylint:disable=import-outside-toplevel
 
         return get_single_gc_stat(self.proc, self.proc.gcs, self.index, single_gc_metric)
 
     def metric_from_name(self, name: str) -> FailableValue:
+        # pylint:disable=import-outside-toplevel
         from .parse_metrics import parse_single_gc_metric_arg
 
         return self.metric(parse_single_gc_metric_arg(name))
@@ -728,6 +750,17 @@ def collects_generation(self, gen: Gens) -> bool:
     def AllocedSinceLastGCMB(self) -> float:
         return self.trace_gc.AllocedSinceLastGCMB
 
+    _alloced_mb_accumulated: Optional[float] = None
+
+    @property
+    def AllocedMBAccumulated(self) -> float:
+        if self._alloced_mb_accumulated is None:
+            prev = 0 if self.prev_gc is None else self.prev_gc.AllocedMBAccumulated
+            self._alloced_mb_accumulated = prev + self.AllocedSinceLastGCMB
+            return self._alloced_mb_accumulated
+        else:
+            return self._alloced_mb_accumulated
+
     @property
     def AllocRateMBSec(self) -> float:
         return self.trace_gc.AllocRateMBSec
@@ -1064,6 +1097,16 @@ def UsesLOHCompaction(self) -> FailableBool:
         return Err("<not implemented>")
         # return has_mechanisms(gc, lambda m: m.loh_compaction)
 
+    @property
+    def MemoryPressure(self) -> FailableFloat:
+        ghh = self.trace_gc.GlobalHeapHistory
+        if ghh is None:
+            return Err("No GlobalHeapHistory")
+        elif ghh.HasMemoryPressure:
+            return Ok(ghh.MemoryPressure)
+        else:
+            return Err("GlobalHeapHistory#HasMemoryPressure was false")
+
     @property
     def HeapCount(self) -> int:
         return len(self.heaps)
@@ -1186,7 +1229,7 @@ def union_mechanisms(a: MechanismsAndReasons, b: MechanismsAndReasons) -> Mechan
 class ProcessedTrace:
     clr: Clr
     test_result: TestResult
-    test_status: Optional[TestRunStatus]
+    test_status: Failable[TestRunStatus]
     process_info: Optional[ProcessInfo]
     process_names: ThreadToProcessToName
     # '--process' that was used to get this
@@ -1195,17 +1238,126 @@ class ProcessedTrace:
     mechanisms_and_reasons: Optional[MechanismsAndReasons]
     join_info: Result[str, AbstractJoinInfoForProcess]
 
+    def Aggregate(
+        self,
+        cb_gc: Callable[[ProcessedGC], FailableFloat],
+        cb_aggregate: Callable[[Iterable[float]], float],
+    ) -> FailableFloat:
+        return flat_map_ok(
+            self.gcs_result,
+            lambda gcs: Err("<no gcs>")
+            if is_empty(gcs)
+            else map_ok(all_non_err(cb_gc(gc) for gc in gcs), cb_aggregate),
+        )
+
+    def Max(self, cb: Callable[[ProcessedGC], FailableFloat]) -> FailableFloat:
+        return self.Aggregate(cb, max)
+
+    def Sum(self, cb: Callable[[ProcessedGC], FailableFloat]) -> FailableFloat:
+        return self.Aggregate(cb, sum)
+
+    @property
+    def HeapSizePeakMB_Max(self) -> FailableFloat:
+        return self.Max(lambda gc: Ok(gc.HeapSizePeakMB))
+
+    def Get95P(self, cb: Callable[[ProcessedGC], float]) -> FailableFloat:
+        return get_95th_percentile([cb(gc) for gc in self.gcs])
+
+    @property
+    def gcperfsim_result(self) -> Failable[GCPerfSimResult]:
+        return flat_map_ok(
+            self.test_status,
+            lambda ts: option_to_result(
+                ts.gcperfsim_result, lambda: "This metric only available for GCPerfSim"
+            ),
+        )
+
+    @property
+    def TotalSecondsTaken(self) -> FailableFloat:
+        return map_ok(self.gcperfsim_result, lambda r: r.seconds_taken)
+
+    @property
+    def Gen0Size(self) -> FailableFloat:
+        return flat_map_ok(
+            self.test_status,
+            lambda ts: option_to_result(
+                ts.test.config.config.complus_gcgen0size, lambda: "Gen0size not specified in config"
+            ),
+        )
+
+    @property
+    def ThreadCount(self) -> FailableFloat:
+        return flat_map_ok(
+            self.test_status,
+            lambda ts: option_to_result(
+                map_option(ts.test.benchmark.benchmark.get_argument("-tc"), int),
+                lambda: "tc not specified in benchmark",
+            ),
+        )
+
+    @property
+    def InternalSecondsTaken(self) -> FailableFloat:
+        return map_ok(self.gcperfsim_result, lambda g: g.seconds_taken)
+
+    @property
+    def FinalHeapSizeGB(self) -> FailableFloat:
+        return flat_map_ok(
+            self.gcperfsim_result,
+            lambda g: Err(
+                "final_heap_size_bytes was not in test result\n"
+                + "this can happen on runtimes < 3.0"
+            )
+            if g.final_heap_size_bytes is None
+            else Ok(bytes_to_gb(g.final_heap_size_bytes)),
+        )
+
+    @property
+    def FinalFragmentationGB(self) -> FailableFloat:
+        return flat_map_ok(
+            self.gcperfsim_result,
+            lambda g: Err(
+                "final_fragmentation_bytes was not in test result\n"
+                + "this can happen on runtimes < 3.0"
+            )
+            if g.final_fragmentation_bytes is None
+            else Ok(bytes_to_gb(g.final_fragmentation_bytes)),
+        )
+
+    @property
+    def FinalTotalMemoryGB(self) -> FailableFloat:
+        return map_ok(self.gcperfsim_result, lambda g: bytes_to_gb(g.final_total_memory_bytes))
+
+    @property
+    def NumCreatedWithFinalizers(self) -> FailableValue:
+        return map_ok(self.gcperfsim_result, lambda g: g.num_created_with_finalizers)
+
+    @property
+    def NumFinalized(self) -> FailableValue:
+        return map_ok(self.gcperfsim_result, lambda g: g.num_finalized)
+
+    @property
+    def Gen0CollectionCount(self) -> FailableValue:
+        return map_ok(self.gcperfsim_result, lambda g: g.collection_counts[0])
+
+    @property
+    def Gen1CollectionCount(self) -> FailableValue:
+        return map_ok(self.gcperfsim_result, lambda g: g.collection_counts[1])
+
+    @property
+    def Gen2CollectionCount(self) -> FailableValue:
+        return map_ok(self.gcperfsim_result, lambda g: g.collection_counts[2])
+
     @property
     def gcs(self) -> Sequence[ProcessedGC]:
         return unwrap(self.gcs_result)
 
     def metric(self, run_metric: RunMetric) -> FailableValue:
-        from .run_metrics import stat_for_proc
+        from .run_metrics import stat_for_proc  # pylint:disable=import-outside-toplevel
 
         return stat_for_proc(self, run_metric)
 
     def metric_from_name(self, name: str) -> FailableValue:
-        from .parse_metrics import parse_run_metric_arg
+        from .parse_metrics import parse_run_metric_arg  # pylint:disable=import-outside-toplevel
 
         return self.metric(parse_run_metric_arg(name))
 
@@ -1267,32 +1419,37 @@ def FirstEventToFirstGCSeconds(self) -> FailableFloat:
 
     @property
     def TotalNonGCSeconds(self) -> FailableFloat:
-        return map_ok(
-            self.FirstToLastEventSeconds,
-            lambda t: t - msec_to_seconds(sum(gc.PauseDurationMSec for gc in self.gcs)),
+        return map_ok_2(
+            self.TotalSecondsTaken,
+            self.gcs_result,
+            lambda t, gcs: t - msec_to_seconds(sum(gc.PauseDurationMSec for gc in gcs)),
         )
 
-    @property
-    def FirstToLastEventSeconds(self) -> FailableFloat:
-        ts = map_option(self.process_info, lambda p: p.events_time_span)
-        if ts is None:
-            return Err("Did not specify to collect events")
-        else:
-            return Ok(msec_to_seconds(ts.DurationMSec))
-
     @property
     def TotalAllocatedMB(self) -> FailableFloat:
-        if self.process_info is None:
-            return Err("Need a trace")
-        else:
-            return Ok(sum(gc.AllocedSinceLastGCMB for gc in self.gcs))
+        return self.Sum(lambda gc: Ok(gc.AllocedSinceLastGCMB))
 
     @property
     def HeapCount(self) -> int:
-        n = self.gcs[0].HeapCount
-        for gc in self.gcs:
-            assert gc.HeapCount == n
-        return n
+        return unwrap(self.HeapCountResult)
+
+    @property
+    def HeapCountResult(self) -> FailableInt:
+        def f(gcs: Sequence[ProcessedGC]) -> int:
+            n_heaps = gcs[0].trace_gc.HeapCount
+            for i, gc in enumerate(gcs):
+                assert gc.trace_gc.HeapCount == n_heaps
+                if gc.trace_gc.GlobalHeapHistory is None:
+                    print(f"WARN: GC{i} has null GlobalHeapHistory. It's a {gc.Type}")
+                phh_count = gc.HeapCount
+                if n_heaps != phh_count:
+                    print(
+                        f"WARN: GC{i} has {phh_count} PerHeapHistories but {n_heaps} heaps. "
+                        + f"It's a {gc.Type}"
+                    )
+            return n_heaps
+
+        return map_ok(self.gcs_result, f)
 
     @property
     def NumberGCs(self) -> int:
diff --git a/src/benchmarks/gc/src/commonlib/bench_file.py b/src/benchmarks/gc/src/commonlib/bench_file.py
index c4723fc001..8058c2f97e 100644
--- a/src/benchmarks/gc/src/commonlib/bench_file.py
+++ b/src/benchmarks/gc/src/commonlib/bench_file.py
@@ -10,13 +10,15 @@
 from dataclasses import dataclass, fields
 from enum import Enum
 from pathlib import Path
-from typing import Any, cast, Iterable, Mapping, Optional, Sequence, Tuple, Type, Union
+from platform import machine as platform_machine
+from typing import cast, Iterable, Mapping, Optional, Sequence, Tuple, Type, Union
 
 from overrides import overrides
 
 from .collection_util import (
     combine_mappings,
     empty_mapping,
+    filter_not_none,
     find_only_or_only_matching,
     is_empty,
     optional_mapping,
@@ -24,7 +26,13 @@
 from .option import map_option, non_null, optional_to_iter, option_or, option_or_3
 from .parse_and_serialize import HexInt, load_yaml, SerializeMappings, write_yaml_file
 from .score_spec import ScoreSpec
-from .type_utils import doc_field, get_field_info_from_name, OrderedEnum, with_slots
+from .type_utils import (
+    combine_dataclasses_with_optional_fields,
+    doc_field,
+    get_field_info_from_name,
+    OrderedEnum,
+    with_slots,
+)
 from .util import (
     add_extension,
     assert_is_percent,
@@ -45,6 +53,8 @@ class GCPerfSimResult:
     """See `PrintResult` in GCPerfSim.cs"""
 
     seconds_taken: float
+    num_created_with_finalizers: int
+    num_finalized: int
     final_total_memory_bytes: int
     # Indexed by generation (0-2)
     collection_counts: Tuple[int, int, int]
@@ -78,6 +88,27 @@ def __post_init__(self) -> None:
             assert 0 < cpu <= 1
 
 
+@doc_field(
+    "percent",
+    "The memory load process will allocate memory until the system's memory load is this high.",
+)
+@doc_field(
+    "no_readjust",
+    """
+If true, the memory load process will never allocate or free any more memory after it's started.
+If false, it will allocate or free in order to keep the system's memory at `percent`.
+""",
+)
+@with_slots
+@dataclass(frozen=True)
+class MemoryLoadOptions:
+    percent: float
+    no_readjust: Optional[bool] = None
+
+    def __post_init__(self) -> None:
+        assert_is_percent(self.percent)
+
+
 @doc_field("complus_gcserver", "Set to true to use server GC.")
 @doc_field("complus_gcconcurrent", "Set to true to allow background GCs.")
 @doc_field("complus_gcgen0size", "gen0size in bytes. (decimal)")
@@ -110,6 +141,16 @@ def __post_init__(self) -> None:
     "Overrides the MaxThreads setting for the ThreadPool worker pool",
 )
 @doc_field("complus_tieredcompilation", "Set to true to enable tiered compilation")
+@doc_field(
+    "complus_bgcfltuningenabled",
+    "Set to true to enable https://github.com/dotnet/coreclr/pull/26695",
+)
+@doc_field("complus_bgcmemgoal", "See comment on https://github.com/dotnet/coreclr/pull/26695")
+@doc_field("complus_bgcmemgoalslack", "See comment on https://github.com/dotnet/coreclr/pull/26695")
+@doc_field(
+    "complus_gcconcurrentfinalization",
+    "Enable concurrent finalization (not available in normal coreclr builds)",
+)
 @doc_field(
     "container",
     """
@@ -126,19 +167,13 @@ def __post_init__(self) -> None:
 """,
 )
 @doc_field(
-    "memory_load_percent",
+    "memory_load",
     "If set, the test runner will launch a second process that ensures "
     + "this percentage of the system's memory is consumed.",
 )
 @with_slots
 @dataclass(frozen=True)
-class Config:
-    """
-    Allows to set environment variables, and container and memory load options.
-    WARN: Normally complus environment variables are specified in hexadecimal on the command line.
-    But when specifying them in a yaml file, use decimal.
-    """
-
+class ConfigOptions:
     complus_gcserver: Optional[bool] = None
     complus_gcconcurrent: Optional[bool] = None
     # This is in bytes
@@ -154,16 +189,37 @@ class Config:
     complus_thread_useallcpugroups: Optional[bool] = None
     complus_threadpool_forcemaxworkerthreads: Optional[int] = None
     complus_tieredcompilation: Optional[bool] = None
+    complus_bgcfltuningenabled: Optional[bool] = None
+    complus_bgcmemgoal: Optional[int] = None
+    complus_bgcmemgoalslack: Optional[int] = None
+    complus_gcconcurrentfinalization: Optional[bool] = None
     container: Optional[TestConfigContainer] = None
     affinitize: Optional[bool] = None
-    memory_load_percent: Optional[float] = None
-    # Remember to update _parse_test_config when adding a new field
+    memory_load: Optional[MemoryLoadOptions] = None
 
     def __post_init__(self) -> None:
         if self.complus_gcheapaffinitizeranges is not None:
             _parse_heap_affinitize_ranges(self.complus_gcheapaffinitizeranges)
-        if self.memory_load_percent is not None:
-            assert_is_percent(self.memory_load_percent)
+
+
+@doc_field(
+    "coreclr_specific",
+    """
+Maps coreclr name to config options for only that coreclr.
+If present, should have an entry for every coreclr.
+""",
+)
+@with_slots
+@dataclass(frozen=True)
+class Config(ConfigOptions):
+    """
+    Allows to set environment variables, and container and memory load options.
+    WARN: Normally complus environment variables are specified in hexadecimal on the command line.
+    But when specifying them in a yaml file, use decimal.
+    """
+
+    coreclr_specific: Optional[Mapping[str, ConfigOptions]] = None
+    # Remember to update TestConfigCombined.env when adding a new field
 
     @staticmethod
     def serialize_mappings() -> SerializeMappings:
@@ -234,11 +290,82 @@ def _parse_heap_affinitize_range_after_group(group: Optional[int], s: str) -> He
         return HeapAffinitizeRange(group=group, lo=x, hi=x)
 
 
+@with_slots
+@dataclass(frozen=True)
+class TestConfigCombinedWithCoreclr:
+    """
+    This is the combination of:
+    * common_config
+    * a particular config
+    * coreclr_specific for that config
+    """
+
+    cfg: ConfigOptions
+
+    def __post_init__(self) -> None:
+        cfg = self.cfg
+        assert (cfg.complus_gcheapaffinitizeranges is not None) == (
+            cfg.complus_gccpugroup == True
+        ), (
+            "Either both complus_gcheapaffinitizeranges and complus_gccpugroup should be set,"
+            + " or neither should"
+        )
+        if cfg.complus_bgcmemgoal or cfg.complus_bgcmemgoalslack:
+            assert (
+                cfg.complus_bgcfltuningenabled
+            ), "bgcmemgoal does nothing without bgcfltuningenabled"
+
+        if cfg.complus_gcconcurrentfinalization:
+            assert (
+                cfg.complus_gcconcurrent
+            ), "gcconcurrentfinalization only has effect if gcconcurrent"
+
+    def env(self, core_root: Optional[Path]) -> Mapping[str, str]:
+        cfg = self.cfg
+
+        def od(name: str, v: Optional[int]) -> Optional[Mapping[str, str]]:
+            return optional_mapping(name, map_option(v, hex_no_0x))
+
+        def ob(name: str, v: Optional[bool]) -> Optional[Mapping[str, str]]:
+            return optional_mapping(name, map_option(v, lambda b: str(int(b))))
+
+        return combine_mappings(
+            empty_mapping() if core_root is None else {"CORE_ROOT": str(core_root)},
+            *filter_not_none(
+                (
+                    ob("COMPlus_gcServer", cfg.complus_gcserver),
+                    ob("COMPlus_gcConcurrent", cfg.complus_gcconcurrent),
+                    od("COMPlus_GCgen0size", cfg.complus_gcgen0size),
+                    od("COMPlus_GCGen0MaxBudget", cfg.complus_gcgen0maxbudget),
+                    optional_mapping(
+                        "COMPlus_GCHeapAffinitizeRanges", cfg.complus_gcheapaffinitizeranges
+                    ),
+                    od("COMPlus_GCHeapCount", cfg.complus_gcheapcount),
+                    od("COMPlus_GCHeapHardLimit", cfg.complus_gcheaphardlimit),
+                    ob("COMPlus_GCLargePages", cfg.complus_gclargepages),
+                    ob("COMPlus_GCNoAffinitize", cfg.complus_gcnoaffinitize),
+                    ob("COMPlus_GCCpuGroup", cfg.complus_gccpugroup),
+                    ob("COMPlus_GCNumaAware", cfg.complus_gcnumaaware),
+                    ob("COMPlus_Thread_UseAllCpuGroups", cfg.complus_thread_useallcpugroups),
+                    od(
+                        "COMPlus_ThreadPool_ForceMaxWorkerThreads",
+                        cfg.complus_threadpool_forcemaxworkerthreads,
+                    ),
+                    ob("COMPlus_TieredCompilation", cfg.complus_tieredcompilation),
+                    ob("COMPlus_BGCFLTuningEnabled", cfg.complus_bgcfltuningenabled),
+                    od("COMPlus_BGCMemGoal", cfg.complus_bgcmemgoal),
+                    od("COMPlus_BGCMemGoalSlack", cfg.complus_bgcmemgoalslack),
+                    ob("COMPLUS_GCConcurrentFinalization", cfg.complus_gcconcurrentfinalization),
+                )
+            ),
+        )
+
+
 # Combined CommonConfig and individual test's config
 @with_slots
 @dataclass(frozen=True)
 class TestConfigCombined:
-    cfg: Config
+    cfg: Config  # includes coreclr_specific
 
     @property
     def container(self) -> Optional[TestConfigContainer]:
@@ -257,37 +384,15 @@ def complus_gcserver(self) -> Optional[bool]:
         return self.cfg.complus_gcserver
 
     @property
-    def memory_load_percent(self) -> Optional[float]:
-        return self.cfg.memory_load_percent
-
-    def env(self, core_root: Optional[Path]) -> Mapping[str, str]:
-        cfg = self.cfg
-
-        def od(name: str, v: Optional[int]) -> Mapping[str, str]:
-            return optional_mapping(name, map_option(v, hex_no_0x))
+    def memory_load(self) -> Optional[MemoryLoadOptions]:
+        return self.cfg.memory_load
 
-        def ob(name: str, v: Optional[bool]) -> Mapping[str, str]:
-            return optional_mapping(name, map_option(v, lambda b: str(int(b))))
-
-        return combine_mappings(
-            empty_mapping() if core_root is None else {"CORE_ROOT": str(core_root)},
-            ob("COMPlus_gcServer", cfg.complus_gcserver),
-            ob("COMPlus_gcConcurrent", cfg.complus_gcconcurrent),
-            od("COMPlus_GCgen0size", cfg.complus_gcgen0size),
-            od("COMPlus_GCGen0MaxBudget", cfg.complus_gcgen0maxbudget),
-            optional_mapping("COMPlus_GCHeapAffinitizeRanges", cfg.complus_gcheapaffinitizeranges),
-            od("COMPlus_GCHeapCount", cfg.complus_gcheapcount),
-            od("COMPlus_GCHeapHardLimit", cfg.complus_gcheaphardlimit),
-            ob("COMPlus_GCLargePages", cfg.complus_gclargepages),
-            ob("COMPlus_GCNoAffinitize", cfg.complus_gcnoaffinitize),
-            ob("COMPlus_GCCpuGroup", cfg.complus_gccpugroup),
-            ob("COMPlus_GCNumaAware", cfg.complus_gcnumaaware),
-            ob("COMPlus_Thread_UseAllCpuGroups", cfg.complus_thread_useallcpugroups),
-            od(
-                "COMPlus_ThreadPool_ForceMaxWorkerThreads",
-                cfg.complus_threadpool_forcemaxworkerthreads,
-            ),
-            ob("COMPlus_TieredCompilation", cfg.complus_tieredcompilation),
+    def with_coreclr(self, coreclr_name: str) -> TestConfigCombinedWithCoreclr:
+        specific = option_or(
+            map_option(self.cfg.coreclr_specific, lambda cs: cs.get(coreclr_name)), ConfigOptions()
+        )
+        return TestConfigCombinedWithCoreclr(
+            combine_dataclasses_with_optional_fields(ConfigOptions, self.cfg, specific)
         )
 
 
@@ -598,9 +703,7 @@ def get_architecture_bitness(a: Architecture) -> Bitness:
 
 
 def get_this_machines_architecture() -> Architecture:
-    from platform import machine
-
-    m = machine()
+    m = platform_machine()
     if m in ("AMD64", "x86_64"):
         return Architecture.amd64
     elif m == "armv71":
@@ -1218,22 +1321,9 @@ def get_test_path(
 def combine_test_configs(
     common_config: Optional[Config], named_config: Config
 ) -> TestConfigCombined:
-    if common_config is None:
-        return TestConfigCombined(named_config)
-    else:
-        # Ensure no overlap
-        def get_value(field_name: str) -> Any:
-            from_common = getattr(common_config, field_name)
-            from_named = getattr(named_config, field_name)
-            if from_common is None:
-                return getattr(named_config, field_name)
-            else:
-                assert (
-                    from_named is None
-                ), f"Overrides {field_name} (common_config: {from_common}, specified: {from_named})"
-                return from_common
-
-        return TestConfigCombined(Config(*(get_value(f.name) for f in fields(Config))))
+    return TestConfigCombined(
+        combine_dataclasses_with_optional_fields(Config, named_config, common_config)
+    )
 
 
 def get_coreclr(bench_file: BenchFile, coreclr_name: Optional[str]) -> CoreclrAndName:
diff --git a/src/benchmarks/gc/src/commonlib/collection_util.py b/src/benchmarks/gc/src/commonlib/collection_util.py
index 6e5230541d..79ba8a8e10 100644
--- a/src/benchmarks/gc/src/commonlib/collection_util.py
+++ b/src/benchmarks/gc/src/commonlib/collection_util.py
@@ -23,7 +23,7 @@
 )
 
 from .frozen_dict import FrozenDict
-from .option import optional_to_iter
+from .option import map_option, optional_to_iter
 from .type_utils import K, T, U, V, with_slots
 
 
@@ -103,8 +103,8 @@ def invert_multi_mapping(m: Mapping[K, Sequence[V]]) -> Mapping[V, K]:
     return out
 
 
-def optional_mapping(key: K, value: Optional[V]) -> Mapping[K, V]:
-    return empty_mapping() if value is None else {key: value}
+def optional_mapping(key: K, value: Optional[V]) -> Optional[Mapping[K, V]]:
+    return map_option(value, lambda v: {key: v})
 
 
 # sequence
@@ -245,6 +245,10 @@ def count(i: Iterable[Tuple[()]]) -> int:
     return sum(1 for _ in i)
 
 
+def filter_not_none(s: Iterable[Optional[T]]) -> Iterable[T]:
+    return (x for x in s if x is not None)
+
+
 def repeat(value: T, times: int) -> Sequence[T]:
     assert times >= 0
     return [value for _ in range(times)]
@@ -495,7 +499,7 @@ def zip_check_3(a: Iterable[T], b: Iterable[U], c: Iterable[V]) -> Iterable[Tupl
     return ((a, *bc) for a, bc in zip_check(a, zip_check(b, c)))
 
 
-def zip_shorten_former(a: Sequence[T], b: Sequence[U]) -> Iterable[Tuple[T, U]]:
+def zip_shorten_latter(a: Sequence[T], b: Sequence[U]) -> Iterable[Tuple[T, U]]:
     assert len(a) <= len(b)
     return zip(a, b)
 
diff --git a/src/benchmarks/gc/src/commonlib/command.py b/src/benchmarks/gc/src/commonlib/command.py
index 265b9ee8d4..700e299b81 100644
--- a/src/benchmarks/gc/src/commonlib/command.py
+++ b/src/benchmarks/gc/src/commonlib/command.py
@@ -26,6 +26,7 @@
     is_a,
     is_field_name_optional,
     match_type,
+    NO_DEFAULT,
     non_optional_type,
     show_type_for_command,
     T,
@@ -151,7 +152,7 @@ def help_command(args: HelpArgs) -> None:
     if args.command_name is None:
         print_document(_document_for_help_all_commands(args.hidden))
     else:
-        from ..all_commands import ALL_COMMANDS
+        from ..all_commands import ALL_COMMANDS  # pylint:disable=import-outside-toplevel
 
         command = get_or_did_you_mean(ALL_COMMANDS, args.command_name, "command")
         _print_help_for_command(command, show_hidden_arguments=args.hidden)
@@ -163,7 +164,7 @@ def help_command(args: HelpArgs) -> None:
 
 
 def _document_for_help_all_commands(show_hidden: bool) -> Document:
-    from ..all_commands import ALL_COMMANDS
+    from ..all_commands import ALL_COMMANDS  # pylint:disable=import-outside-toplevel
 
     def row_for_command(command_name: str, command: Command) -> Row:
         return (Cell(command_name), Cell(unindent_doc(command.doc), align=Align.left))
@@ -313,13 +314,13 @@ def _command_help_row_for_field(fld: Any, show_hidden_arguments: bool) -> Option
 
 
 def validate_all_commands_are_documented() -> None:
-    from ..all_commands import ALL_COMMANDS
+    from ..all_commands import ALL_COMMANDS  # pylint:disable=import-outside-toplevel
 
     for command in ALL_COMMANDS.values():
         param_type = _get_command_parameter_type(command.fn)
         if param_type is not None and is_dataclass(param_type):
             for fld in fields(param_type):
-                assert fld.default is MISSING or is_a(
+                assert fld.default is NO_DEFAULT or is_a(
                     fld.default, fld.type
                 ), f"{param_type.__name__}.{fld.name}: default value does not match type"
                 if try_get_field_argument_info(fld) is None:
@@ -378,7 +379,7 @@ def get_value(field: Any) -> Any:
             from_cmd = fields_from_cmd.get(field.name)
             if from_cmd is None:
                 assert (
-                    field.default is not MISSING
+                    field.default is not NO_DEFAULT
                 ), f"'{_to_cmd_line_arg_name(field.name)}' was not provided and has no default"
                 return check_cast(field.type, field.default)
             else:
diff --git a/src/benchmarks/gc/src/commonlib/document.py b/src/benchmarks/gc/src/commonlib/document.py
index e2a67d877d..bd94b64be5 100644
--- a/src/benchmarks/gc/src/commonlib/document.py
+++ b/src/benchmarks/gc/src/commonlib/document.py
@@ -9,7 +9,6 @@
 from math import ceil, floor
 from os import get_terminal_size, terminal_size
 from pathlib import Path
-from sys import argv
 from typing import Callable, cast, Iterable, Mapping, Optional, List, Sequence, Tuple, Union
 
 from psutil import Process
@@ -28,8 +27,8 @@
     zip_check_3,
 )
 from .option import map_option, optional_to_iter, option_or
-from .type_utils import check_cast, with_slots
-from .util import float_to_str, os_is_windows
+from .type_utils import argument, check_cast, with_slots
+from .util import float_to_str, get_command_line, os_is_windows
 
 Tag = SimpleDoc.Tag
 
@@ -94,7 +93,9 @@ def __post_init__(self) -> None:
             self.header_groups is None or sum(x.size_cells for x in self.header_groups) == n_columns
         )
         for row in self.rows:
-            assert len(row) == n_columns
+            assert (
+                len(row) == n_columns
+            ), f"Row has {len(row)} entries but table has {n_columns} column headers"
 
 
 @with_slots
@@ -166,14 +167,6 @@ class SpecialOutputWidth(Enum):
 
 OutputWidth = Union[int, SpecialOutputWidth]
 
-OUTPUT_WIDTH_DOC = """
-Maximum width (in columns) of console or text file output.
-Default is the current terminal size.
-"""
-TABLE_INDENT_DOC = """
-Indent tables by this many spaces.
-"""
-
 
 def print_document(
     doc: Document,
@@ -635,9 +628,31 @@ def any_file_output(self) -> bool:
 
 EMPTY_OUTPUT_OPTIONS = OutputOptions()
 
-TXT_DOC = "Output to a '.txt' file"
-HTML_DOC = "Output to a '.html' file"
-XLSX_DOC = "Output to a '.xlsx' file"
+
+@with_slots
+@dataclass(frozen=True)
+class DocOutputArgs:
+    output_width: Optional[OutputWidth] = argument(
+        default=None,
+        doc="""
+        Maximum width (in columns) of console or text file output.
+        Default is the current terminal size.
+        """,
+    )
+    table_indent: Optional[int] = argument(default=None, doc="Indent tables by this many spaces.")
+    txt: Optional[Path] = argument(default=None, doc="Output to a '.txt' file")
+    html: Optional[Path] = argument(default=None, doc="Output to a '.html' file")
+    xlsx: Optional[Path] = argument(default=None, doc="Output to a '.xlsx' file")
+
+
+def output_options_from_args(args: DocOutputArgs) -> OutputOptions:
+    return OutputOptions(
+        width=args.output_width,
+        table_indent=args.table_indent,
+        html=args.html,
+        txt=args.txt,
+        excel=args.xlsx,
+    )
 
 
 def handle_doc(doc: Document, output: OutputOptions = EMPTY_OUTPUT_OPTIONS) -> None:
@@ -645,7 +660,7 @@ def handle_doc(doc: Document, output: OutputOptions = EMPTY_OUTPUT_OPTIONS) -> N
         output.html.write_text(render_to_html(doc))
     if output.txt:
         doc_txt = render_to_plaintext(doc, max_width=output.width, table_indent=output.table_indent)
-        txt = f"> {' '.join(argv)}\n\n{doc_txt}"
+        txt = f"{get_command_line()}\n\n{doc_txt}"
         output.txt.write_text(txt, encoding="utf-8")
     if output.excel:
         render_to_excel(doc, output.excel)
diff --git a/src/benchmarks/gc/src/commonlib/get_built.py b/src/benchmarks/gc/src/commonlib/get_built.py
index b63c4bc760..70e2d9d2ac 100644
--- a/src/benchmarks/gc/src/commonlib/get_built.py
+++ b/src/benchmarks/gc/src/commonlib/get_built.py
@@ -63,7 +63,7 @@ def _get_platform_name() -> str:
         return "arm64"
     else:
         p = processor()
-        assert any(x in p for x in ("Intel64", "AMD64"))
+        assert any(x in p for x in ("AMD64", "Intel64", "x86_64"))
         return "x64"
 
 
@@ -313,6 +313,9 @@ class _CopyBuildArgs:
     name: Optional[str] = argument(
         default=None, doc="Name of the output directory. Defaults to the commit hash."
     )
+    overwrite: bool = argument(
+        default=False, doc="If true, the output directory will be copied over if it exists."
+    )
 
 
 _BUILDS_PATH = BENCH_DIR_PATH / "builds"
@@ -321,7 +324,7 @@ class _CopyBuildArgs:
 def _copy_build(args: _CopyBuildArgs) -> None:
     core_root = _get_core_root(args.coreclr, args.kind)
     name = _get_default_build_name(args.coreclr, args.kind) if args.name is None else args.name
-    cp_dir(core_root, _BUILDS_PATH / name)
+    cp_dir(core_root, _BUILDS_PATH / name, args.overwrite)
 
 
 def _get_default_build_name(coreclr: Path, kind: _DebugKind) -> str:
@@ -360,11 +363,14 @@ def rebuild_coreclr(args: RebuildCoreclrArgs) -> None:
             _do_rebuild_coreclr(coreclr, args.just_copy, debug_kind)
 
 
+def _get_debug_or_release(debug_kind: _DebugKind) -> str:
+    return {_DebugKind.debug: "debug", _DebugKind.release: "release"}[debug_kind]
+
+
 def _get_debug_or_release_dir_name(debug_kind: _DebugKind) -> str:
-    plat = _get_platform_name()
-    debug_release = {_DebugKind.debug: "debug", _DebugKind.release: "release"}[debug_kind]
-    debug_release_dir_name = f"{_get_os_name()}.{plat}.{debug_release.capitalize()}"
-    return debug_release_dir_name
+    return (
+        f"{_get_os_name()}.{_get_platform_name()}.{_get_debug_or_release(debug_kind).capitalize()}"
+    )
 
 
 def _get_core_root(coreclr: Path, debug_kind: _DebugKind) -> Path:
@@ -382,7 +388,7 @@ def _do_rebuild_coreclr(coreclr: Path, just_copy: bool, debug_kind: _DebugKind)
                 cmd=(
                     str(coreclr / f"build.{get_build_ext()}"),
                     plat,
-                    debug_release,
+                    _get_debug_or_release(debug_kind),
                     # build.sh does not support --skiptests
                     *optional_to_iter("skiptests" if os_is_windows() else None),
                     "skipmscorlib",
@@ -412,8 +418,13 @@ def _do_rebuild_coreclr(coreclr: Path, just_copy: bool, debug_kind: _DebugKind)
             cp(from_path=product_dir / name, to_path=core_root / name)
 
 
-def cp_dir(from_dir: Path, to_dir: Path) -> None:
-    print(f"Copy {from_dir} to {to_dir}")
+def cp_dir(from_dir: Path, to_dir: Path, overwrite: bool) -> None:
+    if to_dir.exists():
+        if overwrite:
+            to_dir.unlink()
+        else:
+            raise Exception(f"{to_dir} already exists. (Maybe you want to '--overwrite'?)")
+    print(f"Copy directory {from_dir} to {to_dir}")
     copytree(from_dir, to_dir)
 
 
diff --git a/src/benchmarks/gc/src/commonlib/host_info.py b/src/benchmarks/gc/src/commonlib/host_info.py
index 4d3764bf5b..57004bd126 100644
--- a/src/benchmarks/gc/src/commonlib/host_info.py
+++ b/src/benchmarks/gc/src/commonlib/host_info.py
@@ -5,8 +5,9 @@
 from dataclasses import dataclass
 from operator import floordiv
 from pathlib import Path
+from re import search
 from textwrap import indent
-from typing import Mapping, Optional, Sequence, Tuple
+from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
 
 from .bench_file import (
     change_path_machine,
@@ -16,7 +17,7 @@
     parse_machines_arg,
 )
 from .get_built import get_built, Built
-from .collection_util import empty_mapping
+from .collection_util import empty_mapping, is_empty
 from .command import Command, CommandKind, CommandsMapping
 from .config import HOST_INFO_PATH
 from .option import map_option, map_option_2
@@ -57,6 +58,26 @@ class CacheInfo:
     l3: CacheInfoForLevel
 
 
+@with_slots
+@dataclass(frozen=True)
+class Range:
+    # Both inclusive
+    lo: int
+    hi: int
+
+    def with_hi(self, new_hi: int) -> "Range":
+        return Range(self.lo, new_hi)
+
+
+@with_slots
+@dataclass(frozen=True)
+class NumaNodeInfo:
+    numa_node_number: int
+    # None on non-Windows
+    cpu_group_number: Optional[int]
+    ranges: Sequence[Range]
+
+
 @with_slots
 @dataclass(frozen=True)
 class HostInfo:
@@ -64,7 +85,7 @@ class HostInfo:
     hostname: str
     n_physical_processors: int
     n_logical_processors: int
-    numa_nodes: int
+    numa_nodes: Sequence[NumaNodeInfo]
     cache_info: CacheInfo
     clock_ghz: Optional[float] = None
     total_physical_memory_mb: Optional[int] = None
@@ -73,7 +94,7 @@ class HostInfo:
 @with_slots
 @dataclass(frozen=True)
 class _NumaNodesAndCacheInfo:
-    numa_nodes: int
+    numa_nodes: Sequence[NumaNodeInfo]
     n_physical_processors: int
     n_logical_processors: int
     caches: CacheInfo
@@ -131,11 +152,13 @@ def get_opt_kb(name: str) -> Optional[int]:
 
     total_physical_memory_mb = round(kb_to_mb(float(remove_str_end(x["MemTotal"], " kB"))))
 
+    numa_nodes = _get_numa_nodes_posix()
+
     return HostInfo(
         hostname=get_hostname(),
         n_physical_processors=n_physical_processors,
         n_logical_processors=n_logical_processors,
-        numa_nodes=sockets,
+        numa_nodes=numa_nodes,
         cache_info=CacheInfo(
             # TODO: figure out how to determine number of caches on posix
             l1=CacheInfoForLevel(
@@ -152,6 +175,36 @@ def get_opt_kb(name: str) -> Optional[int]:
     )
 
 
+def _get_numa_nodes_posix() -> Sequence[NumaNodeInfo]:
+    return tuple(
+        _parse_numa_nodes_posix(
+            exec_and_get_output(ExecArgs(("numactl", "--hardware"), quiet_print=True))
+        )
+    )
+
+
+def _parse_numa_nodes_posix(s: str) -> Iterable[NumaNodeInfo]:
+    for line in s.splitlines():
+        res = search(r"^node (\d+) cpus: ", line)
+        if res is not None:
+            node_number = int(res.group(1))
+            yield NumaNodeInfo(
+                numa_node_number=node_number,
+                cpu_group_number=None,
+                ranges=_ranges_from_numbers([int(x) for x in line[res.span()[1] :].split()]),
+            )
+
+
+def _ranges_from_numbers(ns: Iterable[int]) -> Sequence[Range]:
+    ranges: List[Range] = []
+    for n in ns:
+        if is_empty(ranges) or n != ranges[-1].hi + 1:
+            ranges.append(Range(n, n))
+        else:
+            ranges.append(ranges.pop().with_hi(n))
+    return ranges
+
+
 def _parse_keys_values_lines(s: str) -> Mapping[str, str]:
     return {k: v for line in s.split("\n") if line != "" for k, v in (_split_line(line),)}
 
@@ -182,6 +235,8 @@ def _get_host_info_windows(built: Built) -> HostInfo:
 
 
 def _get_clock_ghz_windows() -> float:
+    # Import lazily as this is only available on Windows
+    # pylint:disable=import-outside-toplevel
     from winreg import ConnectRegistry, HKEY_LOCAL_MACHINE, OpenKey, QueryValueEx
 
     registry = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
diff --git a/src/benchmarks/gc/src/commonlib/parse_and_serialize.py b/src/benchmarks/gc/src/commonlib/parse_and_serialize.py
index 553fdb2639..525b9ce54a 100644
--- a/src/benchmarks/gc/src/commonlib/parse_and_serialize.py
+++ b/src/benchmarks/gc/src/commonlib/parse_and_serialize.py
@@ -19,7 +19,14 @@
 from .frozen_dict import FrozenDict
 from .option import non_null, optional_to_iter
 from .result_utils import all_non_err, as_err, map_ok, map_ok_2, unwrap
-from .type_utils import check_cast, construct_class_from_fields, match_type, T, with_slots
+from .type_utils import (
+    check_cast,
+    construct_class_from_fields,
+    match_type,
+    NO_DEFAULT,
+    T,
+    with_slots,
+)
 
 
 # Parse the first member to succeed; else return all failure messages
@@ -201,8 +208,9 @@ def _get_field(fld: Field[object]) -> Result[str, object]:
                     if all_optional:
                         return Ok(None)
                     else:
-                        assert (
-                            fld.default is not MISSING
+                        assert fld.default not in (
+                            MISSING,
+                            NO_DEFAULT,
                         ), f"At {desc}: Did not find field {fld.name} (and it has no default)"
                         return Ok(fld.default)
 
diff --git a/src/benchmarks/gc/src/commonlib/result_utils.py b/src/benchmarks/gc/src/commonlib/result_utils.py
index 95dd3b7e8d..85bcdf8d1b 100644
--- a/src/benchmarks/gc/src/commonlib/result_utils.py
+++ b/src/benchmarks/gc/src/commonlib/result_utils.py
@@ -6,7 +6,7 @@
 
 from result import Err, Ok, Result
 
-from .type_utils import E, T, U, V, W
+from .type_utils import E, F, T, U, V, W
 
 
 def all_non_err(xs: Iterable[Result[E, T]]) -> Result[E, Sequence[T]]:
@@ -39,6 +39,10 @@ def ignore_err(r: Result[E, T]) -> Optional[T]:
     return match(r, lambda x: x, lambda _: None)
 
 
+def map_err(r: Result[E, T], cb: Callable[[E], F]) -> Result[F, T]:
+    return match(r, Ok, lambda e: Err(cb(e)))
+
+
 def map_ok(r: Result[E, T], cb: Callable[[T], U]) -> Result[E, U]:
     return flat_map_ok(r, lambda t: Ok(cb(t)))
 
diff --git a/src/benchmarks/gc/src/commonlib/type_utils.py b/src/benchmarks/gc/src/commonlib/type_utils.py
index e8f109d175..a7140160a0 100644
--- a/src/benchmarks/gc/src/commonlib/type_utils.py
+++ b/src/benchmarks/gc/src/commonlib/type_utils.py
@@ -5,7 +5,7 @@
 from __future__ import annotations  # Allow subscripting Field
 
 from collections.abc import Mapping as abc_Mapping, Sequence as abc_Sequence
-from dataclasses import dataclass, field, fields, Field, is_dataclass, MISSING
+from dataclasses import dataclass, field, fields, Field, is_dataclass
 from enum import Enum
 from functools import total_ordering
 from inspect import isclass
@@ -17,6 +17,7 @@
 
 
 E = TypeVar("E")
+F = TypeVar("F")
 K = TypeVar("K")
 V = TypeVar("V")
 T = TypeVar("T")
@@ -89,9 +90,20 @@ def get_field_info_from_name(cls: Type[Any], fld_name: str) -> FieldInfo:
         raise Exception(f"You forgot to document the field {cls.__name__}#{fld_name}") from None
 
 
+# Unlike dataclass.MISSING,
+# python won't object if a field with default of _NO_DEFAULT follows one with a default.
+# Normally you can just sort the default-less fields first,
+# but if we inherit from another class, those fields always go first.
+class _NO_DEFAULT:
+    pass
+
+
+NO_DEFAULT = _NO_DEFAULT()
+
+
 # Shorthand for creating a field in an arguments dataclass.
 def argument(
-    doc: str, default: Any = MISSING, hidden: bool = False, name_optional: bool = False
+    doc: str, default: Any = NO_DEFAULT, hidden: bool = False, name_optional: bool = False
 ) -> Any:
     return field(
         default=default,
@@ -318,3 +330,30 @@ def enum_count(e: Type[Enum]) -> int:
     xs = sorted(enum_value(x) for x in e)
     assert xs == list(range(len(xs))), "Enum values should be 0..N"
     return len(xs)
+
+
+def combine_dataclasses_with_optional_fields(t: Type[T], a: T, b: Optional[T]) -> T:
+    if b is None:
+        return a
+    else:
+
+        def combiner(field_name: str, from_a: object, from_b: object) -> object:
+            if from_a is None:
+                return from_b
+            elif from_b is None:
+                return from_a
+            else:
+                raise Exception(f"Conflicting values for field '{field_name}'")
+
+        return combine_dataclasses(t, a, b, combiner)
+
+
+def combine_dataclasses(
+    t: Type[T], a: T, b: T, combiner: Callable[[str, object, object], object]
+) -> T:
+    """
+    Combines dataclass instances 'a' and 'b' by calling 'combiner' on corresponding fields.
+    'combiner' should be a generic function (str, U, U) -> U
+    """
+
+    return t(*(combiner(f.name, getattr(a, f.name), getattr(b, f.name)) for f in fields(t)))
diff --git a/src/benchmarks/gc/src/commonlib/util.py b/src/benchmarks/gc/src/commonlib/util.py
index 2823776c71..45edc36143 100644
--- a/src/benchmarks/gc/src/commonlib/util.py
+++ b/src/benchmarks/gc/src/commonlib/util.py
@@ -9,7 +9,7 @@
 from enum import Enum
 from functools import reduce
 from inspect import getfile
-from math import inf, isclose, isnan
+from math import ceil, floor, inf, isclose, isnan
 from operator import mul
 import os
 from os import kill, name as os_name
@@ -20,12 +20,14 @@
 from subprocess import DEVNULL, PIPE, Popen, run
 from stat import S_IREAD, S_IWRITE, S_IRUSR, S_IWUSR, S_IRGRP, S_IWGRP, S_IROTH, S_IWOTH
 from statistics import median, StatisticsError
+from sys import argv
 from threading import Event, Thread
 from time import sleep, time
 from typing import Any, Callable, cast, Iterable, Mapping, Optional, Sequence, Union
 from xml.etree.ElementTree import Element, parse as parse_xml
 
 from psutil import process_iter
+from result import Err, Ok, Result
 
 from .collection_util import find, identity, is_empty, min_max_float
 from .option import option_or
@@ -383,7 +385,7 @@ def kill_process(process: AnyPopen, time_allowed_seconds: float) -> None:
         sleep(1)
         if (time() - start_time_seconds) > time_allowed_seconds:
             print(
-                f"Process '{process.args}' refused to shut down normally. "
+                f"Process '{check_cast(str, process.args)}' refused to shut down normally. "
                 + "Trying again without asking nicely."
             )
             process.kill()
@@ -413,6 +415,8 @@ def exec_and_get_output_and_exit_code(args: ExecArgs) -> OutputAndExitCode:
         r = run(args.cmd, stdout=PIPE, cwd=args.cwd, env=args.env, check=False)
     except FileNotFoundError:
         raise ExecutableNotFoundException(Path(args.cmd[0])) from None
+    except NotADirectoryError:
+        raise Exception(f"Invalid cwd: {args.cwd}") from None
 
     return OutputAndExitCode(decode_stdout(r.stdout), r.returncode)
 
@@ -575,7 +579,8 @@ def assert_admin() -> None:
 
 def is_admin() -> bool:
     if os_is_windows():
-        from win32com.shell.shell import IsUserAnAdmin
+        # Do this import lazily as it is only available on Windows
+        from win32com.shell.shell import IsUserAnAdmin  # pylint:disable=import-outside-toplevel
 
         return IsUserAnAdmin()
     else:
@@ -678,6 +683,25 @@ def opt_median(i: Iterable[float]) -> Optional[float]:
         return None
 
 
+# numpy has problems on ARM, so using this instead.
+def get_percentile(values: Sequence[float], percent: float) -> float:
+    assert not is_empty(values)
+    assert 0.0 <= percent <= 100.0
+    sorted_values = sorted(values)
+    fraction = percent / 100.0
+    index_and_fraction = (len(values) - 1) * fraction
+    prev_index = floor(index_and_fraction)
+    next_index = ceil(index_and_fraction)
+    # The closer we are to 'next_index', the more 'next' should matter
+    next_factor = index_and_fraction - prev_index
+    prev_factor = 1.0 - next_factor
+    return sorted_values[prev_index] * prev_factor + sorted_values[next_index] * next_factor
+
+
+def get_95th_percentile(values: Sequence[float]) -> Result[str, float]:
+    return Err("<no values>") if is_empty(values) else Ok(get_percentile(values, 95))
+
+
 def update_file(path: Path, text: str) -> None:
     if (not path.exists()) or path.read_text(encoding="utf-8") != text:
         print(f"Updating {path}")
@@ -701,3 +725,7 @@ def check_no_processes(names: Sequence[str]) -> None:
             assert name not in proc.name().lower(), (
                 f"'{name}' is already running\n" + f"Try: `{suggestion}`"
             )
+
+
+def get_command_line() -> str:
+    return f"> py {' '.join(argv)}"
diff --git a/src/benchmarks/gc/src/dead_code.py b/src/benchmarks/gc/src/dead_code.py
index 05ef86920c..78383f1eba 100644
--- a/src/benchmarks/gc/src/dead_code.py
+++ b/src/benchmarks/gc/src/dead_code.py
@@ -2,13 +2,13 @@
 # The .NET Foundation licenses this file to you under the MIT license.
 # See the LICENSE file in the project root for more information.
 
-duration_msec  # unused variable (jupyter_notebook.py:331)
-alloced_mb  # unused variable (jupyter_notebook.py:332)
-gen0_in_mb  # unused variable (jupyter_notebook.py:381)
-gen0_out_mb  # unused variable (jupyter_notebook.py:382)
-_.SuspensionPercent  # unused property (jupyter_notebook.py:473)
-_.PctPauseFromSuspend  # unused property (jupyter_notebook.py:481)
-_.Action1  # unused property (src\analysis\clr.py:232)
+duration_msec  # unused variable (jupyter_notebook.py:332)
+alloced_mb  # unused variable (jupyter_notebook.py:333)
+gen0_in_mb  # unused variable (jupyter_notebook.py:382)
+gen0_out_mb  # unused variable (jupyter_notebook.py:383)
+_.SuspensionPercent  # unused property (jupyter_notebook.py:474)
+_.PctPauseFromSuspend  # unused property (jupyter_notebook.py:482)
+_.Action1  # unused property (src\analysis\clr.py:233)
 Keys  # unused variable (src\analysis\clr_types.py:42)
 ContainsKey  # unused function (src\analysis\clr_types.py:45)
 nt_symbol_path  # unused variable (src\analysis\clr_types.py:81)
@@ -40,10 +40,6 @@
 GCHandleCount  # unused variable (src\analysis\clr_types.py:185)
 CondemnedGeneration  # unused variable (src\analysis\clr_types.py:191)
 Gen0ReductionCount  # unused variable (src\analysis\clr_types.py:192)
-MemoryPressure  # unused variable (src\analysis\clr_types.py:195)
-HasMemoryPressure  # unused variable (src\analysis\clr_types.py:196)
-MemoryPressure  # unused variable (src\analysis\clr_types.py:247)
-HasMemoryPressure  # unused variable (src\analysis\clr_types.py:248)
 VersionRecognized  # unused variable (src\analysis\clr_types.py:249)
 HasFreeListAllocated  # unused variable (src\analysis\clr_types.py:251)
 HasFreeListRejected  # unused variable (src\analysis\clr_types.py:253)
@@ -132,8 +128,8 @@
 AnalyzeSingleGc  # unused function (src\analysis\clr_types.py:867)
 get_parts  # unused function (src\analysis\core_analysis.py:62)
 get_parts  # unused function (src\analysis\core_analysis.py:127)
-num_samples  # unused variable (src\analysis\core_analysis.py:282)
-_.num_samples  # unused attribute (src\analysis\core_analysis.py:327)
+num_samples  # unused variable (src\analysis\core_analysis.py:291)
+_.num_samples  # unused attribute (src\analysis\core_analysis.py:336)
 alloc_soh  # unused variable (src\analysis\enums.py:33)
 lowmemory  # unused variable (src\analysis\enums.py:35)
 alloc_loh  # unused variable (src\analysis\enums.py:37)
@@ -146,93 +142,95 @@
 lowmemory_host  # unused variable (src\analysis\enums.py:44)
 pm_full_gc  # unused variable (src\analysis\enums.py:45)
 lowmemory_host_blocking  # unused variable (src\analysis\enums.py:46)
-expand_reuse_normal  # unused variable (src\analysis\enums.py:52)
-expand_reuse_bestfit  # unused variable (src\analysis\enums.py:53)
-expand_new_set_ep  # unused variable (src\analysis\enums.py:54)
-expand_new_seg  # unused variable (src\analysis\enums.py:55)
-expand_no_memory  # unused variable (src\analysis\enums.py:56)
-expand_next_full_gc  # unused variable (src\analysis\enums.py:57)
-high_frag  # unused variable (src\analysis\enums.py:67)
-no_gaps  # unused variable (src\analysis\enums.py:68)
-loh_forced  # unused variable (src\analysis\enums.py:69)
-last_gc  # unused variable (src\analysis\enums.py:70)
-induced_compacting  # unused variable (src\analysis\enums.py:71)
-fragmented_gen0  # unused variable (src\analysis\enums.py:72)
-high_mem_load  # unused variable (src\analysis\enums.py:73)
-high_mem_frag  # unused variable (src\analysis\enums.py:74)
-vhigh_mem_frag  # unused variable (src\analysis\enums.py:75)
-no_gc_mode  # unused variable (src\analysis\enums.py:76)
-_.using_concurrent  # unused property (src\analysis\enums.py:91)
-_.using_server  # unused property (src\analysis\enums.py:95)
-BGCPhase  # unused class (src\analysis\enums.py:185)
-BGC1stNonConcurrent  # unused variable (src\analysis\enums.py:186)
-BGC1stConcurrent  # unused variable (src\analysis\enums.py:187)
-BGC2ndNonConcurrent  # unused variable (src\analysis\enums.py:188)
-BGC2ndConcurrent  # unused variable (src\analysis\enums.py:189)
-Concurrent  # unused variable (src\analysis\enums.py:194)
-init_cpu_mapping  # unused variable (src\analysis\enums.py:214)
-rescan_dependent_handles  # unused variable (src\analysis\enums.py:219)
-start_bgc  # unused variable (src\analysis\enums.py:232)
-restart_ee  # unused variable (src\analysis\enums.py:233)
-concurrent_overflow  # unused variable (src\analysis\enums.py:234)
-suspend_ee  # unused variable (src\analysis\enums.py:235)
-bgc_after_ephemeral  # unused variable (src\analysis\enums.py:236)
-allow_fgc  # unused variable (src\analysis\enums.py:237)
-bgc_sweep  # unused variable (src\analysis\enums.py:238)
-suspend_ee_verify  # unused variable (src\analysis\enums.py:239)
-restart_ee_verify  # unused variable (src\analysis\enums.py:240)
-set_state_free  # unused variable (src\analysis\enums.py:241)
-after_absorb  # unused variable (src\analysis\enums.py:244)
-after_reset  # unused variable (src\analysis\enums.py:246)
-after_ephemeral_sweep  # unused variable (src\analysis\enums.py:247)
-after_profiler_heap_walk  # unused variable (src\analysis\enums.py:248)
-minimal_gc  # unused variable (src\analysis\enums.py:249)
-after_commit_soh_no_gc  # unused variable (src\analysis\enums.py:250)
-expand_loh_no_gc  # unused variable (src\analysis\enums.py:251)
-final_no_gc  # unused variable (src\analysis\enums.py:252)
-disable_software_write_watch  # unused variable (src\analysis\enums.py:253)
-restarting  # unused variable (src\analysis\enums.py:260)
-stolen  # unused variable (src\analysis\enums.py:262)
-idle_for_no_good_reason  # unused variable (src\analysis\enums.py:263)
-MarkStack  # unused variable (src\analysis\enums.py:277)
-MarkFQ  # unused variable (src\analysis\enums.py:278)
-MarkHandles  # unused variable (src\analysis\enums.py:279)
-MarkOlder  # unused variable (src\analysis\enums.py:280)
-MarkSizedRef  # unused variable (src\analysis\enums.py:281)
-MarkOverflow  # unused variable (src\analysis\enums.py:282)
-CondemnedReasonsGroup  # unused class (src\analysis\enums.py:287)
-Initial_Generation  # unused variable (src\analysis\enums.py:291)
-Final_Generation  # unused variable (src\analysis\enums.py:292)
-Alloc_Exceeded  # unused variable (src\analysis\enums.py:293)
-Time_Tuning  # unused variable (src\analysis\enums.py:294)
-Induced  # unused variable (src\analysis\enums.py:299)
-Low_Ephemeral  # unused variable (src\analysis\enums.py:300)
-Expand_Heap  # unused variable (src\analysis\enums.py:301)
-Fragmented_Ephemeral  # unused variable (src\analysis\enums.py:302)
-Fragmented_Gen1_To_Gen2  # unused variable (src\analysis\enums.py:303)
-Fragmented_Gen2  # unused variable (src\analysis\enums.py:304)
-Fragmented_Gen2_High_Mem  # unused variable (src\analysis\enums.py:305)
-GC_Before_OOM  # unused variable (src\analysis\enums.py:306)
-Too_Small_For_BGC  # unused variable (src\analysis\enums.py:307)
-Ephemeral_Before_BGC  # unused variable (src\analysis\enums.py:308)
-Internal_Tuning  # unused variable (src\analysis\enums.py:309)
-Max  # unused variable (src\analysis\enums.py:310)
-init  # unused variable (src\analysis\enums.py:315)
-ThreadWaitReason  # unused class (src\analysis\enums.py:326)
-Executive  # unused variable (src\analysis\enums.py:327)
-FreePage  # unused variable (src\analysis\enums.py:328)
-PageIn  # unused variable (src\analysis\enums.py:329)
-SystemAllocation  # unused variable (src\analysis\enums.py:330)
-ExecutionDelay  # unused variable (src\analysis\enums.py:331)
-Suspended  # unused variable (src\analysis\enums.py:332)
-UserRequest  # unused variable (src\analysis\enums.py:333)
-EventPairHigh  # unused variable (src\analysis\enums.py:334)
-EventPairLow  # unused variable (src\analysis\enums.py:335)
-LpcReceive  # unused variable (src\analysis\enums.py:336)
-LpcReply  # unused variable (src\analysis\enums.py:337)
-VirtualMemory  # unused variable (src\analysis\enums.py:338)
-PageOut  # unused variable (src\analysis\enums.py:339)
-Unknown  # unused variable (src\analysis\enums.py:340)
+bgc_tuning_soh  # unused variable (src\analysis\enums.py:47)
+bgc_tuning_loh  # unused variable (src\analysis\enums.py:48)
+bgc_stepping  # unused variable (src\analysis\enums.py:49)
+expand_reuse_normal  # unused variable (src\analysis\enums.py:55)
+expand_reuse_bestfit  # unused variable (src\analysis\enums.py:56)
+expand_new_set_ep  # unused variable (src\analysis\enums.py:57)
+expand_new_seg  # unused variable (src\analysis\enums.py:58)
+expand_no_memory  # unused variable (src\analysis\enums.py:59)
+expand_next_full_gc  # unused variable (src\analysis\enums.py:60)
+high_frag  # unused variable (src\analysis\enums.py:70)
+no_gaps  # unused variable (src\analysis\enums.py:71)
+loh_forced  # unused variable (src\analysis\enums.py:72)
+last_gc  # unused variable (src\analysis\enums.py:73)
+induced_compacting  # unused variable (src\analysis\enums.py:74)
+fragmented_gen0  # unused variable (src\analysis\enums.py:75)
+high_mem_load  # unused variable (src\analysis\enums.py:76)
+high_mem_frag  # unused variable (src\analysis\enums.py:77)
+vhigh_mem_frag  # unused variable (src\analysis\enums.py:78)
+no_gc_mode  # unused variable (src\analysis\enums.py:79)
+_.using_concurrent  # unused property (src\analysis\enums.py:94)
+_.using_server  # unused property (src\analysis\enums.py:98)
+BGCPhase  # unused class (src\analysis\enums.py:188)
+BGC1stNonConcurrent  # unused variable (src\analysis\enums.py:189)
+BGC1stConcurrent  # unused variable (src\analysis\enums.py:190)
+BGC2ndNonConcurrent  # unused variable (src\analysis\enums.py:191)
+BGC2ndConcurrent  # unused variable (src\analysis\enums.py:192)
+Concurrent  # unused variable (src\analysis\enums.py:197)
+init_cpu_mapping  # unused variable (src\analysis\enums.py:217)
+rescan_dependent_handles  # unused variable (src\analysis\enums.py:222)
+start_bgc  # unused variable (src\analysis\enums.py:235)
+restart_ee  # unused variable (src\analysis\enums.py:236)
+concurrent_overflow  # unused variable (src\analysis\enums.py:237)
+suspend_ee  # unused variable (src\analysis\enums.py:238)
+bgc_after_ephemeral  # unused variable (src\analysis\enums.py:239)
+allow_fgc  # unused variable (src\analysis\enums.py:240)
+bgc_sweep  # unused variable (src\analysis\enums.py:241)
+suspend_ee_verify  # unused variable (src\analysis\enums.py:242)
+restart_ee_verify  # unused variable (src\analysis\enums.py:243)
+set_state_free  # unused variable (src\analysis\enums.py:244)
+after_absorb  # unused variable (src\analysis\enums.py:247)
+after_reset  # unused variable (src\analysis\enums.py:249)
+after_ephemeral_sweep  # unused variable (src\analysis\enums.py:250)
+after_profiler_heap_walk  # unused variable (src\analysis\enums.py:251)
+minimal_gc  # unused variable (src\analysis\enums.py:252)
+after_commit_soh_no_gc  # unused variable (src\analysis\enums.py:253)
+expand_loh_no_gc  # unused variable (src\analysis\enums.py:254)
+final_no_gc  # unused variable (src\analysis\enums.py:255)
+disable_software_write_watch  # unused variable (src\analysis\enums.py:256)
+restarting  # unused variable (src\analysis\enums.py:263)
+stolen  # unused variable (src\analysis\enums.py:265)
+idle_for_no_good_reason  # unused variable (src\analysis\enums.py:266)
+MarkStack  # unused variable (src\analysis\enums.py:280)
+MarkFQ  # unused variable (src\analysis\enums.py:281)
+MarkHandles  # unused variable (src\analysis\enums.py:282)
+MarkOlder  # unused variable (src\analysis\enums.py:283)
+MarkSizedRef  # unused variable (src\analysis\enums.py:284)
+MarkOverflow  # unused variable (src\analysis\enums.py:285)
+CondemnedReasonsGroup  # unused class (src\analysis\enums.py:290)
+Initial_Generation  # unused variable (src\analysis\enums.py:294)
+Final_Generation  # unused variable (src\analysis\enums.py:295)
+Alloc_Exceeded  # unused variable (src\analysis\enums.py:296)
+Time_Tuning  # unused variable (src\analysis\enums.py:297)
+Induced  # unused variable (src\analysis\enums.py:302)
+Low_Ephemeral  # unused variable (src\analysis\enums.py:303)
+Expand_Heap  # unused variable (src\analysis\enums.py:304)
+Fragmented_Ephemeral  # unused variable (src\analysis\enums.py:305)
+Fragmented_Gen1_To_Gen2  # unused variable (src\analysis\enums.py:306)
+Fragmented_Gen2  # unused variable (src\analysis\enums.py:307)
+Fragmented_Gen2_High_Mem  # unused variable (src\analysis\enums.py:308)
+GC_Before_OOM  # unused variable (src\analysis\enums.py:309)
+Too_Small_For_BGC  # unused variable (src\analysis\enums.py:310)
+Ephemeral_Before_BGC  # unused variable (src\analysis\enums.py:311)
+Internal_Tuning  # unused variable (src\analysis\enums.py:312)
+init  # unused variable (src\analysis\enums.py:318)
+ThreadWaitReason  # unused class (src\analysis\enums.py:329)
+Executive  # unused variable (src\analysis\enums.py:330)
+FreePage  # unused variable (src\analysis\enums.py:331)
+PageIn  # unused variable (src\analysis\enums.py:332)
+SystemAllocation  # unused variable (src\analysis\enums.py:333)
+ExecutionDelay  # unused variable (src\analysis\enums.py:334)
+Suspended  # unused variable (src\analysis\enums.py:335)
+UserRequest  # unused variable (src\analysis\enums.py:336)
+EventPairHigh  # unused variable (src\analysis\enums.py:337)
+EventPairLow  # unused variable (src\analysis\enums.py:338)
+LpcReceive  # unused variable (src\analysis\enums.py:339)
+LpcReply  # unused variable (src\analysis\enums.py:340)
+VirtualMemory  # unused variable (src\analysis\enums.py:341)
+PageOut  # unused variable (src\analysis\enums.py:342)
+Unknown  # unused variable (src\analysis\enums.py:343)
 processor_number  # unused variable (src\analysis\gui_join_analysis.py:88)
 _.span_msec  # unused property (src\analysis\gui_join_analysis.py:326)
 _.span_msec  # unused property (src\analysis\gui_join_analysis.py:337)
@@ -266,64 +264,66 @@
 tid  # unused variable (src\analysis\gui_stolen_cpu_analysis.py:89)
 stolen_cpu_instances  # unused variable (src\analysis\gui_stolen_cpu_analysis.py:97)
 loose  # unused variable (src\analysis\join_analysis.py:23)
-_.rw  # unused property (src\analysis\mem_utils.py:72)
-_.r_only  # unused property (src\analysis\mem_utils.py:76)
-print_maps  # unused function (src\analysis\mem_utils.py:164)
-parse_maps  # unused function (src\analysis\mem_utils.py:204)
-parse_valgrind_err  # unused function (src\analysis\mem_utils.py:220)
-addr  # unused variable (src\analysis\mem_utils.py:427)
-followed_by  # unused variable (src\analysis\mem_utils.py:429)
-_.abs_count_diff  # unused property (src\analysis\mem_utils.py:617)
-max_stdev_fraction  # unused function (src\analysis\report.py:137)
-_stats_list_for_proc  # unused function (src\analysis\run_metrics.py:154)
-_.startup_flags  # unused property (src\analysis\types.py:228)
-serialize_run_metric  # unused function (src\analysis\types.py:386)
-deserialize_run_metric  # unused function (src\analysis\types.py:393)
-single_gc_metric_must_exist_for_name  # unused function (src\analysis\types.py:412)
-single_heap_metric_must_exist_for_name  # unused function (src\analysis\types.py:430)
-_.Gen0UserAllocatedMB  # unused property (src\analysis\types.py:778)
-_.LOHUserAllocatedMB  # unused property (src\analysis\types.py:784)
-_.Gen0SizeBeforeMB  # unused property (src\analysis\types.py:788)
-_.Gen1SizeBeforeMB  # unused property (src\analysis\types.py:792)
-_.LOHSizeBeforeMB  # unused property (src\analysis\types.py:800)
-_.Gen0BudgetMB  # unused property (src\analysis\types.py:804)
-_.Gen1BudgetMB  # unused property (src\analysis\types.py:808)
-_.Gen0SizeAfterMB  # unused property (src\analysis\types.py:820)
-_.Gen1SizeAfterMB  # unused property (src\analysis\types.py:824)
-_.LOHSizeAfterMB  # unused property (src\analysis\types.py:832)
-_.Gen0FreeListSpaceBeforeMB  # unused property (src\analysis\types.py:836)
-_.Gen1FreeListSpaceBeforeMB  # unused property (src\analysis\types.py:840)
-_.LOHFreeListSpaceBeforeMB  # unused property (src\analysis\types.py:848)
-_.Gen0FreeListSpaceAfterMB  # unused property (src\analysis\types.py:852)
-_.Gen1FreeListSpaceAfterMB  # unused property (src\analysis\types.py:856)
-_.LOHFreeListSpaceAfterMB  # unused property (src\analysis\types.py:864)
-_.Gen0FreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:868)
-_.Gen1FreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:872)
-_.Gen2FreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:876)
-_.LOHFreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:880)
-_.Gen0FreeObjSpaceAfterMB  # unused property (src\analysis\types.py:884)
-_.Gen1FreeObjSpaceAfterMB  # unused property (src\analysis\types.py:888)
-_.Gen2FreeObjSpaceAfterMB  # unused property (src\analysis\types.py:892)
-_.LOHFreeObjSpaceAfterMB  # unused property (src\analysis\types.py:896)
+_.rw  # unused property (src\analysis\mem_utils.py:74)
+_.r_only  # unused property (src\analysis\mem_utils.py:78)
+print_maps  # unused function (src\analysis\mem_utils.py:166)
+parse_maps  # unused function (src\analysis\mem_utils.py:206)
+parse_valgrind_err  # unused function (src\analysis\mem_utils.py:222)
+addr  # unused variable (src\analysis\mem_utils.py:429)
+followed_by  # unused variable (src\analysis\mem_utils.py:431)
+_.abs_count_diff  # unused property (src\analysis\mem_utils.py:618)
+max_stdev_fraction  # unused function (src\analysis\report.py:132)
+_stats_list_for_proc  # unused function (src\analysis\run_metrics.py:87)
+_.startup_flags  # unused property (src\analysis\types.py:237)
+serialize_run_metric  # unused function (src\analysis\types.py:395)
+deserialize_run_metric  # unused function (src\analysis\types.py:402)
+single_gc_metric_must_exist_for_name  # unused function (src\analysis\types.py:421)
+single_heap_metric_must_exist_for_name  # unused function (src\analysis\types.py:439)
+_.Gen0UserAllocatedMB  # unused property (src\analysis\types.py:809)
+_.LOHUserAllocatedMB  # unused property (src\analysis\types.py:815)
+_.Gen0SizeBeforeMB  # unused property (src\analysis\types.py:819)
+_.Gen1SizeBeforeMB  # unused property (src\analysis\types.py:823)
+_.LOHSizeBeforeMB  # unused property (src\analysis\types.py:831)
+_.Gen0BudgetMB  # unused property (src\analysis\types.py:835)
+_.Gen1BudgetMB  # unused property (src\analysis\types.py:839)
+_.Gen0SizeAfterMB  # unused property (src\analysis\types.py:851)
+_.Gen1SizeAfterMB  # unused property (src\analysis\types.py:855)
+_.LOHSizeAfterMB  # unused property (src\analysis\types.py:863)
+_.Gen0FreeListSpaceBeforeMB  # unused property (src\analysis\types.py:867)
+_.Gen1FreeListSpaceBeforeMB  # unused property (src\analysis\types.py:871)
+_.LOHFreeListSpaceBeforeMB  # unused property (src\analysis\types.py:879)
+_.Gen0FreeListSpaceAfterMB  # unused property (src\analysis\types.py:883)
+_.Gen1FreeListSpaceAfterMB  # unused property (src\analysis\types.py:887)
+_.LOHFreeListSpaceAfterMB  # unused property (src\analysis\types.py:895)
+_.Gen0FreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:899)
+_.Gen1FreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:903)
+_.Gen2FreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:907)
+_.LOHFreeObjSpaceBeforeMB  # unused property (src\analysis\types.py:911)
+_.Gen0FreeObjSpaceAfterMB  # unused property (src\analysis\types.py:915)
+_.Gen1FreeObjSpaceAfterMB  # unused property (src\analysis\types.py:919)
+_.Gen2FreeObjSpaceAfterMB  # unused property (src\analysis\types.py:923)
+_.LOHFreeObjSpaceAfterMB  # unused property (src\analysis\types.py:927)
 empty_set  # unused function (src\commonlib\collection_util.py:38)
 group_by  # unused function (src\commonlib\collection_util.py:54)
 filter_together  # unused function (src\commonlib\collection_util.py:113)
-_.y_min  # unused property (src\commonlib\collection_util.py:403)
-_.y_max  # unused property (src\commonlib\collection_util.py:407)
-reverse  # unused function (src\commonlib\collection_util.py:445)
+_.y_min  # unused property (src\commonlib\collection_util.py:407)
+_.y_max  # unused property (src\commonlib\collection_util.py:411)
+reverse  # unused function (src\commonlib\collection_util.py:449)
 allow_out_of_date  # unused variable (src\commonlib\get_built.py:47)
-_.average_bytes  # unused property (src\commonlib\host_info.py:47)
-l1  # unused variable (src\commonlib\host_info.py:55)
-l2  # unused variable (src\commonlib\host_info.py:56)
-l3  # unused variable (src\commonlib\host_info.py:57)
-hostname  # unused variable (src\commonlib\host_info.py:64)
-cache_info  # unused variable (src\commonlib\host_info.py:68)
-to_json  # unused function (src\commonlib\parse_and_serialize.py:65)
-_.sort_base_mapping_type_on_output  # unused attribute (src\commonlib\parse_and_serialize.py:256)
-_.__qualname__  # unused attribute (src\commonlib\type_utils.py:48)
-bytes_to_kb  # unused function (src\commonlib\util.py:470)
+_.average_bytes  # unused property (src\commonlib\host_info.py:48)
+l1  # unused variable (src\commonlib\host_info.py:56)
+l2  # unused variable (src\commonlib\host_info.py:57)
+l3  # unused variable (src\commonlib\host_info.py:58)
+numa_node_number  # unused variable (src\commonlib\host_info.py:75)
+cpu_group_number  # unused variable (src\commonlib\host_info.py:77)
+hostname  # unused variable (src\commonlib\host_info.py:85)
+cache_info  # unused variable (src\commonlib\host_info.py:89)
+to_json  # unused function (src\commonlib\parse_and_serialize.py:72)
+_.sort_base_mapping_type_on_output  # unused attribute (src\commonlib\parse_and_serialize.py:264)
+_.__qualname__  # unused attribute (src\commonlib\type_utils.py:49)
+bytes_to_kb  # unused function (src\commonlib\util.py:472)
 _float_range_around  # unused function (src\exec\generate_tests.py:212)
 _survive_bench_file  # unused function (src\exec\generate_tests.py:270)
 _gcsmall_benchyaml  # unused function (src\exec\generate_tests.py:724)
 NonTemporaryDirectory  # unused function (src\exec\run_single_test.py:190)
-_run_single_test_linux_perfcollect  # unused function (src\exec\run_single_test.py:815)
\ No newline at end of file
+_run_single_test_linux_perfcollect  # unused function (src\exec\run_single_test.py:832)
\ No newline at end of file
diff --git a/src/benchmarks/gc/src/exec/GCPerfSim/GCPerfSim.cs b/src/benchmarks/gc/src/exec/GCPerfSim/GCPerfSim.cs
index 05cf62b384..6e8a717658 100644
--- a/src/benchmarks/gc/src/exec/GCPerfSim/GCPerfSim.cs
+++ b/src/benchmarks/gc/src/exec/GCPerfSim/GCPerfSim.cs
@@ -453,6 +453,8 @@ abstract class ReferenceItemWithSize : ITypeWithPayload
     public static long NumConstructed = 0;
     public static long NumFreed = 0;
 #endif
+    public static long NumCreatedWithFinalizers = 0;
+    public static long NumFinalized = 0;
 
     // The size includes indirect children too.
 
@@ -535,9 +537,6 @@ public void AddToEndOfList(ReferenceItemWithSize refItem)
 
     class ReferenceItemWithSizeFinalizable : ReferenceItemWithSize
     {
-        public static long NumCreatedWithFinalizers = 0;
-        public static long NumFinalized = 0;
-
         public ReferenceItemWithSizeFinalizable(uint size, bool isPinned)
             : base(size, isPinned)
         {
@@ -586,6 +585,7 @@ public SizeRange(uint low, uint high)
     public readonly uint weight;
     public BucketSpec(SizeRange sizeRange, uint survInterval, uint pinInterval, uint finalizableInterval, uint weight)
     {
+        Debug.Assert(weight != 0);
         this.sizeRange = sizeRange;
         this.survInterval = survInterval;
         this.pinInterval = pinInterval;
@@ -597,7 +597,9 @@ public BucketSpec(SizeRange sizeRange, uint survInterval, uint pinInterval, uint
 
         if (this.pinInterval != 0 || this.finalizableInterval != 0)
         {
-            Util.AlwaysAssert(this.survInterval != 0, "pinInterval and finalizableInterval only affect surviving objects, but nothing survives");
+            Util.AlwaysAssert(
+                this.survInterval != 0,
+                $"pinInterval and finalizableInterval only affect surviving objects, but nothing survives (in bucket with size range {sizeRange})");
         }
     }
 
@@ -804,12 +806,14 @@ class Args
 {
     public readonly uint threadCount;
     public readonly PerThreadArgs perThreadArgs;
+    public readonly bool finishWithFullCollect;
     public readonly bool endException;
 
-    public Args(uint threadCount, in PerThreadArgs perThreadArgs, bool endException)
+    public Args(uint threadCount, in PerThreadArgs perThreadArgs, bool finishWithFullCollect, bool endException)
     {
         this.threadCount = threadCount;
         this.perThreadArgs = perThreadArgs;
+        this.finishWithFullCollect = finishWithFullCollect;
         this.endException = endException;
     }
 
@@ -888,7 +892,14 @@ private static Args ParseFromFile(string fileName)
             State? s = TryReadTag(ref text);
             if (s == State.Eof)
             {
-                return new Args(threadCount: threadCount, perThreadArgs: new PerThreadArgs(verifyLiveSize: verifyLiveSize, printEveryNthIter: printEveryNthIter, ParsePhases(ref text, threadCount)), endException: false);
+                return new Args(
+                    threadCount: threadCount,
+                    perThreadArgs: new PerThreadArgs(
+                        verifyLiveSize: verifyLiveSize,
+                        printEveryNthIter: printEveryNthIter,
+                        phases: ParsePhases(ref text, threadCount)),
+                    finishWithFullCollect: false,
+                    endException: false);
             }
             ReadOnlySpan<char> word = text.TakeWord();
             text.TakeSpace();
@@ -1135,12 +1146,16 @@ private static Args ParseFromCommandLine(string[] args)
         ItemType allocType = ItemType.ReferenceItem;
         bool verifyLiveSize = false;
         uint printEveryNthIter = 0;
+        bool finishWithFullCollect = false;
         bool endException = false;
 
         for (uint i = 0; i < args.Length; ++i)
         {
             switch (args[i])
             {
+                case "-finishWithFullCollect":
+                    finishWithFullCollect = true;
+                    break;
                 case "-endException":
                     endException = true;
                     break;
@@ -1267,6 +1282,7 @@ private static Args ParseFromCommandLine(string[] args)
         return new Args(
             threadCount: threadCount,
             perThreadArgs: new PerThreadArgs(verifyLiveSize: verifyLiveSize, printEveryNthIter: printEveryNthIter, phases: new Phase[] { onlyPhase }),
+            finishWithFullCollect: finishWithFullCollect,
             endException: endException);
     }
 
@@ -2140,20 +2156,37 @@ static void DoTest(in Args args, int currentPid)
 
     public static int Main(string[] argsStrs)
     {
-        Console.WriteLine($"Running 64-bit? {Environment.Is64BitProcess}");
-
-        int currentPid = Process.GetCurrentProcess().Id;
-        Console.WriteLine("PID: {0}", currentPid);
-
-        Stopwatch stopwatch = new Stopwatch();
-        stopwatch.Start();
-
-        Args args;
         try
         {
+            Args args;
             args = ArgsParser.Parse(argsStrs);
-            args.Describe();
-            DoTest(args, currentPid);
+
+            double secondsTaken = MainInner(args);
+
+            if (args.endException)
+            {
+                GC.Collect(2, GCCollectionMode.Forced, true);
+#if TODO
+                EmptyWorkingSet(Process.GetCurrentProcess().Handle);
+#endif
+                //Debugger.Break();
+                throw new System.ArgumentException("Just an opportunity for debugging", "test");
+            }
+
+            if (args.finishWithFullCollect)
+            {
+                while (ReferenceItemWithSize.NumFinalized < ReferenceItemWithSize.NumCreatedWithFinalizers)
+                {
+                    Console.WriteLine($"{ReferenceItemWithSize.NumFinalized} out of {ReferenceItemWithSize.NumCreatedWithFinalizers} finalizers have run, doing a full collect");
+                    GC.Collect(2, GCCollectionMode.Forced, blocking: true);
+                    GC.WaitForPendingFinalizers();
+                }
+                Util.AlwaysAssert(ReferenceItemWithSize.NumFinalized == ReferenceItemWithSize.NumCreatedWithFinalizers);
+            }
+
+            PrintResult(secondsTaken: secondsTaken);
+
+            return 0;
         }
         catch (Exception e)
         {
@@ -2161,21 +2194,23 @@ public static int Main(string[] argsStrs)
             Console.Error.WriteLine(e.StackTrace);
             return 1;
         }
+    }
 
-        if (args.endException)
-        {
-            GC.Collect(2, GCCollectionMode.Forced, true);
-#if TODO
-            EmptyWorkingSet(Process.GetCurrentProcess().Handle);
-#endif
-            //Debugger.Break();
-            throw new System.ArgumentException("Just an opportunity for debugging", "test");
-        }
+    public static double MainInner(Args args)
+    {
+        Console.WriteLine($"Running 64-bit? {Environment.Is64BitProcess}");
+
+        int currentPid = Process.GetCurrentProcess().Id;
+        Console.WriteLine("PID: {0}", currentPid);
+
+        Stopwatch stopwatch = new Stopwatch();
+        stopwatch.Start();
+
+        args.Describe();
+        DoTest(args, currentPid);
 
         stopwatch.Stop();
-        PrintResult(secondsTaken: stopwatch.Elapsed.TotalSeconds);
-        
-        return 0;
+        return stopwatch.Elapsed.TotalSeconds;
     }
 
     private static void PrintResult(double secondsTaken)
@@ -2194,6 +2229,8 @@ private static void PrintResult(double secondsTaken)
         }
         Console.WriteLine("]");
 
+        Console.WriteLine($"num_created_with_finalizers: {ReferenceItemWithSize.NumCreatedWithFinalizers}");
+        Console.WriteLine($"num_finalized: {ReferenceItemWithSize.NumFinalized}");
         Console.WriteLine($"final_total_memory_bytes: {GC.GetTotalMemory(forceFullCollection: false)}");
 
         // Use reflection to detect GC.GetGCMemoryInfo because it doesn't exist in dotnet core 2.0 or in .NET framework.
diff --git a/src/benchmarks/gc/src/exec/env/CMakeLists.txt b/src/benchmarks/gc/src/exec/env/CMakeLists.txt
index 8df91f9cbe..2bad05df36 100644
--- a/src/benchmarks/gc/src/exec/env/CMakeLists.txt
+++ b/src/benchmarks/gc/src/exec/env/CMakeLists.txt
@@ -1,7 +1,13 @@
 cmake_minimum_required (VERSION 3.15)
 project (env)
 
-add_executable(get_host_info get_host_info.c)
+add_executable(get_host_info get_host_info.cpp)
 add_executable(is_in_job is_in_job.c)
 add_executable(make_memory_load make_memory_load.c)
 add_executable(run_in_job run_in_job.c)
+
+if(MSVC)
+  add_compile_options(/W4 /WX)
+else()
+  add_compile_options(-Wall -Wextra -pedantic -Werror)
+endif()
diff --git a/src/benchmarks/gc/src/exec/env/get_host_info.c b/src/benchmarks/gc/src/exec/env/get_host_info.cpp
similarity index 61%
rename from src/benchmarks/gc/src/exec/env/get_host_info.c
rename to src/benchmarks/gc/src/exec/env/get_host_info.cpp
index 73fbac93fb..64f6d6f008 100644
--- a/src/benchmarks/gc/src/exec/env/get_host_info.c
+++ b/src/benchmarks/gc/src/exec/env/get_host_info.cpp
@@ -6,6 +6,7 @@
 
 #include <assert.h>
 #include <stdio.h>
+#include <vector>
 #include <windows.h>
 
 typedef struct LogicalProcessorInfos {
@@ -25,10 +26,10 @@ static LogicalProcessorInfos get_logical_processor_infos() {
     // (`sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)` is 76).
     if (!GetLogicalProcessorInformationEx(relation, NULL, &buffer_size_bytes) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
         printf("Failed to get # elements\n");
-        return (LogicalProcessorInfos) { 0, NULL };
+        return LogicalProcessorInfos { 0, NULL };
     }
 
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* infos = malloc(buffer_size_bytes);
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* infos = static_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(malloc(buffer_size_bytes));
     assert(infos != NULL);
 
     DWORD old_buffer_size_bytes = buffer_size_bytes;
@@ -36,11 +37,11 @@ static LogicalProcessorInfos get_logical_processor_infos() {
     {
         printf("Failed to get elements\n");
         free(infos);
-        return (LogicalProcessorInfos) { 0, NULL };
+        return LogicalProcessorInfos { 0, NULL };
     }
 
     assert(buffer_size_bytes == old_buffer_size_bytes);
-    return (LogicalProcessorInfos) { buffer_size_bytes, infos };
+    return LogicalProcessorInfos { buffer_size_bytes, infos };
 }
 
 #define MIN_LEVEL 1
@@ -52,11 +53,43 @@ typedef struct CacheStatsForLevel
     size_t total_bytes;
 } CacheStatsForLevel;
 
+struct RangeIter {
+    size_t i;
+
+    size_t operator*() const {
+        return i;
+    }
+
+    bool operator!=(const RangeIter other) const {
+        return i != other.i;
+    }
+
+    RangeIter& operator++() {
+        ++i;
+        return *this;
+    }
+};
+
+struct Range {
+    // both inclusive
+    size_t lo;
+    size_t hi;
+
+    RangeIter begin() const { return RangeIter{lo}; }
+    RangeIter end() const { return RangeIter{hi + 1}; }
+};
+
+struct NumaNodeInfo {
+    size_t numa_node_number;
+    size_t cpu_group_number;
+    std::vector<Range> ranges;
+};
+
 typedef struct CacheStats
 {
-    size_t numa_nodes;
-    size_t n_physical_processors;
-    size_t n_logical_processors;
+    std::vector<NumaNodeInfo> numa_nodes;
+    size_t n_physical_processors = 0;
+    size_t n_logical_processors = 0;
     CacheStatsForLevel levels[MAX_LEVEL + 1]; // index with 1, 2, or 3;
 } CacheStats;
 
@@ -64,14 +97,48 @@ static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* offset(SYSTEM_LOGICAL_PROCESSOR_
     return (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) (((char*) ptr) + bytes);
 }
 
+static bool has_bit(size_t a, size_t bit_index) {
+    return (a & (static_cast<size_t>(1) << bit_index)) != 0;
+}
+
+static std::vector<Range> ranges_from_mask(const KAFFINITY mask) {
+    static_assert(sizeof(mask) == sizeof(size_t), "expecting KAFFINITY == size_t");
+    assert(mask != 0);
+    std::vector<Range> ranges;
+    for (size_t i = 0; i < sizeof(mask) * 8; i++) {
+        if (has_bit(mask, i)) {
+            if (ranges.empty() || i != ranges.back().hi + 1) {
+                ranges.push_back(Range{i, i});
+            } else {
+                ranges.back().hi = i;
+            }
+        }
+    }
+    assert(!ranges.empty());
+    return ranges;
+}
+
+static void check_numa_nodes_are_correct(const std::vector<NumaNodeInfo>& numa_nodes) {
+    for (const NumaNodeInfo& nn : numa_nodes) {
+        for (const Range& range : nn.ranges) {
+            for (size_t i : range) {
+                PROCESSOR_NUMBER pn { static_cast<WORD>(nn.cpu_group_number), static_cast<BYTE>(i), /*reserved*/ 0 };
+                USHORT check_numa_node_number;
+                BOOL success = GetNumaProcessorNodeEx(&pn, &check_numa_node_number);
+                assert(success);
+                assert(check_numa_node_number == nn.numa_node_number);
+            }
+        }
+    }
+}
+
 static CacheStats getCacheStats()
 {
-    CacheStats res;
+    CacheStats res {};
     res.n_physical_processors = 0;
     res.n_logical_processors = 0;
-    res.numa_nodes = 0;
     for (size_t level = 0; level <= MAX_LEVEL; level++)
-        res.levels[level] = (CacheStatsForLevel) { 0, 0 };
+        res.levels[level] = CacheStatsForLevel { 0, 0 };
 
     LogicalProcessorInfos infos = get_logical_processor_infos();
     SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info_ptr = infos.first_info;
@@ -101,10 +168,12 @@ static CacheStats getCacheStats()
             }
             case RelationNumaNode:
             {
-                // NUMA_NODE_RELATIONSHIP nn = info.NumaNode;
+                //printf("Processor %d is on numa node %d\n", info.Processor, info.NumaNode.NodeNumber);
+                NUMA_NODE_RELATIONSHIP nn = info.NumaNode;
                 // printf("FOUND A NUMA NODE %ld\n", nn.NodeNumber);
                 // DWORD node_number = info.NumaNode.NodeNumber; // This is the only member of that struct.
-                res.numa_nodes++;
+                GROUP_AFFINITY gm = nn.GroupMask;
+                res.numa_nodes.push_back(NumaNodeInfo{nn.NodeNumber, gm.Group, ranges_from_mask(gm.Mask)});
                 break;
             }
 
@@ -135,13 +204,24 @@ static CacheStats getCacheStats()
 
     assert(info_ptr == end);
 
+    check_numa_nodes_are_correct(res.numa_nodes);
+
     free(infos.first_info); // frees the whole buffer
     return res;
 }
 
 int main(void) {
     CacheStats stats = getCacheStats();
-    printf("numa_nodes: %zd\n", stats.numa_nodes);
+    printf("numa_nodes:\n", stats.numa_nodes);
+    for (const NumaNodeInfo& nn : stats.numa_nodes) {
+        printf("  -\n");
+        printf("    numa_node_number: %zd\n", nn.numa_node_number);
+        printf("    cpu_group_number: %zd\n", nn.cpu_group_number);
+        printf("    ranges:\n");
+        for (const Range& range : nn.ranges) {
+           printf("      - { lo: %zd, hi: %zd }\n", range.lo, range.hi);
+        }
+    }
     printf("n_physical_processors: %zd\n", stats.n_physical_processors);
     printf("n_logical_processors: %zd\n", stats.n_logical_processors);
     printf("caches:\n");
diff --git a/src/benchmarks/gc/src/exec/env/make_memory_load.c b/src/benchmarks/gc/src/exec/env/make_memory_load.c
index 3dff442e09..2b0a849233 100644
--- a/src/benchmarks/gc/src/exec/env/make_memory_load.c
+++ b/src/benchmarks/gc/src/exec/env/make_memory_load.c
@@ -102,6 +102,7 @@ typedef struct Args
 {
     double desired_mem_usage_fraction;
     bool never_release;
+    bool no_readjust;
 } Args;
 
 const char* USAGE = "Usage: make_memory_load.exe -percent 50 [-neverRelease]\n";
@@ -110,6 +111,7 @@ static int parse_args(Args* args, const int argc, char** argv)
 {
     double percent = 0.0;
     bool never_release = FALSE;
+    bool no_readjust = FALSE;
 
     for (int i = 1; i < argc; i++)
     {
@@ -124,6 +126,10 @@ static int parse_args(Args* args, const int argc, char** argv)
         {
             never_release = TRUE;
         }
+        else if (streq(argv[i], "-noReadjust"))
+        {
+            no_readjust = TRUE;
+        }
         else
         {
             return fail(USAGE);
@@ -139,7 +145,7 @@ static int parse_args(Args* args, const int argc, char** argv)
         return fail("Percent must be > 0 and <= 99\n");
     }
 
-    *args = (Args) { .desired_mem_usage_fraction = percent / 100, .never_release = never_release };
+    *args = (Args) { .desired_mem_usage_fraction = percent / 100, .never_release = never_release, .no_readjust = no_readjust };
     return 0;
 }
 
@@ -322,7 +328,10 @@ int main(const int argc, char** argv)
     while (TRUE)
     {
         Sleep(100); // milliseconds
-        adjust(&mem);
+        if (!args.no_readjust)
+        {
+            adjust(&mem);
+        }
     }
 
     return 0;
diff --git a/src/benchmarks/gc/src/exec/remote.py b/src/benchmarks/gc/src/exec/remote.py
index 1ca8a18926..a5dbf52ee0 100644
--- a/src/benchmarks/gc/src/exec/remote.py
+++ b/src/benchmarks/gc/src/exec/remote.py
@@ -138,7 +138,7 @@ class RemoteDoArgs(_CommonArgs):
 
 def remote_do(argv: Sequence[str]) -> None:
     # Loading this lazily as it imports this
-    from ..all_commands import ALL_COMMANDS
+    from ..all_commands import ALL_COMMANDS  # pylint:disable=import-outside-toplevel
 
     # Separated by a `--`
     local_argv, remote_cmd_and_argv = split_once(argv, lambda a: a == "--")
diff --git a/src/benchmarks/gc/src/exec/run_single_test.py b/src/benchmarks/gc/src/exec/run_single_test.py
index 24ca36ff5e..f7c9edeab4 100644
--- a/src/benchmarks/gc/src/exec/run_single_test.py
+++ b/src/benchmarks/gc/src/exec/run_single_test.py
@@ -206,9 +206,17 @@ def run_single_test_temporary(clr: Clr, built: Built, t: SingleTest) -> ProcessI
 
 def check_env() -> Mapping[str, str]:
     e = environ
-    for k in e.keys():
-        if any(k.lower().startswith(start) for start in ("complus", "core_root")):
-            raise Exception(f"Environment variable '{k}' should not be set")
+    bad_environment_variables = [
+        k
+        for k in e.keys()
+        if any(k.lower().startswith(start) for start in ("complus", "core_root"))
+    ]
+    if not is_empty(bad_environment_variables):
+        start = f"Environment variables should not be set: {', '.join(bad_environment_variables)}"
+        msg = (
+            start if os_is_windows() else f'{start}\nTry running: unset "${{!COMPlus@}}" CORE_ROOT'
+        )
+        raise Exception(msg)
     return e
 
 
@@ -373,17 +381,23 @@ def _run_single_test_windows_perfview(
     ensure_empty_dir(out.out_path_base)
 
     # Start with the memory load
-    mem_load_pct = t.config.memory_load_percent
+    mem_load = t.config.memory_load
     mem_load_process = None
-    if mem_load_pct is not None:
+    if mem_load is not None:
         print("setting up memory load...")
-        mem_load_process = Popen(
-            args=(str(built.win.make_memory_load), "-percent", str(mem_load_pct)), stderr=PIPE
+        mem_load_args: Sequence[str] = (
+            str(built.win.make_memory_load),
+            "-percent",
+            str(mem_load.percent),
+            *optional_to_iter("-noReadjust" if mem_load.no_readjust else None),
         )
+        mem_load_process = Popen(args=mem_load_args, stderr=PIPE)
         assert mem_load_process.stderr is not None
         # Wait on it to start up
         line = decode_stdout(mem_load_process.stderr.readline())
-        assert line == "make_memory_load finished starting up"
+        assert (
+            line == "make_memory_load finished starting up"
+        ), f"Unexpected make_memory_load output {line}"
         print("done")
 
     log_file = out.add_ext("perfview-log.txt")
@@ -400,15 +414,17 @@ def _run_single_test_windows_perfview(
     test_cmd = _get_windows_test_cmd(built, t, ignore_container=False)
     run_process = exec_start(_get_exec_args(test_cmd.command, t, out), pipe_stdout=True)
 
-    run_result = wait_on_process_with_timeout(
-        run_process, start_time_seconds=start_time_seconds, timeout_seconds=timeout_seconds
-    )
-
-    exec_and_expect_output(
-        ExecArgs(_get_perfview_start_or_stop_cmd(t, log_file, trace_file, is_start=False)),
-        expected_output="",
-        err="PerfView stop failed",
-    )
+    try:
+        run_result = wait_on_process_with_timeout(
+            run_process, start_time_seconds=start_time_seconds, timeout_seconds=timeout_seconds
+        )
+    finally:
+        # Stop PerfView even if the test failed
+        exec_and_expect_output(
+            ExecArgs(_get_perfview_start_or_stop_cmd(t, log_file, trace_file, is_start=False)),
+            expected_output="",
+            err="PerfView stop failed",
+        )
 
     if run_result.time_taken is None:
         kill_test_processes()
@@ -505,7 +521,7 @@ def _run_single_test_no_collect(
 ) -> _PartialTestRunStatus:
     if t.options.log is not None:
         raise Exception("TODO")
-    if t.config.memory_load_percent is not None:
+    if t.config.memory_load is not None:
         # The script only works on windows right now
         raise Exception("TODO")
 
@@ -547,7 +563,7 @@ def _run_single_test_dotnet_trace(
         raise Exception("TODO")
     if t.config.affinitize:
         raise Exception("TODO")
-    if t.config.memory_load_percent is not None:
+    if t.config.memory_load is not None:
         # The script only works on windows right now
         raise Exception("TODO")
 
@@ -798,9 +814,10 @@ def _rename_gcperfsim_out(out: TestPaths) -> Path:
 
 
 def _get_exec_args(cmd: Sequence[str], t: SingleTest, out: TestPaths) -> ExecArgs:
+
     env = combine_mappings(
         t.default_env,
-        t.config.env(map_option(t.coreclr, lambda c: c.core_root)),
+        t.config.with_coreclr(t.coreclr_name).env(map_option(t.coreclr, lambda c: c.core_root)),
         log_env(t.options.log, out.out_path_base),
     )
     return ExecArgs(
@@ -823,7 +840,7 @@ def _run_single_test_linux_perfcollect(t: SingleTest, out: TestPaths) -> TestRun
 
     cwd = non_null(t.coreclr).corerun.parent  # TODO: handle self-contained executables
     env = combine_mappings(
-        t.config.env(map_option(t.coreclr, lambda c: c.core_root)),
+        t.config.with_coreclr(t.coreclr_name).env(map_option(t.coreclr, lambda c: c.core_root)),
         {"COMPlus_PerfMapEnabled": "1", "COMPlus_EnableEventLog": "1"},
     )
     cmd: Sequence[str] = _benchmark_command(t)
diff --git a/src/benchmarks/gc/src/exec/run_tests.py b/src/benchmarks/gc/src/exec/run_tests.py
index 0e46ecdf97..83fee18b88 100644
--- a/src/benchmarks/gc/src/exec/run_tests.py
+++ b/src/benchmarks/gc/src/exec/run_tests.py
@@ -248,7 +248,7 @@ def how_to_run_test(args: HowToRunTestArgs) -> None:
     benchmark = benchmark_and_name.benchmark
 
     env = combine_mappings(
-        config.env(map_option(coreclr_paths, lambda c: c.core_root)),
+        config.with_coreclr(coreclr.name).env(map_option(coreclr_paths, lambda c: c.core_root)),
         log_env(bench.options.log, Path.cwd() / "log"),
     )
     container = config.container
diff --git a/src/benchmarks/gc/src/requirements.txt b/src/benchmarks/gc/src/requirements.txt
index ec38f1d78b..e85d93c35e 100644
--- a/src/benchmarks/gc/src/requirements.txt
+++ b/src/benchmarks/gc/src/requirements.txt
@@ -2,20 +2,20 @@ black==19.3b0
 flask==1.1.1
 gitignore_parser==0.0.5
 jupyter==1.0.0
-jupyterlab==1.1.4
+jupyterlab==1.2.0
 matplotlib==3.1.1
-mypy==0.730
-numpy==1.17.2
-overrides==2.0
+mypy==0.740
+numpy==1.17.3
+overrides==2.4
 packaging==19.2
 psutil==5.6.3
 pur==5.2.2
-pylint==2.4.2
+pylint==2.4.3
 pythonnet==2.4.0
 pywin32==225; sys_platform == "win32"
 result==0.4.0
 ruamel.yaml==0.16.5
 termcolor==1.1.0
 vulture==1.1
-xlsxwriter==1.2.1
+xlsxwriter==1.2.2
 yattag==1.12.2
diff --git a/src/benchmarks/gc/src/suite.py b/src/benchmarks/gc/src/suite.py
index ac5cc2ea03..6a89859e74 100644
--- a/src/benchmarks/gc/src/suite.py
+++ b/src/benchmarks/gc/src/suite.py
@@ -24,6 +24,7 @@
     parse_bench_file,
     GCPerfSimArgs,
     MAX_ITERATIONS_FOR_ANALYZE_DOC,
+    MemoryLoadOptions,
     TestConfigContainer,
     Config,
     Vary,
@@ -158,7 +159,7 @@ class SuiteRunCommandArgs:
 
 
 def suite_run_command(args: SuiteRunCommandArgs) -> None:
-    from .all_commands import ALL_COMMANDS
+    from .all_commands import ALL_COMMANDS  # pylint:disable=import-outside-toplevel
 
     suite = load_yaml(SuiteFile, args.suite_path)
     commands = suite.command_groups[args.command_name]
@@ -275,8 +276,8 @@ def _create_scenario_high_memory_load(
     )
     # TODO: Don't specify a percent, specify an amount remaining in GB
     configs: Mapping[str, Config] = {
-        "80pct": Config(memory_load_percent=80),
-        "90pct": Config(memory_load_percent=90),
+        "80pct": Config(memory_load=MemoryLoadOptions(percent=80)),
+        "90pct": Config(memory_load=MemoryLoadOptions(percent=90)),
     }
     benchmarks: Mapping[str, Benchmark] = {
         "a": Benchmark(arguments=GCPerfSimArgs(tc=proc_count, tagb=40, tlgb=5, sohsi=30, sohpi=50))