nixpkgs/ci/eval/compare/cmp-stats.py
Philip Taron 3d0e2841b8
ci.eval.compare: keep warnings as warnings rather than raising them as errors
This reverts commit f8210561f3 (ci.eval.compare: turn warnings into errors, 2025-09-16).

It turns out that there are normal math warnings and we don't want to block CI on the math coming out wrong.
2025-09-18 08:37:36 -07:00

318 lines
11 KiB
Python

import argparse
import json
import numpy as np
import os
import pandas as pd
from dataclasses import asdict, dataclass
from pathlib import Path
from scipy.stats import ttest_rel
from tabulate import tabulate
from typing import Final
def flatten_data(json_data: dict) -> dict:
"""
Extracts and flattens metrics from JSON data.
This is needed because the JSON data can be nested.
For example, the JSON data entry might look like this:
"gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
Flattened:
"gc.cycles": 13
"gc.heapSize": 5404549120
...
See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
for the ultimate source of this data.
Args:
json_data (dict): JSON data containing metrics.
Returns:
dict: Flattened metrics with keys as metric names.
"""
flat_metrics = {}
for key, value in json_data.items():
# This key is duplicated as `time.cpu`; we keep that copy.
if key == "cpuTime":
continue
if isinstance(value, (int, float)):
flat_metrics[key] = value
elif isinstance(value, dict):
for subkey, subvalue in value.items():
assert isinstance(subvalue, (int, float)), subvalue
flat_metrics[f"{key}.{subkey}"] = subvalue
else:
assert isinstance(value, (float, int, dict)), (
f"Value `{value}` has unexpected type"
)
return flat_metrics
def load_all_metrics(path: Path) -> dict:
"""
Loads all stats JSON files in the specified file or directory and extracts metrics.
These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.
If the provided path is a directory, it must have the structure $path/$system/$stats,
where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
and $stats is a stats JSON file.
If the provided path is a file, it is a stats JSON file.
Args:
path (Path): Directory containing JSON files or a stats JSON file.
Returns:
dict: Dictionary with filenames as keys and extracted metrics as values.
"""
metrics = {}
if path.is_dir():
for system_dir in path.iterdir():
assert system_dir.is_dir()
for chunk_output in system_dir.iterdir():
with chunk_output.open() as f:
data = json.load(f)
metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
else:
with path.open() as f:
metrics[path.name] = flatten_data(json.load(f))
return metrics
def metric_table_name(name: str, explain: bool) -> str:
"""
Returns the name of the metric, plus a footnote to explain it if needed.
"""
return f"{name}[^{name}]" if explain else name
METRIC_EXPLANATION_FOOTNOTE: Final[str] = """
[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
[^gc.cycles]: Number of times garbage collection has been performed.
[^gc.heapSize]: Size in bytes of the garbage collector heap.
[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
[^envs.number]: The count of all `Env` objects allocated.
[^nrAvoided]: The number of thunks avoided being created.
[^nrExprs]: The number of expression objects ever created.
[^nrFunctionCalls]: The number of function calls ever made.
[^nrLookups]: The number of lookups into an attrset ever made.
[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
[^sets.number]: The number of attrsets ever made.
[^symbols.number]: The number of symbols ever added to the symbol table.
[^values.number]: The number of values ever made.
[^envs.elements]: The number of values contained within an `Env` object.
[^list.concats]: The number of list concatenation operations (`++`) performed.
[^list.elements]: The number of values contained within a list.
[^sets.elements]: The number of values contained within an attrset.
[^sizes.Attr]: Size in bytes of the `Attr` type.
[^sizes.Bindings]: Size in bytes of the `Bindings` type.
[^sizes.Env]: Size in bytes of the `Env` type.
[^sizes.Value]: Size in bytes of the `Value` type.
"""
@dataclass(frozen=True)
class PairwiseTestResults:
updated: pd.DataFrame
equivalent: pd.DataFrame
@staticmethod
def tabulate(table, headers) -> str:
return tabulate(
table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
)
def updated_to_markdown(self, explain: bool) -> str:
assert not self.updated.empty
# Header (get column names and format them)
return self.tabulate(
headers=[str(column) for column in self.updated.columns],
table=[
[
# The metric acts as its own footnote name
metric_table_name(row["metric"], explain),
# Check for no change and NaN in p_value/t_stat
*[
None if np.isnan(val) or np.allclose(val, 0) else val
for val in row[1:]
],
]
for _, row in self.updated.iterrows()
],
)
def equivalent_to_markdown(self, explain: bool) -> str:
assert not self.equivalent.empty
return self.tabulate(
headers=[str(column) for column in self.equivalent.columns],
table=[
[
# The metric acts as its own footnote name
metric_table_name(row["metric"], explain),
row["value"],
]
for _, row in self.equivalent.iterrows()
],
)
def to_markdown(self, explain: bool) -> str:
result = ""
if not self.equivalent.empty:
result += "## Unchanged values\n\n"
result += self.equivalent_to_markdown(explain)
if not self.updated.empty:
result += ("\n\n" if result else "") + "## Updated values\n\n"
result += self.updated_to_markdown(explain)
if explain:
result += METRIC_EXPLANATION_FOOTNOTE
return result
@dataclass(frozen=True)
class Equivalent:
metric: str
value: float
@dataclass(frozen=True)
class Comparison:
metric: str
mean_before: float
mean_after: float
mean_diff: float
mean_pct_change: float
@dataclass(frozen=True)
class ComparisonWithPValue(Comparison):
p_value: float
t_stat: float
def metric_sort_key(name: str) -> str:
if name in ("time.cpu", "time.gc", "time.gcFraction"):
return (1, name)
elif name.startswith("gc"):
return (2, name)
elif name.endswith(("bytes", "Bytes")):
return (3, name)
elif name.startswith("nr") or name.endswith("number"):
return (4, name)
else:
return (5, name)
def perform_pairwise_tests(
before_metrics: dict, after_metrics: dict
) -> PairwiseTestResults:
common_files = sorted(set(before_metrics) & set(after_metrics))
all_keys = sorted(
{
metric_keys
for file_metrics in before_metrics.values()
for metric_keys in file_metrics.keys()
},
key=metric_sort_key,
)
updated = []
equivalent = []
for key in all_keys:
before_vals = []
after_vals = []
for fname in common_files:
if key in before_metrics[fname] and key in after_metrics[fname]:
before_vals.append(before_metrics[fname][key])
after_vals.append(after_metrics[fname][key])
if len(before_vals) == 0:
continue
before_arr = np.array(before_vals)
after_arr = np.array(after_vals)
diff = after_arr - before_arr
# If there's no difference, add it all to the equivalent output.
if np.allclose(diff, 0):
equivalent.append(Equivalent(metric=key, value=before_vals[0]))
else:
pct_change = 100 * diff / before_arr
result = Comparison(
metric=key,
mean_before=np.mean(before_arr),
mean_after=np.mean(after_arr),
mean_diff=np.mean(diff),
mean_pct_change=np.mean(pct_change),
)
# If there are enough values to perform a t-test, do so.
if len(before_vals) > 1:
t_stat, p_val = ttest_rel(after_arr, before_arr)
result = ComparisonWithPValue(
**asdict(result), p_value=p_val, t_stat=t_stat
)
updated.append(result)
return PairwiseTestResults(
updated=pd.DataFrame(map(asdict, updated)),
equivalent=pd.DataFrame(map(asdict, equivalent)),
)
def main():
parser = argparse.ArgumentParser(
description="Performance comparison of Nix evaluation statistics"
)
parser.add_argument(
"--explain", action="store_true", help="Explain the evaluation statistics"
)
parser.add_argument(
"before", help="File or directory containing baseline (data before)"
)
parser.add_argument(
"after", help="File or directory containing comparison (data after)"
)
options = parser.parse_args()
before_stats = Path(options.before)
after_stats = Path(options.after)
before_metrics = load_all_metrics(before_stats)
after_metrics = load_all_metrics(after_stats)
pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
print(markdown_table)
if __name__ == "__main__":
main()