LadybirdBrowser · gmta · Sep 12, 2025 · Aug 20, 2025
diff --git a/WasmCoremark/coremark-minimal.wasm b/WasmCoremark/coremark-minimal.wasm
diff --git a/compare.py b/compare.py
@@ -7,21 +7,54 @@
 import math
 from scipy import stats
 
+METRIC_PERF_CORRELATION = {
+    "time": "inverse",
+    "reported_score": "direct",
+}
+
 def confidence_interval(data, confidence=0.95):
     n = len(data)
     mean = statistics.mean(data)
     stderr = statistics.stdev(data) / math.sqrt(n) if n > 1 else 0
     interval = stderr * stats.t.ppf((1 + confidence) / 2., n-1) if n > 1 else 0
     return mean - interval, mean + interval
 
-def calculate_totals(data):
+def calculate_totals(data, test_metrics):
     totals = {}
+    time_totals = {}
     for suite, tests in data.items():
-        suite_total = sum(test["mean"] for test in tests.values())
+        suite_total = sum(tests[test][test_metrics[suite][test]]["mean"] for test in tests.keys())
+        suite_total_for_time = sum(tests[test]["time"]["mean"] for test in tests.keys())
         totals[suite] = suite_total
-    all_suites_total = sum(totals.values())
+        time_totals[suite] = suite_total_for_time
+    all_suites_total = sum(time_totals.values())
     return totals, all_suites_total
 
+def infer_test_metrics(old_data, new_data):
+    metrics = {}
+    all_suites = set(old_data.keys()).union(set(new_data.keys()))
+    for suite in all_suites:
+        all_tests = set(old_data.get(suite, {}).keys()).union(set(new_data.get(suite, {}).keys()))
+        suite_metrics = {}
+        for test in all_tests:
+            all_test_metrics = set(old_data.get(suite, {}).get(test, {}).keys()).union(set(new_data.get(suite, {}).get(test, {}).keys()))
+            chosen_metric = "time"
+            if len(all_test_metrics) == 2:
+                chosen_metric = next(x for x in all_test_metrics if x != "time")
+            suite_metrics[test] = chosen_metric
+        metrics[suite] = suite_metrics
+
+    # Migrate old data to new format
+    for data in (old_data, new_data):
+        first = data[next(iter(data))]
+        if "time" not in first[next(iter(first))]:
+            for suite in data:
+                for test in data[suite]:
+                    results = data[suite][test]
+                    data[suite][test] = { "time": results }
+
+    return old_data, new_data, metrics
+
 def main():
     parser = argparse.ArgumentParser(description="Compare JavaScript benchmark results.")
     parser.add_argument("-o", "--old", required=True, help="Old JSON results file.")
@@ -33,8 +66,10 @@ def main():
     with open(args.new, "r") as f:
         new_data = json.load(f)
 
-    old_totals, old_all_suites_total = calculate_totals(old_data)
-    new_totals, new_all_suites_total = calculate_totals(new_data)
+    old_data, new_data, test_metrics = infer_test_metrics(old_data, new_data)
+
+    old_totals, old_all_suites_total = calculate_totals(old_data, test_metrics)
+    new_totals, new_all_suites_total = calculate_totals(new_data, test_metrics)
 
     table_data = []
     for suite in old_data.keys():
@@ -43,20 +78,30 @@ def main():
         for test in old_data[suite].keys():
             if test not in new_data[suite]:
                 continue
-            old_mean = old_data[suite][test]["mean"]
-            new_mean = new_data[suite][test]["mean"]
-            old_min = min(old_data[suite][test]["runs"])
-            new_min = min(new_data[suite][test]["runs"])
-            old_max = max(old_data[suite][test]["runs"])
-            new_max = max(new_data[suite][test]["runs"])
-            speedup = old_mean / new_mean
+            test_metric = test_metrics[suite][test]
+            old_mean = old_data[suite][test][test_metric]["mean"]
+            new_mean = new_data[suite][test][test_metric]["mean"]
+            old_min = min(old_data[suite][test][test_metric]["runs"])
+            new_min = min(new_data[suite][test][test_metric]["runs"])
+            old_max = max(old_data[suite][test][test_metric]["runs"])
+            new_max = max(new_data[suite][test][test_metric]["runs"])
+
+            speedup = new_mean / old_mean
+            if METRIC_PERF_CORRELATION[test_metric] == "inverse":
+                speedup = 1. / speedup
+
             formatted_speedup = f"{speedup:.3f}"
             table_data.append([suite, test, formatted_speedup, f"{old_mean:.3f} ± {old_min:.3f} … {old_max:.3f}", f"{new_mean:.3f} ± {new_min:.3f} … {new_max:.3f}"])
 
     # Add total times comparison to the table data
     for suite in old_totals.keys():
         if suite in new_totals:
-            speedup = old_totals[suite] / new_totals[suite]
+            speedup = new_totals[suite] / old_totals[suite]
+            suite_metrics = set(test_metrics[suite].values())
+            assert len(suite_metrics) == 1, f"Multiple metrics for suite {suite}: {suite_metrics}"
+            if METRIC_PERF_CORRELATION[suite_metrics.pop()] == "inverse":
+                speedup = 1. / speedup
+
             table_data.append([suite, "Total", f"{speedup:.3f}", f"{old_totals[suite]:.3f}", f"{new_totals[suite]:.3f}"])
 
     # Compare all suites total

diff --git a/run.py b/run.py
@@ -1,37 +1,57 @@
 #!/usr/bin/env python3
 
 import argparse
+import enum
 import json
 import os
-import subprocess
+import re
+import shlex
 import statistics
+import subprocess
 import sys
 from tabulate import tabulate
 
-def run_benchmark(executable, executable_arguments, suite, test_file, iterations, index, total, suppress_output=False):
-    times = []
+FLOAT_RE = re.compile(r"([0-9]*\.[0-9]+|[0-9]+)")
+
+class ScoreMetric(enum.Enum):
+    time = "time"
+    output = "reported_score"
+
+def run_benchmark(executable, executable_arguments, suite, test_file, score_metric, iterations, index, total, suppress_output=False):
+    unit = "s" if score_metric == ScoreMetric.time else ""
+    measures = { k:[] for k in ScoreMetric }
+
     for i in range(iterations):
         if not suppress_output:
-            print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(times):.3f}s)" if times else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r")
+            print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(measures[score_metric]):.3f}{unit})" if measures[score_metric] else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r")
             sys.stdout.flush()
 
-        result = subprocess.run([f"time -p {executable} {' '.join(executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL, text=True, executable="/bin/bash")
+        result = subprocess.run([f"time -p {shlex.quote(executable)} {' '.join(shlex.quote(arg) for arg in executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL if score_metric == ScoreMetric.time else subprocess.PIPE, text=True, executable="/bin/bash")
         result.check_returncode()
 
         time_output = result.stderr.split("\n")
         real_time_line = [line for line in time_output if "real" in line][0]
         time_taken = float(real_time_line.split(" ")[-1])
-        times.append(time_taken)
-
-    mean = statistics.mean(times)
-    stdev = statistics.stdev(times) if len(times) > 1 else 0
-    min_time = min(times)
-    max_time = max(times)
+        measures[ScoreMetric.time].append(time_taken)
+
+        if score_metric == ScoreMetric.output:
+            output = result.stdout.split("\n")
+            value = None
+            for line in output:
+                if match := FLOAT_RE.search(line):
+                    value = float(match[1])
+            assert value is not None, "Expected a float in the benchmark output"
+            measures[ScoreMetric.output].append(value)
+
+    means = { key:statistics.mean(values) if len(values) > 0 else None for key, values in measures.items() }
+    stdevs = { key:statistics.stdev(values) if len(values) > 1 else 0 for key, values in measures.items() }
+    min_values = { key:min(values) if len(values) > 0 else None for key, values in measures.items() }
+    max_values = { key:max(values) if len(values) > 0 else None for key, values in measures.items() }
     if not suppress_output:
-        print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {mean:.3f}s ± {stdev:.3f}s, Range: {min_time:.3f}s … {max_time:.3f}s\033[K")
+        print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {means[score_metric]:.3f}{unit} ± {stdevs[score_metric]:.3f}{unit}, Range: {min_values[score_metric]:.3f}{unit} … {max_values[score_metric]:.3f}{unit}\033[K")
         sys.stdout.flush()
 
-    return mean, stdev, min_time, max_time, times
+    return means, stdevs, min_values, max_values, measures
 
 def main():
     parser = argparse.ArgumentParser(description="Run JavaScript benchmarks.")
@@ -44,7 +64,7 @@ def main():
     args = parser.parse_args()
 
     if args.suites == "all":
-        suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench"]
+        suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench", "WasmCoremark"]
     else:
         suites = args.suites.split(",")
 
@@ -54,7 +74,7 @@ def main():
             for test_file in sorted(os.listdir("SunSpider")):
                 if not test_file.endswith(".js"):
                     continue
-                run_benchmark(args.executable, [], "SunSpider", test_file, 1, 0, 0, suppress_output=True)
+                run_benchmark(args.executable, [], "SunSpider", ScoreMetric.time, test_file, 1, 0, 0, suppress_output=True)
 
     results = {}
     table_data = []
@@ -64,32 +84,41 @@ def main():
     for suite in suites:
         results[suite] = {}
         is_wasm_bench = suite == "WasmMicroBench"
+        is_wasm_coremark = suite == "WasmCoremark"
 
         executable = ""
         executable_arguments = []
+        score_metric = ScoreMetric.time
         if (is_wasm_bench):
             executable = args.wasm_executable
             executable_arguments = ["-e", "run_microbench"]
+        elif is_wasm_coremark:
+            executable = args.wasm_executable
+            executable_arguments = ["-e", "run", "--export-js", "env.clock_ms:i64=BigInt(+new Date)"]
+            score_metric = ScoreMetric.output
         else:
             executable = args.executable
 
         for test_file in sorted(os.listdir(suite)):
-            if (is_wasm_bench):
+            if (is_wasm_bench or is_wasm_coremark):
                 if not test_file.endswith(".wasm"):
                     continue
             else:
                 if not test_file.endswith(".js"):
                     continue
 
-            mean, stdev, min_time, max_time, runs = run_benchmark(executable, executable_arguments, suite, test_file, args.iterations, current_test, total_tests)
+            stats = run_benchmark(executable, executable_arguments, suite, test_file, score_metric, args.iterations, current_test, total_tests)
             results[suite][test_file] = {
-                "mean": mean,
-                "stdev": stdev,
-                "min": min_time,
-                "max": max_time,
-                "runs": runs
+                key.value: {
+                    "mean": mean,
+                    "stdev": stdev,
+                    "min": min_val,
+                    "max": max_val,
+                    "runs": runs,
+                } for key, (mean, stdev, min_val, max_val, runs) in zip(stats[0].keys(), zip(*(x.values() for x in stats))) if runs
             }
-            table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_time:.3f} … {max_time:.3f}"])
+            mean, stdev, min_val, max_val, _ = (stat[score_metric] for stat in stats)
+            table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_val:.3f} … {max_val:.3f}"])
             current_test += 1
 
     print(tabulate(table_data, headers=["Suite", "Test", "Mean ± σ", "Range (min … max)"]))