diff --git a/WasmCoremark/coremark-minimal.wasm b/WasmCoremark/coremark-minimal.wasm new file mode 100644 index 0000000..c5d6b87 Binary files /dev/null and b/WasmCoremark/coremark-minimal.wasm differ diff --git a/compare.py b/compare.py index e7b0f2b..c0ef3ea 100755 --- a/compare.py +++ b/compare.py @@ -7,6 +7,11 @@ import math from scipy import stats +METRIC_PERF_CORRELATION = { + "time": "inverse", + "reported_score": "direct", +} + def confidence_interval(data, confidence=0.95): n = len(data) mean = statistics.mean(data) @@ -14,14 +19,42 @@ def confidence_interval(data, confidence=0.95): interval = stderr * stats.t.ppf((1 + confidence) / 2., n-1) if n > 1 else 0 return mean - interval, mean + interval -def calculate_totals(data): +def calculate_totals(data, test_metrics): totals = {} + time_totals = {} for suite, tests in data.items(): - suite_total = sum(test["mean"] for test in tests.values()) + suite_total = sum(tests[test][test_metrics[suite][test]]["mean"] for test in tests.keys()) + suite_total_for_time = sum(tests[test]["time"]["mean"] for test in tests.keys()) totals[suite] = suite_total - all_suites_total = sum(totals.values()) + time_totals[suite] = suite_total_for_time + all_suites_total = sum(time_totals.values()) return totals, all_suites_total +def infer_test_metrics(old_data, new_data): + metrics = {} + all_suites = set(old_data.keys()).union(set(new_data.keys())) + for suite in all_suites: + all_tests = set(old_data.get(suite, {}).keys()).union(set(new_data.get(suite, {}).keys())) + suite_metrics = {} + for test in all_tests: + all_test_metrics = set(old_data.get(suite, {}).get(test, {}).keys()).union(set(new_data.get(suite, {}).get(test, {}).keys())) + chosen_metric = "time" + if len(all_test_metrics) == 2: + chosen_metric = next(x for x in all_test_metrics if x != "time") + suite_metrics[test] = chosen_metric + metrics[suite] = suite_metrics + + # Migrate old data to new format + for data in (old_data, new_data): + first = data[next(iter(data))] + if "time" not in first[next(iter(first))]: + for suite in data: + for test in data[suite]: + results = data[suite][test] + data[suite][test] = { "time": results } + + return old_data, new_data, metrics + def main(): parser = argparse.ArgumentParser(description="Compare JavaScript benchmark results.") parser.add_argument("-o", "--old", required=True, help="Old JSON results file.") @@ -33,8 +66,10 @@ def main(): with open(args.new, "r") as f: new_data = json.load(f) - old_totals, old_all_suites_total = calculate_totals(old_data) - new_totals, new_all_suites_total = calculate_totals(new_data) + old_data, new_data, test_metrics = infer_test_metrics(old_data, new_data) + + old_totals, old_all_suites_total = calculate_totals(old_data, test_metrics) + new_totals, new_all_suites_total = calculate_totals(new_data, test_metrics) table_data = [] for suite in old_data.keys(): @@ -43,20 +78,30 @@ def main(): for test in old_data[suite].keys(): if test not in new_data[suite]: continue - old_mean = old_data[suite][test]["mean"] - new_mean = new_data[suite][test]["mean"] - old_min = min(old_data[suite][test]["runs"]) - new_min = min(new_data[suite][test]["runs"]) - old_max = max(old_data[suite][test]["runs"]) - new_max = max(new_data[suite][test]["runs"]) - speedup = old_mean / new_mean + test_metric = test_metrics[suite][test] + old_mean = old_data[suite][test][test_metric]["mean"] + new_mean = new_data[suite][test][test_metric]["mean"] + old_min = min(old_data[suite][test][test_metric]["runs"]) + new_min = min(new_data[suite][test][test_metric]["runs"]) + old_max = max(old_data[suite][test][test_metric]["runs"]) + new_max = max(new_data[suite][test][test_metric]["runs"]) + + speedup = new_mean / old_mean + if METRIC_PERF_CORRELATION[test_metric] == "inverse": + speedup = 1. / speedup + formatted_speedup = f"{speedup:.3f}" table_data.append([suite, test, formatted_speedup, f"{old_mean:.3f} ± {old_min:.3f} … {old_max:.3f}", f"{new_mean:.3f} ± {new_min:.3f} … {new_max:.3f}"]) # Add total times comparison to the table data for suite in old_totals.keys(): if suite in new_totals: - speedup = old_totals[suite] / new_totals[suite] + speedup = new_totals[suite] / old_totals[suite] + suite_metrics = set(test_metrics[suite].values()) + assert len(suite_metrics) == 1, f"Multiple metrics for suite {suite}: {suite_metrics}" + if METRIC_PERF_CORRELATION[suite_metrics.pop()] == "inverse": + speedup = 1. / speedup + table_data.append([suite, "Total", f"{speedup:.3f}", f"{old_totals[suite]:.3f}", f"{new_totals[suite]:.3f}"]) # Compare all suites total diff --git a/run.py b/run.py index 960780c..13ecc81 100755 --- a/run.py +++ b/run.py @@ -1,37 +1,57 @@ #!/usr/bin/env python3 import argparse +import enum import json import os -import subprocess +import re +import shlex import statistics +import subprocess import sys from tabulate import tabulate -def run_benchmark(executable, executable_arguments, suite, test_file, iterations, index, total, suppress_output=False): - times = [] +FLOAT_RE = re.compile(r"([0-9]*\.[0-9]+|[0-9]+)") + +class ScoreMetric(enum.Enum): + time = "time" + output = "reported_score" + +def run_benchmark(executable, executable_arguments, suite, test_file, score_metric, iterations, index, total, suppress_output=False): + unit = "s" if score_metric == ScoreMetric.time else "" + measures = { k:[] for k in ScoreMetric } + for i in range(iterations): if not suppress_output: - print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(times):.3f}s)" if times else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r") + print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(measures[score_metric]):.3f}{unit})" if measures[score_metric] else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r") sys.stdout.flush() - result = subprocess.run([f"time -p {executable} {' '.join(executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL, text=True, executable="/bin/bash") + result = subprocess.run([f"time -p {shlex.quote(executable)} {' '.join(shlex.quote(arg) for arg in executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL if score_metric == ScoreMetric.time else subprocess.PIPE, text=True, executable="/bin/bash") result.check_returncode() time_output = result.stderr.split("\n") real_time_line = [line for line in time_output if "real" in line][0] time_taken = float(real_time_line.split(" ")[-1]) - times.append(time_taken) - - mean = statistics.mean(times) - stdev = statistics.stdev(times) if len(times) > 1 else 0 - min_time = min(times) - max_time = max(times) + measures[ScoreMetric.time].append(time_taken) + + if score_metric == ScoreMetric.output: + output = result.stdout.split("\n") + value = None + for line in output: + if match := FLOAT_RE.search(line): + value = float(match[1]) + assert value is not None, "Expected a float in the benchmark output" + measures[ScoreMetric.output].append(value) + + means = { key:statistics.mean(values) if len(values) > 0 else None for key, values in measures.items() } + stdevs = { key:statistics.stdev(values) if len(values) > 1 else 0 for key, values in measures.items() } + min_values = { key:min(values) if len(values) > 0 else None for key, values in measures.items() } + max_values = { key:max(values) if len(values) > 0 else None for key, values in measures.items() } if not suppress_output: - print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {mean:.3f}s ± {stdev:.3f}s, Range: {min_time:.3f}s … {max_time:.3f}s\033[K") + print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {means[score_metric]:.3f}{unit} ± {stdevs[score_metric]:.3f}{unit}, Range: {min_values[score_metric]:.3f}{unit} … {max_values[score_metric]:.3f}{unit}\033[K") sys.stdout.flush() - return mean, stdev, min_time, max_time, times + return means, stdevs, min_values, max_values, measures def main(): parser = argparse.ArgumentParser(description="Run JavaScript benchmarks.") @@ -44,7 +64,7 @@ def main(): args = parser.parse_args() if args.suites == "all": - suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench"] + suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench", "WasmCoremark"] else: suites = args.suites.split(",") @@ -54,7 +74,7 @@ def main(): for test_file in sorted(os.listdir("SunSpider")): if not test_file.endswith(".js"): continue - run_benchmark(args.executable, [], "SunSpider", test_file, 1, 0, 0, suppress_output=True) + run_benchmark(args.executable, [], "SunSpider", ScoreMetric.time, test_file, 1, 0, 0, suppress_output=True) results = {} table_data = [] @@ -64,32 +84,41 @@ def main(): for suite in suites: results[suite] = {} is_wasm_bench = suite == "WasmMicroBench" + is_wasm_coremark = suite == "WasmCoremark" executable = "" executable_arguments = [] + score_metric = ScoreMetric.time if (is_wasm_bench): executable = args.wasm_executable executable_arguments = ["-e", "run_microbench"] + elif is_wasm_coremark: + executable = args.wasm_executable + executable_arguments = ["-e", "run", "--export-js", "env.clock_ms:i64=BigInt(+new Date)"] + score_metric = ScoreMetric.output else: executable = args.executable for test_file in sorted(os.listdir(suite)): - if (is_wasm_bench): + if (is_wasm_bench or is_wasm_coremark): if not test_file.endswith(".wasm"): continue else: if not test_file.endswith(".js"): continue - mean, stdev, min_time, max_time, runs = run_benchmark(executable, executable_arguments, suite, test_file, args.iterations, current_test, total_tests) + stats = run_benchmark(executable, executable_arguments, suite, test_file, score_metric, args.iterations, current_test, total_tests) results[suite][test_file] = { - "mean": mean, - "stdev": stdev, - "min": min_time, - "max": max_time, - "runs": runs + key.value: { + "mean": mean, + "stdev": stdev, + "min": min_val, + "max": max_val, + "runs": runs, + } for key, (mean, stdev, min_val, max_val, runs) in zip(stats[0].keys(), zip(*(x.values() for x in stats))) if runs } - table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_time:.3f} … {max_time:.3f}"]) + mean, stdev, min_val, max_val, _ = (stat[score_metric] for stat in stats) + table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_val:.3f} … {max_val:.3f}"]) current_test += 1 print(tabulate(table_data, headers=["Suite", "Test", "Mean ± σ", "Range (min … max)"]))