Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added WasmCoremark/coremark-minimal.wasm
Binary file not shown.
71 changes: 58 additions & 13 deletions compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,54 @@
import math
from scipy import stats

METRIC_PERF_CORRELATION = {
"time": "inverse",
"reported_score": "direct",
}

def confidence_interval(data, confidence=0.95):
n = len(data)
mean = statistics.mean(data)
stderr = statistics.stdev(data) / math.sqrt(n) if n > 1 else 0
interval = stderr * stats.t.ppf((1 + confidence) / 2., n-1) if n > 1 else 0
return mean - interval, mean + interval

def calculate_totals(data):
def calculate_totals(data, test_metrics):
totals = {}
time_totals = {}
for suite, tests in data.items():
suite_total = sum(test["mean"] for test in tests.values())
suite_total = sum(tests[test][test_metrics[suite][test]]["mean"] for test in tests.keys())
suite_total_for_time = sum(tests[test]["time"]["mean"] for test in tests.keys())
totals[suite] = suite_total
all_suites_total = sum(totals.values())
time_totals[suite] = suite_total_for_time
all_suites_total = sum(time_totals.values())
return totals, all_suites_total

def infer_test_metrics(old_data, new_data):
metrics = {}
all_suites = set(old_data.keys()).union(set(new_data.keys()))
for suite in all_suites:
all_tests = set(old_data.get(suite, {}).keys()).union(set(new_data.get(suite, {}).keys()))
suite_metrics = {}
for test in all_tests:
all_test_metrics = set(old_data.get(suite, {}).get(test, {}).keys()).union(set(new_data.get(suite, {}).get(test, {}).keys()))
chosen_metric = "time"
if len(all_test_metrics) == 2:
chosen_metric = next(x for x in all_test_metrics if x != "time")
suite_metrics[test] = chosen_metric
metrics[suite] = suite_metrics

# Migrate old data to new format
for data in (old_data, new_data):
first = data[next(iter(data))]
if "time" not in first[next(iter(first))]:
for suite in data:
for test in data[suite]:
results = data[suite][test]
data[suite][test] = { "time": results }

return old_data, new_data, metrics

def main():
parser = argparse.ArgumentParser(description="Compare JavaScript benchmark results.")
parser.add_argument("-o", "--old", required=True, help="Old JSON results file.")
Expand All @@ -33,8 +66,10 @@ def main():
with open(args.new, "r") as f:
new_data = json.load(f)

old_totals, old_all_suites_total = calculate_totals(old_data)
new_totals, new_all_suites_total = calculate_totals(new_data)
old_data, new_data, test_metrics = infer_test_metrics(old_data, new_data)

old_totals, old_all_suites_total = calculate_totals(old_data, test_metrics)
new_totals, new_all_suites_total = calculate_totals(new_data, test_metrics)

table_data = []
for suite in old_data.keys():
Expand All @@ -43,20 +78,30 @@ def main():
for test in old_data[suite].keys():
if test not in new_data[suite]:
continue
old_mean = old_data[suite][test]["mean"]
new_mean = new_data[suite][test]["mean"]
old_min = min(old_data[suite][test]["runs"])
new_min = min(new_data[suite][test]["runs"])
old_max = max(old_data[suite][test]["runs"])
new_max = max(new_data[suite][test]["runs"])
speedup = old_mean / new_mean
test_metric = test_metrics[suite][test]
old_mean = old_data[suite][test][test_metric]["mean"]
new_mean = new_data[suite][test][test_metric]["mean"]
old_min = min(old_data[suite][test][test_metric]["runs"])
new_min = min(new_data[suite][test][test_metric]["runs"])
old_max = max(old_data[suite][test][test_metric]["runs"])
new_max = max(new_data[suite][test][test_metric]["runs"])

speedup = new_mean / old_mean
if METRIC_PERF_CORRELATION[test_metric] == "inverse":
speedup = 1. / speedup

formatted_speedup = f"{speedup:.3f}"
table_data.append([suite, test, formatted_speedup, f"{old_mean:.3f} ± {old_min:.3f} … {old_max:.3f}", f"{new_mean:.3f} ± {new_min:.3f} … {new_max:.3f}"])

# Add total times comparison to the table data
for suite in old_totals.keys():
if suite in new_totals:
speedup = old_totals[suite] / new_totals[suite]
speedup = new_totals[suite] / old_totals[suite]
suite_metrics = set(test_metrics[suite].values())
assert len(suite_metrics) == 1, f"Multiple metrics for suite {suite}: {suite_metrics}"
if METRIC_PERF_CORRELATION[suite_metrics.pop()] == "inverse":
speedup = 1. / speedup

table_data.append([suite, "Total", f"{speedup:.3f}", f"{old_totals[suite]:.3f}", f"{new_totals[suite]:.3f}"])

# Compare all suites total
Expand Down
75 changes: 52 additions & 23 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,57 @@
#!/usr/bin/env python3

import argparse
import enum
import json
import os
import subprocess
import re
import shlex
import statistics
import subprocess
import sys
from tabulate import tabulate

def run_benchmark(executable, executable_arguments, suite, test_file, iterations, index, total, suppress_output=False):
times = []
FLOAT_RE = re.compile(r"([0-9]*\.[0-9]+|[0-9]+)")

class ScoreMetric(enum.Enum):
time = "time"
output = "reported_score"

def run_benchmark(executable, executable_arguments, suite, test_file, score_metric, iterations, index, total, suppress_output=False):
unit = "s" if score_metric == ScoreMetric.time else ""
measures = { k:[] for k in ScoreMetric }

for i in range(iterations):
if not suppress_output:
print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(times):.3f}s)" if times else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r")
print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(measures[score_metric]):.3f}{unit})" if measures[score_metric] else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r")
sys.stdout.flush()

result = subprocess.run([f"time -p {executable} {' '.join(executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL, text=True, executable="/bin/bash")
result = subprocess.run([f"time -p {shlex.quote(executable)} {' '.join(shlex.quote(arg) for arg in executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL if score_metric == ScoreMetric.time else subprocess.PIPE, text=True, executable="/bin/bash")
result.check_returncode()

time_output = result.stderr.split("\n")
real_time_line = [line for line in time_output if "real" in line][0]
time_taken = float(real_time_line.split(" ")[-1])
times.append(time_taken)

mean = statistics.mean(times)
stdev = statistics.stdev(times) if len(times) > 1 else 0
min_time = min(times)
max_time = max(times)
measures[ScoreMetric.time].append(time_taken)

if score_metric == ScoreMetric.output:
output = result.stdout.split("\n")
value = None
for line in output:
if match := FLOAT_RE.search(line):
value = float(match[1])
assert value is not None, "Expected a float in the benchmark output"
measures[ScoreMetric.output].append(value)

means = { key:statistics.mean(values) if len(values) > 0 else None for key, values in measures.items() }
stdevs = { key:statistics.stdev(values) if len(values) > 1 else 0 for key, values in measures.items() }
min_values = { key:min(values) if len(values) > 0 else None for key, values in measures.items() }
max_values = { key:max(values) if len(values) > 0 else None for key, values in measures.items() }
if not suppress_output:
print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {mean:.3f}s ± {stdev:.3f}s, Range: {min_time:.3f}s … {max_time:.3f}s\033[K")
print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {means[score_metric]:.3f}{unit} ± {stdevs[score_metric]:.3f}{unit}, Range: {min_values[score_metric]:.3f}{unit} … {max_values[score_metric]:.3f}{unit}\033[K")
sys.stdout.flush()

return mean, stdev, min_time, max_time, times
return means, stdevs, min_values, max_values, measures

def main():
parser = argparse.ArgumentParser(description="Run JavaScript benchmarks.")
Expand All @@ -44,7 +64,7 @@ def main():
args = parser.parse_args()

if args.suites == "all":
suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench"]
suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench", "WasmCoremark"]
else:
suites = args.suites.split(",")

Expand All @@ -54,7 +74,7 @@ def main():
for test_file in sorted(os.listdir("SunSpider")):
if not test_file.endswith(".js"):
continue
run_benchmark(args.executable, [], "SunSpider", test_file, 1, 0, 0, suppress_output=True)
run_benchmark(args.executable, [], "SunSpider", ScoreMetric.time, test_file, 1, 0, 0, suppress_output=True)

results = {}
table_data = []
Expand All @@ -64,32 +84,41 @@ def main():
for suite in suites:
results[suite] = {}
is_wasm_bench = suite == "WasmMicroBench"
is_wasm_coremark = suite == "WasmCoremark"

executable = ""
executable_arguments = []
score_metric = ScoreMetric.time
if (is_wasm_bench):
executable = args.wasm_executable
executable_arguments = ["-e", "run_microbench"]
elif is_wasm_coremark:
executable = args.wasm_executable
executable_arguments = ["-e", "run", "--export-js", "env.clock_ms:i64=BigInt(+new Date)"]
score_metric = ScoreMetric.output
else:
executable = args.executable

for test_file in sorted(os.listdir(suite)):
if (is_wasm_bench):
if (is_wasm_bench or is_wasm_coremark):
if not test_file.endswith(".wasm"):
continue
else:
if not test_file.endswith(".js"):
continue

mean, stdev, min_time, max_time, runs = run_benchmark(executable, executable_arguments, suite, test_file, args.iterations, current_test, total_tests)
stats = run_benchmark(executable, executable_arguments, suite, test_file, score_metric, args.iterations, current_test, total_tests)
results[suite][test_file] = {
"mean": mean,
"stdev": stdev,
"min": min_time,
"max": max_time,
"runs": runs
key.value: {
"mean": mean,
"stdev": stdev,
"min": min_val,
"max": max_val,
"runs": runs,
} for key, (mean, stdev, min_val, max_val, runs) in zip(stats[0].keys(), zip(*(x.values() for x in stats))) if runs
}
table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_time:.3f} … {max_time:.3f}"])
mean, stdev, min_val, max_val, _ = (stat[score_metric] for stat in stats)
table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_val:.3f} … {max_val:.3f}"])
current_test += 1

print(tabulate(table_data, headers=["Suite", "Test", "Mean ± σ", "Range (min … max)"]))
Expand Down