Skip to content

Commit 694d528

Browse files
committed
Add coremark-wasm as a benchmark
This is a non-timing benchmark, and the output value is shown as a measure of performance (higher = better)
1 parent d151be8 commit 694d528

File tree

3 files changed

+110
-36
lines changed

3 files changed

+110
-36
lines changed

WasmCoremark/coremark-minimal.wasm

7.59 KB
Binary file not shown.

compare.py

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,54 @@
77
import math
88
from scipy import stats
99

10+
METRIC_PERF_CORRELATION = {
11+
"time": "inverse",
12+
"reported_score": "direct",
13+
}
14+
1015
def confidence_interval(data, confidence=0.95):
1116
n = len(data)
1217
mean = statistics.mean(data)
1318
stderr = statistics.stdev(data) / math.sqrt(n) if n > 1 else 0
1419
interval = stderr * stats.t.ppf((1 + confidence) / 2., n-1) if n > 1 else 0
1520
return mean - interval, mean + interval
1621

17-
def calculate_totals(data):
22+
def calculate_totals(data, test_metrics):
1823
totals = {}
24+
time_totals = {}
1925
for suite, tests in data.items():
20-
suite_total = sum(test["mean"] for test in tests.values())
26+
suite_total = sum(tests[test][test_metrics[suite][test]]["mean"] for test in tests.keys())
27+
suite_total_for_time = sum(tests[test]["time"]["mean"] for test in tests.keys())
2128
totals[suite] = suite_total
22-
all_suites_total = sum(totals.values())
29+
time_totals[suite] = suite_total_for_time
30+
all_suites_total = sum(time_totals.values())
2331
return totals, all_suites_total
2432

33+
def infer_test_metrics(old_data, new_data):
34+
metrics = {}
35+
all_suites = set(old_data.keys()).union(set(new_data.keys()))
36+
for suite in all_suites:
37+
all_tests = set(old_data.get(suite, {}).keys()).union(set(new_data.get(suite, {}).keys()))
38+
suite_metrics = {}
39+
for test in all_tests:
40+
all_test_metrics = set(old_data.get(suite, {}).get(test, {}).keys()).union(set(new_data.get(suite, {}).get(test, {}).keys()))
41+
chosen_metric = "time"
42+
if len(all_test_metrics) == 2:
43+
chosen_metric = next(x for x in all_test_metrics if x != "time")
44+
suite_metrics[test] = chosen_metric
45+
metrics[suite] = suite_metrics
46+
47+
# Migrate old data to new format
48+
for data in (old_data, new_data):
49+
first = data[next(iter(data))]
50+
if "time" not in first[next(iter(first))]:
51+
for suite in data:
52+
for test in data[suite]:
53+
results = data[suite][test]
54+
data[suite][test] = { "time": results }
55+
56+
return old_data, new_data, metrics
57+
2558
def main():
2659
parser = argparse.ArgumentParser(description="Compare JavaScript benchmark results.")
2760
parser.add_argument("-o", "--old", required=True, help="Old JSON results file.")
@@ -33,8 +66,10 @@ def main():
3366
with open(args.new, "r") as f:
3467
new_data = json.load(f)
3568

36-
old_totals, old_all_suites_total = calculate_totals(old_data)
37-
new_totals, new_all_suites_total = calculate_totals(new_data)
69+
old_data, new_data, test_metrics = infer_test_metrics(old_data, new_data)
70+
71+
old_totals, old_all_suites_total = calculate_totals(old_data, test_metrics)
72+
new_totals, new_all_suites_total = calculate_totals(new_data, test_metrics)
3873

3974
table_data = []
4075
for suite in old_data.keys():
@@ -43,20 +78,30 @@ def main():
4378
for test in old_data[suite].keys():
4479
if test not in new_data[suite]:
4580
continue
46-
old_mean = old_data[suite][test]["mean"]
47-
new_mean = new_data[suite][test]["mean"]
48-
old_min = min(old_data[suite][test]["runs"])
49-
new_min = min(new_data[suite][test]["runs"])
50-
old_max = max(old_data[suite][test]["runs"])
51-
new_max = max(new_data[suite][test]["runs"])
52-
speedup = old_mean / new_mean
81+
test_metric = test_metrics[suite][test]
82+
old_mean = old_data[suite][test][test_metric]["mean"]
83+
new_mean = new_data[suite][test][test_metric]["mean"]
84+
old_min = min(old_data[suite][test][test_metric]["runs"])
85+
new_min = min(new_data[suite][test][test_metric]["runs"])
86+
old_max = max(old_data[suite][test][test_metric]["runs"])
87+
new_max = max(new_data[suite][test][test_metric]["runs"])
88+
89+
speedup = new_mean / old_mean
90+
if METRIC_PERF_CORRELATION[test_metric] == "inverse":
91+
speedup = 1. / speedup
92+
5393
formatted_speedup = f"{speedup:.3f}"
5494
table_data.append([suite, test, formatted_speedup, f"{old_mean:.3f} ± {old_min:.3f}{old_max:.3f}", f"{new_mean:.3f} ± {new_min:.3f}{new_max:.3f}"])
5595

5696
# Add total times comparison to the table data
5797
for suite in old_totals.keys():
5898
if suite in new_totals:
59-
speedup = old_totals[suite] / new_totals[suite]
99+
speedup = new_totals[suite] / old_totals[suite]
100+
suite_metrics = set(test_metrics[suite].values())
101+
assert len(suite_metrics) == 1, f"Multiple metrics for suite {suite}: {suite_metrics}"
102+
if METRIC_PERF_CORRELATION[suite_metrics.pop()] == "inverse":
103+
speedup = 1. / speedup
104+
60105
table_data.append([suite, "Total", f"{speedup:.3f}", f"{old_totals[suite]:.3f}", f"{new_totals[suite]:.3f}"])
61106

62107
# Compare all suites total

run.py

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,57 @@
11
#!/usr/bin/env python3
22

33
import argparse
4+
import enum
45
import json
56
import os
6-
import subprocess
7+
import re
8+
import shlex
79
import statistics
10+
import subprocess
811
import sys
912
from tabulate import tabulate
1013

11-
def run_benchmark(executable, executable_arguments, suite, test_file, iterations, index, total, suppress_output=False):
12-
times = []
14+
FLOAT_RE = re.compile(r"([0-9]*\.[0-9]+|[0-9]+)")
15+
16+
class ScoreMetric(enum.Enum):
17+
time = "time"
18+
output = "reported_score"
19+
20+
def run_benchmark(executable, executable_arguments, suite, test_file, score_metric, iterations, index, total, suppress_output=False):
21+
unit = "s" if score_metric == ScoreMetric.time else ""
22+
measures = { k:[] for k in ScoreMetric }
23+
1324
for i in range(iterations):
1425
if not suppress_output:
15-
print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(times):.3f}s)" if times else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r")
26+
print(f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations}, Avg: {statistics.mean(measures[score_metric]):.3f}{unit})" if measures[score_metric] else f"[{index}/{total}] {suite}/{test_file} (Iteration {i+1}/{iterations})", end="\r")
1627
sys.stdout.flush()
1728

18-
result = subprocess.run([f"time -p {executable} {' '.join(executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL, text=True, executable="/bin/bash")
29+
result = subprocess.run([f"time -p {shlex.quote(executable)} {' '.join(shlex.quote(arg) for arg in executable_arguments)} {suite}/{test_file}"], shell=True, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL if score_metric == ScoreMetric.time else subprocess.PIPE, text=True, executable="/bin/bash")
1930
result.check_returncode()
2031

2132
time_output = result.stderr.split("\n")
2233
real_time_line = [line for line in time_output if "real" in line][0]
2334
time_taken = float(real_time_line.split(" ")[-1])
24-
times.append(time_taken)
25-
26-
mean = statistics.mean(times)
27-
stdev = statistics.stdev(times) if len(times) > 1 else 0
28-
min_time = min(times)
29-
max_time = max(times)
35+
measures[ScoreMetric.time].append(time_taken)
36+
37+
if score_metric == ScoreMetric.output:
38+
output = result.stdout.split("\n")
39+
value = None
40+
for line in output:
41+
if match := FLOAT_RE.search(line):
42+
value = float(match[1])
43+
assert value is not None, "Expected a float in the benchmark output"
44+
measures[ScoreMetric.output].append(value)
45+
46+
means = { key:statistics.mean(values) if len(values) > 0 else None for key, values in measures.items() }
47+
stdevs = { key:statistics.stdev(values) if len(values) > 1 else 0 for key, values in measures.items() }
48+
min_values = { key:min(values) if len(values) > 0 else None for key, values in measures.items() }
49+
max_values = { key:max(values) if len(values) > 0 else None for key, values in measures.items() }
3050
if not suppress_output:
31-
print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {mean:.3f}s ± {stdev:.3f}s, Range: {min_time:.3f}s{max_time:.3f}s\033[K")
51+
print(f"[{index}/{total}] {suite}/{test_file} completed. Mean: {means[score_metric]:.3f}{unit} ± {stdevs[score_metric]:.3f}{unit}, Range: {min_values[score_metric]:.3f}{unit}{max_values[score_metric]:.3f}{unit}\033[K")
3252
sys.stdout.flush()
3353

34-
return mean, stdev, min_time, max_time, times
54+
return means, stdevs, min_values, max_values, measures
3555

3656
def main():
3757
parser = argparse.ArgumentParser(description="Run JavaScript benchmarks.")
@@ -44,7 +64,7 @@ def main():
4464
args = parser.parse_args()
4565

4666
if args.suites == "all":
47-
suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench"]
67+
suites = ["SunSpider", "Kraken", "Octane", "JetStream", "JetStream3", "RegExp", "MicroBench", "WasmMicroBench", "WasmCoremark"]
4868
else:
4969
suites = args.suites.split(",")
5070

@@ -54,7 +74,7 @@ def main():
5474
for test_file in sorted(os.listdir("SunSpider")):
5575
if not test_file.endswith(".js"):
5676
continue
57-
run_benchmark(args.executable, [], "SunSpider", test_file, 1, 0, 0, suppress_output=True)
77+
run_benchmark(args.executable, [], "SunSpider", ScoreMetric.time, test_file, 1, 0, 0, suppress_output=True)
5878

5979
results = {}
6080
table_data = []
@@ -64,32 +84,41 @@ def main():
6484
for suite in suites:
6585
results[suite] = {}
6686
is_wasm_bench = suite == "WasmMicroBench"
87+
is_wasm_coremark = suite == "WasmCoremark"
6788

6889
executable = ""
6990
executable_arguments = []
91+
score_metric = ScoreMetric.time
7092
if (is_wasm_bench):
7193
executable = args.wasm_executable
7294
executable_arguments = ["-e", "run_microbench"]
95+
elif is_wasm_coremark:
96+
executable = args.wasm_executable
97+
executable_arguments = ["-e", "run", "--export-js", "env.clock_ms:i64=BigInt(+new Date)"]
98+
score_metric = ScoreMetric.output
7399
else:
74100
executable = args.executable
75101

76102
for test_file in sorted(os.listdir(suite)):
77-
if (is_wasm_bench):
103+
if (is_wasm_bench or is_wasm_coremark):
78104
if not test_file.endswith(".wasm"):
79105
continue
80106
else:
81107
if not test_file.endswith(".js"):
82108
continue
83109

84-
mean, stdev, min_time, max_time, runs = run_benchmark(executable, executable_arguments, suite, test_file, args.iterations, current_test, total_tests)
110+
stats = run_benchmark(executable, executable_arguments, suite, test_file, score_metric, args.iterations, current_test, total_tests)
85111
results[suite][test_file] = {
86-
"mean": mean,
87-
"stdev": stdev,
88-
"min": min_time,
89-
"max": max_time,
90-
"runs": runs
112+
key.value: {
113+
"mean": mean,
114+
"stdev": stdev,
115+
"min": min_val,
116+
"max": max_val,
117+
"runs": runs,
118+
} for key, (mean, stdev, min_val, max_val, runs) in zip(stats[0].keys(), zip(*(x.values() for x in stats))) if runs
91119
}
92-
table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_time:.3f}{max_time:.3f}"])
120+
mean, stdev, min_val, max_val, _ = (stat[score_metric] for stat in stats)
121+
table_data.append([suite, test_file, f"{mean:.3f} ± {stdev:.3f}", f"{min_val:.3f}{max_val:.3f}"])
93122
current_test += 1
94123

95124
print(tabulate(table_data, headers=["Suite", "Test", "Mean ± σ", "Range (min … max)"]))

0 commit comments

Comments
 (0)