Skip to content

Commit da1a31d

Browse files
committed
Add parquet h2o support
1 parent 38b87bf commit da1a31d

File tree

1 file changed

+75
-9
lines changed

1 file changed

+75
-9
lines changed

benchmarks/bench.sh

Lines changed: 75 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,24 @@ clickbench_pushdown: ClickBench queries against partitioned (100 files) parqu
100100
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
101101
102102
# H2O.ai Benchmarks (Group By, Join, Window)
103-
h2o_small: h2oai benchmark with small dataset (1e7 rows) for groupby, default file format is csv
104-
h2o_medium: h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
105-
h2o_big: h2oai benchmark with large dataset (1e9 rows) for groupby, default file format is csv
106-
h2o_small_join: h2oai benchmark with small dataset (1e7 rows) for join, default file format is csv
107-
h2o_medium_join: h2oai benchmark with medium dataset (1e8 rows) for join, default file format is csv
108-
h2o_big_join: h2oai benchmark with large dataset (1e9 rows) for join, default file format is csv
109-
h2o_small_window: Extended h2oai benchmark with small dataset (1e7 rows) for window, default file format is csv
110-
h2o_medium_window: Extended h2oai benchmark with medium dataset (1e8 rows) for window, default file format is csv
111-
h2o_big_window: Extended h2oai benchmark with large dataset (1e9 rows) for window, default file format is csv
103+
h2o_small: h2oai benchmark with small dataset (1e7 rows) for groupby, default file format is csv
104+
h2o_medium: h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
105+
h2o_big: h2oai benchmark with large dataset (1e9 rows) for groupby, default file format is csv
106+
h2o_small_join: h2oai benchmark with small dataset (1e7 rows) for join, default file format is csv
107+
h2o_medium_join: h2oai benchmark with medium dataset (1e8 rows) for join, default file format is csv
108+
h2o_big_join: h2oai benchmark with large dataset (1e9 rows) for join, default file format is csv
109+
h2o_small_window: Extended h2oai benchmark with small dataset (1e7 rows) for window, default file format is csv
110+
h2o_medium_window: Extended h2oai benchmark with medium dataset (1e8 rows) for window, default file format is csv
111+
h2o_big_window: Extended h2oai benchmark with large dataset (1e9 rows) for window, default file format is csv
112+
h2o_small_parquet: h2oai benchmark with small dataset (1e7 rows) for groupby, file format is parquet
113+
h2o_medium_parquet: h2oai benchmark with medium dataset (1e8 rows) for groupby, file format is parquet
114+
h2o_big_parquet: h2oai benchmark with large dataset (1e9 rows) for groupby, file format is parquet
115+
h2o_small_join_parquet: h2oai benchmark with small dataset (1e7 rows) for join, file format is parquet
116+
h2o_medium_join_parquet: h2oai benchmark with medium dataset (1e8 rows) for join, file format is parquet
117+
h2o_big_join_parquet: h2oai benchmark with large dataset (1e9 rows) for join, file format is parquet
118+
h2o_small_window_parquet: Extended h2oai benchmark with small dataset (1e7 rows) for window, file format is parquet
119+
h2o_medium_window_parquet: Extended h2oai benchmark with medium dataset (1e8 rows) for window, file format is parquet
120+
h2o_big_window_parquet: Extended h2oai benchmark with large dataset (1e9 rows) for window, file format is parquet
112121
113122
# Join Order Benchmark (IMDB)
114123
imdb: Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
@@ -245,6 +254,34 @@ main() {
245254
h2o_big_window)
246255
data_h2o_join "BIG" "CSV"
247256
;;
257+
h2o_small_parquet)
258+
data_h2o "SMALL" "PARQUET"
259+
;;
260+
h2o_medium_parquet)
261+
data_h2o "MEDIUM" "PARQUET"
262+
;;
263+
h2o_big_parquet)
264+
data_h2o "BIG" "PARQUET"
265+
;;
266+
h2o_small_join_parquet)
267+
data_h2o_join "SMALL" "PARQUET"
268+
;;
269+
h2o_medium_join_parquet)
270+
data_h2o_join "MEDIUM" "PARQUET"
271+
;;
272+
h2o_big_join_parquet)
273+
data_h2o_join "BIG" "PARQUET"
274+
;;
275+
# h2o window benchmark uses the same data as the h2o join
276+
h2o_small_window_parquet)
277+
data_h2o_join "SMALL" "PARQUET"
278+
;;
279+
h2o_medium_window_parquet)
280+
data_h2o_join "MEDIUM" "PARQUET"
281+
;;
282+
h2o_big_window_parquet)
283+
data_h2o_join "BIG" "PARQUET"
284+
;;
248285
external_aggr)
249286
# same data as for tpch
250287
data_tpch "1"
@@ -381,6 +418,34 @@ main() {
381418
h2o_big_window)
382419
run_h2o_window "BIG" "CSV" "window"
383420
;;
421+
h2o_small_parquet)
422+
run_h2o "SMALL" "PARQUET"
423+
;;
424+
h2o_medium_parquet)
425+
run_h2o "MEDIUM" "PARQUET"
426+
;;
427+
h2o_big_parquet)
428+
run_h2o "BIG" "PARQUET"
429+
;;
430+
h2o_small_join_parquet)
431+
run_h2o_join "SMALL" "PARQUET"
432+
;;
433+
h2o_medium_join_parquet)
434+
run_h2o_join "MEDIUM" "PARQUET"
435+
;;
436+
h2o_big_join_parquet)
437+
run_h2o_join "BIG" "PARQUET"
438+
;;
439+
# h2o window benchmark uses the same data as the h2o join
440+
h2o_small_window_parquet)
441+
run_h2o_window "SMALL" "PARQUET"
442+
;;
443+
h2o_medium_window_parquet)
444+
run_h2o_window "MEDIUM" "PARQUET"
445+
;;
446+
h2o_big_window_parquet)
447+
run_h2o_window "BIG" "PARQUET"
448+
;;
384449
external_aggr)
385450
run_external_aggr
386451
;;
@@ -775,6 +840,7 @@ data_h2o() {
775840

776841
# Set virtual environment directory
777842
VIRTUAL_ENV="${PWD}/venv"
843+
rm -rf "$VIRTUAL_ENV"
778844

779845
# Create a virtual environment using the detected Python command
780846
$PYTHON_CMD -m venv "$VIRTUAL_ENV"

0 commit comments

Comments
 (0)