Skip to content

Commit 6807622

Browse files
committed
test: add initial tests of parquet metadata
1 parent 9fb8f36 commit 6807622

File tree

3 files changed

+101
-1
lines changed

3 files changed

+101
-1
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ filterwarnings = [
171171
markers = [
172172
"snowflake",
173173
"library",
174-
"s3"
174+
"s3",
175+
"slow",
175176
]
176177

177178

python/letsql/backends/let/tests/conftest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
import pathlib
2+
13
import pytest
24
import pyarrow as pa
5+
import pyarrow.parquet as pq
36

47
import letsql as ls
58

@@ -120,3 +123,39 @@ def df():
120123
)
121124

122125
return batch.to_pandas()
126+
127+
128+
@pytest.fixture(scope="function")
129+
def parquet_metadata():
130+
return {b"mykey": b"myvalue"}
131+
132+
133+
@pytest.fixture(scope="function")
134+
def parquet_path_without_metadata(tmpdir):
135+
parquet_path_without_metadata = pathlib.Path(tmpdir).joinpath(
136+
"without-metadata.parquet"
137+
)
138+
metadata = {b"mykey": b"myvalue"}
139+
table = pa.Table.from_pydict({"a": [1], "b": ["two"]}).replace_schema_metadata(
140+
metadata
141+
)
142+
with pq.ParquetWriter(parquet_path_without_metadata, table.schema) as writer:
143+
writer.write_table(table)
144+
return parquet_path_without_metadata
145+
146+
147+
@pytest.fixture(scope="function")
148+
def parquet_path_with_metadata(tmpdir):
149+
parquet_path_with_metadata = pathlib.Path(tmpdir).joinpath("with-metadata.parquet")
150+
metadata = {b"mykey": b"myvalue"}
151+
table = pa.Table.from_pydict({"a": [1], "b": ["two"]}).replace_schema_metadata(
152+
metadata
153+
)
154+
# order of sorts matters!!!
155+
sort_order = [(el.name, "ascending") for el in table.schema][::-1]
156+
sorting_columns = pq.SortingColumn.from_ordering(table.schema, sort_order)
157+
with pq.ParquetWriter(
158+
parquet_path_with_metadata, table.schema, sorting_columns=sorting_columns
159+
) as writer:
160+
writer.write_table(table)
161+
return parquet_path_with_metadata

python/letsql/backends/let/tests/test_client.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import ibis
22
import pyarrow as pa
3+
import pyarrow.parquet as pq
4+
import pytest
5+
from pytest import param
36

47
import letsql as ls
58
from letsql.tests.util import (
@@ -75,3 +78,60 @@ def test_register_table_with_uppercase_multiple_times(ls_con):
7578
assert uppercase_table_name in ls_con.list_tables()
7679
assert ls.execute(t) is not None
7780
assert t.schema() == expected_schema
81+
82+
83+
@pytest.mark.xfail(reason="datafusion metadata reading not working")
84+
def test_parquet_expr_metadata_available(
85+
ls_con, parquet_metadata, parquet_path_with_metadata
86+
):
87+
table_name = "t"
88+
ls_con.read_parquet(parquet_path_with_metadata, table_name=table_name)
89+
con_metadata = ls_con.con.table(table_name).schema().metadata
90+
assert not set(parquet_metadata.items()).difference(set(con_metadata.items()))
91+
92+
93+
@pytest.mark.parametrize(
94+
"path",
95+
(
96+
param(
97+
"parquet_path_with_metadata",
98+
id="pathlib",
99+
marks=[],
100+
),
101+
param(
102+
"s3://letsql-pytest/with-metadata.parquet",
103+
id="s3",
104+
marks=[pytest.mark.slow],
105+
),
106+
param(
107+
"https://letsql-pytest.s3.us-east-2.amazonaws.com/with-metadata.parquet",
108+
id="https",
109+
marks=[
110+
pytest.mark.xfail(
111+
reason="pyarrow.parquet.read_metadata can't do http/https"
112+
)
113+
],
114+
),
115+
),
116+
)
117+
def test_parquet_metadata_readable(request, parquet_metadata, path):
118+
try:
119+
path = request.getfixturevalue(path)
120+
except Exception:
121+
pass
122+
metadata = pq.read_metadata(path)
123+
assert not set(parquet_metadata.items()).difference(set(metadata.metadata.items()))
124+
125+
126+
def test_file_sort_order_injected(ls_con, parquet_path_with_metadata):
127+
table_name = "t"
128+
t = ls_con.read_parquet(parquet_path_with_metadata, table_name=table_name)
129+
expr = t.group_by("b").agg(max_a=t["a"].max())
130+
sql = f"EXPLAIN {ls.to_sql(expr)}"
131+
physical_plan = (
132+
ls_con.con.sql(sql)
133+
.to_pandas()
134+
.set_index("plan_type")
135+
.loc["physical_plan", "plan"]
136+
)
137+
assert "ordering_mode=Sorted" in physical_plan

0 commit comments

Comments
 (0)