diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 24cb586c82b..5a2752eba61 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1887,10 +1887,23 @@ def read_table(source, *, columns=None, use_threads=True, "the 'schema' argument is not supported when the " "pyarrow.dataset module is not available" ) + if isinstance(source, list): + raise ValueError( + "the 'source' argument cannot be a list of files " + "when the pyarrow.dataset module is not available" + ) + filesystem, path = _resolve_filesystem_and_path(source, filesystem) if filesystem is not None: + if not filesystem.get_file_info(path).is_file: + raise ValueError( + "the 'source' argument should be " + "an existing .parquet file and not a directory, " + "when the pyarrow.dataset module is not available" + ) + source = filesystem.open_input_file(path) - # TODO test that source is not a directory or a list + dataset = ParquetFile( source, read_dictionary=read_dictionary, binary_type=binary_type, diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 591bcffc1ac..b588528db55 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -16,6 +16,7 @@ # under the License. import os +import sys from collections import OrderedDict import io import warnings @@ -185,8 +186,7 @@ def __init__(self, *args, **kwargs): pq.read_table(path, partitioning=['week', 'color']) with pytest.raises(ValueError, match="the 'schema' argument"): pq.read_table(path, schema=table.schema) - # Error message varies depending on OS - with pytest.raises(OSError): + with pytest.raises(ValueError, match="the 'source' argument"): pq.read_table(tempdir) result = pq.read_table(path) assert result == table @@ -993,3 +993,14 @@ def test_checksum_write_to_dataset(tempdir): # checksum verification enabled raises an exception with pytest.raises(OSError, match="CRC checksum verification"): _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) + + +@pytest.mark.parametrize( + "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]]) +def test_read_table_raises_value_error_when_ds_is_unavailable( + monkeypatch, source): + # GH-47728 + monkeypatch.setitem(sys.modules, "pyarrow.dataset", None) + + with pytest.raises(ValueError, match="the 'source' argument"): + pq.read_table(source=source)