diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 44f604e9ee5..50788eecff0 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -58,6 +58,12 @@ class FolderBasedBuilder(datasets.GeneratorBasedBuilder): METADATA_FILENAMES: list[str] = ["metadata.csv", "metadata.jsonl", "metadata.parquet"] def _info(self): + if not self.config.data_dir and not self.config.data_files: + raise ValueError( + "Folder-based datasets require either `data_dir` or `data_files` to be specified. " + "Neither was provided." + ) + return datasets.DatasetInfo(features=self.config.features) def _split_generators(self, dl_manager): diff --git a/tests/packaged_modules/test_folder_based_builder.py b/tests/packaged_modules/test_folder_based_builder.py index 7bc4c26e716..8e17f2839c7 100644 --- a/tests/packaged_modules/test_folder_based_builder.py +++ b/tests/packaged_modules/test_folder_based_builder.py @@ -285,7 +285,7 @@ def test_default_folder_builder_not_usable(data_files_with_labels_no_metadata, c # test that AutoFolder is extended for streaming when it's child class is instantiated: # see line 115 in src/datasets/streaming.py def test_streaming_patched(): - _ = DummyFolderBasedBuilder() + _ = DummyFolderBasedBuilder(data_dir=".") module = importlib.import_module(FolderBasedBuilder.__module__) assert hasattr(module, "_patched_for_streaming") assert module._patched_for_streaming