Tests typing and fixes for push_to_hub (#7608)

lhoestq · web-flow · commit 89bd1f971402 · 2025-06-12T23:15:21.000+02:00
* tests typing and fixes for push_to_hub

* fix
diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -31,6 +31,7 @@
 from .features import Features
 from .features.features import FeatureType
 from .info import DatasetInfo, DatasetInfosDict
+from .iterable_dataset import IterableDataset
 from .naming import _split_re
 from .splits import NamedSplit, Split, SplitDict, SplitInfo
 from .table import Table
@@ -49,7 +50,7 @@ def __call__(self, *fn_args, **fn_kwargs):
         return self.func(*fn_args, *self.args, **fn_kwargs)
 
 
-class DatasetDict(dict):
+class DatasetDict(dict[Union[str, NamedSplit], "Dataset"]):
     """A dictionary (dict of str: datasets.Dataset) with dataset transforms methods (map, filter, etc.)"""
 
     def _check_values_type(self):
@@ -1616,6 +1617,7 @@ def push_to_hub(
         max_shard_size: Optional[Union[int, str]] = None,
         num_shards: Optional[dict[str, int]] = None,
         embed_external_files: bool = True,
+        num_proc: Optional[int] = None,
     ) -> CommitInfo:
         """Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
         The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.
@@ -1676,6 +1678,12 @@ def push_to_hub(
                 In particular, this will do the following before the push for the fields of type:
 
                 - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.
+            num_proc (`int`, *optional*, defaults to `None`):
+                Number of processes when preparing and uploading the dataset.
+                This is helpful if the dataset is made of many samples or media files to embed.
+                Multiprocessing is disabled by default.
+
+                <Added version="4.0.0"/>
 
         Return:
             huggingface_hub.CommitInfo
@@ -1756,6 +1764,7 @@ def push_to_hub(
                 max_shard_size=max_shard_size,
                 num_shards=num_shards.get(split),
                 embed_external_files=embed_external_files,
+                num_proc=num_proc,
             )
             additions += split_additions
             total_uploaded_size += uploaded_size
@@ -1910,12 +1919,61 @@ def push_to_hub(
         return commit_info
 
 
-class IterableDatasetDict(dict):
+class IterableDatasetDict(dict[Union[str, NamedSplit], IterableDataset]):
+    def _check_values_type(self):
+        for dataset in self.values():
+            if not isinstance(dataset, IterableDataset):
+                raise TypeError(f"Values in `DatasetDict` should be of type `Dataset` but got type '{type(dataset)}'")
+
+    def _check_values_features(self):
+        items = [(key, dataset._resolve_features()) for key, dataset in self.items()]
+        for item_a, item_b in zip(items[:-1], items[1:]):
+            if item_a[1].features != item_b[1].features:
+                raise ValueError(
+                    f"All datasets in `DatasetDict` should have the same features but features for '{item_a[0]}' and '{item_b[0]}' don't match: {item_a[1].features} != {item_b[1].features}"
+                )
+
     def __repr__(self):
         repr = "\n".join([f"{k}: {v}" for k, v in self.items()])
         repr = re.sub(r"^", " " * 4, repr, count=0, flags=re.M)
         return f"IterableDatasetDict({{\n{repr}\n}})"
 
+    @property
+    def num_columns(self) -> dict[str, Optional[int]]:
+        """Number of columns in each split of the dataset.
+        This can contain None valies if some splits have unknown features (e.g. after a map() operation).
+
+        Example:
+
+        ```py
+        >>> from datasets import load_dataset
+        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
+        >>> ds.num_columns
+        {'test': 2, 'train': 2, 'validation': 2}
+        ```
+        """
+        self._check_values_type()
+        return {k: dataset.num_columns for k, dataset in self.items()}
+
+    @property
+    def column_names(self) -> dict[str, Optional[list[str]]]:
+        """Names of the columns in each split of the dataset.
+        This can contain None valies if some splits have unknown features (e.g. after a map() operation).
+
+        Example:
+
+        ```py
+        >>> from datasets import load_dataset
+        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
+        >>> ds.column_names
+        {'test': ['text', 'label'],
+         'train': ['text', 'label'],
+         'validation': ['text', 'label']}
+        ```
+        """
+        self._check_values_type()
+        return {k: dataset.column_names for k, dataset in self.items()}
+
     def with_format(
         self,
         type: Optional[str] = None,
@@ -2385,6 +2443,7 @@ def push_to_hub(
         # max_shard_size: Optional[Union[int, str]] = None,  # TODO(QL): add arg
         num_shards: Optional[dict[str, int]] = None,
         embed_external_files: bool = True,
+        num_proc: Optional[int] = None,
     ) -> CommitInfo:
         """Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
         The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.
@@ -2436,6 +2495,12 @@ def push_to_hub(
                 In particular, this will do the following before the push for the fields of type:
 
                 - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.
+            num_proc (`int`, *optional*, defaults to `None`):
+                Number of processes when preparing and uploading the dataset.
+                This is helpful if the dataset is made of many samples or media files to embed.
+                Multiprocessing is disabled by default.
+
+                <Added version="4.0.0"/>
 
         Return:
             huggingface_hub.CommitInfo
@@ -2505,7 +2570,7 @@ def push_to_hub(
         for split in self.keys():
             logger.info(f"Pushing split {split} to the Hub.")
             # The split=key needs to be removed before merging
-            split_additions, uploaded_size, dataset_nbytes = self[split]._push_parquet_shards_to_hub(
+            split_additions, uploaded_size, dataset_nbytes, num_examples = self[split]._push_parquet_shards_to_hub(
                 repo_id,
                 data_dir=data_dir,
                 split=split,
@@ -2515,11 +2580,12 @@ def push_to_hub(
                 # max_shard_size=max_shard_size,  # TODO(QL): add arg
                 num_shards=num_shards.get(split),
                 embed_external_files=embed_external_files,
+                num_proc=num_proc,
             )
             additions += split_additions
             total_uploaded_size += uploaded_size
             total_dataset_nbytes += dataset_nbytes
-            info_to_dump.splits[split] = SplitInfo(str(split), num_bytes=dataset_nbytes, num_examples=len(self[split]))
+            info_to_dump.splits[split] = SplitInfo(str(split), num_bytes=dataset_nbytes, num_examples=num_examples)
         info_to_dump.download_checksums = None
         info_to_dump.download_size = total_uploaded_size
         info_to_dump.dataset_size = total_dataset_nbytes
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -2015,6 +2015,38 @@ def __init__(
         self._prepare_ex_iterable_for_iteration()  # set state_dict
         _maybe_add_torch_iterable_dataset_parent_class(self.__class__)  # subclass of torch IterableDataset
 
+    @property
+    def num_columns(self) -> Optional[int]:
+        """Number of columns in the dataset.
+        This can be None if the dataset has unknown features (e.g. after a map() operation).
+
+        Example:
+
+        ```py
+        >>> from datasets import load_dataset
+        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
+        >>> ds.num_columns
+        2
+        ```
+        """
+        return None if self.features is None else len(self.features)
+
+    @property
+    def column_names(self) -> Optional[list[str]]:
+        """Names of the columns in the dataset.
+        This can be None if the dataset has unknown features (e.g. after a map() operation).
+
+        Example:
+
+        ```py
+        >>> from datasets import load_dataset
+        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True)
+        >>> ds.column_names
+        ['text', 'label']
+        ```
+        """
+        return None if self.features is None else list(self.features)
+
     def state_dict(self) -> dict:
         """Get the current state_dict of the dataset.
         It corresponds to the state at the latest example it yielded.
@@ -3007,21 +3039,6 @@ def shard(
             token_per_repo_id=self._token_per_repo_id,
         )
 
-    @property
-    def column_names(self) -> Optional[list[str]]:
-        """Names of the columns in the dataset.
-
-        Example:
-
-        ```py
-        >>> from datasets import load_dataset
-        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True)
-        >>> ds.column_names
-        ['text', 'label']
-        ```
-        """
-        return list(self._info.features.keys()) if self._info.features is not None else None
-
     def add_column(self, name: str, column: Union[list, np.array]) -> "IterableDataset":
         """Add column to Dataset.
 
@@ -3791,7 +3808,7 @@ def _push_parquet_shards_to_hub(
         num_shards: Optional[int],
         embed_external_files: bool,
         num_proc: Optional[int],
-    ) -> tuple[list[CommitOperationAdd], int, int]:
+    ) -> tuple[list[CommitOperationAdd], int, int, int]:
         """Pushes the dataset shards as Parquet files to the hub.
 
         Returns:
@@ -3841,7 +3858,7 @@ def _push_parquet_shards_to_hub(
             total=num_shards,
             desc=desc,
         )
-        with contextlib.nullcontext() if num_proc is None and num_proc > 1 else Pool(num_proc) as pool:
+        with contextlib.nullcontext() if num_proc is None or num_proc <= 1 else Pool(num_proc) as pool:
             update_stream = (
                 IterableDataset._push_parquet_shards_to_hub_single(**kwargs_iterable[0])
                 if pool is None
@@ -3858,6 +3875,9 @@ def _push_parquet_shards_to_hub(
                     additions += content[0]
                     dataset_nbytes += content[1]
                     num_examples += content[2]
+            if pool is not None:
+                pool.close()
+                pool.join()
 
         uploaded_size = sum(addition.upload_info.size for addition in additions)
         return additions, uploaded_size, dataset_nbytes, num_examples
diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -22,6 +22,7 @@
     DownloadManager,
     Features,
     Image,
+    IterableDatasetDict,
     Value,
     load_dataset,
     load_dataset_builder,
@@ -873,6 +874,68 @@ def test_push_dataset_dict_to_hub_with_config_no_metadata_configs(self, temporar
                 "*/another_config/random-00000-of-00001.parquet",
             )
 
+    def test_push_dataset_dict_to_hub_num_proc(self, temporary_repo, set_ci_hub_access_token):
+        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
+
+        local_ds = DatasetDict({"train": ds})
+
+        with temporary_repo() as ds_name:
+            local_ds.push_to_hub(ds_name, num_proc=2)
+            hub_ds = load_dataset(ds_name, download_mode="force_redownload")
+
+            assert local_ds.column_names == hub_ds.column_names
+            assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())
+            assert local_ds["train"].features == hub_ds["train"].features
+
+            # Ensure that there is a single file on the repository that has the correct name
+            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset"))
+            assert files == [
+                ".gitattributes",
+                "README.md",
+                "data/train-00000-of-00002.parquet",
+                "data/train-00001-of-00002.parquet",
+            ]
+
+    def test_push_dataset_dict_to_hub_iterable(self, temporary_repo, set_ci_hub_access_token):
+        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}).to_iterable_dataset()
+
+        local_ds = IterableDatasetDict({"train": ds})
+
+        with temporary_repo() as ds_name:
+            local_ds.push_to_hub(ds_name)
+            hub_ds = load_dataset(ds_name, download_mode="force_redownload")
+
+            assert local_ds.column_names == hub_ds.column_names
+            assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())
+            assert local_ds["train"].features == hub_ds["train"].features
+
+            # Ensure that there is a single file on the repository that has the correct name
+            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset"))
+            assert files == [".gitattributes", "README.md", "data/train-00000-of-00001.parquet"]
+
+    def test_push_dataset_dict_to_hub_iterable_num_proc(self, temporary_repo, set_ci_hub_access_token):
+        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}).to_iterable_dataset(num_shards=3)
+
+        local_ds = IterableDatasetDict({"train": ds})
+
+        with temporary_repo() as ds_name:
+            local_ds.push_to_hub(ds_name, num_proc=2)
+            hub_ds = load_dataset(ds_name, download_mode="force_redownload")
+
+            assert local_ds.column_names == hub_ds.column_names
+            assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())
+            assert local_ds["train"].features == hub_ds["train"].features
+
+            # Ensure that there is a single file on the repository that has the correct name
+            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset"))
+            assert files == [
+                ".gitattributes",
+                "README.md",
+                "data/train-00000-of-00003.parquet",
+                "data/train-00001-of-00003.parquet",
+                "data/train-00002-of-00003.parquet",
+            ]
+
 
 class DummyFolderBasedBuilder(FolderBasedBuilder):
     BASE_FEATURE = dict