31
31
from .features import Features
32
32
from .features .features import FeatureType
33
33
from .info import DatasetInfo , DatasetInfosDict
34
+ from .iterable_dataset import IterableDataset
34
35
from .naming import _split_re
35
36
from .splits import NamedSplit , Split , SplitDict , SplitInfo
36
37
from .table import Table
@@ -49,7 +50,7 @@ def __call__(self, *fn_args, **fn_kwargs):
49
50
return self .func (* fn_args , * self .args , ** fn_kwargs )
50
51
51
52
52
- class DatasetDict (dict ):
53
+ class DatasetDict (dict [ Union [ str , NamedSplit ], "Dataset" ] ):
53
54
"""A dictionary (dict of str: datasets.Dataset) with dataset transforms methods (map, filter, etc.)"""
54
55
55
56
def _check_values_type (self ):
@@ -1616,6 +1617,7 @@ def push_to_hub(
1616
1617
max_shard_size : Optional [Union [int , str ]] = None ,
1617
1618
num_shards : Optional [dict [str , int ]] = None ,
1618
1619
embed_external_files : bool = True ,
1620
+ num_proc : Optional [int ] = None ,
1619
1621
) -> CommitInfo :
1620
1622
"""Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
1621
1623
The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.
@@ -1676,6 +1678,12 @@ def push_to_hub(
1676
1678
In particular, this will do the following before the push for the fields of type:
1677
1679
1678
1680
- [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.
1681
+ num_proc (`int`, *optional*, defaults to `None`):
1682
+ Number of processes when preparing and uploading the dataset.
1683
+ This is helpful if the dataset is made of many samples or media files to embed.
1684
+ Multiprocessing is disabled by default.
1685
+
1686
+ <Added version="4.0.0"/>
1679
1687
1680
1688
Return:
1681
1689
huggingface_hub.CommitInfo
@@ -1756,6 +1764,7 @@ def push_to_hub(
1756
1764
max_shard_size = max_shard_size ,
1757
1765
num_shards = num_shards .get (split ),
1758
1766
embed_external_files = embed_external_files ,
1767
+ num_proc = num_proc ,
1759
1768
)
1760
1769
additions += split_additions
1761
1770
total_uploaded_size += uploaded_size
@@ -1910,12 +1919,61 @@ def push_to_hub(
1910
1919
return commit_info
1911
1920
1912
1921
1913
- class IterableDatasetDict (dict ):
1922
+ class IterableDatasetDict (dict [Union [str , NamedSplit ], IterableDataset ]):
1923
+ def _check_values_type (self ):
1924
+ for dataset in self .values ():
1925
+ if not isinstance (dataset , IterableDataset ):
1926
+ raise TypeError (f"Values in `DatasetDict` should be of type `Dataset` but got type '{ type (dataset )} '" )
1927
+
1928
+ def _check_values_features (self ):
1929
+ items = [(key , dataset ._resolve_features ()) for key , dataset in self .items ()]
1930
+ for item_a , item_b in zip (items [:- 1 ], items [1 :]):
1931
+ if item_a [1 ].features != item_b [1 ].features :
1932
+ raise ValueError (
1933
+ f"All datasets in `DatasetDict` should have the same features but features for '{ item_a [0 ]} ' and '{ item_b [0 ]} ' don't match: { item_a [1 ].features } != { item_b [1 ].features } "
1934
+ )
1935
+
1914
1936
def __repr__ (self ):
1915
1937
repr = "\n " .join ([f"{ k } : { v } " for k , v in self .items ()])
1916
1938
repr = re .sub (r"^" , " " * 4 , repr , count = 0 , flags = re .M )
1917
1939
return f"IterableDatasetDict({{\n { repr } \n }})"
1918
1940
1941
+ @property
1942
+ def num_columns (self ) -> dict [str , Optional [int ]]:
1943
+ """Number of columns in each split of the dataset.
1944
+ This can contain None valies if some splits have unknown features (e.g. after a map() operation).
1945
+
1946
+ Example:
1947
+
1948
+ ```py
1949
+ >>> from datasets import load_dataset
1950
+ >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
1951
+ >>> ds.num_columns
1952
+ {'test': 2, 'train': 2, 'validation': 2}
1953
+ ```
1954
+ """
1955
+ self ._check_values_type ()
1956
+ return {k : dataset .num_columns for k , dataset in self .items ()}
1957
+
1958
+ @property
1959
+ def column_names (self ) -> dict [str , Optional [list [str ]]]:
1960
+ """Names of the columns in each split of the dataset.
1961
+ This can contain None valies if some splits have unknown features (e.g. after a map() operation).
1962
+
1963
+ Example:
1964
+
1965
+ ```py
1966
+ >>> from datasets import load_dataset
1967
+ >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
1968
+ >>> ds.column_names
1969
+ {'test': ['text', 'label'],
1970
+ 'train': ['text', 'label'],
1971
+ 'validation': ['text', 'label']}
1972
+ ```
1973
+ """
1974
+ self ._check_values_type ()
1975
+ return {k : dataset .column_names for k , dataset in self .items ()}
1976
+
1919
1977
def with_format (
1920
1978
self ,
1921
1979
type : Optional [str ] = None ,
@@ -2385,6 +2443,7 @@ def push_to_hub(
2385
2443
# max_shard_size: Optional[Union[int, str]] = None, # TODO(QL): add arg
2386
2444
num_shards : Optional [dict [str , int ]] = None ,
2387
2445
embed_external_files : bool = True ,
2446
+ num_proc : Optional [int ] = None ,
2388
2447
) -> CommitInfo :
2389
2448
"""Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
2390
2449
The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.
@@ -2436,6 +2495,12 @@ def push_to_hub(
2436
2495
In particular, this will do the following before the push for the fields of type:
2437
2496
2438
2497
- [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.
2498
+ num_proc (`int`, *optional*, defaults to `None`):
2499
+ Number of processes when preparing and uploading the dataset.
2500
+ This is helpful if the dataset is made of many samples or media files to embed.
2501
+ Multiprocessing is disabled by default.
2502
+
2503
+ <Added version="4.0.0"/>
2439
2504
2440
2505
Return:
2441
2506
huggingface_hub.CommitInfo
@@ -2505,7 +2570,7 @@ def push_to_hub(
2505
2570
for split in self .keys ():
2506
2571
logger .info (f"Pushing split { split } to the Hub." )
2507
2572
# The split=key needs to be removed before merging
2508
- split_additions , uploaded_size , dataset_nbytes = self [split ]._push_parquet_shards_to_hub (
2573
+ split_additions , uploaded_size , dataset_nbytes , num_examples = self [split ]._push_parquet_shards_to_hub (
2509
2574
repo_id ,
2510
2575
data_dir = data_dir ,
2511
2576
split = split ,
@@ -2515,11 +2580,12 @@ def push_to_hub(
2515
2580
# max_shard_size=max_shard_size, # TODO(QL): add arg
2516
2581
num_shards = num_shards .get (split ),
2517
2582
embed_external_files = embed_external_files ,
2583
+ num_proc = num_proc ,
2518
2584
)
2519
2585
additions += split_additions
2520
2586
total_uploaded_size += uploaded_size
2521
2587
total_dataset_nbytes += dataset_nbytes
2522
- info_to_dump .splits [split ] = SplitInfo (str (split ), num_bytes = dataset_nbytes , num_examples = len ( self [ split ]) )
2588
+ info_to_dump .splits [split ] = SplitInfo (str (split ), num_bytes = dataset_nbytes , num_examples = num_examples )
2523
2589
info_to_dump .download_checksums = None
2524
2590
info_to_dump .download_size = total_uploaded_size
2525
2591
info_to_dump .dataset_size = total_dataset_nbytes
0 commit comments