11import os
2- from . _internal import PyDataset
2+
33from huggingface_hub import hf_hub_download , list_repo_files
44
5+ from ._internal import PyDataset
6+
7+
58__all__ = ["Dataset" ]
69
710
@@ -12,9 +15,9 @@ class Dataset(PyDataset):
1215 # ...
1316 # ]
1417
15- def from_hub (repo , metadata_store ):
18+ def from_hub (repo , metadata_store , revision = None ):
1619 """Create a Dataset from Hugging Face Hub."""
17- repo_files = list_repo_files (repo , repo_type = "dataset" )
20+ repo_files = list_repo_files (repo , repo_type = "dataset" , revision = revision )
1821
1922 parquet_files = []
2023 for filename in repo_files :
@@ -31,12 +34,16 @@ def from_hub(repo, metadata_store):
3134 raise ValueError (f"No parquet files found in the dataset '{ repo } '." )
3235
3336 return Dataset (
34- repo , parquet_files , data_store = "hf://" , metadata_store = metadata_store
37+ repo ,
38+ parquet_files ,
39+ revision = revision ,
40+ data_store = f"hf://datasets/{ repo } " ,
41+ metadata_store = metadata_store ,
3542 )
3643
37- def from_cache (repo , metadata_store , download = False ):
44+ def from_cache (repo , metadata_store , revision = None , download = False ):
3845 """Create a Dataset from HF local cache."""
39- repo_files = list_repo_files (repo , repo_type = "dataset" )
46+ repo_files = list_repo_files (repo , repo_type = "dataset" , revision = revision )
4047
4148 parquet_files = []
4249 for filename in repo_files :
@@ -57,5 +64,9 @@ def from_cache(repo, metadata_store, download=False):
5764 raise ValueError (f"No parquet files found in the dataset '{ repo } '." )
5865
5966 return Dataset (
60- repo , parquet_files , data_store = "file://" , metadata_store = metadata_store
67+ repo ,
68+ parquet_files ,
69+ revision = revision ,
70+ data_store = "file://" ,
71+ metadata_store = metadata_store ,
6172 )
0 commit comments